From 743dc9084508f9042d347d93ccf791595601f6e3 Mon Sep 17 00:00:00 2001 From: Liam Keegan Date: Fri, 16 Feb 2024 17:20:42 +0100 Subject: [PATCH] Add max_distance (#124) - distance functions have a `max_distance` argument - default value is max value for that type of int (i.e. unchanged) - CPU distance implementations now return early if the distance reached max_distance - for max_distance < 255 we do more intermediate summing of distances to allow early return -> slight reduction in performance if no early return - add benchmarks and tests - bump version - resolves #117 --- .pre-commit-config.yaml | 4 +- CMakeLists.txt | 2 +- README.md | 10 +- ext/CMakeLists.txt | 1 + include/hamming/distance_avx2.hh | 4 +- include/hamming/distance_avx512.hh | 4 +- include/hamming/distance_cuda.hh | 12 ++- include/hamming/distance_neon.hh | 4 +- include/hamming/distance_sse2.hh | 4 +- include/hamming/hamming.hh | 33 ++++--- include/hamming/hamming_impl.hh | 37 ++++--- pyproject.toml | 2 +- python/hammingdist.cc | 15 +-- python/tests/test_hammingdist.py | 22 +++-- src/bench.cc | 7 +- src/bench.hh | 3 +- src/distance_avx2.cc | 14 ++- src/distance_avx2_t.cc | 32 +++--- src/distance_avx512.cc | 14 ++- src/distance_avx512_t.cc | 32 +++--- src/distance_cuda.cu | 37 ++++--- src/distance_cuda_t.cc | 32 +++--- src/distance_neon.cc | 14 ++- src/distance_neon_t.cc | 32 +++--- src/distance_sse2.cc | 14 ++- src/distance_sse2_t.cc | 32 +++--- src/hamming.cc | 11 ++- src/hamming_bench.cc | 22 ++++- src/hamming_impl.cc | 11 ++- src/hamming_impl_t.cc | 124 +++++++++++++---------- src/hamming_t.cc | 153 ++++++++++++++++------------- 31 files changed, 460 insertions(+), 278 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f1e217e..245182b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -8,7 +8,7 @@ repos: - id: mixed-line-ending - repo: https://github.com/psf/black - rev: 23.12.1 + rev: 24.2.0 hooks: - id: black @@ -29,7 +29,7 @@ repos: - id: prettier - repo: https://github.com/python-jsonschema/check-jsonschema - rev: 0.27.3 + rev: 0.28.0 hooks: - id: check-github-workflows - id: check-readthedocs diff --git a/CMakeLists.txt b/CMakeLists.txt index 69984d6..e45c6c1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.23..3.27) project( hammingdist - VERSION 1.0.0 + VERSION 1.1.0 LANGUAGES CXX) include(CTest) diff --git a/README.md b/README.md index 061a51e..df914f8 100644 --- a/README.md +++ b/README.md @@ -89,10 +89,10 @@ import hammingdist sequence_indices = hammingdist.fasta_sequence_indices(fasta_file) ``` -## Large distance values +## Maximum distance values By default, the elements in the distances matrix returned by `hammingdist.from_fasta` have a maximum value of 255. - +You can also set a smaller maximum value using the `max_distance` argument. For distances larger than this `hammingdist.from_fasta_large` supports distances up to 65535 (but uses twice as much RAM) ## Distances from reference sequence @@ -122,12 +122,12 @@ On linux hammingdist is built with OpenMP (multithreading) support, and will aut ## CUDA on linux On linux hammingdist is also built with CUDA (Nvidia GPU) support. -To use the GPU instead of the CPU, set `use_gpu=True` when calling `from_fasta`: +To use the GPU instead of the CPU, set `use_gpu=True` when calling `from_fasta`. Here we also set the maximum distance to 2: ```python import hammingdist -data = hammingdist.from_fasta("example.fasta", use_gpu=True) +data = hammingdist.from_fasta("example.fasta", use_gpu=True, max_distance=2) ``` Additionally, the lower triangular matrix file can now be directly constructed from the fasta file @@ -138,7 +138,7 @@ which means it requires less RAM and runs faster. ```python import hammingdist -hammingdist.from_fasta_to_lower_triangular('input_fasta.txt', 'output_lower_triangular.txt', use_gpu=True) +hammingdist.from_fasta_to_lower_triangular('input_fasta.txt', 'output_lower_triangular.txt', use_gpu=True, max_distance=2) ``` ![overview](plots/speed.png) diff --git a/ext/CMakeLists.txt b/ext/CMakeLists.txt index 897253e..3bf8290 100644 --- a/ext/CMakeLists.txt +++ b/ext/CMakeLists.txt @@ -1,3 +1,4 @@ +set(FMT_INSTALL OFF) add_subdirectory(fmt) if(HAMMING_BUILD_PYTHON) diff --git a/include/hamming/distance_avx2.hh b/include/hamming/distance_avx2.hh index 7823a52..b63c8bb 100644 --- a/include/hamming/distance_avx2.hh +++ b/include/hamming/distance_avx2.hh @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include "hamming/hamming_impl_types.hh" @@ -8,6 +9,7 @@ namespace hamming { int distance_avx2(const std::vector &a, - const std::vector &b); + const std::vector &b, + int max_dist = std::numeric_limits::max()); } diff --git a/include/hamming/distance_avx512.hh b/include/hamming/distance_avx512.hh index a21d414..bd3a89e 100644 --- a/include/hamming/distance_avx512.hh +++ b/include/hamming/distance_avx512.hh @@ -4,10 +4,12 @@ #include #include "hamming/hamming_impl_types.hh" +#include namespace hamming { int distance_avx512(const std::vector &a, - const std::vector &b); + const std::vector &b, + int max_dist = std::numeric_limits::max()); } diff --git a/include/hamming/distance_cuda.hh b/include/hamming/distance_cuda.hh index c7c7e52..3f2ddf2 100644 --- a/include/hamming/distance_cuda.hh +++ b/include/hamming/distance_cuda.hh @@ -2,6 +2,7 @@ #include #include +#include #include #include "hamming/hamming_impl_types.hh" @@ -11,17 +12,20 @@ namespace hamming { bool distance_cuda_have_device(); int distance_cuda(const std::vector &a, - const std::vector &b); + const std::vector &b, + int max_dist = std::numeric_limits::max()); // for now explicit function def for each choice of integer type std::vector -distances_cuda_8bit(const std::vector> &data); +distances_cuda_8bit(const std::vector> &data, + uint8_t max_dist = std::numeric_limits::max()); std::vector -distances_cuda_16bit(const std::vector> &data); +distances_cuda_16bit(const std::vector> &data, + uint16_t max_dist = std::numeric_limits::max()); void distances_cuda_to_lower_triangular( const std::vector> &data, - const std::string &filename); + const std::string &filename, int max_distance); } // namespace hamming diff --git a/include/hamming/distance_neon.hh b/include/hamming/distance_neon.hh index 1cdebd6..c1d8193 100644 --- a/include/hamming/distance_neon.hh +++ b/include/hamming/distance_neon.hh @@ -4,10 +4,12 @@ #include #include "hamming/hamming_impl_types.hh" +#include namespace hamming { int distance_neon(const std::vector &a, - const std::vector &b); + const std::vector &b, + int max_dist = std::numeric_limits::max()); } diff --git a/include/hamming/distance_sse2.hh b/include/hamming/distance_sse2.hh index ac5d2c7..f3f9f08 100644 --- a/include/hamming/distance_sse2.hh +++ b/include/hamming/distance_sse2.hh @@ -4,10 +4,12 @@ #include #include "hamming/hamming_impl_types.hh" +#include namespace hamming { int distance_sse2(const std::vector &a, - const std::vector &b); + const std::vector &b, + int max_dist = std::numeric_limits::max()); } diff --git a/include/hamming/hamming.hh b/include/hamming/hamming.hh index 06f90a0..831e442 100644 --- a/include/hamming/hamming.hh +++ b/include/hamming/hamming.hh @@ -19,10 +19,12 @@ template struct DataSet { explicit DataSet(std::vector &data, bool include_x = false, bool clear_input_data = false, std::vector &&indices = {}, - bool use_gpu = false) + bool use_gpu = false, + int max_distance = std::numeric_limits::max()) : nsamples(data.size()), sequence_indices(std::move(indices)) { validate_data(data); - result = distances(data, include_x, clear_input_data, use_gpu); + result = distances(data, include_x, clear_input_data, use_gpu, + max_distance); } explicit DataSet(const std::string &filename) { @@ -99,9 +101,10 @@ template struct DataSet { std::vector sequence_indices{}; }; -DataSet from_stringlist(std::vector &data, - bool include_x = false, - bool use_gpu = false); +DataSet +from_stringlist(std::vector &data, bool include_x = false, + bool use_gpu = false, + int max_distance = std::numeric_limits::max()); DataSet from_csv(const std::string &filename); @@ -122,19 +125,21 @@ DataSet from_lower_triangular(const std::string &filename) { } template -DataSet from_fasta(const std::string &filename, - bool include_x = false, - bool remove_duplicates = false, - std::size_t n = 0, bool use_gpu = false) { +DataSet +from_fasta(const std::string &filename, bool include_x = false, + bool remove_duplicates = false, std::size_t n = 0, + bool use_gpu = false, + int max_distance = std::numeric_limits::max()) { auto [data, sequence_indices] = read_fasta(filename, remove_duplicates, n); return DataSet(data, include_x, true, - std::move(sequence_indices), use_gpu); + std::move(sequence_indices), use_gpu, + max_distance); } -void from_fasta_to_lower_triangular(const std::string &input_filename, - const std::string &output_filename, - bool remove_duplicates = false, - std::size_t n = 0, bool use_gpu = false); +void from_fasta_to_lower_triangular( + const std::string &input_filename, const std::string &output_filename, + bool remove_duplicates = false, std::size_t n = 0, bool use_gpu = false, + int max_distance = std::numeric_limits::max()); ReferenceDistIntType distance(const std::string &seq0, const std::string &seq1, bool include_x = false); diff --git a/include/hamming/hamming_impl.hh b/include/hamming/hamming_impl.hh index 6f1814f..ad771de 100644 --- a/include/hamming/hamming_impl.hh +++ b/include/hamming/hamming_impl.hh @@ -28,25 +28,30 @@ inline bool cuda_gpu_available() { } typedef int (*distance_func_ptr)(const std::vector &, - const std::vector &); + const std::vector &, int); distance_func_ptr get_fastest_supported_distance_func(); std::array lookupTable(bool include_x = false); -template DistIntType safe_int_cast(int x) { - if (x > std::numeric_limits::max()) { - return std::numeric_limits::max(); +template +DistIntType +safe_int_cast(int x, + DistIntType max_x = std::numeric_limits::max()) { + if (x > max_x) { + return max_x; } return static_cast(x); } void validate_data(const std::vector &data); -int distance_sparse(const SparseData &a, const SparseData &b); +int distance_sparse(const SparseData &a, const SparseData &b, + int max_dist = std::numeric_limits::max()); int distance_cpp(const std::vector &a, - const std::vector &b); + const std::vector &b, + int max_dist = std::numeric_limits::max()); std::vector to_sparse_data(const std::vector &data, bool include_x); @@ -63,8 +68,9 @@ std::vector from_string(const std::string &str); template std::vector distances(std::vector &data, bool include_x, bool clear_input_data, - bool use_gpu) { + bool use_gpu, int max_distance) { std::vector result((data.size() - 1) * data.size() / 2, 0); + auto max_dist = safe_int_cast(max_distance); auto start_time = std::chrono::high_resolution_clock::now(); auto print_timing = [&start_time](const std::string &event, bool final = false) { @@ -118,13 +124,14 @@ std::vector distances(std::vector &data, } print_timing("pre-processing"); #ifdef HAMMING_WITH_OPENMP -#pragma omp parallel for default(none) shared(result, sparse, nsamples) +#pragma omp parallel for default(none) \ + shared(result, sparse, nsamples, max_dist) #endif for (std::size_t i = 0; i < nsamples; ++i) { std::size_t offset{i * (i - 1) / 2}; for (std::size_t j = 0; j < i; ++j) { - result[offset + j] = - safe_int_cast(distance_sparse(sparse[i], sparse[j])); + result[offset + j] = safe_int_cast( + distance_sparse(sparse[i], sparse[j], max_dist)); } } print_timing("distance calculation", true); @@ -142,9 +149,9 @@ std::vector distances(std::vector &data, std::cout << "# hammingdist :: Using GPU..." << std::endl; print_timing("pre-processing"); if constexpr (sizeof(DistIntType) == 1) { - return distances_cuda_8bit(dense); + return distances_cuda_8bit(dense, max_dist); } else if constexpr (sizeof(DistIntType) == 2) { - return distances_cuda_16bit(dense); + return distances_cuda_16bit(dense, max_dist); } else { throw std::runtime_error("No GPU implementation available"); } @@ -155,13 +162,13 @@ std::vector distances(std::vector &data, print_timing("pre-processing"); #ifdef HAMMING_WITH_OPENMP #pragma omp parallel for schedule(static, 1) default(none) \ - shared(result, dense, nsamples, distance_func) + shared(result, dense, nsamples, distance_func, max_dist) #endif for (std::size_t i = 0; i < nsamples; ++i) { std::size_t offset{i * (i - 1) / 2}; for (std::size_t j = 0; j < i; ++j) { - result[offset + j] = - safe_int_cast(distance_func(dense[i], dense[j])); + result[offset + j] = safe_int_cast( + distance_func(dense[i], dense[j], max_dist)); } } print_timing("distance calculation", true); diff --git a/pyproject.toml b/pyproject.toml index 6bcd607..c42da3f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build" [project] name = "hammingdist" -version = "1.0.0" +version = "1.1.0" description = "A fast tool to calculate Hamming distances" readme = "README.md" license = {text = "MIT"} diff --git a/python/hammingdist.cc b/python/hammingdist.cc index b78205f..76261d8 100644 --- a/python/hammingdist.cc +++ b/python/hammingdist.cc @@ -60,25 +60,28 @@ PYBIND11_MODULE(hammingdist, m) { m.def("from_fasta", &from_fasta, py::arg("filename"), py::arg("include_x") = false, py::arg("remove_duplicates") = false, py::arg("n") = 0, py::arg("use_gpu") = false, + py::arg("max_distance") = 255, "Creates a dataset by reading from a fasta file (assuming all " "sequences have equal length). Maximum value of an element in the " - "distances matrix: 255. Distances that would have been larger than " - "this value saturate at 255 - to support genomes with larger distances " - "than this see `from_fasta_large` instead."); + "distances matrix: max_distance or 255, whichever is lower." + "Distances that would have been larger than " + "this value instead saturate at this value - to support genomes with " + "larger distances than this see `from_fasta_large` instead."); m.def("from_fasta_large", &from_fasta, py::arg("filename"), py::arg("include_x") = false, py::arg("remove_duplicates") = false, py::arg("n") = 0, py::arg("use_gpu") = false, + py::arg("max_distance") = 65535, "Creates a dataset by reading from a fasta file (assuming all " "sequences have equal length). Maximum value of an element in the " - "distances matrix: 65535"); + "distances matrix: max_distance or 65535, whichever is lower"); m.def("from_fasta_to_lower_triangular", &from_fasta_to_lower_triangular, py::arg("fasta_filename"), py::arg("output_filename"), py::arg("remove_duplicates") = false, py::arg("n") = 0, - py::arg("use_gpu") = true, + py::arg("use_gpu") = true, py::arg("max_distance") = 65535, "Construct lower triangular distances matrix output file from the " "fasta file," "requires an NVIDIA GPU. Maximum value of an element in " - "the distances matrix: 65535"); + "the distances matrix: max_distance or 65535, whichever is lower"); m.def("from_lower_triangular", &from_lower_triangular, "Creates a dataset by reading already computed distances from lower " "triangular format. Maximum value of an element in the distances " diff --git a/python/tests/test_hammingdist.py b/python/tests/test_hammingdist.py index 581ba1f..f7704f7 100644 --- a/python/tests/test_hammingdist.py +++ b/python/tests/test_hammingdist.py @@ -95,14 +95,18 @@ def test_from_fasta(from_fasta_func, use_gpu, tmp_path): (["A", "C", "G", "T", "-", "X"], True), ], ) -def test_fasta_reference_distances(chars, include_x, tmp_path): +@pytest.mark.parametrize("max_distance", [0, 1, 2, 3, 89, 497, 9999999]) +def test_fasta_reference_distances(chars, include_x, max_distance, tmp_path): # generate 50 sequences, each with 25 characters sequences = ["".join(random.choices(chars, k=25)) for i in range(50)] fasta_file = str(tmp_path / "fasta.txt") write_fasta_file(fasta_file, sequences) # calculate distances matrix data = hammingdist.from_fasta( - fasta_file, remove_duplicates=False, include_x=include_x + fasta_file, + remove_duplicates=False, + include_x=include_x, + max_distance=max_distance, ) # use each sequence in turn as the reference sequence & calculate reference distances for i, sequence in enumerate(sequences): @@ -115,7 +119,7 @@ def test_fasta_reference_distances(chars, include_x, tmp_path): for j, dist in enumerate(vec): # if x is not included, invalid chars have distance 1 but data[i,i] returns 0 by construction if include_x or i != j: - assert data[i, j] == dist + assert data[i, j] == min(max_distance, dist) # should also agree with output of distance function for these two sequences assert dist == hammingdist.distance( sequences[i], sequences[j], include_x=include_x @@ -136,17 +140,23 @@ def test_distance(): not hammingdist.cuda_gpu_available(), reason="No CUDA GPU available or hammingdist was compiled without CUDA support", ) -def test_from_fasta_to_lower_triangular(tmp_path): +@pytest.mark.parametrize("max_distance", [0, 1, 2, 3, 89, 497, 9999999]) +def test_from_fasta_to_lower_triangular(tmp_path, max_distance): sequences = [ "ACGTGTCGTGTCGACGTGTCGCAGGTGTCGACGTGTCGCAGGTGTCGACGTGTCGCAG", "CCGTGTCGTGTCGACGTGTCGC-GGTGTCGACGTGTCGCAGGTGTCGACGTGTCGCAG", "CAGTGT-GTGTCGACGTGTCGCAGGTGTCGACGTGTCGCAGGTGTCGACGTG--GCAG", ] - lower_triangular_dist = [[1], [2, 1]] + lower_triangular_dist = [ + [min(1, max_distance)], + [min(2, max_distance), min(1, max_distance)], + ] fasta_file = str(tmp_path / "fasta.txt") output_file = str(tmp_path / "out.txt") write_fasta_file(fasta_file, sequences) - hammingdist.from_fasta_to_lower_triangular(fasta_file, output_file) + hammingdist.from_fasta_to_lower_triangular( + fasta_file, output_file, max_distance=max_distance + ) with open(output_file) as f: data = f.read().splitlines() assert len(data) == 2 diff --git a/src/bench.cc b/src/bench.cc index c35a30c..30bffad 100644 --- a/src/bench.cc +++ b/src/bench.cc @@ -40,13 +40,14 @@ std::vector make_stringlist(int64_t n, int64_t n_samples, } void write_fasta(const std::string &filename, const std::string &seq, - std::size_t n_seq, std::mt19937 &gen) { + std::size_t n_seq, std::mt19937 &gen, + std::size_t randomise_every_n) { std::ofstream fs; fs.open(filename); for (std::size_t i = 0; i < n_seq; ++i) { auto randomized_seq{seq}; - // make ~0.5% of each sequence differ from seq - randomize_n(randomized_seq, seq.size() / 200, gen); + // randomly replace 1 in randomise_every_n elements of seq + randomize_n(randomized_seq, seq.size() / randomise_every_n, gen); fs << ">seq" << i << "\n" << randomized_seq << "\n"; } fs.close(); diff --git a/src/bench.hh b/src/bench.hh index 52498cb..39f2fcc 100644 --- a/src/bench.hh +++ b/src/bench.hh @@ -15,7 +15,8 @@ std::vector make_stringlist(int64_t n, int64_t n_samples, std::mt19937 &gen); void write_fasta(const std::string &filename, const std::string &seq, - std::size_t n_seq, std::mt19937 &gen); + std::size_t n_seq, std::mt19937 &gen, + std::size_t randomise_every_n = 200); template std::vector make_distances(int64_t n, std::mt19937 &gen) { diff --git a/src/distance_avx2.cc b/src/distance_avx2.cc index 7bf74b2..684e306 100644 --- a/src/distance_avx2.cc +++ b/src/distance_avx2.cc @@ -4,7 +4,7 @@ namespace hamming { int distance_avx2(const std::vector &a, - const std::vector &b) { + const std::vector &b, int max_dist) { // distance implementation using AVX2 simd intrinsics // a 256-bit register holds 32 GeneBlocks, i.e. 64 genes constexpr std::size_t n_geneblocks{32}; @@ -24,8 +24,11 @@ int distance_avx2(const std::vector &a, std::size_t n_iter{a.size() / n_geneblocks}; // each partial distance count is stored in a unit8, so max value = 255, // and the value can be increased by at most 2 with each iteration, - // so we do 127 inner iterations for a max value of 254 to avoid overflow - std::size_t n_inner{127}; + // so up to 127 inner iterations for a max value of 254 avoid overflow. + // if max_dist is large then we maximise the number of inner iterations, but + // if it is small then we do fewer inner iterations to allow more + // opportunities to return early if we have reached max_dist + std::size_t n_inner = max_dist >= 255 ? 127 : 16; std::size_t n_outer{1 + n_iter / n_inner}; for (std::size_t j = 0; j < n_outer; ++j) { std::size_t n{std::min((j + 1) * n_inner, n_iter)}; @@ -60,6 +63,9 @@ int distance_avx2(const std::vector &a, for (std::size_t i = 0; i < n_partialsums; ++i) { r += r_partial[i]; } + if (r >= max_dist) { + return max_dist; + } } // do last partial block without simd intrinsics for (std::size_t i = n_geneblocks * n_iter; i < a.size(); ++i) { @@ -67,7 +73,7 @@ int distance_avx2(const std::vector &a, r += static_cast((c & mask_gene0) == 0); r += static_cast((c & mask_gene1) == 0); } - return r; + return std::min(max_dist, r); } } // namespace hamming diff --git a/src/distance_avx2_t.cc b/src/distance_avx2_t.cc index 68d46d3..e026d7e 100644 --- a/src/distance_avx2_t.cc +++ b/src/distance_avx2_t.cc @@ -16,9 +16,12 @@ TEST_CASE("distance_avx2() returns all return zero for identical vectors", 32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072, 131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575, 1048576, 1048577}) { - CAPTURE(n); - auto g1{make_gene_vector(n, gen)}; - REQUIRE(distance_avx2(g1, g1) == 0); + for (int max_dist : {0, 1, 2, 11, 999, 9876544}) { + CAPTURE(n); + CAPTURE(max_dist); + auto g1{make_gene_vector(n, gen)}; + REQUIRE(distance_avx2(g1, g1, max_dist) == 0); + } } } @@ -34,10 +37,13 @@ TEST_CASE("distance_avx2() all return n for n A's and n G's", 32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072, 131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575, 1048576, 1048577}) { - CAPTURE(n); - auto g1 = from_string(std::string(n, 'A')); - auto g2 = from_string(std::string(n, 'G')); - REQUIRE(distance_avx2(g1, g2) == n); + for (int max_dist : {0, 1, 2, 11, 999, 9876544}) { + CAPTURE(n); + CAPTURE(max_dist); + auto g1 = from_string(std::string(n, 'A')); + auto g2 = from_string(std::string(n, 'G')); + REQUIRE(distance_avx2(g1, g2, max_dist) == std::min(max_dist, n)); + } } } @@ -54,9 +60,13 @@ TEST_CASE("distance_avx2() returns same as distance_cpp() for random vectors", 32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072, 131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575, 1048576, 1048577}) { - CAPTURE(n); - auto g1{make_gene_vector(n, gen)}; - auto g2{make_gene_vector(n, gen)}; - REQUIRE(distance_avx2(g1, g2) == distance_cpp(g1, g2)); + for (int max_dist : {0, 1, 2, 11, 999, 9876544}) { + CAPTURE(n); + CAPTURE(max_dist); + auto g1{make_gene_vector(n, gen)}; + auto g2{make_gene_vector(n, gen)}; + REQUIRE(distance_avx2(g1, g2, max_dist) == + distance_cpp(g1, g2, max_dist)); + } } } diff --git a/src/distance_avx512.cc b/src/distance_avx512.cc index 0694621..3e2d40f 100644 --- a/src/distance_avx512.cc +++ b/src/distance_avx512.cc @@ -4,7 +4,7 @@ namespace hamming { int distance_avx512(const std::vector &a, - const std::vector &b) { + const std::vector &b, int max_dist) { // distance implementation using AVX512 simd intrinsics // a 512-bit register holds 64 GeneBlocks, i.e. 128 genes constexpr std::size_t n_geneblocks{64}; @@ -26,8 +26,11 @@ int distance_avx512(const std::vector &a, std::size_t n_iter{a.size() / n_geneblocks}; // each partial distance count is stored in a unit8, so max value = 255, // and the value can be increased by at most 2 with each iteration, - // so we do 127 inner iterations for a max value of 254 to avoid overflow - std::size_t n_inner{127}; + // so up to 127 inner iterations for a max value of 254 avoid overflow. + // if max_dist is large then we maximise the number of inner iterations, but + // if it is small then we do fewer inner iterations to allow more + // opportunities to return early if we have reached max_dist + std::size_t n_inner = max_dist >= 255 ? 127 : 16; std::size_t n_outer{1 + n_iter / n_inner}; for (std::size_t j = 0; j < n_outer; ++j) { std::size_t n{std::min((j + 1) * n_inner, n_iter)}; @@ -59,6 +62,9 @@ int distance_avx512(const std::vector &a, for (std::size_t i = 0; i < n_partialsums; ++i) { r += r_partial[i]; } + if (r >= max_dist) { + return max_dist; + } } // do last partial block without simd intrinsics for (std::size_t i = n_geneblocks * n_iter; i < a.size(); ++i) { @@ -66,7 +72,7 @@ int distance_avx512(const std::vector &a, r += static_cast((c & mask_gene0) == 0); r += static_cast((c & mask_gene1) == 0); } - return r; + return std::min(max_dist, r); } } // namespace hamming diff --git a/src/distance_avx512_t.cc b/src/distance_avx512_t.cc index c067453..a7da8ce 100644 --- a/src/distance_avx512_t.cc +++ b/src/distance_avx512_t.cc @@ -17,9 +17,12 @@ TEST_CASE("distance_avx512() returns all return zero for identical vectors", 32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072, 131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575, 1048576, 1048577}) { - CAPTURE(n); - auto g1{make_gene_vector(n, gen)}; - REQUIRE(distance_avx512(g1, g1) == 0); + for (int max_dist : {0, 1, 2, 11, 999, 9876544}) { + CAPTURE(n); + CAPTURE(max_dist); + auto g1{make_gene_vector(n, gen)}; + REQUIRE(distance_avx512(g1, g1, max_dist) == 0); + } } } @@ -35,10 +38,13 @@ TEST_CASE("distance_avx512() all return n for n A's and n G's", 32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072, 131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575, 1048576, 1048577}) { - CAPTURE(n); - auto g1 = from_string(std::string(n, 'A')); - auto g2 = from_string(std::string(n, 'G')); - REQUIRE(distance_avx512(g1, g2) == n); + for (int max_dist : {0, 1, 2, 11, 999, 9876544}) { + CAPTURE(n); + CAPTURE(max_dist); + auto g1 = from_string(std::string(n, 'A')); + auto g2 = from_string(std::string(n, 'G')); + REQUIRE(distance_avx512(g1, g2, max_dist) == std::min(max_dist, n)); + } } } @@ -55,9 +61,13 @@ TEST_CASE("distance_avx512() returns same as distance_cpp() for random vectors", 32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072, 131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575, 1048576, 1048577}) { - CAPTURE(n); - auto g1{make_gene_vector(n, gen)}; - auto g2{make_gene_vector(n, gen)}; - REQUIRE(distance_avx512(g1, g2) == distance_cpp(g1, g2)); + for (int max_dist : {0, 1, 2, 11, 999, 9876544}) { + CAPTURE(n); + CAPTURE(max_dist); + auto g1{make_gene_vector(n, gen)}; + auto g2{make_gene_vector(n, gen)}; + REQUIRE(distance_avx512(g1, g2, max_dist) == + distance_cpp(g1, g2, max_dist)); + } } } diff --git a/src/distance_cuda.cu b/src/distance_cuda.cu index 9ddfcb5..8a1cc33 100644 --- a/src/distance_cuda.cu +++ b/src/distance_cuda.cu @@ -7,9 +7,10 @@ namespace hamming { template -__global__ void Dist(DistIntType *partial_distances, const std::uint8_t *genes, - std::uint64_t distances_offset, - unsigned int geneBlocksPerSample) { +__global__ void +Dist(DistIntType *partial_distances, const std::uint8_t *genes, + std::uint64_t distances_offset, unsigned int geneBlocksPerSample, + DistIntType max_dist = cuda::std::numeric_limits::max()) { // Calculates all gridDim.x entries of the partial_distances array. // // The full distances array is a flat nsamples * (nsamples - 1) / 2 element @@ -82,8 +83,7 @@ __global__ void Dist(DistIntType *partial_distances, const std::uint8_t *genes, sum += __shfl_down_sync(FULL_MASK, sum, offset); } if (threadIndex == 0) { - auto maxDist{cuda::std::numeric_limits::max()}; - partial_distances[distancesIndex] = sum > maxDist ? maxDist : sum; + partial_distances[distancesIndex] = sum > max_dist ? max_dist : sum; } } } @@ -91,7 +91,8 @@ __global__ void Dist(DistIntType *partial_distances, const std::uint8_t *genes, template std::vector distances_cuda(const std::vector> &data, - const std::string &filename = {}) { + const std::string &filename = {}, + DistIntType max_dist = std::numeric_limits::max()) { std::vector distances{}; std::size_t timing_gpu_ms = 0; std::size_t timing_io_ms = 0; @@ -150,7 +151,8 @@ distances_cuda(const std::vector> &data, // this call returns immediately and the kernel runs asynchronously on the // GPU Dist<<>>( - partial_distances, genes, distances_offset, geneBlocksPerSample); + partial_distances, genes, distances_offset, geneBlocksPerSample, + max_dist); cudaEventRecord(stop); if (auto err = cudaGetLastError(); err != cudaSuccess) { throw std::runtime_error(cudaGetErrorString(err)); @@ -200,26 +202,31 @@ distances_cuda(const std::vector> &data, } std::vector -distances_cuda_8bit(const std::vector> &data) { - return distances_cuda(data, {}); +distances_cuda_8bit(const std::vector> &data, + uint8_t max_dist) { + return distances_cuda(data, {}, max_dist); } std::vector -distances_cuda_16bit(const std::vector> &data) { - return distances_cuda(data, {}); +distances_cuda_16bit(const std::vector> &data, + uint16_t max_dist) { + return distances_cuda(data, {}, max_dist); } void distances_cuda_to_lower_triangular( const std::vector> &data, - const std::string &filename) { - distances_cuda(data, filename); + const std::string &filename, int max_distance) { + uint16_t max_dist = max_distance > std::numeric_limits::max() + ? std::numeric_limits::max() + : static_cast(max_distance); + distances_cuda(data, filename, max_dist); } int distance_cuda(const std::vector &a, - const std::vector &b) { + const std::vector &b, int max_dist) { // wrapper for testing cuda kernel with existing distance API std::vector> data{a, b}; - return distances_cuda(data, {})[0]; + return distances_cuda(data, {}, max_dist)[0]; } bool distance_cuda_have_device() { diff --git a/src/distance_cuda_t.cc b/src/distance_cuda_t.cc index dbb5b55..3acea9e 100644 --- a/src/distance_cuda_t.cc +++ b/src/distance_cuda_t.cc @@ -16,9 +16,12 @@ TEST_CASE("distance_cuda() returns all return zero for identical vectors", 32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072, 131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575, 1048576, 1048577}) { - CAPTURE(n); - auto g1{make_gene_vector(n, gen)}; - REQUIRE(distance_cuda(g1, g1) == 0); + for (int max_dist : {0, 1, 2, 11, 999, 9876544}) { + CAPTURE(n); + CAPTURE(max_dist); + auto g1{make_gene_vector(n, gen)}; + REQUIRE(distance_cuda(g1, g1, max_dist) == 0); + } } } @@ -34,10 +37,13 @@ TEST_CASE("distance_cuda() all return n for n A's and n G's", 32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072, 131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575, 1048576, 1048577}) { - CAPTURE(n); - auto g1 = from_string(std::string(n, 'A')); - auto g2 = from_string(std::string(n, 'G')); - REQUIRE(distance_cuda(g1, g2) == n); + for (int max_dist : {0, 1, 2, 11, 999, 9876544}) { + CAPTURE(n); + CAPTURE(max_dist); + auto g1 = from_string(std::string(n, 'A')); + auto g2 = from_string(std::string(n, 'G')); + REQUIRE(distance_cuda(g1, g2, max_dist) == std::min(max_dist, n)); + } } } @@ -54,9 +60,13 @@ TEST_CASE("distance_cuda() returns same as distance_cpp() for random vectors", 32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072, 131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575, 1048576, 1048577}) { - CAPTURE(n); - auto g1{make_gene_vector(n, gen)}; - auto g2{make_gene_vector(n, gen)}; - REQUIRE(distance_cuda(g1, g2) == distance_cpp(g1, g2)); + for (int max_dist : {0, 1, 2, 11, 999, 9876544}) { + CAPTURE(n); + CAPTURE(max_dist); + auto g1{make_gene_vector(n, gen)}; + auto g2{make_gene_vector(n, gen)}; + REQUIRE(distance_cuda(g1, g2, max_dist) == + distance_cpp(g1, g2, max_dist)); + } } } diff --git a/src/distance_neon.cc b/src/distance_neon.cc index 8e8783e..eaccb5f 100644 --- a/src/distance_neon.cc +++ b/src/distance_neon.cc @@ -4,7 +4,7 @@ namespace hamming { int distance_neon(const std::vector &a, - const std::vector &b) { + const std::vector &b, int max_dist) { // distance implementation using NEON simd intrinsics // a 128-bit register holds 16 GeneBlocks, i.e. 32 genes constexpr std::size_t n_geneblocks{16}; @@ -24,8 +24,11 @@ int distance_neon(const std::vector &a, std::size_t n_iter{a.size() / n_geneblocks}; // each partial distance count is stored in a uint8, so max value = 255, // and the value can be increased by at most 2 with each iteration, - // so we do 127 inner iterations for a max value of 254 to avoid overflow - std::size_t n_inner{127}; + // so up to 127 inner iterations for a max value of 254 avoid overflow. + // if max_dist is large then we maximise the number of inner iterations, but + // if it is small then we do fewer inner iterations to allow more + // opportunities to return early if we have reached max_dist + std::size_t n_inner = max_dist >= 255 ? 127 : 16; std::size_t n_outer{1 + n_iter / n_inner}; for (std::size_t j = 0; j < n_outer; ++j) { std::size_t n{std::min((j + 1) * n_inner, n_iter)}; @@ -52,6 +55,9 @@ int distance_neon(const std::vector &a, } // sum the 16 distances in r_s & add to r r += vaddlvq_u8(r_s); + if (r >= max_dist) { + return max_dist; + } } // do last partial block without simd intrinsics for (std::size_t i = n_geneblocks * n_iter; i < a.size(); ++i) { @@ -59,7 +65,7 @@ int distance_neon(const std::vector &a, r += static_cast((c & mask_gene0) == 0); r += static_cast((c & mask_gene1) == 0); } - return r; + return std::min(max_dist, r); } } // namespace hamming diff --git a/src/distance_neon_t.cc b/src/distance_neon_t.cc index 0f3a7a8..3060170 100644 --- a/src/distance_neon_t.cc +++ b/src/distance_neon_t.cc @@ -16,9 +16,12 @@ TEST_CASE("distance_neon() returns all return zero for identical vectors", 32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072, 131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575, 1048576, 1048577}) { - CAPTURE(n); - auto g1{make_gene_vector(n, gen)}; - REQUIRE(distance_neon(g1, g1) == 0); + for (int max_dist : {0, 1, 2, 11, 999, 9876544}) { + CAPTURE(n); + CAPTURE(max_dist); + auto g1{make_gene_vector(n, gen)}; + REQUIRE(distance_neon(g1, g1, max_dist) == 0); + } } } @@ -34,10 +37,13 @@ TEST_CASE("distance_neon() all return n for n A's and n G's", 32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072, 131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575, 1048576, 1048577}) { - CAPTURE(n); - auto g1 = from_string(std::string(n, 'A')); - auto g2 = from_string(std::string(n, 'G')); - REQUIRE(distance_neon(g1, g2) == n); + for (int max_dist : {0, 1, 2, 11, 999, 9876544}) { + CAPTURE(n); + CAPTURE(max_dist); + auto g1 = from_string(std::string(n, 'A')); + auto g2 = from_string(std::string(n, 'G')); + REQUIRE(distance_neon(g1, g2, max_dist) == std::min(max_dist, n)); + } } } @@ -54,9 +60,13 @@ TEST_CASE("distance_neon() returns same as distance_cpp() for random vectors", 32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072, 131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575, 1048576, 1048577}) { - CAPTURE(n); - auto g1{make_gene_vector(n, gen)}; - auto g2{make_gene_vector(n, gen)}; - REQUIRE(distance_neon(g1, g2) == distance_cpp(g1, g2)); + for (int max_dist : {0, 1, 2, 11, 999, 9876544}) { + CAPTURE(n); + CAPTURE(max_dist); + auto g1{make_gene_vector(n, gen)}; + auto g2{make_gene_vector(n, gen)}; + REQUIRE(distance_neon(g1, g2, max_dist) == + distance_cpp(g1, g2, max_dist)); + } } } diff --git a/src/distance_sse2.cc b/src/distance_sse2.cc index abb8701..d733c86 100644 --- a/src/distance_sse2.cc +++ b/src/distance_sse2.cc @@ -4,7 +4,7 @@ namespace hamming { int distance_sse2(const std::vector &a, - const std::vector &b) { + const std::vector &b, int max_dist) { // distance implementation using SSE2 simd intrinsics // a 128-bit register holds 16 GeneBlocks, i.e. 32 genes constexpr std::size_t n_geneblocks{16}; @@ -24,8 +24,11 @@ int distance_sse2(const std::vector &a, std::size_t n_iter{a.size() / n_geneblocks}; // each partial distance count is stored in a unit8, so max value = 255, // and the value can be increased by at most 2 with each iteration, - // so we do 127 inner iterations for a max value of 254 to avoid overflow - std::size_t n_inner{127}; + // so up to 127 inner iterations for a max value of 254 avoid overflow. + // if max_dist is large then we maximise the number of inner iterations, but + // if it is small then we do fewer inner iterations to allow more + // opportunities to return early if we have reached max_dist + std::size_t n_inner = max_dist >= 255 ? 127 : 16; std::size_t n_outer{1 + n_iter / n_inner}; for (std::size_t j = 0; j < n_outer; ++j) { std::size_t n{std::min((j + 1) * n_inner, n_iter)}; @@ -60,6 +63,9 @@ int distance_sse2(const std::vector &a, for (std::size_t i = 0; i < n_partialsums; ++i) { r += r_partial[i]; } + if (r >= max_dist) { + return max_dist; + } } // do last partial block without simd intrinsics for (std::size_t i = n_geneblocks * n_iter; i < a.size(); ++i) { @@ -67,7 +73,7 @@ int distance_sse2(const std::vector &a, r += static_cast((c & mask_gene0) == 0); r += static_cast((c & mask_gene1) == 0); } - return r; + return std::min(max_dist, r); } } // namespace hamming diff --git a/src/distance_sse2_t.cc b/src/distance_sse2_t.cc index 3de54f0..8dfc4f5 100644 --- a/src/distance_sse2_t.cc +++ b/src/distance_sse2_t.cc @@ -16,9 +16,12 @@ TEST_CASE("distance_sse2() returns all return zero for identical vectors", 32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072, 131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575, 1048576, 1048577}) { - CAPTURE(n); - auto g1{make_gene_vector(n, gen)}; - REQUIRE(distance_sse2(g1, g1) == 0); + for (int max_dist : {0, 1, 2, 11, 999, 9876544}) { + CAPTURE(n); + CAPTURE(max_dist); + auto g1{make_gene_vector(n, gen)}; + REQUIRE(distance_sse2(g1, g1, max_dist) == 0); + } } } @@ -34,10 +37,13 @@ TEST_CASE("distance_sse2() all return n for n A's and n G's", 32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072, 131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575, 1048576, 1048577}) { - CAPTURE(n); - auto g1 = from_string(std::string(n, 'A')); - auto g2 = from_string(std::string(n, 'G')); - REQUIRE(distance_sse2(g1, g2) == n); + for (int max_dist : {0, 1, 2, 11, 999, 9876544}) { + CAPTURE(n); + CAPTURE(max_dist); + auto g1 = from_string(std::string(n, 'A')); + auto g2 = from_string(std::string(n, 'G')); + REQUIRE(distance_sse2(g1, g2, max_dist) == std::min(max_dist, n)); + } } } @@ -54,9 +60,13 @@ TEST_CASE("distance_sse2() returns same as distance_cpp() for random vectors", 32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072, 131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575, 1048576, 1048577}) { - CAPTURE(n); - auto g1{make_gene_vector(n, gen)}; - auto g2{make_gene_vector(n, gen)}; - REQUIRE(distance_sse2(g1, g2) == distance_cpp(g1, g2)); + for (int max_dist : {0, 1, 2, 11, 999, 9876544}) { + CAPTURE(n); + CAPTURE(max_dist); + auto g1{make_gene_vector(n, gen)}; + auto g2{make_gene_vector(n, gen)}; + REQUIRE(distance_sse2(g1, g2, max_dist) == + distance_cpp(g1, g2, max_dist)); + } } } diff --git a/src/hamming.cc b/src/hamming.cc index 75b26a7..8984f66 100644 --- a/src/hamming.cc +++ b/src/hamming.cc @@ -14,8 +14,10 @@ namespace hamming { DataSet from_stringlist(std::vector &data, - bool include_x, bool use_gpu) { - return DataSet(data, include_x, false, {}, use_gpu); + bool include_x, bool use_gpu, + int max_distance) { + return DataSet(data, include_x, false, {}, use_gpu, + max_distance); } DataSet from_csv(const std::string &filename) { @@ -25,7 +27,7 @@ DataSet from_csv(const std::string &filename) { void from_fasta_to_lower_triangular(const std::string &input_filename, const std::string &output_filename, bool remove_duplicates, std::size_t n, - bool use_gpu) { + bool use_gpu, int max_distance) { if (use_gpu) { std::cout << "# hammingdist :: Using GPU..." << std::endl; } @@ -40,7 +42,8 @@ void from_fasta_to_lower_triangular(const std::string &input_filename, << " ms..." << std::endl; #ifdef HAMMING_WITH_CUDA if (use_gpu) { - distances_cuda_to_lower_triangular(dense_data, output_filename); + distances_cuda_to_lower_triangular(dense_data, output_filename, + max_distance); return; } #endif diff --git a/src/hamming_bench.cc b/src/hamming_bench.cc index 148b845..c4e253f 100644 --- a/src/hamming_bench.cc +++ b/src/hamming_bench.cc @@ -51,7 +51,7 @@ static void bench_from_fasta_to_lower_triangular_gpu(benchmark::State &state) { auto reference_seq{make_string(sampleLength, gen, true)}; write_fasta(fasta_file, reference_seq, state.range(0), gen); for (auto _ : state) { - from_fasta_to_lower_triangular(fasta_file, lt_file, false); + from_fasta_to_lower_triangular(fasta_file, lt_file, false, 0, true); } state.SetComplexityN(n); } @@ -67,6 +67,26 @@ static void bench_fasta_reference_distances(benchmark::State &state) { } } +static void bench_from_fasta_max_dist(benchmark::State &state) { +#ifdef HAMMING_WITH_OPENMP + omp_set_num_threads(1); +#endif + std::mt19937 gen(12345); + int64_t max_dist{state.range(0)}; + int64_t randomise_every_n{state.range(1)}; + std::string fasta_file{benchmark_tmp_input_file}; + auto reference_seq{make_string(sampleLength, gen, true)}; + write_fasta(fasta_file, reference_seq, 4096, gen, randomise_every_n); + for (auto _ : state) { + from_fasta(fasta_file, false, false, 0, false, + max_dist); + } +} + +BENCHMARK(bench_from_fasta_max_dist) + ->ArgsProduct({{2, 255}, + {10, 50, 100, 300, 500, 1000, 5000, 10000, 30000, + 10000000}}); BENCHMARK(bench_from_stringlist) ->RangeMultiplier(2) ->Range(16, 1024) diff --git a/src/hamming_impl.cc b/src/hamming_impl.cc index e2109c9..b1f5256 100644 --- a/src/hamming_impl.cc +++ b/src/hamming_impl.cc @@ -90,7 +90,7 @@ void validate_data(const std::vector &data) { } } -int distance_sparse(const SparseData &a, const SparseData &b) { +int distance_sparse(const SparseData &a, const SparseData &b, int max_dist) { int r{0}; std::size_t ia{0}; std::size_t ib{0}; @@ -113,6 +113,9 @@ int distance_sparse(const SparseData &a, const SparseData &b) { ia += 2; ib += 2; } + if (r >= max_dist) { + return max_dist; + } } while (ia < a.size()) { r += static_cast(a[ia + 1] != 0xff); @@ -122,18 +125,18 @@ int distance_sparse(const SparseData &a, const SparseData &b) { r += static_cast(b[ib + 1] != 0xff); ib += 2; } - return r; + return std::min(r, max_dist); } int distance_cpp(const std::vector &a, - const std::vector &b) { + const std::vector &b, int max_dist) { int r{0}; for (std::size_t i = 0; i < a.size(); ++i) { auto c{static_cast(a[i] & b[i])}; r += static_cast((c & mask_gene0) == 0) + static_cast((c & mask_gene1) == 0); } - return r; + return std::min(r, max_dist); } static std::string diff --git a/src/hamming_impl_t.cc b/src/hamming_impl_t.cc index 6a0cadc..eea34ca 100644 --- a/src/hamming_impl_t.cc +++ b/src/hamming_impl_t.cc @@ -16,9 +16,12 @@ TEST_CASE("distance_cpp() returns zero for identical vectors of valid chars", 32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072, 131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575, 1048576, 1048577}) { - CAPTURE(n); - auto g1{make_gene_vector(n, gen)}; - REQUIRE(distance_cpp(g1, g1) == 0); + for (int max_dist : {0, 1, 2, 11, 999, 9876544}) { + CAPTURE(n); + CAPTURE(max_dist); + auto g1{make_gene_vector(n, gen)}; + REQUIRE(distance_cpp(g1, g1, max_dist) == 0); + } } } @@ -33,10 +36,13 @@ TEST_CASE("distance_cpp() returns n for n A's and n G's", "[impl][distance]") { 32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072, 131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575, 1048576, 1048577}) { - CAPTURE(n); - auto g1 = from_string(std::string(n, 'A')); - auto g2 = from_string(std::string(n, 'G')); - REQUIRE(distance_cpp(g1, g2) == n); + for (int max_dist : {0, 1, 2, 11, 999, 9876544}) { + CAPTURE(n); + CAPTURE(max_dist); + auto g1 = from_string(std::string(n, 'A')); + auto g2 = from_string(std::string(n, 'G')); + REQUIRE(distance_cpp(g1, g2, max_dist) == std::min(max_dist, n)); + } } } @@ -53,12 +59,15 @@ TEST_CASE("distance_sparse() returns zero for identical vectors of valid chars", 32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072, 131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575, 1048576, 1048577}) { - CAPTURE(n); - for (bool include_x : {false, true}) { - auto g1{make_test_string(n, gen, include_x)}; - CAPTURE(include_x); - auto sparse = to_sparse_data({g1, g1}, include_x); - REQUIRE(distance_sparse(sparse[0], sparse[1]) == 0); + for (int max_dist : {0, 1, 2, 11, 999, 9876544}) { + CAPTURE(n); + CAPTURE(max_dist); + for (bool include_x : {false, true}) { + auto g1{make_test_string(n, gen, include_x)}; + CAPTURE(include_x); + auto sparse = to_sparse_data({g1, g1}, include_x); + REQUIRE(distance_sparse(sparse[0], sparse[1], max_dist) == 0); + } } } } @@ -75,15 +84,19 @@ TEST_CASE("distance_sparse() returns n for n A's and n G's", 32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072, 131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575, 1048576, 1048577}) { - CAPTURE(n); - for (char c1 : {'A', 'X'}) { - for (char c2 : {'G', 'T'}) { - auto g1 = std::string(n, c1); - auto g2 = std::string(n, c2); - for (bool include_x : {false, true}) { - CAPTURE(include_x); - auto sparse = to_sparse_data({g1, g2}, include_x); - REQUIRE(distance_sparse(sparse[0], sparse[1]) == n); + for (int max_dist : {0, 1, 2, 11, 999, 9876544}) { + CAPTURE(n); + CAPTURE(max_dist); + for (char c1 : {'A', 'X'}) { + for (char c2 : {'G', 'T'}) { + auto g1 = std::string(n, c1); + auto g2 = std::string(n, c2); + for (bool include_x : {false, true}) { + CAPTURE(include_x); + auto sparse = to_sparse_data({g1, g2}, include_x); + REQUIRE(distance_sparse(sparse[0], sparse[1], max_dist) == + std::min(max_dist, n)); + } } } } @@ -104,15 +117,19 @@ TEST_CASE("distance_sparse() returns same as distance_cpp() for random vectors " 32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072, 131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575, 1048576, 1048577}) { - CAPTURE(n); - auto s1{make_test_string(n, gen)}; - auto s2{make_test_string(n, gen)}; - auto g1{from_string(s1)}; - auto g2{from_string(s2)}; - for (bool include_x : {false, true}) { - CAPTURE(include_x); - auto sparse = to_sparse_data({s1, s2}, include_x); - REQUIRE(distance_sparse(sparse[0], sparse[1]) == distance_cpp(g1, g2)); + for (int max_dist : {0, 1, 2, 11, 999, 9876544}) { + CAPTURE(n); + CAPTURE(max_dist); + auto s1{make_test_string(n, gen)}; + auto s2{make_test_string(n, gen)}; + auto g1{from_string(s1)}; + auto g2{from_string(s2)}; + for (bool include_x : {false, true}) { + CAPTURE(include_x); + auto sparse = to_sparse_data({s1, s2}, include_x); + REQUIRE(distance_sparse(sparse[0], sparse[1], max_dist) == + distance_cpp(g1, g2, max_dist)); + } } } } @@ -131,30 +148,33 @@ TEST_CASE("distance_sparse() returns same as distance_cpp() for equal-distance " 32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072, 131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575, 1048576, 1048577}) { - CAPTURE(n); - // calculate distance of strings with no X's - auto s1{make_test_string(n, gen)}; - auto s2{make_test_string(n, gen)}; - auto g1{from_string(s1)}; - auto g2{from_string(s2)}; - auto dist = distance_cpp(g1, g2); - // replace all A's with X's : sparse distance with X as valid char should be - // the same - for (char replace_char : {'A', 'C', 'G', 'T'}) { - auto s1_x = s1; - auto s2_x = s2; - for (auto &c : s1_x) { - if (c == replace_char) { - c = 'X'; + for (int max_dist : {0, 1, 2, 11, 999, 9876544}) { + CAPTURE(n); + CAPTURE(max_dist); + // calculate distance of strings with no X's + auto s1{make_test_string(n, gen)}; + auto s2{make_test_string(n, gen)}; + auto g1{from_string(s1)}; + auto g2{from_string(s2)}; + auto dist = distance_cpp(g1, g2, max_dist); + // replace all A's with X's : sparse distance with X as valid char should + // be the same + for (char replace_char : {'A', 'C', 'G', 'T'}) { + auto s1_x = s1; + auto s2_x = s2; + for (auto &c : s1_x) { + if (c == replace_char) { + c = 'X'; + } } - } - for (auto &c : s2_x) { - if (c == replace_char) { - c = 'X'; + for (auto &c : s2_x) { + if (c == replace_char) { + c = 'X'; + } } + auto sparse = to_sparse_data({s1_x, s2_x}, true); + REQUIRE(distance_sparse(sparse[0], sparse[1], max_dist) == dist); } - auto sparse = to_sparse_data({s1_x, s2_x}, true); - REQUIRE(distance_sparse(sparse[0], sparse[1]) == dist); } } } diff --git a/src/hamming_t.cc b/src/hamming_t.cc index 0b94e61..b595bb5 100644 --- a/src/hamming_t.cc +++ b/src/hamming_t.cc @@ -183,31 +183,34 @@ TEST_CASE("from_fasta single line sequences with duplicates", "[hamming]") { std::vector sequence_indices{0, 1, 1, 1, 0, 2}; for (auto use_gpu : valid_use_gpu_values()) { for (int n : {0, 6, 22}) { - for (bool include_x : {false, true}) { - CAPTURE(include_x); - CAPTURE(use_gpu); - if (use_gpu && include_x) { - // include_x cannot be used with use_gpu - REQUIRE_THROWS( - from_fasta(tmp_file_name, include_x, true, n, use_gpu)); - } else { - auto d = - from_fasta(tmp_file_name, include_x, true, n, use_gpu); - REQUIRE(d.nsamples == 3); - REQUIRE(d.sequence_indices == sequence_indices); - REQUIRE(d[{0, 0}] == 0); - REQUIRE(d[{0, 1}] == 2); - REQUIRE(d[{0, 2}] == 1); - REQUIRE(d[{1, 0}] == 2); - REQUIRE(d[{1, 1}] == 0); - REQUIRE(d[{1, 2}] == 2); - REQUIRE(d[{2, 0}] == 1); - REQUIRE(d[{2, 1}] == 2); - REQUIRE(d[{2, 2}] == 0); + for (int max_dist : {0, 1, 2, 3, 999999999}) { + for (bool include_x : {false, true}) { + CAPTURE(include_x); + CAPTURE(use_gpu); + if (use_gpu && include_x) { + // include_x cannot be used with use_gpu + REQUIRE_THROWS(from_fasta(tmp_file_name, include_x, true, + n, use_gpu, max_dist)); + } else { + auto d = from_fasta(tmp_file_name, include_x, true, n, + use_gpu, max_dist); + REQUIRE(d.nsamples == 3); + REQUIRE(d.sequence_indices == sequence_indices); + REQUIRE(d[{0, 0}] == 0); + REQUIRE(d[{0, 1}] == (max_dist < 2 ? max_dist : 2)); + REQUIRE(d[{0, 2}] == (max_dist < 1 ? max_dist : 1)); + REQUIRE(d[{1, 0}] == (max_dist < 2 ? max_dist : 2)); + REQUIRE(d[{1, 1}] == 0); + REQUIRE(d[{1, 2}] == (max_dist < 2 ? max_dist : 2)); + REQUIRE(d[{2, 0}] == (max_dist < 1 ? max_dist : 1)); + REQUIRE(d[{2, 1}] == (max_dist < 2 ? max_dist : 2)); + REQUIRE(d[{2, 2}] == 0); + } } } } } + std::remove(tmp_file_name); } @@ -295,18 +298,21 @@ TEST_CASE("from_csv reproduces correct data", "[hamming]") { TEMPLATE_TEST_CASE("distance integer saturates instead of overflowing", "[hamming]", uint8_t, uint16_t) { - auto n_max{static_cast(std::numeric_limits::max())}; + auto n_max{static_cast(std::numeric_limits::max())}; std::mt19937 gen(12345); std::vector data(2); for (auto use_gpu : valid_use_gpu_values()) { - for (auto n : {n_max, n_max + 1, n_max + 99}) { - CAPTURE(use_gpu); - CAPTURE(n); - data[0] = std::string(n, 'A'); - data[1] = std::string(n, 'T'); - DataSet dataSet(data, false, false, {}, use_gpu); - REQUIRE(dataSet[{0, 1}] == n_max); - REQUIRE(dataSet[{1, 0}] == n_max); + for (auto max_dist : {0, 1, 2, 3, 7, 21, 886, 2784904, 99999999}) { + for (auto n : {n_max, n_max + 1, n_max + 99}) { + CAPTURE(use_gpu); + CAPTURE(n); + CAPTURE(max_dist); + data[0] = std::string(n, 'A'); + data[1] = std::string(n, 'T'); + DataSet dataSet(data, false, false, {}, use_gpu, max_dist); + REQUIRE(dataSet[{0, 1}] == std::min(max_dist, n_max)); + REQUIRE(dataSet[{1, 0}] == std::min(max_dist, n_max)); + } } } } @@ -345,18 +351,20 @@ TEST_CASE("from_stringlist GPU and CPU implementations give consistent results", std::mt19937 gen(12345); for (bool include_x : {false}) { for (int n : {1, 5, 13, 32, 89, 185, 497, 1092}) { - for (int n_samples : {2, 3, 4, 7, 11, 32, 33, 257, 689}) { - std::vector stringlist; - stringlist.reserve(n_samples); - for (std::size_t i = 0; i < n_samples; ++i) { - stringlist.push_back(make_test_string(n, gen, include_x)); - } - CAPTURE(include_x); - auto d_cpu{from_stringlist(stringlist, include_x, false)}; - auto d_gpu{from_stringlist(stringlist, include_x, true)}; - for (std::size_t i = 0; i < n_samples; ++i) { - for (std::size_t j = 0; j < n_samples; ++j) { - REQUIRE(d_cpu[{i, j}] == d_gpu[{i, j}]); + for (int max_dist : {0, 1, 2, 3, 89, 497, 9999999}) { + for (int n_samples : {2, 3, 4, 7, 11, 32, 33, 257, 689}) { + std::vector stringlist; + stringlist.reserve(n_samples); + for (std::size_t i = 0; i < n_samples; ++i) { + stringlist.push_back(make_test_string(n, gen, include_x)); + } + CAPTURE(include_x); + auto d_cpu{from_stringlist(stringlist, include_x, false, max_dist)}; + auto d_gpu{from_stringlist(stringlist, include_x, true, max_dist)}; + for (std::size_t i = 0; i < n_samples; ++i) { + for (std::size_t j = 0; j < n_samples; ++j) { + REQUIRE(d_cpu[{i, j}] == d_gpu[{i, j}]); + } } } } @@ -377,16 +385,20 @@ TEMPLATE_TEST_CASE( for (bool remove_duplicates : {false, true}) { for (bool include_x : {false}) { for (int n : {17, 88, 381, 1023}) { - for (int n_samples : {2, 3, 4, 7, 11, 127, 128, 255, 256, 257, 703}) { - write_test_fasta(tmp_file_name, n, n_samples, gen, include_x); - CAPTURE(include_x); - auto d_cpu{from_fasta(tmp_file_name, include_x, - remove_duplicates, 0, false)}; - auto d_gpu{from_fasta(tmp_file_name, include_x, - remove_duplicates, 0, true)}; - for (std::size_t i = 0; i < n_samples; ++i) { - for (std::size_t j = 0; j < n_samples; ++j) { - REQUIRE(d_cpu[{i, j}] == d_gpu[{i, j}]); + for (int max_dist : {0, 1, 2, 3, 89, 497, 9999999}) { + for (int n_samples : {2, 3, 4, 7, 11, 127, 128, 255, 256, 257, 703}) { + write_test_fasta(tmp_file_name, n, n_samples, gen, include_x); + CAPTURE(include_x); + auto d_cpu{from_fasta(tmp_file_name, include_x, + remove_duplicates, 0, false, + max_dist)}; + auto d_gpu{from_fasta(tmp_file_name, include_x, + remove_duplicates, 0, true, + max_dist)}; + for (std::size_t i = 0; i < n_samples; ++i) { + for (std::size_t j = 0; j < n_samples; ++j) { + REQUIRE(d_cpu[{i, j}] == d_gpu[{i, j}]); + } } } } @@ -409,23 +421,26 @@ TEST_CASE("from_fasta_to_lower_triangular GPU consistent with CPU from_fasta", REQUIRE(std::tmpnam(tmp_lt_file_name) != nullptr); CAPTURE(tmp_lt_file_name); for (bool remove_duplicates : {false, true}) { - for (int n : {17, 88, 381, 1023}) { - for (int n_samples : - {2, 3, 4, 7, 11, 127, 128, 255, 256, 257, 703, 1012}) { - CAPTURE(remove_duplicates); - CAPTURE(n); - CAPTURE(n_samples); - write_test_fasta(tmp_fasta_file_name, n, n_samples, gen, false); - auto d_cpu{from_fasta(tmp_fasta_file_name, false, - remove_duplicates, 0, false)}; - from_fasta_to_lower_triangular(tmp_fasta_file_name, tmp_lt_file_name, - remove_duplicates, 0, true); - auto d_gpu{from_lower_triangular(tmp_lt_file_name)}; - for (std::size_t i = 0; i < n_samples; ++i) { - for (std::size_t j = 0; j < n_samples; ++j) { - CAPTURE(i); - CAPTURE(j); - REQUIRE(d_cpu[{i, j}] == d_gpu[{i, j}]); + for (int n : {17, 88, 381}) { + for (int max_dist : {0, 1, 2, 3, 89, 9999999}) { + for (int n_samples : {2, 3, 4, 7, 11, 127, 128, 255, 256, 257, 703}) { + CAPTURE(remove_duplicates); + CAPTURE(n); + CAPTURE(max_dist); + CAPTURE(n_samples); + write_test_fasta(tmp_fasta_file_name, n, n_samples, gen, false); + auto d_cpu{from_fasta(tmp_fasta_file_name, false, + remove_duplicates, 0, false, + max_dist)}; + from_fasta_to_lower_triangular(tmp_fasta_file_name, tmp_lt_file_name, + remove_duplicates, 0, true, max_dist); + auto d_gpu{from_lower_triangular(tmp_lt_file_name)}; + for (std::size_t i = 0; i < n_samples; ++i) { + for (std::size_t j = 0; j < n_samples; ++j) { + CAPTURE(i); + CAPTURE(j); + REQUIRE(d_cpu[{i, j}] == d_gpu[{i, j}]); + } } } }