diff --git a/CMakeLists.txt b/CMakeLists.txt index 181209d..94cbcfc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.11) project( hammingdist - VERSION 0.17.0 + VERSION 0.18.0 LANGUAGES CXX) include(CTest) diff --git a/README.md b/README.md index 037855d..a9f5a34 100644 --- a/README.md +++ b/README.md @@ -53,6 +53,48 @@ data.dump_sequence_indices("indices.txt") data = hammingdist.from_stringlist(["ACGTACGT", "ACGTAGGT", "ATTTACGT"]) ``` +## Duplicates + +When `from_fasta` is called with the option `remove_duplicates=True`, duplicate sequences are removed before constructing the differences matrix. + +For example given this set of three input sequences: + +| Index | Sequence | +| ----- | -------- | +| 0 | ACG | +| 1 | ACG | +| 2 | TAG | + +The distances matrix would be a 2x2 matrix of distances between `ACG` and `TAT`: +| | ACG | TAT | +| --- | --- | --- | +| ACG | 0 | 2 | +| TAT | 2 | 0 | + +The row of the distances matrix corresponding to each index in the original sequence would be: + +| Index | Sequence | Row in distances matrix | +| ----- | -------- | ----------------------- | +| 0 | ACG | 0 | +| 1 | ACG | 0 | +| 2 | TAT | 1 | + +This last column is what is written to disk by `DataSet.dump_sequence_indices`. + +It can also be constructed (as a numpy array) without calculating the distances matrix by using `hammingdist.fasta_sequence_indices` + +```python +import hammingdist + +sequence_indices = hammingdist.fasta_sequence_indices(fasta_file) +``` + +## Large distance values + +By default, the elements in the distances matrix returned by `hammingdist.from_fasta` have a maximum value of 255. + +For distances larger than this `hammingdist.from_fasta_large` supports distances up to 65535 (but uses twice as much RAM) + ## Distances from reference sequence The distance of each sequence in a fasta file from a given reference sequence can be calculated using: diff --git a/distance.cc b/distance.cc index d2b38fb..66bd66e 100644 --- a/distance.cc +++ b/distance.cc @@ -9,7 +9,8 @@ int main(int argc, char *argv[]) { std::size_t nsamples = std::stoi(std::string(argv[2])); auto start = std::chrono::steady_clock::now(); - auto data = hamming::from_fasta(filename, nsamples); + auto data = + hamming::from_fasta(filename, nsamples); data.dump("distances.csv"); auto stop = std::chrono::steady_clock::now(); std::chrono::duration elapsed_seconds = stop - start; diff --git a/src/distance_avx2.hh b/include/hamming/distance_avx2.hh similarity index 85% rename from src/distance_avx2.hh rename to include/hamming/distance_avx2.hh index 1f7c483..8fc6cdc 100644 --- a/src/distance_avx2.hh +++ b/include/hamming/distance_avx2.hh @@ -4,7 +4,7 @@ #include #include -#include "hamming_impl_types.hh" +#include "hamming/hamming_impl_types.hh" namespace hamming { diff --git a/src/distance_avx512.hh b/include/hamming/distance_avx512.hh similarity index 85% rename from src/distance_avx512.hh rename to include/hamming/distance_avx512.hh index 624a191..919fd31 100644 --- a/src/distance_avx512.hh +++ b/include/hamming/distance_avx512.hh @@ -4,7 +4,7 @@ #include #include -#include "hamming_impl_types.hh" +#include "hamming/hamming_impl_types.hh" namespace hamming { diff --git a/src/distance_sse2.hh b/include/hamming/distance_sse2.hh similarity index 85% rename from src/distance_sse2.hh rename to include/hamming/distance_sse2.hh index a235dbe..6920226 100644 --- a/src/distance_sse2.hh +++ b/include/hamming/distance_sse2.hh @@ -4,7 +4,7 @@ #include #include -#include "hamming_impl_types.hh" +#include "hamming/hamming_impl_types.hh" namespace hamming { diff --git a/include/hamming/hamming.hh b/include/hamming/hamming.hh index f83d4ff..61af9b7 100644 --- a/include/hamming/hamming.hh +++ b/include/hamming/hamming.hh @@ -1,17 +1,183 @@ #ifndef _HAMMING_HH #define _HAMMING_HH +#include "hamming/hamming_impl.hh" #include "hamming/hamming_types.hh" +#include +#include +#include #include +#include #include namespace hamming { -DataSet from_stringlist(std::vector &); -DataSet from_csv(const std::string &); -DataSet from_fasta(const std::string &, bool include_x = false, - bool remove_duplicates = false, std::size_t n = 0); -DataSet from_lower_triangular(const std::string &); +inline std::size_t uint_sqrt(std::size_t x) { + return static_cast(std::round(std::sqrt(x))); +} + +template struct DataSet { + explicit DataSet(std::vector &data, bool include_x = false, + bool clear_input_data = false, + std::vector &&indices = {}) + : nsamples(data.size()), sequence_indices(std::move(indices)) { + validate_data(data); + result = distances(data, include_x, clear_input_data); + } + + explicit DataSet(const std::string &filename) { + // Determine correct dataset size + std::ifstream stream(filename); + std::string line; + nsamples = std::count(std::istreambuf_iterator(stream), + std::istreambuf_iterator(), '\n'); + result.resize(nsamples * (nsamples + 1) / 2); + + // Read the data + stream = std::ifstream(filename); + std::size_t i{0}; + std::size_t current{0}; + while (std::getline(stream, line)) { + std::istringstream s(line); + std::string d; + for (std::size_t j = 0; j < current; ++j) { + std::getline(s, d, ','); + result[i++] = std::stoi(d); + } + ++current; + } + } + + explicit DataSet(std::vector &&distances) + : result{std::move(distances)} { + // infer n from number of lower triangular matrix elements = n(n-1)/2 + nsamples = (uint_sqrt(8 * result.size() + 1) + 1) / 2; + } + + void dump(const std::string &filename) { + std::ofstream stream(filename); + for (std::size_t i = 0; i < nsamples; ++i) { + for (std::size_t j = 0; j < nsamples; ++j) { + stream << (*this)[{i, j}]; + if (j != nsamples - 1) + stream << ", "; + } + stream << std::endl; + } + } + + void dump_lower_triangular(const std::string &filename) { + std::ofstream stream(filename); +#ifdef HAMMING_WITH_OPENMP + std::size_t block_size = 200; +#pragma omp parallel for ordered schedule(static, 1) + for (std::size_t i_start = 1; i_start < nsamples; i_start += block_size) { + std::stringstream line; + std::size_t i_end = i_start + block_size; + if (i_end > nsamples) { + i_end = nsamples; + } + for (std::size_t i = i_start; i < i_end; ++i) { + std::size_t offset{i * (i - 1) / 2}; + for (std::size_t j = 0; j + 1 < i; ++j) { + line << static_cast(result[offset + j]) << ","; + } + line << static_cast(result[offset + i - 1]) << "\n"; + } +#pragma omp ordered + stream << line.str(); + } +#else + std::size_t k = 0; + for (std::size_t i = 1; i < nsamples; ++i) { + for (std::size_t j = 0; j + 1 < i; ++j) { + stream << static_cast(result[k++]) << ","; + } + stream << static_cast(result[k++]) << "\n"; + } +#endif + } + + void dump_sequence_indices(const std::string &filename) { + std::ofstream stream(filename); + if (sequence_indices.empty()) { + for (std::size_t i = 0; i < nsamples; ++i) { + stream << i << "\n"; + } + return; + } + for (auto sequence_index : sequence_indices) { + stream << sequence_index << "\n"; + } + } + + int operator[](const std::array &index) const { + auto i = index[0]; + auto j = index[1]; + if (i < j) + return result[j * (j - 1) / 2 + i]; + if (i > j) + return result[i * (i - 1) / 2 + j]; + // This is a diagonal entry + return 0; + } + + std::size_t nsamples; + std::vector result; + std::vector sequence_indices{}; +}; + +DataSet from_stringlist(std::vector &data); + +DataSet from_csv(const std::string &filename); + +DataSet from_lower_triangular(const std::string &filename); + +template +DataSet +from_fasta(const std::string &filename, bool include_x = false, + bool remove_duplicates = false, std::size_t n = 0) { + std::vector data; + data.reserve(n); + if (n == 0) { + n = std::numeric_limits::max(); + data.reserve(65536); + } + std::unordered_map map_seq_to_index; + std::vector sequence_indices{}; + // Initializing the stream + std::ifstream stream(filename); + std::size_t count = 0; + std::size_t count_unique = 0; + std::string line; + // skip first header + std::getline(stream, line); + while (count < n && !stream.eof()) { + std::string seq{}; + while (std::getline(stream, line) && line[0] != '>') { + seq.append(line); + } + if (remove_duplicates) { + auto result = map_seq_to_index.emplace(std::move(seq), count_unique); + if (result.second) { + ++count_unique; + } + sequence_indices.push_back(result.first->second); + } else { + data.push_back(std::move(seq)); + } + ++count; + } + if (remove_duplicates) { + // copy each unique sequence to the vector of strings + data.resize(count_unique); + for (auto &key_value_pair : map_seq_to_index) { + data[key_value_pair.second] = key_value_pair.first; + } + } + return DataSet(data, include_x, true, + std::move(sequence_indices)); +} ReferenceDistIntType distance(const std::string &seq0, const std::string &seq1, bool include_x = false); diff --git a/include/hamming/hamming_impl.hh b/include/hamming/hamming_impl.hh new file mode 100644 index 0000000..e31e498 --- /dev/null +++ b/include/hamming/hamming_impl.hh @@ -0,0 +1,129 @@ +#ifndef _HAMMING_IMPL_HH +#define _HAMMING_IMPL_HH + +#include +#include +#include +#include +#include +#include +#ifdef HAMMING_WITH_OPENMP +#include +#endif +#ifdef HAMMING_WITH_SSE2 +#include "hamming/distance_sse2.hh" +#endif +#ifdef HAMMING_WITH_AVX2 +#include "hamming/distance_avx2.hh" +#endif +#ifdef HAMMING_WITH_AVX512 +#include "hamming/distance_avx512.hh" +#endif + +#include "hamming/hamming_impl_types.hh" +#include "hamming/hamming_types.hh" + +namespace hamming { + +std::array lookupTable(bool include_x = false); + +template DistIntType safe_int_cast(int x) { + if (x > std::numeric_limits::max()) { + return std::numeric_limits::max(); + } + return static_cast(x); +} + +void validate_data(const std::vector &data); + +int distance_sparse(const SparseData &a, const SparseData &b); + +int distance_cpp(const std::vector &a, + const std::vector &b); + +std::vector to_sparse_data(const std::vector &data, + bool include_x); + +std::vector> +to_dense_data(const std::vector &data); + +std::vector from_string(const std::string &str); + +template +std::vector distances(std::vector &data, + bool include_x, bool clear_input_data) { + std::vector result((data.size() - 1) * data.size() / 2, 0); + auto sparse = to_sparse_data(data, include_x); + std::size_t nsamples{data.size()}; + std::size_t sample_length{data[0].size()}; + + // if X is included, we have to use the sparse distance function + bool use_sparse = include_x; + // otherwise, use heuristic to choose distance function: if < 0.5% of values + // differ from reference genome, use sparse distance function + if (!include_x) { + constexpr double sparse_threshold{0.005}; + std::size_t n_diff{0}; + for (const auto &s : sparse) { + n_diff += s.size() / 2; + } + double frac_diff{static_cast(n_diff) / + static_cast(nsamples * sample_length)}; + use_sparse = frac_diff < sparse_threshold; + } + if (use_sparse) { + if (clear_input_data) { + data.clear(); + } +#ifdef HAMMING_WITH_OPENMP +#pragma omp parallel for +#endif + for (std::size_t i = 0; i < nsamples; ++i) { + std::size_t offset{i * (i - 1) / 2}; + for (std::size_t j = 0; j < i; ++j) { + result[offset + j] = + safe_int_cast(distance_sparse(sparse[i], sparse[j])); + } + } + return result; + } + + // otherwise use fastest supported dense distance function + auto dense = to_dense_data(data); + if (clear_input_data) { + data.clear(); + } + const auto features = cpu_features::GetX86Info().features; + int (*distance_func)(const std::vector &a, + const std::vector &b) = distance_cpp; +#ifdef HAMMING_WITH_SSE2 + if (features.sse2) { + distance_func = distance_sse2; + } +#endif +#ifdef HAMMING_WITH_AVX2 + if (features.avx2) { + distance_func = distance_avx2; + } +#endif +#ifdef HAMMING_WITH_AVX512 + if (features.avx512bw) { + distance_func = distance_avx512; + } +#endif + +#ifdef HAMMING_WITH_OPENMP +#pragma omp parallel for schedule(static, 1) +#endif + for (std::size_t i = 0; i < nsamples; ++i) { + std::size_t offset{i * (i - 1) / 2}; + for (std::size_t j = 0; j < i; ++j) + result[offset + j] = + safe_int_cast(distance_func(dense[i], dense[j])); + } + return result; +} + +} // namespace hamming + +#endif diff --git a/src/hamming_impl_types.hh b/include/hamming/hamming_impl_types.hh similarity index 100% rename from src/hamming_impl_types.hh rename to include/hamming/hamming_impl_types.hh diff --git a/include/hamming/hamming_types.hh b/include/hamming/hamming_types.hh index cb68f7e..8fdf069 100644 --- a/include/hamming/hamming_types.hh +++ b/include/hamming/hamming_types.hh @@ -1,31 +1,18 @@ #ifndef _HAMMING_TYPES_HH #define _HAMMING_TYPES_HH +#include #include +#include #include +#include #include #include namespace hamming { -using DistIntType = uint8_t; using ReferenceDistIntType = uint32_t; - -struct DataSet { - DataSet(std::vector &, bool include_x = false, - bool clear_input_data = false, - std::vector &&indices = {}); - DataSet(const std::string &); - DataSet(std::vector &&distances); - void dump(const std::string &); - void dump_lower_triangular(const std::string &); - void dump_sequence_indices(const std::string &); - int operator[](const std::array &) const; - - std::size_t nsamples; - std::vector result; - std::vector sequence_indices{}; -}; +using DefaultDistIntType = uint8_t; } // namespace hamming diff --git a/python/hammingdist.cc b/python/hammingdist.cc index c49276c..6c82f19 100644 --- a/python/hammingdist.cc +++ b/python/hammingdist.cc @@ -28,26 +28,49 @@ namespace hamming { PYBIND11_MODULE(hammingdist, m) { m.doc() = "Small tool to calculate Hamming distances between gene sequences"; - py::class_(m, "DataSet") - .def("dump", &DataSet::dump, "Dump distances matrix in csv format") - .def("dump_lower_triangular", &DataSet::dump_lower_triangular, + py::class_>(m, "DataSet") + .def("dump", &DataSet::dump, + "Dump distances matrix in csv format") + .def("dump_lower_triangular", + &DataSet::dump_lower_triangular, "Dump distances matrix in lower triangular format (comma-delimited, " "row-major)") - .def("dump_sequence_indices", &DataSet::dump_sequence_indices, + .def("dump_sequence_indices", + &DataSet::dump_sequence_indices, "Dump row index in distances matrix for each input sequence") - .def("__getitem__", &DataSet::operator[]) - .def_readonly("_distances", &DataSet::result); + .def("__getitem__", &DataSet::operator[]) + .def_readonly("_distances", &DataSet::result); + + py::class_>(m, "DataSetLarge") + .def("dump", &DataSet::dump, + "Dump distances matrix in csv format") + .def("dump_lower_triangular", &DataSet::dump_lower_triangular, + "Dump distances matrix in lower triangular format (comma-delimited, " + "row-major)") + .def("dump_sequence_indices", &DataSet::dump_sequence_indices, + "Dump row index in distances matrix for each input sequence") + .def("__getitem__", &DataSet::operator[]) + .def_readonly("_distances", &DataSet::result); m.def("from_stringlist", &from_stringlist, "Creates a dataset from a list of strings"); m.def("from_csv", &from_csv, "Creates a dataset by reading already computed distances from csv " "(full matrix expected)"); - m.def("from_fasta", &from_fasta, py::arg("filename"), + m.def("from_fasta", &from_fasta, py::arg("filename"), + py::arg("include_x") = false, py::arg("remove_duplicates") = false, + py::arg("n") = 0, + "Creates a dataset by reading from a fasta file (assuming all " + "sequences have equal length). Maximum value of an element in the " + "distances matrix: 255. Distances that would have been larger than " + "this value saturate at 255 - to support genomes with larger distances " + "than this see `from_fasta_large` instead."); + m.def("from_fasta_large", &from_fasta, py::arg("filename"), py::arg("include_x") = false, py::arg("remove_duplicates") = false, py::arg("n") = 0, "Creates a dataset by reading from a fasta file (assuming all " - "sequences have equal length)"); + "sequences have equal length). Maximum value of an element in the " + "distances matrix: 65535"); m.def("from_lower_triangular", &from_lower_triangular, "Creates a dataset by reading already computed distances from lower " "triangular format"); diff --git a/python/tests/test_hammingdist.py b/python/tests/test_hammingdist.py index 932fc32..2f3dfd9 100644 --- a/python/tests/test_hammingdist.py +++ b/python/tests/test_hammingdist.py @@ -32,7 +32,10 @@ def check_output_sizes(dat, n_in, n_out, tmp_out_file, fasta_sequence_indices=No assert np.allclose(indices, fasta_sequence_indices) -def test_from_fasta(tmp_path): +@pytest.mark.parametrize( + "from_fasta_func", [hammingdist.from_fasta, hammingdist.from_fasta_large] +) +def test_from_fasta(from_fasta_func, tmp_path): sequences = [ "ACGTGTCGTGTCGACGTGTCG", "ACGTGTCGTTTCGACGAGTCG", diff --git a/setup.py b/setup.py index 97d823b..d9f4765 100644 --- a/setup.py +++ b/setup.py @@ -93,7 +93,7 @@ def build_extension(self, ext): setup( name="hammingdist", - version="0.17.0", + version="0.18.0", author="Dominic Kempf, Liam Keegan", author_email="ssc@iwr.uni-heidelberg.de", description="A fast tool to calculate Hamming distances", diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d18b82d..b7d4a21 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,7 +1,7 @@ # Build hamming library add_library(hamming STATIC hamming.cc hamming_impl.cc) target_include_directories(hamming PUBLIC ../include) -target_link_libraries(hamming PRIVATE CpuFeature::cpu_features) +target_link_libraries(hamming PUBLIC CpuFeature::cpu_features) if(HAMMING_WITH_OPENMP) find_package(OpenMP REQUIRED) target_compile_definitions(hamming PUBLIC HAMMING_WITH_OPENMP) @@ -14,7 +14,7 @@ if(HAMMING_WITH_SSE2) target_compile_definitions(hamming PUBLIC HAMMING_WITH_SSE2) add_library(distance_sse2 STATIC distance_sse2.cc) target_compile_options(distance_sse2 PRIVATE -msse2) - target_include_directories(distance_sse2 PUBLIC .) + target_include_directories(distance_sse2 PUBLIC ../include) target_link_libraries(hamming PRIVATE distance_sse2) endif() @@ -22,7 +22,7 @@ if(HAMMING_WITH_AVX2) target_compile_definitions(hamming PUBLIC HAMMING_WITH_AVX2) add_library(distance_avx2 STATIC distance_avx2.cc) target_compile_options(distance_avx2 PRIVATE -mavx2) - target_include_directories(distance_avx2 PUBLIC .) + target_include_directories(distance_avx2 PUBLIC ../include) target_link_libraries(hamming PRIVATE distance_avx2) endif() @@ -30,7 +30,7 @@ if(HAMMING_WITH_AVX512) target_compile_definitions(hamming PUBLIC HAMMING_WITH_AVX512) add_library(distance_avx512 STATIC distance_avx512.cc) target_compile_options(distance_avx512 PRIVATE -mavx512bw) - target_include_directories(distance_avx512 PUBLIC .) + target_include_directories(distance_avx512 PUBLIC ../include) target_link_libraries(hamming PRIVATE distance_avx512) endif() diff --git a/src/distance_avx2.cc b/src/distance_avx2.cc index b8cfbc5..7bf74b2 100644 --- a/src/distance_avx2.cc +++ b/src/distance_avx2.cc @@ -1,5 +1,4 @@ -#include "distance_avx2.hh" -#include "hamming_impl_types.hh" +#include "hamming/distance_avx2.hh" #include namespace hamming { diff --git a/src/distance_avx2_bench.cc b/src/distance_avx2_bench.cc index cdd41d4..d059d2f 100644 --- a/src/distance_avx2_bench.cc +++ b/src/distance_avx2_bench.cc @@ -1,7 +1,7 @@ #include "bench.hh" -#include "distance_avx2.hh" +#include "hamming/distance_avx2.hh" #include "hamming/hamming.hh" -#include "hamming_impl.hh" +#include "hamming/hamming_impl.hh" #ifdef HAMMING_WITH_OPENMP #include #endif diff --git a/src/distance_avx2_t.cc b/src/distance_avx2_t.cc index 8083d9f..68d46d3 100644 --- a/src/distance_avx2_t.cc +++ b/src/distance_avx2_t.cc @@ -1,4 +1,4 @@ -#include "distance_avx2.hh" +#include "hamming/distance_avx2.hh" #include "tests.hh" using namespace hamming; diff --git a/src/distance_avx512.cc b/src/distance_avx512.cc index e3e65d5..0694621 100644 --- a/src/distance_avx512.cc +++ b/src/distance_avx512.cc @@ -1,4 +1,4 @@ -#include "distance_avx512.hh" +#include "hamming/distance_avx512.hh" #include namespace hamming { diff --git a/src/distance_avx512_bench.cc b/src/distance_avx512_bench.cc index 8989970..4fa5344 100644 --- a/src/distance_avx512_bench.cc +++ b/src/distance_avx512_bench.cc @@ -1,7 +1,7 @@ #include "bench.hh" -#include "distance_avx512.hh" +#include "hamming/distance_avx512.hh" #include "hamming/hamming.hh" -#include "hamming_impl.hh" +#include "hamming/hamming_impl.hh" #ifdef HAMMING_WITH_OPENMP #include #endif diff --git a/src/distance_avx512_t.cc b/src/distance_avx512_t.cc index 38736ca..c067453 100644 --- a/src/distance_avx512_t.cc +++ b/src/distance_avx512_t.cc @@ -1,4 +1,4 @@ -#include "distance_avx512.hh" +#include "hamming/distance_avx512.hh" #include "tests.hh" using namespace hamming; diff --git a/src/distance_sse2.cc b/src/distance_sse2.cc index 6350d42..abb8701 100644 --- a/src/distance_sse2.cc +++ b/src/distance_sse2.cc @@ -1,4 +1,4 @@ -#include "distance_sse2.hh" +#include "hamming/distance_sse2.hh" #include namespace hamming { diff --git a/src/distance_sse2_bench.cc b/src/distance_sse2_bench.cc index b686b71..ddc4ab9 100644 --- a/src/distance_sse2_bench.cc +++ b/src/distance_sse2_bench.cc @@ -1,7 +1,7 @@ #include "bench.hh" -#include "distance_sse2.hh" +#include "hamming/distance_sse2.hh" #include "hamming/hamming.hh" -#include "hamming_impl.hh" +#include "hamming/hamming_impl.hh" #ifdef HAMMING_WITH_OPENMP #include #endif diff --git a/src/distance_sse2_t.cc b/src/distance_sse2_t.cc index 3bce35b..3de54f0 100644 --- a/src/distance_sse2_t.cc +++ b/src/distance_sse2_t.cc @@ -1,4 +1,4 @@ -#include "distance_sse2.hh" +#include "hamming/distance_sse2.hh" #include "tests.hh" using namespace hamming; diff --git a/src/hamming.cc b/src/hamming.cc index f1933ea..ff3c6aa 100644 --- a/src/hamming.cc +++ b/src/hamming.cc @@ -1,5 +1,5 @@ #include "hamming/hamming.hh" -#include "hamming_impl.hh" +#include "hamming/hamming_impl.hh" #include #include @@ -14,122 +14,16 @@ namespace hamming { -DataSet::DataSet(std::vector &data, bool include_x, - bool clear_input_data, std::vector &&indices) - : nsamples(data.size()), sequence_indices(std::move(indices)) { - validate_data(data); - result = distances(data, include_x, clear_input_data); +DataSet from_stringlist(std::vector &data) { + return DataSet(data); } -DataSet::DataSet(const std::string &filename) { - // Determine correct dataset size - std::ifstream stream(filename); - std::string line; - nsamples = std::count(std::istreambuf_iterator(stream), - std::istreambuf_iterator(), '\n'); - result.resize(nsamples * (nsamples + 1) / 2); - - // Read the data - stream = std::ifstream(filename); - std::size_t i{0}; - std::size_t current{0}; - while (std::getline(stream, line)) { - std::istringstream s(line); - std::string d; - for (std::size_t j = 0; j < current; ++j) { - std::getline(s, d, ','); - result[i++] = std::stoi(d); - } - ++current; - } -} - -static std::size_t uint_sqrt(std::size_t x) { - return static_cast(std::round(std::sqrt(x))); -} - -DataSet::DataSet(std::vector &&distances) - : result{std::move(distances)} { - // infer n from number of lower triangular matrix elements = n(n-1)/2 - nsamples = (uint_sqrt(8 * result.size() + 1) + 1) / 2; -} - -void DataSet::dump(const std::string &filename) { - std::ofstream stream(filename); - for (std::size_t i = 0; i < nsamples; ++i) { - for (std::size_t j = 0; j < nsamples; ++j) { - stream << (*this)[{i, j}]; - if (j != nsamples - 1) - stream << ", "; - } - stream << std::endl; - } +DataSet from_csv(const std::string &filename) { + return DataSet(filename); } -void DataSet::dump_lower_triangular(const std::string &filename) { - std::ofstream stream(filename); -#ifdef HAMMING_WITH_OPENMP - std::size_t block_size = 200; -#pragma omp parallel for ordered schedule(static, 1) - for (std::size_t i_start = 1; i_start < nsamples; i_start += block_size) { - std::stringstream line; - std::size_t i_end = i_start + block_size; - if (i_end > nsamples) { - i_end = nsamples; - } - for (std::size_t i = i_start; i < i_end; ++i) { - std::size_t offset{i * (i - 1) / 2}; - for (std::size_t j = 0; j + 1 < i; ++j) { - line << static_cast(result[offset + j]) << ","; - } - line << static_cast(result[offset + i - 1]) << "\n"; - } -#pragma omp ordered - stream << line.str(); - } -#else - std::size_t k = 0; - for (std::size_t i = 1; i < nsamples; ++i) { - for (std::size_t j = 0; j + 1 < i; ++j) { - stream << static_cast(result[k++]) << ","; - } - stream << static_cast(result[k++]) << "\n"; - } -#endif -} - -void DataSet::dump_sequence_indices(const std::string &filename) { - std::ofstream stream(filename); - if (sequence_indices.empty()) { - for (std::size_t i = 0; i < nsamples; ++i) { - stream << i << "\n"; - } - return; - } - for (auto sequence_index : sequence_indices) { - stream << sequence_index << "\n"; - } -} - -int DataSet::operator[](const std::array &index) const { - auto i = index[0]; - auto j = index[1]; - if (i < j) - return result[j * (j - 1) / 2 + i]; - if (i > j) - return result[i * (i - 1) / 2 + j]; - // This is a diagonal entry - return 0; -} - -DataSet from_stringlist(std::vector &data) { - return DataSet(data); -} - -DataSet from_csv(const std::string &filename) { return DataSet(filename); } - -DataSet from_lower_triangular(const std::string &filename) { - std::vector distances; +DataSet from_lower_triangular(const std::string &filename) { + std::vector distances; std::ifstream stream(filename); std::string line; while (std::getline(stream, line)) { @@ -137,53 +31,32 @@ DataSet from_lower_triangular(const std::string &filename) { std::string d; while (s.good()) { std::getline(s, d, ','); - distances.push_back(safe_int_cast(std::stoi(d))); + distances.push_back(safe_int_cast(std::stoi(d))); } } return DataSet(std::move(distances)); } -DataSet from_fasta(const std::string &filename, bool include_x, - bool remove_duplicates, std::size_t n) { - std::vector data; - data.reserve(n); - if (n == 0) { - n = std::numeric_limits::max(); - data.reserve(65536); - } - std::unordered_map map_seq_to_index; - std::vector sequence_indices{}; - // Initializing the stream - std::ifstream stream(filename); - std::size_t count = 0; - std::size_t count_unique = 0; - std::string line; - // skip first header - std::getline(stream, line); - while (count < n && !stream.eof()) { - std::string seq{}; - while (std::getline(stream, line) && line[0] != '>') { - seq.append(line); - } - if (remove_duplicates) { - auto result = map_seq_to_index.emplace(std::move(seq), count_unique); - if (result.second) { - ++count_unique; - } - sequence_indices.push_back(result.first->second); - } else { - data.push_back(std::move(seq)); - } - ++count; +ReferenceDistIntType distance(const std::string &seq0, const std::string &seq1, + bool include_x) { + auto lookup{lookupTable(include_x)}; + ReferenceDistIntType distance{0}; + if (seq0.size() != seq1.size()) { + throw std::runtime_error( + "Error: Sequences do not all have the same length"); } - if (remove_duplicates) { - // copy each unique sequence to the vector of strings - data.resize(count_unique); - for (auto &key_value_pair : map_seq_to_index) { - data[key_value_pair.second] = key_value_pair.first; + for (std::size_t i = 0; i < seq0.size(); ++i) { + auto a{lookup[seq0[i]]}; + auto b{lookup[seq1[i]]}; + bool invalid{(a & b) == 0x00}; + bool differ{(seq0[i] != seq1[i])}; + if (invalid || differ) { + bool nodash{(a != 0xff) && (b != 0xff)}; + distance += + static_cast(invalid || (differ && nodash)); } } - return DataSet(data, include_x, true, std::move(sequence_indices)); + return distance; } std::vector fasta_sequence_indices(const std::string &fasta_file, @@ -253,26 +126,4 @@ fasta_reference_distances(const std::string &reference_sequence, return distances; } -ReferenceDistIntType distance(const std::string &seq0, const std::string &seq1, - bool include_x) { - auto lookup{lookupTable(include_x)}; - ReferenceDistIntType distance{0}; - if (seq0.size() != seq1.size()) { - throw std::runtime_error( - "Error: Sequences do not all have the same length"); - } - for (std::size_t i = 0; i < seq0.size(); ++i) { - auto a{lookup[seq0[i]]}; - auto b{lookup[seq1[i]]}; - bool invalid{(a & b) == 0x00}; - bool differ{(seq0[i] != seq1[i])}; - if (invalid || differ) { - bool nodash{(a != 0xff) && (b != 0xff)}; - distance += - static_cast(invalid || (differ && nodash)); - } - } - return distance; -} - } // namespace hamming diff --git a/src/hamming_bench.cc b/src/hamming_bench.cc index 53f0b5d..2f4ff28 100644 --- a/src/hamming_bench.cc +++ b/src/hamming_bench.cc @@ -1,6 +1,6 @@ #include "bench.hh" #include "hamming/hamming.hh" -#include "hamming_impl.hh" +#include "hamming/hamming_impl.hh" #ifdef HAMMING_WITH_OPENMP #include #endif diff --git a/src/hamming_impl.cc b/src/hamming_impl.cc index 1b95c46..2ba768f 100644 --- a/src/hamming_impl.cc +++ b/src/hamming_impl.cc @@ -1,21 +1,6 @@ -#include "hamming_impl.hh" -#ifdef HAMMING_WITH_SSE2 -#include "distance_sse2.hh" -#endif -#ifdef HAMMING_WITH_AVX2 -#include "distance_avx2.hh" -#endif -#ifdef HAMMING_WITH_AVX512 -#include "distance_avx512.hh" -#endif +#include "hamming/hamming_impl.hh" #include -#include -#include #include -#ifdef HAMMING_WITH_OPENMP -#include -#endif -#include namespace hamming { // bit meaning: @@ -41,84 +26,17 @@ std::array lookupTable(bool include_x) { return lookup; } -DistIntType safe_int_cast(int x) { - if (x > std::numeric_limits::max()) { - return std::numeric_limits::max(); - } - return static_cast(x); -} - -std::vector distances(std::vector &data, - bool include_x, bool clear_input_data) { - std::vector result((data.size() - 1) * data.size() / 2, 0); - auto sparse = to_sparse_data(data, include_x); - std::size_t nsamples{data.size()}; - std::size_t sample_length{data[0].size()}; - - // if X is included, we have to use the sparse distance function - bool use_sparse = include_x; - // otherwise, use heuristic to choose distance function: if < 0.5% of values - // differ from reference genome, use sparse distance function - if (!include_x) { - constexpr double sparse_threshold{0.005}; - std::size_t n_diff{0}; - for (const auto &s : sparse) { - n_diff += s.size() / 2; - } - double frac_diff{static_cast(n_diff) / - static_cast(nsamples * sample_length)}; - use_sparse = frac_diff < sparse_threshold; +void validate_data(const std::vector &data) { + if (data.empty() || data[0].empty()) { + throw std::runtime_error("Error: Empty sequence"); } - if (use_sparse) { - if (clear_input_data) { - data.clear(); - } -#ifdef HAMMING_WITH_OPENMP -#pragma omp parallel for -#endif - for (std::size_t i = 0; i < nsamples; ++i) { - std::size_t offset{i * (i - 1) / 2}; - for (std::size_t j = 0; j < i; ++j) { - result[offset + j] = - safe_int_cast(distance_sparse(sparse[i], sparse[j])); - } + auto length{data[0].size()}; + for (const auto &d : data) { + if (d.size() != length) { + throw std::runtime_error( + "Error: Sequences do not all have the same length"); } - return result; - } - - // otherwise use fastest supported dense distance function - auto dense = to_dense_data(data); - if (clear_input_data) { - data.clear(); - } - const auto features = cpu_features::GetX86Info().features; - int (*distance_func)(const std::vector &a, - const std::vector &b) = distance_cpp; -#ifdef HAMMING_WITH_SSE2 - if (features.sse2) { - distance_func = distance_sse2; } -#endif -#ifdef HAMMING_WITH_AVX2 - if (features.avx2) { - distance_func = distance_avx2; - } -#endif -#ifdef HAMMING_WITH_AVX512 - if (features.avx512bw) { - distance_func = distance_avx512; - } -#endif - -#ifdef HAMMING_WITH_OPENMP -#pragma omp parallel for schedule(static, 1) -#endif - for (std::size_t i = 0; i < nsamples; ++i) { - std::size_t offset{i * (i - 1) / 2}; - for (std::size_t j = 0; j < i; ++j) - result[offset + j] = safe_int_cast(distance_func(dense[i], dense[j])); - } - return result; } int distance_sparse(const SparseData &a, const SparseData &b) { @@ -167,19 +85,6 @@ int distance_cpp(const std::vector &a, return r; } -void validate_data(const std::vector &data) { - if (data.empty() || data[0].empty()) { - throw std::runtime_error("Error: Empty sequence"); - } - auto length{data[0].size()}; - for (const auto &d : data) { - if (d.size() != length) { - throw std::runtime_error( - "Error: Sequences do not all have the same length"); - } - } -} - static std::string get_reference_expression(const std::vector &data, bool include_x) { std::string g0; diff --git a/src/hamming_impl.hh b/src/hamming_impl.hh deleted file mode 100644 index f1eb04f..0000000 --- a/src/hamming_impl.hh +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef _HAMMING_IMPL_HH -#define _HAMMING_IMPL_HH - -#include -#include -#include -#include - -#include "hamming/hamming_types.hh" -#include "hamming_impl_types.hh" - -namespace hamming { - -std::array lookupTable(bool include_x = false); - -DistIntType safe_int_cast(int x); - -std::vector distances(std::vector &data, - bool include_x, bool clear_input_data); - -int distance_sparse(const SparseData &a, const SparseData &b); - -int distance_cpp(const std::vector &a, - const std::vector &b); - -void validate_data(const std::vector &data); - -std::vector to_sparse_data(const std::vector &data, - bool include_x); - -std::vector> -to_dense_data(const std::vector &data); - -std::vector from_string(const std::string &str); - -} // namespace hamming - -#endif diff --git a/src/hamming_impl_bench.cc b/src/hamming_impl_bench.cc index 2da7a84..7d23034 100644 --- a/src/hamming_impl_bench.cc +++ b/src/hamming_impl_bench.cc @@ -1,6 +1,6 @@ #include "bench.hh" #include "hamming/hamming.hh" -#include "hamming_impl.hh" +#include "hamming/hamming_impl.hh" #ifdef HAMMING_WITH_OPENMP #include #endif diff --git a/src/hamming_impl_t.cc b/src/hamming_impl_t.cc index 49eb1ae..6a0cadc 100644 --- a/src/hamming_impl_t.cc +++ b/src/hamming_impl_t.cc @@ -1,4 +1,4 @@ -#include "hamming_impl.hh" +#include "hamming/hamming_impl.hh" #include "tests.hh" using namespace hamming; diff --git a/src/hamming_t.cc b/src/hamming_t.cc index 5246a91..a88d6d9 100644 --- a/src/hamming_t.cc +++ b/src/hamming_t.cc @@ -110,9 +110,13 @@ TEST_CASE("two expressions with distance 2", "[hamming]") { for (auto &v : expr) { auto d = from_stringlist(v); REQUIRE(d[{0, 0}] == 0); + REQUIRE(distance(v[0], v[0]) == 0); REQUIRE(d[{0, 1}] == 2); + REQUIRE(distance(v[0], v[1]) == 2); REQUIRE(d[{1, 0}] == 2); + REQUIRE(distance(v[1], v[0]) == 2); REQUIRE(d[{1, 1}] == 0); + REQUIRE(distance(v[1], v[1]) == 0); } } @@ -131,7 +135,8 @@ TEST_CASE("from_fasta single line sequences", "[hamming]") { CAPTURE(include_x); CAPTURE(remove_duplicates); for (int n : {0, 2, 3, 8}) { - auto d = from_fasta(tmp_file_name, include_x, remove_duplicates, n); + auto d = + from_fasta(tmp_file_name, include_x, remove_duplicates, n); REQUIRE(d[{0, 0}] == 0); REQUIRE(d[{0, 1}] == 2); REQUIRE(d[{1, 0}] == 2); @@ -164,7 +169,7 @@ TEST_CASE("from_fasta single line sequences with duplicates", "[hamming]") { for (int n : {0, 6, 22}) { for (bool include_x : {false, true}) { CAPTURE(include_x); - auto d = from_fasta(tmp_file_name, include_x, true, n); + auto d = from_fasta(tmp_file_name, include_x, true, n); REQUIRE(d.nsamples == 3); REQUIRE(d.sequence_indices == sequence_indices); REQUIRE(d[{0, 0}] == 0); @@ -199,7 +204,8 @@ TEST_CASE("from_fasta multi-line sequences", "[hamming]") { for (bool include_x : {false, true}) { CAPTURE(include_x); for (int n : {0, 2, 3, 8}) { - auto d = from_fasta(tmp_file_name, include_x, remove_duplicates, 2); + auto d = + from_fasta(tmp_file_name, include_x, remove_duplicates, 2); REQUIRE(d[{0, 0}] == 0); REQUIRE(d[{0, 1}] == 2); REQUIRE(d[{1, 0}] == 2); @@ -240,7 +246,7 @@ TEST_CASE("from_csv reproduces correct data", "[hamming]") { for (auto &d : data) d = make_test_string(201, gen); - DataSet ref(data); + DataSet ref(data); char tmp_file_name[L_tmpnam]; REQUIRE(std::tmpnam(tmp_file_name) != nullptr); ref.dump(std::string(tmp_file_name)); @@ -255,15 +261,16 @@ TEST_CASE("from_csv reproduces correct data", "[hamming]") { std::remove(tmp_file_name); } -TEST_CASE("distance integer saturates instead of overflowing", "[hamming]") { - auto n_max{static_cast(std::numeric_limits::max())}; +TEMPLATE_TEST_CASE("distance integer saturates instead of overflowing", + "[hamming]", uint8_t, uint16_t) { + auto n_max{static_cast(std::numeric_limits::max())}; std::mt19937 gen(12345); std::vector data(2); for (auto n : {n_max, n_max + 1, n_max + 99}) { CAPTURE(n); data[0] = std::string(n, 'A'); data[1] = std::string(n, 'T'); - DataSet dataSet(data); + DataSet dataSet(data); REQUIRE(dataSet[{0, 1}] == n_max); REQUIRE(dataSet[{1, 0}] == n_max); } @@ -279,7 +286,7 @@ TEST_CASE("from_lower_triangular reproduces correct data", "[hamming]") { for (auto &d : data) d = make_test_string(24, gen); - DataSet ref(data); + DataSet ref(data); REQUIRE(ref.nsamples == n); ref.dump_lower_triangular(std::string(tmp_file_name)); diff --git a/src/tests.hh b/src/tests.hh index b515d9b..9999496 100644 --- a/src/tests.hh +++ b/src/tests.hh @@ -2,7 +2,7 @@ #define HAMMING_TESTS_HH #include "hamming/hamming.hh" -#include "hamming_impl.hh" +#include "hamming/hamming_impl.hh" #include #include #include