Merge pull request #75 from ssciwr/fix73_uint8_uint16_choice

allow runtime choice of uint8 or uint16 for distances matrix element type
ssciwr · Sep 5, 2022 · 13ef3ca · 13ef3ca
2 parents 0ea8ac9 + de2e3f5
commit 13ef3ca
Show file tree

Hide file tree

Showing 31 changed files with 457 additions and 382 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.11)
 
 project(
   hammingdist
-  VERSION 0.17.0
+  VERSION 0.18.0
   LANGUAGES CXX)
 
 include(CTest)

diff --git a/README.md b/README.md
@@ -53,6 +53,48 @@ data.dump_sequence_indices("indices.txt")
 data = hammingdist.from_stringlist(["ACGTACGT", "ACGTAGGT", "ATTTACGT"])
 ```
 
+## Duplicates
+
+When `from_fasta` is called with the option `remove_duplicates=True`, duplicate sequences are removed before constructing the differences matrix.
+
+For example given this set of three input sequences:
+
+| Index | Sequence |
+| ----- | -------- |
+| 0     | ACG      |
+| 1     | ACG      |
+| 2     | TAG      |
+
+The distances matrix would be a 2x2 matrix of distances between `ACG` and `TAT`:
+| | ACG | TAT |
+| --- | --- | --- |
+| ACG | 0 | 2 |
+| TAT | 2 | 0 |
+
+The row of the distances matrix corresponding to each index in the original sequence would be:
+
+| Index | Sequence | Row in distances matrix |
+| ----- | -------- | ----------------------- |
+| 0     | ACG      | 0                       |
+| 1     | ACG      | 0                       |
+| 2     | TAT      | 1                       |
+
+This last column is what is written to disk by `DataSet.dump_sequence_indices`.
+
+It can also be constructed (as a numpy array) without calculating the distances matrix by using `hammingdist.fasta_sequence_indices`
+
+```python
+import hammingdist
+
+sequence_indices = hammingdist.fasta_sequence_indices(fasta_file)
+```
+
+## Large distance values
+
+By default, the elements in the distances matrix returned by `hammingdist.from_fasta` have a maximum value of 255.
+
+For distances larger than this `hammingdist.from_fasta_large` supports distances up to 65535 (but uses twice as much RAM)
+
 ## Distances from reference sequence
 
 The distance of each sequence in a fasta file from a given reference sequence can be calculated using:

diff --git a/distance.cc b/distance.cc
@@ -9,7 +9,8 @@ int main(int argc, char *argv[]) {
   std::size_t nsamples = std::stoi(std::string(argv[2]));
 
   auto start = std::chrono::steady_clock::now();
-  auto data = hamming::from_fasta(filename, nsamples);
+  auto data =
+      hamming::from_fasta<hamming::DefaultDistIntType>(filename, nsamples);
   data.dump("distances.csv");
   auto stop = std::chrono::steady_clock::now();
   std::chrono::duration<double> elapsed_seconds = stop - start;

diff --git a/src/distance_avx2.hh → include/hamming/distance_avx2.hh b/src/distance_avx2.hh → include/hamming/distance_avx2.hh
@@ -4,7 +4,7 @@
 #include <cstdint>
 #include <vector>
 
-#include "hamming_impl_types.hh"
+#include "hamming/hamming_impl_types.hh"
 
 namespace hamming {
 

diff --git a/src/distance_avx512.hh → include/hamming/distance_avx512.hh b/src/distance_avx512.hh → include/hamming/distance_avx512.hh
@@ -4,7 +4,7 @@
 #include <cstdint>
 #include <vector>
 
-#include "hamming_impl_types.hh"
+#include "hamming/hamming_impl_types.hh"
 
 namespace hamming {
 

diff --git a/src/distance_sse2.hh → include/hamming/distance_sse2.hh b/src/distance_sse2.hh → include/hamming/distance_sse2.hh
@@ -4,7 +4,7 @@
 #include <cstdint>
 #include <vector>
 
-#include "hamming_impl_types.hh"
+#include "hamming/hamming_impl_types.hh"
 
 namespace hamming {
 

diff --git a/include/hamming/hamming.hh b/include/hamming/hamming.hh
@@ -1,17 +1,183 @@
 #ifndef _HAMMING_HH
 #define _HAMMING_HH
 
+#include "hamming/hamming_impl.hh"
 #include "hamming/hamming_types.hh"
+#include <cmath>
+#include <fstream>
+#include <sstream>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 namespace hamming {
 
-DataSet from_stringlist(std::vector<std::string> &);
-DataSet from_csv(const std::string &);
-DataSet from_fasta(const std::string &, bool include_x = false,
-                   bool remove_duplicates = false, std::size_t n = 0);
-DataSet from_lower_triangular(const std::string &);
+inline std::size_t uint_sqrt(std::size_t x) {
+  return static_cast<std::size_t>(std::round(std::sqrt(x)));
+}
+
+template <typename DistIntType> struct DataSet {
+  explicit DataSet(std::vector<std::string> &data, bool include_x = false,
+                   bool clear_input_data = false,
+                   std::vector<std::size_t> &&indices = {})
+      : nsamples(data.size()), sequence_indices(std::move(indices)) {
+    validate_data(data);
+    result = distances<DistIntType>(data, include_x, clear_input_data);
+  }
+
+  explicit DataSet(const std::string &filename) {
+    // Determine correct dataset size
+    std::ifstream stream(filename);
+    std::string line;
+    nsamples = std::count(std::istreambuf_iterator<char>(stream),
+                          std::istreambuf_iterator<char>(), '\n');
+    result.resize(nsamples * (nsamples + 1) / 2);
+
+    // Read the data
+    stream = std::ifstream(filename);
+    std::size_t i{0};
+    std::size_t current{0};
+    while (std::getline(stream, line)) {
+      std::istringstream s(line);
+      std::string d;
+      for (std::size_t j = 0; j < current; ++j) {
+        std::getline(s, d, ',');
+        result[i++] = std::stoi(d);
+      }
+      ++current;
+    }
+  }
+
+  explicit DataSet(std::vector<DistIntType> &&distances)
+      : result{std::move(distances)} {
+    // infer n from number of lower triangular matrix elements = n(n-1)/2
+    nsamples = (uint_sqrt(8 * result.size() + 1) + 1) / 2;
+  }
+
+  void dump(const std::string &filename) {
+    std::ofstream stream(filename);
+    for (std::size_t i = 0; i < nsamples; ++i) {
+      for (std::size_t j = 0; j < nsamples; ++j) {
+        stream << (*this)[{i, j}];
+        if (j != nsamples - 1)
+          stream << ", ";
+      }
+      stream << std::endl;
+    }
+  }
+
+  void dump_lower_triangular(const std::string &filename) {
+    std::ofstream stream(filename);
+#ifdef HAMMING_WITH_OPENMP
+    std::size_t block_size = 200;
+#pragma omp parallel for ordered schedule(static, 1)
+    for (std::size_t i_start = 1; i_start < nsamples; i_start += block_size) {
+      std::stringstream line;
+      std::size_t i_end = i_start + block_size;
+      if (i_end > nsamples) {
+        i_end = nsamples;
+      }
+      for (std::size_t i = i_start; i < i_end; ++i) {
+        std::size_t offset{i * (i - 1) / 2};
+        for (std::size_t j = 0; j + 1 < i; ++j) {
+          line << static_cast<int>(result[offset + j]) << ",";
+        }
+        line << static_cast<int>(result[offset + i - 1]) << "\n";
+      }
+#pragma omp ordered
+      stream << line.str();
+    }
+#else
+    std::size_t k = 0;
+    for (std::size_t i = 1; i < nsamples; ++i) {
+      for (std::size_t j = 0; j + 1 < i; ++j) {
+        stream << static_cast<int>(result[k++]) << ",";
+      }
+      stream << static_cast<int>(result[k++]) << "\n";
+    }
+#endif
+  }
+
+  void dump_sequence_indices(const std::string &filename) {
+    std::ofstream stream(filename);
+    if (sequence_indices.empty()) {
+      for (std::size_t i = 0; i < nsamples; ++i) {
+        stream << i << "\n";
+      }
+      return;
+    }
+    for (auto sequence_index : sequence_indices) {
+      stream << sequence_index << "\n";
+    }
+  }
+
+  int operator[](const std::array<std::size_t, 2> &index) const {
+    auto i = index[0];
+    auto j = index[1];
+    if (i < j)
+      return result[j * (j - 1) / 2 + i];
+    if (i > j)
+      return result[i * (i - 1) / 2 + j];
+    // This is a diagonal entry
+    return 0;
+  }
+
+  std::size_t nsamples;
+  std::vector<DistIntType> result;
+  std::vector<std::size_t> sequence_indices{};
+};
+
+DataSet<DefaultDistIntType> from_stringlist(std::vector<std::string> &data);
+
+DataSet<DefaultDistIntType> from_csv(const std::string &filename);
+
+DataSet<DefaultDistIntType> from_lower_triangular(const std::string &filename);
+
+template <typename DistIntType>
+DataSet<DistIntType>
+from_fasta(const std::string &filename, bool include_x = false,
+           bool remove_duplicates = false, std::size_t n = 0) {
+  std::vector<std::string> data;
+  data.reserve(n);
+  if (n == 0) {
+    n = std::numeric_limits<std::size_t>::max();
+    data.reserve(65536);
+  }
+  std::unordered_map<std::string, std::size_t> map_seq_to_index;
+  std::vector<std::size_t> sequence_indices{};
+  // Initializing the stream
+  std::ifstream stream(filename);
+  std::size_t count = 0;
+  std::size_t count_unique = 0;
+  std::string line;
+  // skip first header
+  std::getline(stream, line);
+  while (count < n && !stream.eof()) {
+    std::string seq{};
+    while (std::getline(stream, line) && line[0] != '>') {
+      seq.append(line);
+    }
+    if (remove_duplicates) {
+      auto result = map_seq_to_index.emplace(std::move(seq), count_unique);
+      if (result.second) {
+        ++count_unique;
+      }
+      sequence_indices.push_back(result.first->second);
+    } else {
+      data.push_back(std::move(seq));
+    }
+    ++count;
+  }
+  if (remove_duplicates) {
+    // copy each unique sequence to the vector of strings
+    data.resize(count_unique);
+    for (auto &key_value_pair : map_seq_to_index) {
+      data[key_value_pair.second] = key_value_pair.first;
+    }
+  }
+  return DataSet<DistIntType>(data, include_x, true,
+                              std::move(sequence_indices));
+}
 
 ReferenceDistIntType distance(const std::string &seq0, const std::string &seq1,
                               bool include_x = false);