Skip to content

Commit

Permalink
Merge pull request #75 from ssciwr/fix73_uint8_uint16_choice
Browse files Browse the repository at this point in the history
allow runtime choice of uint8 or uint16 for distances matrix element type
  • Loading branch information
lkeegan authored Sep 5, 2022
2 parents 0ea8ac9 + de2e3f5 commit 13ef3ca
Show file tree
Hide file tree
Showing 31 changed files with 457 additions and 382 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.11)

project(
hammingdist
VERSION 0.17.0
VERSION 0.18.0
LANGUAGES CXX)

include(CTest)
Expand Down
42 changes: 42 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,48 @@ data.dump_sequence_indices("indices.txt")
data = hammingdist.from_stringlist(["ACGTACGT", "ACGTAGGT", "ATTTACGT"])
```

## Duplicates

When `from_fasta` is called with the option `remove_duplicates=True`, duplicate sequences are removed before constructing the differences matrix.

For example given this set of three input sequences:

| Index | Sequence |
| ----- | -------- |
| 0 | ACG |
| 1 | ACG |
| 2 | TAG |

The distances matrix would be a 2x2 matrix of distances between `ACG` and `TAT`:
| | ACG | TAT |
| --- | --- | --- |
| ACG | 0 | 2 |
| TAT | 2 | 0 |

The row of the distances matrix corresponding to each index in the original sequence would be:

| Index | Sequence | Row in distances matrix |
| ----- | -------- | ----------------------- |
| 0 | ACG | 0 |
| 1 | ACG | 0 |
| 2 | TAT | 1 |

This last column is what is written to disk by `DataSet.dump_sequence_indices`.

It can also be constructed (as a numpy array) without calculating the distances matrix by using `hammingdist.fasta_sequence_indices`

```python
import hammingdist

sequence_indices = hammingdist.fasta_sequence_indices(fasta_file)
```

## Large distance values

By default, the elements in the distances matrix returned by `hammingdist.from_fasta` have a maximum value of 255.

For distances larger than this `hammingdist.from_fasta_large` supports distances up to 65535 (but uses twice as much RAM)

## Distances from reference sequence

The distance of each sequence in a fasta file from a given reference sequence can be calculated using:
Expand Down
3 changes: 2 additions & 1 deletion distance.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ int main(int argc, char *argv[]) {
std::size_t nsamples = std::stoi(std::string(argv[2]));

auto start = std::chrono::steady_clock::now();
auto data = hamming::from_fasta(filename, nsamples);
auto data =
hamming::from_fasta<hamming::DefaultDistIntType>(filename, nsamples);
data.dump("distances.csv");
auto stop = std::chrono::steady_clock::now();
std::chrono::duration<double> elapsed_seconds = stop - start;
Expand Down
2 changes: 1 addition & 1 deletion src/distance_avx2.hh → include/hamming/distance_avx2.hh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include <cstdint>
#include <vector>

#include "hamming_impl_types.hh"
#include "hamming/hamming_impl_types.hh"

namespace hamming {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include <cstdint>
#include <vector>

#include "hamming_impl_types.hh"
#include "hamming/hamming_impl_types.hh"

namespace hamming {

Expand Down
2 changes: 1 addition & 1 deletion src/distance_sse2.hh → include/hamming/distance_sse2.hh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include <cstdint>
#include <vector>

#include "hamming_impl_types.hh"
#include "hamming/hamming_impl_types.hh"

namespace hamming {

Expand Down
176 changes: 171 additions & 5 deletions include/hamming/hamming.hh
Original file line number Diff line number Diff line change
@@ -1,17 +1,183 @@
#ifndef _HAMMING_HH
#define _HAMMING_HH

#include "hamming/hamming_impl.hh"
#include "hamming/hamming_types.hh"
#include <cmath>
#include <fstream>
#include <sstream>
#include <string>
#include <unordered_map>
#include <vector>

namespace hamming {

DataSet from_stringlist(std::vector<std::string> &);
DataSet from_csv(const std::string &);
DataSet from_fasta(const std::string &, bool include_x = false,
bool remove_duplicates = false, std::size_t n = 0);
DataSet from_lower_triangular(const std::string &);
inline std::size_t uint_sqrt(std::size_t x) {
return static_cast<std::size_t>(std::round(std::sqrt(x)));
}

template <typename DistIntType> struct DataSet {
explicit DataSet(std::vector<std::string> &data, bool include_x = false,
bool clear_input_data = false,
std::vector<std::size_t> &&indices = {})
: nsamples(data.size()), sequence_indices(std::move(indices)) {
validate_data(data);
result = distances<DistIntType>(data, include_x, clear_input_data);
}

explicit DataSet(const std::string &filename) {
// Determine correct dataset size
std::ifstream stream(filename);
std::string line;
nsamples = std::count(std::istreambuf_iterator<char>(stream),
std::istreambuf_iterator<char>(), '\n');
result.resize(nsamples * (nsamples + 1) / 2);

// Read the data
stream = std::ifstream(filename);
std::size_t i{0};
std::size_t current{0};
while (std::getline(stream, line)) {
std::istringstream s(line);
std::string d;
for (std::size_t j = 0; j < current; ++j) {
std::getline(s, d, ',');
result[i++] = std::stoi(d);
}
++current;
}
}

explicit DataSet(std::vector<DistIntType> &&distances)
: result{std::move(distances)} {
// infer n from number of lower triangular matrix elements = n(n-1)/2
nsamples = (uint_sqrt(8 * result.size() + 1) + 1) / 2;
}

void dump(const std::string &filename) {
std::ofstream stream(filename);
for (std::size_t i = 0; i < nsamples; ++i) {
for (std::size_t j = 0; j < nsamples; ++j) {
stream << (*this)[{i, j}];
if (j != nsamples - 1)
stream << ", ";
}
stream << std::endl;
}
}

void dump_lower_triangular(const std::string &filename) {
std::ofstream stream(filename);
#ifdef HAMMING_WITH_OPENMP
std::size_t block_size = 200;
#pragma omp parallel for ordered schedule(static, 1)
for (std::size_t i_start = 1; i_start < nsamples; i_start += block_size) {
std::stringstream line;
std::size_t i_end = i_start + block_size;
if (i_end > nsamples) {
i_end = nsamples;
}
for (std::size_t i = i_start; i < i_end; ++i) {
std::size_t offset{i * (i - 1) / 2};
for (std::size_t j = 0; j + 1 < i; ++j) {
line << static_cast<int>(result[offset + j]) << ",";
}
line << static_cast<int>(result[offset + i - 1]) << "\n";
}
#pragma omp ordered
stream << line.str();
}
#else
std::size_t k = 0;
for (std::size_t i = 1; i < nsamples; ++i) {
for (std::size_t j = 0; j + 1 < i; ++j) {
stream << static_cast<int>(result[k++]) << ",";
}
stream << static_cast<int>(result[k++]) << "\n";
}
#endif
}

void dump_sequence_indices(const std::string &filename) {
std::ofstream stream(filename);
if (sequence_indices.empty()) {
for (std::size_t i = 0; i < nsamples; ++i) {
stream << i << "\n";
}
return;
}
for (auto sequence_index : sequence_indices) {
stream << sequence_index << "\n";
}
}

int operator[](const std::array<std::size_t, 2> &index) const {
auto i = index[0];
auto j = index[1];
if (i < j)
return result[j * (j - 1) / 2 + i];
if (i > j)
return result[i * (i - 1) / 2 + j];
// This is a diagonal entry
return 0;
}

std::size_t nsamples;
std::vector<DistIntType> result;
std::vector<std::size_t> sequence_indices{};
};

DataSet<DefaultDistIntType> from_stringlist(std::vector<std::string> &data);

DataSet<DefaultDistIntType> from_csv(const std::string &filename);

DataSet<DefaultDistIntType> from_lower_triangular(const std::string &filename);

template <typename DistIntType>
DataSet<DistIntType>
from_fasta(const std::string &filename, bool include_x = false,
bool remove_duplicates = false, std::size_t n = 0) {
std::vector<std::string> data;
data.reserve(n);
if (n == 0) {
n = std::numeric_limits<std::size_t>::max();
data.reserve(65536);
}
std::unordered_map<std::string, std::size_t> map_seq_to_index;
std::vector<std::size_t> sequence_indices{};
// Initializing the stream
std::ifstream stream(filename);
std::size_t count = 0;
std::size_t count_unique = 0;
std::string line;
// skip first header
std::getline(stream, line);
while (count < n && !stream.eof()) {
std::string seq{};
while (std::getline(stream, line) && line[0] != '>') {
seq.append(line);
}
if (remove_duplicates) {
auto result = map_seq_to_index.emplace(std::move(seq), count_unique);
if (result.second) {
++count_unique;
}
sequence_indices.push_back(result.first->second);
} else {
data.push_back(std::move(seq));
}
++count;
}
if (remove_duplicates) {
// copy each unique sequence to the vector of strings
data.resize(count_unique);
for (auto &key_value_pair : map_seq_to_index) {
data[key_value_pair.second] = key_value_pair.first;
}
}
return DataSet<DistIntType>(data, include_x, true,
std::move(sequence_indices));
}

ReferenceDistIntType distance(const std::string &seq0, const std::string &seq1,
bool include_x = false);
Expand Down
Loading

0 comments on commit 13ef3ca

Please sign in to comment.