-
Notifications
You must be signed in to change notification settings - Fork 196
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #3844 from vgteam/haplotype-sampling
Haplotype sampling
- Loading branch information
Showing
15 changed files
with
2,871 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Submodule gbwtgraph
updated
12 files
+8 −1 | README.md | |
+71 −2 | include/gbwtgraph/algorithms.h | |
+9 −3 | include/gbwtgraph/gbwtgraph.h | |
+6 −4 | include/gbwtgraph/minimizer.h | |
+75 −3 | src/algorithms.cpp | |
+77 −0 | src/gbwtgraph.cpp | |
+19 −30 | src/gfa.cpp | |
+25 −25 | src/minimizer.cpp | |
+23 −0 | tests/gfas/for_subgraph.gfa | |
+29 −0 | tests/test_algorithms.cpp | |
+2 −0 | tests/test_gbwtgraph.cpp | |
+83 −2 | tests/test_gfa.cpp |
Submodule kff-cpp-api
added at
b174f7
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,204 @@ | ||
#include "kff.hpp" | ||
|
||
namespace vg { | ||
|
||
//------------------------------------------------------------------------------ | ||
|
||
// Encode up to 4 characters in one byte. | ||
uint8_t kff_encode(const std::string& kmer, size_t start, size_t limit, const uint8_t* encoding) { | ||
uint8_t val = 0; | ||
for (size_t i = start; i < limit; i++) { | ||
val <<= 2; | ||
auto packed = gbwtgraph::CHAR_TO_PACK[static_cast<uint8_t>(kmer[i])]; | ||
if (packed < 4) { | ||
val |= encoding[packed]; | ||
} | ||
} | ||
return val; | ||
} | ||
|
||
std::vector<uint8_t> kff_encode(const std::string& kmer, const uint8_t* encoding) { | ||
std::vector<uint8_t> result; | ||
result.reserve(kff_bytes(kmer.length())); | ||
|
||
// If k is not a multiple of 4, KFF adds the padding to the high-order bits | ||
// of the first byte. | ||
size_t remainder = kmer.length() & 3; | ||
if (remainder > 0) { | ||
result.push_back(kff_encode(kmer, 0, remainder, encoding)); | ||
} | ||
for (size_t i = remainder; i < kmer.length(); i += 4) { | ||
result.push_back(kff_encode(kmer, i, i + 4, encoding)); | ||
} | ||
|
||
return result; | ||
} | ||
|
||
//------------------------------------------------------------------------------ | ||
|
||
std::string kff_invert(const uint8_t* encoding) { | ||
std::string result(4, ' '); | ||
result[encoding[0]] = 'A'; | ||
result[encoding[1]] = 'C'; | ||
result[encoding[2]] = 'G'; | ||
result[encoding[3]] = 'T'; | ||
return result; | ||
} | ||
|
||
// Decode up to 4 characters from one byte | ||
void kff_decode(uint8_t byte, size_t chars, const std::string& decoding, std::string& output) { | ||
size_t offset = 2 * chars; | ||
for (size_t i = 0; i < chars; i++) { | ||
offset -= 2; | ||
output.push_back(decoding[(byte >> offset) & 3]); | ||
} | ||
} | ||
|
||
std::string kff_decode(const uint8_t* kmer, size_t k, const std::string& decoding) { | ||
std::string result; | ||
result.reserve(k); | ||
|
||
size_t bytes = kff_bytes(k); | ||
size_t chars = k & 3; | ||
if (chars == 0) { | ||
chars = 4; | ||
} | ||
for (size_t i = 0; i < bytes; i++) { | ||
kff_decode(kmer[i], chars, decoding, result); | ||
chars = 4; | ||
} | ||
|
||
return result; | ||
} | ||
|
||
//------------------------------------------------------------------------------ | ||
|
||
// Recode up to 4 characters in one byte. | ||
uint8_t kff_recode(gbwtgraph::Key64::value_type kmer, size_t k, size_t chars, const uint8_t* encoding) { | ||
size_t offset = 2 * k; | ||
uint8_t val = 0; | ||
for (size_t i = 0; i < chars; i++) { | ||
offset -= 2; | ||
val = (val << 2) | encoding[(kmer >> offset) & 3]; | ||
} | ||
return val; | ||
} | ||
|
||
std::vector<uint8_t> kff_recode(gbwtgraph::Key64::value_type kmer, size_t k, const uint8_t* encoding) { | ||
std::vector<uint8_t> result; | ||
result.reserve(kff_bytes(k)); | ||
|
||
size_t remainder = k & 3; | ||
if (remainder > 0) { | ||
result.push_back(kff_recode(kmer, k, remainder, encoding)); | ||
} | ||
for (size_t i = remainder; i < k; i += 4) { | ||
result.push_back(kff_recode(kmer, k - i, 4, encoding)); | ||
} | ||
|
||
return result; | ||
} | ||
|
||
gbwtgraph::Key64::value_type kff_recode(const uint8_t* kmer, size_t k, const std::string& decoding) { | ||
gbwtgraph::Key64::value_type result = 0; | ||
|
||
size_t bytes = kff_bytes(k); | ||
size_t chars = k & 3; | ||
if (chars == 0) { | ||
chars = 4; | ||
} | ||
for (size_t i = 0; i < bytes; i++) { | ||
size_t offset = 2 * chars; | ||
for (size_t j = 0; j < chars; j++) { | ||
offset -= 2; | ||
unsigned char c = decoding[(kmer[i] >> offset) & 3]; | ||
result = (result << 2) | gbwtgraph::CHAR_TO_PACK[c]; | ||
} | ||
chars = 4; | ||
} | ||
|
||
return result; | ||
} | ||
|
||
std::vector<gbwtgraph::Key64::value_type> kff_recode(const uint8_t* kmers, size_t n, size_t k, const std::string& decoding) { | ||
std::vector<gbwtgraph::Key64::value_type> result; | ||
result.reserve(n); | ||
|
||
size_t total_chars = n + k - 1; | ||
size_t bytes = kff_bytes(total_chars); | ||
size_t chars = total_chars & 3; | ||
if (chars == 0) { | ||
chars = 4; | ||
} | ||
|
||
gbwtgraph::Key64::value_type curr = 0; | ||
for (size_t i = 0, processed = 0; i < bytes; i++) { | ||
size_t offset = 2 * chars; | ||
for (size_t j = 0; j < chars; j++) { | ||
offset -= 2; | ||
unsigned char c = decoding[(kmers[i] >> offset) & 3]; | ||
curr = (curr << 2) | gbwtgraph::CHAR_TO_PACK[c]; | ||
processed++; | ||
if (processed >= k) { | ||
result.push_back(curr & sdsl::bits::lo_set[2 * k]); | ||
} | ||
} | ||
chars = 4; | ||
} | ||
|
||
return result; | ||
} | ||
|
||
//------------------------------------------------------------------------------ | ||
|
||
uint8_t kff_get(const uint8_t* kmer, size_t i) { | ||
size_t byte = i / 4; | ||
size_t offset = 3 - (i & 3); | ||
return (kmer[byte] >> (2 * offset)) & 3; | ||
} | ||
|
||
void kff_set(std::vector<uint8_t>& kmer, size_t i, uint8_t value) { | ||
size_t byte = i / 4; | ||
size_t offset = 3 - (i & 3); | ||
kmer[byte] |= value << (2 * offset); | ||
} | ||
|
||
std::vector<uint8_t> kff_reverse_complement(const uint8_t* kmer, size_t k, const uint8_t* encoding) { | ||
uint8_t complement[4]; | ||
complement[encoding[0]] = encoding[3]; | ||
complement[encoding[1]] = encoding[2]; | ||
complement[encoding[2]] = encoding[1]; | ||
complement[encoding[3]] = encoding[0]; | ||
|
||
size_t offset = (4 - (k & 3)) & 3; | ||
std::vector<uint8_t> result(kff_bytes(k), 0); | ||
for (size_t i = 0; i < k; i++) { | ||
kff_set(result, 4 * result.size() - 1 - i, complement[kff_get(kmer, i + offset)]); | ||
} | ||
return result; | ||
} | ||
|
||
gbwtgraph::Key64::value_type minimizer_reverse_complement(gbwtgraph::Key64::value_type kmer, size_t k) { | ||
gbwtgraph::Key64::value_type result = 0; | ||
for (size_t i = 0; i < k; i++) { | ||
result = (result << 2) | ((kmer & 3) ^ 3); | ||
kmer >>= 2; | ||
} | ||
return result; | ||
} | ||
|
||
//------------------------------------------------------------------------------ | ||
|
||
uint64_t kff_parse(const uint8_t* data, size_t bytes) { | ||
uint64_t value = 0; | ||
size_t shift = 8 * bytes; | ||
for (size_t i = 0; i < bytes; i++) { | ||
shift -= 8; | ||
value |= static_cast<uint64_t>(data[i]) << shift; | ||
} | ||
return value; | ||
} | ||
|
||
//------------------------------------------------------------------------------ | ||
|
||
} // namespace vg |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
#ifndef VG_KFF_HPP_INCLUDED | ||
#define VG_KFF_HPP_INCLUDED | ||
|
||
/** \file | ||
* Tools for working with the Kmer File Format (KFF). | ||
*/ | ||
|
||
#include <kff_io.hpp> | ||
|
||
#include <gbwtgraph/minimizer.h> | ||
|
||
namespace vg { | ||
|
||
//------------------------------------------------------------------------------ | ||
|
||
/// Returns the number of bytes required for a kmer in KFF format. | ||
inline size_t kff_bytes(size_t k) { | ||
return (k + 3) / 4; | ||
} | ||
|
||
/// Encodes a kmer in KFF format according to the given encoding. | ||
/// Non-ACGT characters are encoded as 0s. | ||
std::vector<uint8_t> kff_encode(const std::string& kmer, const uint8_t* encoding); | ||
|
||
/// Inverts the KFF encoding into a packed -> char table. | ||
std::string kff_invert(const uint8_t* encoding); | ||
|
||
/// Decodes a kmer in KFF format according to the given encoding. | ||
std::string kff_decode(const uint8_t* kmer, size_t k, const std::string& decoding); | ||
|
||
/// Recodes a kmer from a minimizer index in KFF format according to the given encoding. | ||
std::vector<uint8_t> kff_recode(gbwtgraph::Key64::value_type kmer, size_t k, const uint8_t* encoding); | ||
|
||
/// Recodes a KFF kmer in the minimizer index format according to the given encoding. | ||
/// Will fail silently if `k` is too large or `decoding` is not from `kff_invert()`. | ||
gbwtgraph::Key64::value_type kff_recode(const uint8_t* kmer, size_t k, const std::string& decoding); | ||
|
||
/// Recodes `n` KFF kmers in the minimizer index format according to the given encoding. | ||
/// Will fail silently if `k` is too large or `decoding` is not from `kff_invert()`. | ||
std::vector<gbwtgraph::Key64::value_type> kff_recode(const uint8_t* kmers, size_t n, size_t k, const std::string& decoding); | ||
|
||
/// Returns the reverse complement of a KFF kmer. | ||
std::vector<uint8_t> kff_reverse_complement(const uint8_t* kmer, size_t k, const uint8_t* encoding); | ||
|
||
/// Returns the reverse complement of a minimizer index kmer. | ||
gbwtgraph::Key64::value_type minimizer_reverse_complement(gbwtgraph::Key64::value_type kmer, size_t k); | ||
|
||
/// Parses a big-endian integer from KFF data. | ||
uint64_t kff_parse(const uint8_t* data, size_t bytes); | ||
|
||
//------------------------------------------------------------------------------ | ||
|
||
} // namespace vg | ||
|
||
#endif // VG_KFF_HPP_INCLUDED |
Oops, something went wrong.
8cbbe0f
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
vg CI tests complete for merge to master. View the full report here.
16 tests passed, 0 tests failed and 0 tests skipped in 9862 seconds