Skip to content

Commit

Permalink
Merge pull request #3844 from vgteam/haplotype-sampling
Browse files Browse the repository at this point in the history
Haplotype sampling
  • Loading branch information
jltsiren authored Feb 10, 2023
2 parents a6bef17 + 5207546 commit 8cbbe0f
Show file tree
Hide file tree
Showing 15 changed files with 2,871 additions and 3 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,6 @@
path = deps/tabixpp
url = https://github.com/ekg/tabixpp.git
branch = master
[submodule "deps/kff-cpp-api"]
path = deps/kff-cpp-api
url = https://github.com/Kmer-File-Format/kff-cpp-api.git
15 changes: 14 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ INCLUDE_FLAGS :=-I$(CWD)/$(INC_DIR) -isystem $(CWD)/$(INC_DIR) -I. -I$(CWD)/$(SR

# Define libraries to link vg against.
LD_LIB_DIR_FLAGS := -L$(CWD)/$(LIB_DIR)
LD_LIB_FLAGS := -lvcflib -ltabixpp -lgssw -lssw -lsublinearLS -lpthread -lncurses -lgcsa2 -lgbwtgraph -lgbwt -ldivsufsort -ldivsufsort64 -lvcfh -lraptor2 -lpinchesandcacti -l3edgeconnected -lsonlib -lfml -lstructures -lbdsg -lxg -lsdsl -lzstd -lhandlegraph
LD_LIB_FLAGS := -lvcflib -ltabixpp -lgssw -lssw -lsublinearLS -lpthread -lncurses -lgcsa2 -lgbwtgraph -lgbwt -lkff -ldivsufsort -ldivsufsort64 -lvcfh -lraptor2 -lpinchesandcacti -l3edgeconnected -lsonlib -lfml -lstructures -lbdsg -lxg -lsdsl -lzstd -lhandlegraph
# We omit Boost Program Options for now; we find it in a platform-dependent way.
# By default it has no suffix
BOOST_SUFFIX=""
Expand Down Expand Up @@ -315,6 +315,7 @@ SNAPPY_DIR:=deps/snappy
GCSA2_DIR:=deps/gcsa2
GBWT_DIR:=deps/gbwt
GBWTGRAPH_DIR=deps/gbwtgraph
KFF_DIR=deps/kff-cpp-api
PROGRESS_BAR_DIR:=deps/progress_bar
FASTAHACK_DIR:=deps/fastahack
FERMI_DIR:=deps/fermi-lite
Expand Down Expand Up @@ -362,6 +363,7 @@ LIB_DEPS += $(LIB_DIR)/libsnappy.a
LIB_DEPS += $(LIB_DIR)/libgcsa2.a
LIB_DEPS += $(LIB_DIR)/libgbwt.a
LIB_DEPS += $(LIB_DIR)/libgbwtgraph.a
LIB_DEPS += $(LIB_DIR)/libkff.a
LIB_DEPS += $(LIB_DIR)/libhts.a
LIB_DEPS += $(LIB_DIR)/libtabixpp.a
LIB_DEPS += $(LIB_DIR)/libvcflib.a
Expand Down Expand Up @@ -416,6 +418,7 @@ DEPS = $(LIB_DEPS)
DEPS += $(INC_DIR)/gcsa/gcsa.h
DEPS += $(INC_DIR)/gbwt/dynamic_gbwt.h
DEPS += $(INC_DIR)/gbwtgraph/gbwtgraph.h
DEPS += $(INC_DIR)/kff_io.hpp
DEPS += $(INC_DIR)/lru_cache.h
DEPS += $(INC_DIR)/dynamic/dynamic.hpp
DEPS += $(INC_DIR)/sparsehash/sparse_hash_map
Expand Down Expand Up @@ -558,6 +561,15 @@ else
+. ./source_me.sh && cp -r $(GBWTGRAPH_DIR)/include/gbwtgraph $(CWD)/$(INC_DIR)/ && cd $(GBWTGRAPH_DIR) && $(MAKE) clean && $(MAKE) $(FILTER) && mv lib/libgbwtgraph.a $(CWD)/$(LIB_DIR)
endif

$(INC_DIR)/kff_io.hpp: $(LIB_DIR)/libkff.a

$(LIB_DIR)/libkff.a: $(KFF_DIR)/kff_io.cpp $(KFF_DIR)/kff_io.hpp.in
ifeq ($(shell uname -s),Darwin)
+. ./source_me.sh && cd $(KFF_DIR) && rm -Rf build && mkdir build && cd build && cmake .. && AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR)
else
+. ./source_me.sh && cd $(KFF_DIR) && rm -Rf build && mkdir build && cd build && cmake .. && $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR)
endif

$(INC_DIR)/BooPHF.h: $(BBHASH_DIR)/BooPHF.h
+cp $(BBHASH_DIR)/BooPHF.h $(CWD)/$(INC_DIR)

Expand Down Expand Up @@ -989,6 +1001,7 @@ clean: clean-vcflib
cd $(DEP_DIR) && cd gcsa2 && $(MAKE) clean
cd $(DEP_DIR) && cd gbwt && $(MAKE) clean
cd $(DEP_DIR) && cd gbwtgraph && $(MAKE) clean
cd $(DEP_DIR) && cd kff-cpp-api && rm -Rf build
cd $(DEP_DIR) && cd gssw && $(MAKE) clean
cd $(DEP_DIR) && cd ssw && cd src && $(MAKE) clean
cd $(DEP_DIR) && cd progress_bar && $(MAKE) clean
Expand Down
1 change: 1 addition & 0 deletions deps/kff-cpp-api
Submodule kff-cpp-api added at b174f7
11 changes: 11 additions & 0 deletions src/gbwtgraph_helper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,5 +182,16 @@ unordered_map<nid_t, pair<string, size_t>> load_translation_back_map(const gbwtg
return translation_back_map;
}

//------------------------------------------------------------------------------

std::string to_string_gbwtgraph(handle_t handle) {
return to_string_gbwtgraph(gbwtgraph::GBWTGraph::handle_to_node(handle));
}

std::string to_string_gbwtgraph(gbwt::node_type node) {
return std::string("(") + std::to_string(gbwt::Node::id(node)) + std::string(", ") + std::to_string(gbwt::Node::is_reverse(node)) + std::string(")");
}

//------------------------------------------------------------------------------

} // namespace vg
13 changes: 13 additions & 0 deletions src/gbwtgraph_helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,19 @@ std::unordered_map<nid_t, std::pair<std::string, size_t>> load_translation_back_

//------------------------------------------------------------------------------

/// Returns an empty GBWTGraph handle corresponding to the GBWT endmarker.
inline handle_t empty_gbwtgraph_handle() {
return gbwtgraph::GBWTGraph::node_to_handle(0);
}

/// Returns a string representation of a GBWTGraph handle.
std::string to_string_gbwtgraph(handle_t handle);

/// Returns a string representation of a GBWTGraph node.
std::string to_string_gbwtgraph(gbwt::node_type node);

//------------------------------------------------------------------------------

} // namespace vg

#endif // VG_GBWTGRAPH_HELPER_HPP_INCLUDED
204 changes: 204 additions & 0 deletions src/kff.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
#include "kff.hpp"

namespace vg {

//------------------------------------------------------------------------------

// Encode up to 4 characters in one byte.
uint8_t kff_encode(const std::string& kmer, size_t start, size_t limit, const uint8_t* encoding) {
uint8_t val = 0;
for (size_t i = start; i < limit; i++) {
val <<= 2;
auto packed = gbwtgraph::CHAR_TO_PACK[static_cast<uint8_t>(kmer[i])];
if (packed < 4) {
val |= encoding[packed];
}
}
return val;
}

std::vector<uint8_t> kff_encode(const std::string& kmer, const uint8_t* encoding) {
std::vector<uint8_t> result;
result.reserve(kff_bytes(kmer.length()));

// If k is not a multiple of 4, KFF adds the padding to the high-order bits
// of the first byte.
size_t remainder = kmer.length() & 3;
if (remainder > 0) {
result.push_back(kff_encode(kmer, 0, remainder, encoding));
}
for (size_t i = remainder; i < kmer.length(); i += 4) {
result.push_back(kff_encode(kmer, i, i + 4, encoding));
}

return result;
}

//------------------------------------------------------------------------------

std::string kff_invert(const uint8_t* encoding) {
std::string result(4, ' ');
result[encoding[0]] = 'A';
result[encoding[1]] = 'C';
result[encoding[2]] = 'G';
result[encoding[3]] = 'T';
return result;
}

// Decode up to 4 characters from one byte
void kff_decode(uint8_t byte, size_t chars, const std::string& decoding, std::string& output) {
size_t offset = 2 * chars;
for (size_t i = 0; i < chars; i++) {
offset -= 2;
output.push_back(decoding[(byte >> offset) & 3]);
}
}

std::string kff_decode(const uint8_t* kmer, size_t k, const std::string& decoding) {
std::string result;
result.reserve(k);

size_t bytes = kff_bytes(k);
size_t chars = k & 3;
if (chars == 0) {
chars = 4;
}
for (size_t i = 0; i < bytes; i++) {
kff_decode(kmer[i], chars, decoding, result);
chars = 4;
}

return result;
}

//------------------------------------------------------------------------------

// Recode up to 4 characters in one byte.
uint8_t kff_recode(gbwtgraph::Key64::value_type kmer, size_t k, size_t chars, const uint8_t* encoding) {
size_t offset = 2 * k;
uint8_t val = 0;
for (size_t i = 0; i < chars; i++) {
offset -= 2;
val = (val << 2) | encoding[(kmer >> offset) & 3];
}
return val;
}

std::vector<uint8_t> kff_recode(gbwtgraph::Key64::value_type kmer, size_t k, const uint8_t* encoding) {
std::vector<uint8_t> result;
result.reserve(kff_bytes(k));

size_t remainder = k & 3;
if (remainder > 0) {
result.push_back(kff_recode(kmer, k, remainder, encoding));
}
for (size_t i = remainder; i < k; i += 4) {
result.push_back(kff_recode(kmer, k - i, 4, encoding));
}

return result;
}

gbwtgraph::Key64::value_type kff_recode(const uint8_t* kmer, size_t k, const std::string& decoding) {
gbwtgraph::Key64::value_type result = 0;

size_t bytes = kff_bytes(k);
size_t chars = k & 3;
if (chars == 0) {
chars = 4;
}
for (size_t i = 0; i < bytes; i++) {
size_t offset = 2 * chars;
for (size_t j = 0; j < chars; j++) {
offset -= 2;
unsigned char c = decoding[(kmer[i] >> offset) & 3];
result = (result << 2) | gbwtgraph::CHAR_TO_PACK[c];
}
chars = 4;
}

return result;
}

std::vector<gbwtgraph::Key64::value_type> kff_recode(const uint8_t* kmers, size_t n, size_t k, const std::string& decoding) {
std::vector<gbwtgraph::Key64::value_type> result;
result.reserve(n);

size_t total_chars = n + k - 1;
size_t bytes = kff_bytes(total_chars);
size_t chars = total_chars & 3;
if (chars == 0) {
chars = 4;
}

gbwtgraph::Key64::value_type curr = 0;
for (size_t i = 0, processed = 0; i < bytes; i++) {
size_t offset = 2 * chars;
for (size_t j = 0; j < chars; j++) {
offset -= 2;
unsigned char c = decoding[(kmers[i] >> offset) & 3];
curr = (curr << 2) | gbwtgraph::CHAR_TO_PACK[c];
processed++;
if (processed >= k) {
result.push_back(curr & sdsl::bits::lo_set[2 * k]);
}
}
chars = 4;
}

return result;
}

//------------------------------------------------------------------------------

uint8_t kff_get(const uint8_t* kmer, size_t i) {
size_t byte = i / 4;
size_t offset = 3 - (i & 3);
return (kmer[byte] >> (2 * offset)) & 3;
}

void kff_set(std::vector<uint8_t>& kmer, size_t i, uint8_t value) {
size_t byte = i / 4;
size_t offset = 3 - (i & 3);
kmer[byte] |= value << (2 * offset);
}

std::vector<uint8_t> kff_reverse_complement(const uint8_t* kmer, size_t k, const uint8_t* encoding) {
uint8_t complement[4];
complement[encoding[0]] = encoding[3];
complement[encoding[1]] = encoding[2];
complement[encoding[2]] = encoding[1];
complement[encoding[3]] = encoding[0];

size_t offset = (4 - (k & 3)) & 3;
std::vector<uint8_t> result(kff_bytes(k), 0);
for (size_t i = 0; i < k; i++) {
kff_set(result, 4 * result.size() - 1 - i, complement[kff_get(kmer, i + offset)]);
}
return result;
}

gbwtgraph::Key64::value_type minimizer_reverse_complement(gbwtgraph::Key64::value_type kmer, size_t k) {
gbwtgraph::Key64::value_type result = 0;
for (size_t i = 0; i < k; i++) {
result = (result << 2) | ((kmer & 3) ^ 3);
kmer >>= 2;
}
return result;
}

//------------------------------------------------------------------------------

uint64_t kff_parse(const uint8_t* data, size_t bytes) {
uint64_t value = 0;
size_t shift = 8 * bytes;
for (size_t i = 0; i < bytes; i++) {
shift -= 8;
value |= static_cast<uint64_t>(data[i]) << shift;
}
return value;
}

//------------------------------------------------------------------------------

} // namespace vg
55 changes: 55 additions & 0 deletions src/kff.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#ifndef VG_KFF_HPP_INCLUDED
#define VG_KFF_HPP_INCLUDED

/** \file
* Tools for working with the Kmer File Format (KFF).
*/

#include <kff_io.hpp>

#include <gbwtgraph/minimizer.h>

namespace vg {

//------------------------------------------------------------------------------

/// Returns the number of bytes required for a kmer in KFF format.
inline size_t kff_bytes(size_t k) {
return (k + 3) / 4;
}

/// Encodes a kmer in KFF format according to the given encoding.
/// Non-ACGT characters are encoded as 0s.
std::vector<uint8_t> kff_encode(const std::string& kmer, const uint8_t* encoding);

/// Inverts the KFF encoding into a packed -> char table.
std::string kff_invert(const uint8_t* encoding);

/// Decodes a kmer in KFF format according to the given encoding.
std::string kff_decode(const uint8_t* kmer, size_t k, const std::string& decoding);

/// Recodes a kmer from a minimizer index in KFF format according to the given encoding.
std::vector<uint8_t> kff_recode(gbwtgraph::Key64::value_type kmer, size_t k, const uint8_t* encoding);

/// Recodes a KFF kmer in the minimizer index format according to the given encoding.
/// Will fail silently if `k` is too large or `decoding` is not from `kff_invert()`.
gbwtgraph::Key64::value_type kff_recode(const uint8_t* kmer, size_t k, const std::string& decoding);

/// Recodes `n` KFF kmers in the minimizer index format according to the given encoding.
/// Will fail silently if `k` is too large or `decoding` is not from `kff_invert()`.
std::vector<gbwtgraph::Key64::value_type> kff_recode(const uint8_t* kmers, size_t n, size_t k, const std::string& decoding);

/// Returns the reverse complement of a KFF kmer.
std::vector<uint8_t> kff_reverse_complement(const uint8_t* kmer, size_t k, const uint8_t* encoding);

/// Returns the reverse complement of a minimizer index kmer.
gbwtgraph::Key64::value_type minimizer_reverse_complement(gbwtgraph::Key64::value_type kmer, size_t k);

/// Parses a big-endian integer from KFF data.
uint64_t kff_parse(const uint8_t* data, size_t bytes);

//------------------------------------------------------------------------------

} // namespace vg

#endif // VG_KFF_HPP_INCLUDED
Loading

1 comment on commit 8cbbe0f

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for merge to master. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 9862 seconds

Please sign in to comment.