From 762ea299ea841816544cc9c99b80d67a400df5c5 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Thu, 12 Aug 2021 15:51:59 -0400 Subject: [PATCH 1/2] add algorithm to zip up tails in multipath alignments --- src/aligner.cpp | 12 +- src/aligner.hpp | 7 +- src/multipath_alignment_graph.cpp | 502 ++++++++++++++++++++++++++++-- src/multipath_alignment_graph.hpp | 6 + src/position.cpp | 6 + src/position.hpp | 2 +- src/splicing.cpp | 80 +++-- 7 files changed, 536 insertions(+), 79 deletions(-) diff --git a/src/aligner.cpp b/src/aligner.cpp index 67081acc8c4..8f958e2f7cd 100644 --- a/src/aligner.cpp +++ b/src/aligner.cpp @@ -1559,17 +1559,17 @@ int32_t Aligner::score_full_length_bonus(bool left_side, const Alignment& alignm return full_length_bonus; } -int32_t Aligner::score_partial_alignment(const Alignment& alignment, const HandleGraph& graph, const Path& path, +int32_t Aligner::score_partial_alignment(const Alignment& alignment, const HandleGraph& graph, const path_t& path, string::const_iterator seq_begin, bool no_read_end_scoring) const { int32_t score = 0; string::const_iterator read_pos = seq_begin; bool in_deletion = false; for (size_t i = 0; i < path.mapping_size(); i++) { - const Mapping& mapping = path.mapping(i); + const auto& mapping = path.mapping(i); for (size_t j = 0; j < mapping.edit_size(); j++) { - const Edit& edit = mapping.edit(j); + const auto& edit = mapping.edit(j); if (edit.from_length() > 0) { if (edit.to_length() > 0) { @@ -2255,7 +2255,7 @@ int32_t QualAdjAligner::score_full_length_bonus(bool left_side, const Alignment& alignment.quality().begin()); } -int32_t QualAdjAligner::score_partial_alignment(const Alignment& alignment, const HandleGraph& graph, const Path& path, +int32_t QualAdjAligner::score_partial_alignment(const Alignment& alignment, const HandleGraph& graph, const path_t& path, string::const_iterator seq_begin, bool no_read_end_scoring) const { int32_t score = 0; @@ -2264,7 +2264,7 @@ int32_t QualAdjAligner::score_partial_alignment(const Alignment& alignment, cons bool in_deletion = false; for (size_t i = 0; i < path.mapping_size(); i++) { - const Mapping& mapping = path.mapping(i); + const auto& mapping = path.mapping(i); // get the sequence of this node on the proper strand string node_seq = graph.get_sequence(graph.get_handle(mapping.position().node_id(), @@ -2273,7 +2273,7 @@ int32_t QualAdjAligner::score_partial_alignment(const Alignment& alignment, cons string::const_iterator ref_pos = node_seq.begin() + mapping.position().offset(); for (size_t j = 0; j < mapping.edit_size(); j++) { - const Edit& edit = mapping.edit(j); + const auto& edit = mapping.edit(j); if (edit.from_length() > 0) { if (edit.to_length() > 0) { diff --git a/src/aligner.hpp b/src/aligner.hpp index 418422f7ce1..276acbf0169 100644 --- a/src/aligner.hpp +++ b/src/aligner.hpp @@ -14,6 +14,7 @@ #include "Variant.h" #include "Fasta.h" #include "handle.hpp" +#include "path.hpp" #include "dozeu_interface.hpp" #include "deletion_aligner.hpp" @@ -182,7 +183,7 @@ namespace vg { virtual int32_t score_full_length_bonus(bool left_side, const Alignment& alignment) const = 0; /// Compute the score of a path against the given range of subsequence with the given qualities. - virtual int32_t score_partial_alignment(const Alignment& alignment, const HandleGraph& graph, const Path& path, + virtual int32_t score_partial_alignment(const Alignment& alignment, const HandleGraph& graph, const path_t& path, string::const_iterator seq_begin, bool no_read_end_scoring = false) const = 0; /// Returns the score of an insert or deletion of the given length @@ -370,7 +371,7 @@ namespace vg { int32_t score_full_length_bonus(bool left_side, const Alignment& alignment) const; - int32_t score_partial_alignment(const Alignment& alignment, const HandleGraph& graph, const Path& path, + int32_t score_partial_alignment(const Alignment& alignment, const HandleGraph& graph, const path_t& path, string::const_iterator seq_begin, bool no_read_end_scoring = false) const; private: @@ -429,7 +430,7 @@ namespace vg { int32_t score_full_length_bonus(bool left_side, const Alignment& alignment) const; - int32_t score_partial_alignment(const Alignment& alignment, const HandleGraph& graph, const Path& path, + int32_t score_partial_alignment(const Alignment& alignment, const HandleGraph& graph, const path_t& path, string::const_iterator seq_begin, bool no_read_end_scoring = false) const; diff --git a/src/multipath_alignment_graph.cpp b/src/multipath_alignment_graph.cpp index 2a224656632..48f49eb1bb0 100644 --- a/src/multipath_alignment_graph.cpp +++ b/src/multipath_alignment_graph.cpp @@ -4144,6 +4144,245 @@ namespace vg { align(alignment, align_graph, aligner, score_anchors_as_matches, max_alt_alns, dynamic_alt_alns, max_gap, pessimistic_tail_gap_multiplier, constant_padding, multipath_aln_out, allow_negative_scores); } + + pair MultipathAlignmentGraph::zip_alignments(vector>& alt_alns, bool from_left, + const Alignment& alignment, const HandleGraph& align_graph, + string::const_iterator begin, const GSSWAligner* aligner) { + pair return_val; + + int64_t i, j, incr; + auto& path_1 = alt_alns.front().first; + if (from_left) { + incr = 1; + i = 0; + } + else { + incr = -1; + i = path_1.mapping_size() - 1; + } + // past-the-last index of the longest match that concludes in a match/mismatvch + int64_t last_aligned_i, last_aligned_j; + + // walk the first path to check for full prefix/suffix matches + bool found_mismatch = false, in_indel = false; + while (i >= 0 && i < path_1.mapping_size()) { + const auto& mapping_1 = path_1.mapping(i); + // start from the appropriate side of the mapping + if (from_left) { + j = 0; + } + else { + j = mapping_1.edit_size() - 1; + } + while(j >= 0 && j < path_1.mapping(i).edit_size()) { + const auto& edit_1 = mapping_1.edit(j); + if (!in_indel) { + // we've matched up to here and the most recent match wasn't an indel + last_aligned_i = i; + last_aligned_j = j; + } + // check whether all of the other paths match the first path at the next edit + for (size_t k = 1; k < alt_alns.size() && !found_mismatch; ++k) { + auto& path_2 = alt_alns[k].first; + // find the corresponding mapping index + int64_t i2; + if (from_left) { + i2 = 0; + } + else { + i2 = path_2.mapping_size() - path_1.mapping_size() + i; + } + + if (i2 < 0 || i2 >= path_2.mapping_size()) { + found_mismatch = true; + break; + } + const auto& mapping_2 = path_2.mapping(i2); + // find the corresponding edit index + int64_t j2; + if (from_left) { + j2 = 0; + } + else { + j2 = mapping_2.edit_size() - mapping_1.edit_size() + j; + } + if (j2 < 0 || j2 >= mapping_2.edit_size()) { + found_mismatch = true; + break; + } + const auto& edit_2 = mapping_2.edit(j2); + found_mismatch = (edit_1 != edit_2); + } + if (found_mismatch) { + break; + } + else { + j += incr; + in_indel = (edit_1.from_length() == 0 || edit_1.to_length() == 0); + } + } + if (found_mismatch) { + break; + } + else { + i += incr; + } + } + + if (in_indel) { + // the match concluded in an insertion or deletion, we need to make sure it doesn't + // continue onto the next edit + bool all_nexts_aligned = true; + if (i >= 0 && i < path_1.mapping_size()) { + // we didn't match the entire first path, so we need to actually check + const auto& edit = path_1.mapping(i).edit(j); + all_nexts_aligned = (edit.from_length() != 0 && edit.to_length() != 0); + } + for (size_t k = 1; k < alt_alns.size() && all_nexts_aligned; ++k) { + auto& path_2 = alt_alns[k].first; + int64_t i2 = path_2.mapping_size() - path_1.mapping_size() + i; + if (i2 >= 0 && i2 < path_2.mapping_size()) { + // we didn't run through the full path + const auto& mapping_2 = path_2.mapping(i2); + int64_t j2 = mapping_2.edit_size() - path_1.mapping(i).edit_size() + j; + if (j2 >= 0 && j2 < mapping_2.edit_size()) { + // we didn't run through the full mapping + const auto& edit = mapping_2.edit(j2); + all_nexts_aligned = (edit.from_length() != 0 && edit.to_length() != 0); + } + } + } + if (!all_nexts_aligned) { + // we need to backtrack to the last aligned base + i = last_aligned_i; + j = last_aligned_j; + } + } + + if ((from_left && (i != 0 || j != 0)) || + (!from_left && (i != path_1.mapping_size() - 1 || j != path_1.mapping().back().edit_size() - 1))) { + + // we matched part of the path across all alternate alignments, we can zip it up + + // delete the corresponding parts from all the other paths + for (int64_t k = 1; k < alt_alns.size(); ++k) { + auto& path_2 = alt_alns[k].first; + if (from_left) { + // we have to remove part of the path from the beginning of the existing paths + int64_t num_full_mappings = i; + if (i < path_2.mapping_size()) { + auto mapping = path_2.mutable_mapping(i); + if (j < mapping->edit_size()) { + // we have to remove part of the mapping + int64_t from_len = 0; + for (int64_t l = 0; l < j; ++l) { + from_len += mapping->edit(l).from_length(); + } + mapping->mutable_edit()->erase(mapping->mutable_edit()->begin(), + mapping->mutable_edit()->begin() + j); + mapping->mutable_position()->set_offset(mapping->position().offset() + from_len); + } + else { + // we can actually remove the whole mapping + ++num_full_mappings; + } + } + // remove any full mappings that are shared + path_2.mutable_mapping()->erase(path_2.mutable_mapping()->begin(), + path_2.mutable_mapping()->begin() + i); + } + else { + // we remove from the back (easier, but need to translate indexes) + int64_t i2 = path_2.mapping_size() - path_1.mapping_size() + i; + path_2.mutable_mapping()->resize(i2 + 1); + const auto& mapping_1 = path_1.mapping(i); + auto mapping_2 = path_2.mutable_mapping(i2); + int64_t j2 = mapping_2->edit_size() - mapping_1.edit_size() + j; + mapping_2->mutable_edit()->resize(j2 + 1); + if (mapping_2->edit().empty()) { + // we actually removed the whole final mapping, get rid of it + path_2.mutable_mapping()->pop_back(); + } + } + } + + // figure out how much of the primary to move over to the zipped alignment + int64_t k, num_mappings, num_edits; + if (from_left) { + k = 0; + num_mappings = i; + num_edits = j; + if (i < path_1.mapping_size() && j >= path_1.mapping(i).edit_size()) { + ++num_mappings; + num_edits = 0; + } + } + else { + k = path_1.mapping_size() - 1; + num_mappings = k - i; + num_edits = 0; + if (j < 0) { + ++num_mappings; + } + else if (i >= 0) { + num_edits = path_1.mapping(i).edit_size() - j - 1; + } + } + // copy to the zipped alignment + for (int64_t m = 0; m < num_mappings; k += incr, ++m) { + *return_val.first.add_mapping() = move(*path_1.mutable_mapping(k)); + } + if (num_edits != 0) { + auto mapping = path_1.mutable_mapping(k); + auto pos = mapping->mutable_position(); + auto copy_mapping = return_val.first.add_mapping(); + auto copy_pos = copy_mapping->mutable_position(); + for (int64_t l = from_left ? 0 : mapping->edit_size() - 1, e = 0; e < num_edits; l += incr, ++e) { + auto edit = mapping->mutable_edit(l); + *copy_mapping->add_edit() = move(*edit); + } + if (from_left) { + mapping->mutable_edit()->erase(mapping->mutable_edit()->begin(), + mapping->mutable_edit()->begin() + num_edits); + *copy_pos = *pos; + pos->set_offset(pos->offset() + mapping_from_length(*copy_mapping)); + } + else { + // the edits were added in reverse order, switch them back + reverse(copy_mapping->mutable_edit()->begin(), copy_mapping->mutable_edit()->end()); + + mapping->mutable_edit()->resize(mapping->edit_size() - num_edits); + copy_pos->set_node_id(pos->node_id()); + copy_pos->set_is_reverse(pos->is_reverse()); + copy_pos->set_offset(pos->offset() + mapping_from_length(*copy_mapping)); + } + } + + string::const_iterator alignment_begin; + if (from_left) { + path_1.mutable_mapping()->erase(path_1.mutable_mapping()->begin(), + path_1.mutable_mapping()->begin() + num_mappings); + alignment_begin = begin; + } + else { + // the mappings were added in reverse order, switch them back + reverse(return_val.first.mutable_mapping()->begin(), return_val.first.mutable_mapping()->end()); + + path_1.mutable_mapping()->resize(path_1.mapping_size() - num_mappings); + alignment_begin = begin + path_to_length(path_1); + } + + // score the zipped alignment + return_val.second = aligner->score_partial_alignment(alignment, align_graph, return_val.first, alignment_begin); + + // remove this part of the score from the alt alignments + for (auto& alt_aln : alt_alns) { + alt_aln.second -= return_val.second; + } + } + + return return_val; + } void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGraph& align_graph, const GSSWAligner* aligner, bool score_anchors_as_matches, size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, @@ -4159,6 +4398,7 @@ namespace vg { #ifdef debug_multipath_alignment cerr << "transferred over read information" << endl; #endif + // TODO: could I get away with moving the paths instead of copying them? // add a subpath for each of the exact match nodes if (score_anchors_as_matches) { @@ -4179,12 +4419,7 @@ namespace vg { PathNode& path_node = path_nodes.at(j); subpath_t* subpath = multipath_aln_out.add_subpath(); *subpath->mutable_path() = path_node.path; - - // TODO: should not leave it like this - Path proto_path; - to_proto_path(path_node.path, proto_path); - - subpath->set_score(aligner->score_partial_alignment(alignment, align_graph, proto_path, path_node.begin)); + subpath->set_score(aligner->score_partial_alignment(alignment, align_graph, path_node.path, path_node.begin)); } } @@ -4483,44 +4718,159 @@ namespace vg { auto deduplicated = deduplicate_alt_alns(alt_alignments, false, true); PathNode& path_node = path_nodes.at(j); + pos_t end_pos = final_position(path_node.path); - subpath_t* sink_subpath = multipath_aln_out.mutable_subpath(j); + size_t sink_idx = j; + auto sink_subpath = multipath_aln_out.mutable_subpath(j); - const path_mapping_t& final_mapping = path_node.path.mapping(path_node.path.mapping_size() - 1); + // zip together identical prefixes and suffixes of the tail alignment + pair left_zip_aln, right_zip_aln; + if (deduplicated.size() > 1) { + right_zip_aln = zip_alignments(deduplicated, false, alignment, align_graph, path_node.end, aligner); + left_zip_aln = zip_alignments(deduplicated, true, alignment, align_graph, path_node.end, aligner); + } - pos_t end_pos = final_position(path_node.path); + if (!left_zip_aln.first.mapping().empty()) { + // we were able to zip a prefix of the tails together + + sink_subpath->add_next(multipath_aln_out.subpath_size()); + sink_idx = multipath_aln_out.subpath_size(); + + sink_subpath = multipath_aln_out.add_subpath(); + *sink_subpath->mutable_path() = move(left_zip_aln.first); + sink_subpath->set_score(left_zip_aln.second); + + auto first_mapping = sink_subpath->mutable_path()->mutable_mapping(0); + if (first_mapping->position().node_id() == id(end_pos)) { + first_mapping->mutable_position()->set_offset(offset(end_pos)); + } + + // the zipped alignment is now the sink + end_pos = final_position(sink_subpath->path()); +#ifdef debug_multipath_alignment + cerr << "left zipped alignment from " << j << " to right tail:" << endl; + cerr << debug_string(*sink_subpath) << endl; +#endif + } + // add in the + bool zip_to_zip_connection = false; for (auto& tail_alignment : deduplicated) { + if (tail_alignment.first.mapping().empty()) { + // it's possible that an entire tail get's swallowed up in zipping + zip_to_zip_connection = true; + continue; + } + sink_subpath->add_next(multipath_aln_out.subpath_size()); - subpath_t* tail_subpath = multipath_aln_out.add_subpath(); + auto tail_subpath = multipath_aln_out.add_subpath(); *tail_subpath->mutable_path() = move(tail_alignment.first); tail_subpath->set_score(tail_alignment.second); // get the pointer again in case the vector reallocated - sink_subpath = multipath_aln_out.mutable_subpath(j); + sink_subpath = multipath_aln_out.mutable_subpath(sink_idx); - path_mapping_t* first_mapping = tail_subpath->mutable_path()->mutable_mapping(0); + auto first_mapping = tail_subpath->mutable_path()->mutable_mapping(0); - if (first_mapping->position().node_id() == final_mapping.position().node_id()) { + if (first_mapping->position().node_id() == id(end_pos)) { first_mapping->mutable_position()->set_offset(offset(end_pos)); } - else if (tail_subpath->path().mapping_size() == 1 && first_mapping->edit_size() == 1 - && first_mapping->edit(0).from_length() == 0 && first_mapping->edit(0).to_length() > 0 - && first_mapping->position().node_id() != final_mapping.position().node_id()) { + else if (right_zip_aln.first.mapping().empty() + && tail_subpath->path().mapping_size() == 1 && first_mapping->edit_size() == 1 + && first_mapping->edit(0).from_length() == 0 && first_mapping->edit(0).to_length() != 0 + && first_mapping->position().node_id() != id(end_pos)) { // this is a pure soft-clip on the beginning of the next node, we'll move it to the end // of the match node to match invariants expected by other parts of the code base position_t* pos = first_mapping->mutable_position(); - pos->set_node_id(final_mapping.position().node_id()); - pos->set_is_reverse(final_mapping.position().is_reverse()); - pos->set_offset(final_mapping.position().offset() + mapping_from_length(final_mapping)); + pos->set_node_id(id(end_pos)); + pos->set_is_reverse(is_rev(end_pos)); + pos->set_offset(offset(end_pos)); } #ifdef debug_multipath_alignment - cerr << "subpath from " << j << " to right tail:" << endl; + cerr << "subpath from " << sink_idx << " to right tail:" << endl; cerr << debug_string(*tail_subpath) << endl; #endif } + + if (!right_zip_aln.first.mapping().empty()) { + if (right_zip_aln.first.mapping_size() == 1 + && right_zip_aln.first.mapping(0).edit_size() == 1 + && right_zip_aln.first.mapping(0).edit(0).from_length() == 0 + && right_zip_aln.first.mapping(0).edit(0).to_length() != 0) { + + // this subpath is a soft-clip, to match the expectations of other code we need to + // relocate this edit onto the other subpaths (more complicated than in the tail + // alignment because there might be multiple previous nodes) + + auto edit = right_zip_aln.first.mutable_mapping(0)->mutable_edit(0); + + // add an edit to the tails + for (size_t i = sink_idx + 1; i < multipath_aln_out.subpath_size(); ++i) { + *multipath_aln_out.mutable_subpath(i)->mutable_path()->mutable_mapping()->back().add_edit() = *edit; + } + + if (zip_to_zip_connection) { + // we can't add this edit to the left zipped alignment because it has sucessor + // subpaths, need to add it as a separate subpath + sink_subpath->add_next(multipath_aln_out.subpath_size()); + + auto tail_subpath = multipath_aln_out.add_subpath(); + auto mapping = tail_subpath->mutable_path()->add_mapping(); + *mapping->add_edit() = move(*edit); + + auto pos = mapping->mutable_position(); + pos->set_node_id(id(end_pos)); + pos->set_is_reverse(is_rev(end_pos)); + pos->set_offset(offset(end_pos)); + sink_subpath->set_score(0); + +#ifdef debug_multipath_alignment + cerr << "right zipped softclip subpath from " << sink_idx << " to right tail:" << endl; + cerr << debug_string(*tail_subpath) << endl; +#endif + } + + // there's no sink anymore, it's a frayed end + sink_subpath = nullptr; + } + else { + // add edges from all of the alt alns to the zipped tail + for (size_t i = sink_idx + 1; i < multipath_aln_out.subpath_size(); ++i) { + multipath_aln_out.mutable_subpath(i)->add_next(multipath_aln_out.subpath_size()); + } + if (zip_to_zip_connection) { + multipath_aln_out.mutable_subpath(sink_idx)->add_next(multipath_aln_out.subpath_size()); + } + + // an arbitrary choice of a predecessor (this will only come into play + // if the previous paths end mid-node, in which case all of them must + // end on that node) + end_pos = final_position(multipath_aln_out.subpath().back().path()); + + auto tail_subpath = multipath_aln_out.add_subpath(); + *tail_subpath->mutable_path() = move(right_zip_aln.first); + tail_subpath->set_score(right_zip_aln.second); + + auto first_mapping = tail_subpath->mutable_path()->mutable_mapping(0); + if (first_mapping->position().node_id() == id(end_pos)) { + first_mapping->mutable_position()->set_offset(offset(end_pos)); + } + // note: don't need to check for softclips because that is handled separately above + + // the zipped end is now the sink + sink_subpath = tail_subpath; +#ifdef debug_multipath_alignment + cerr << "right zipped alignment from " << sink_idx << " to right tail:" << endl; + cerr << debug_string(*tail_subpath) << endl; +#endif + } + } + else { + // there's no sink anymore + sink_subpath = nullptr; + } } // Now handle the left tails. @@ -4534,27 +4884,125 @@ namespace vg { vector& alt_alignments = tail_alignments[false][j]; // remove alignments with the same path auto deduplicated = deduplicate_alt_alns(alt_alignments, true, false); + + pair left_zip_aln, right_zip_aln; + if (deduplicated.size() > 1) { + right_zip_aln = zip_alignments(deduplicated, false, alignment, align_graph, path_node.end, aligner); + left_zip_aln = zip_alignments(deduplicated, true, alignment, align_graph, path_node.end, aligner); + } - const path_mapping_t& first_mapping = path_node.path.mapping(0); + size_t source_idx = j; + auto first_mapping = &path_node.path.mapping().front(); + + if (!right_zip_aln.first.mapping().empty()) { + // we were able to zip a prefix of the tails together + + source_idx = multipath_aln_out.subpath_size(); + + auto zip_subpath = multipath_aln_out.add_subpath(); + *zip_subpath->mutable_path() = move(right_zip_aln.first); + zip_subpath->set_score(right_zip_aln.second); + + first_mapping = &zip_subpath->path().mapping().front(); + +#ifdef debug_multipath_alignment + cerr << "right zipped alignment from " << j << " to left tail:" << endl; + cerr << debug_string(*zip_subpath) << endl; +#endif + } + + bool zip_to_zip_connection = false; for (auto& tail_alignment : deduplicated) { - subpath_t* tail_subpath = multipath_aln_out.add_subpath(); + if (tail_alignment.first.mapping().empty()) { + // it's possible that an entire tail get's swallowed up in zipping + zip_to_zip_connection = true; + continue; + } + + if (!left_zip_aln.first.mapping().empty()) { + // the left part of these alignments hasn't been zipped, so this + // is a start + multipath_aln_out.add_start(multipath_aln_out.subpath_size()); + } + + auto tail_subpath = multipath_aln_out.add_subpath(); *tail_subpath->mutable_path() = move(tail_alignment.first); tail_subpath->set_score(tail_alignment.second); - tail_subpath->add_next(j); - multipath_aln_out.add_start(multipath_aln_out.subpath_size() - 1); + tail_subpath->add_next(source_idx); #ifdef debug_multipath_alignment - cerr << "subpath from " << j << " to left tail:" << endl; + cerr << "subpath from " << source_idx << " to left tail:" << endl; cerr << debug_string(*tail_subpath) << endl; #endif path_mapping_t* final_mapping = tail_subpath->mutable_path()->mutable_mapping(tail_subpath->path().mapping_size() - 1); if (tail_subpath->path().mapping_size() == 1 && final_mapping->edit_size() == 1 && final_mapping->edit(0).from_length() == 0 && final_mapping->edit(0).to_length() > 0 - && final_mapping->position().node_id() != first_mapping.position().node_id()) { + && final_mapping->position().node_id() != first_mapping->position().node_id()) { // this is a pure soft clip on the end of the previous node, so we move it over to the // beginning of the match node to match invariants in rest of code base - *final_mapping->mutable_position() = first_mapping.position(); + *final_mapping->mutable_position() = first_mapping->position(); + } + } + + if (!left_zip_aln.first.mapping().empty()) { + if (left_zip_aln.first.mapping_size() == 1 + && left_zip_aln.first.mapping(0).edit_size() == 1 + && left_zip_aln.first.mapping(0).edit(0).from_length() == 0 + && left_zip_aln.first.mapping(0).edit(0).to_length() != 0) { + + // this is a pure soft clip, we need to attach it to the adjacent nodes + // to maintain expected invariants (because they might start on different + // nodes) + + auto edit = &left_zip_aln.first.mutable_mapping()->front().mutable_edit()->front(); + + // add edit to subpaths and add starts + for (size_t i = source_idx + 1; i < multipath_aln_out.subpath_size(); ++i) { + auto edits = multipath_aln_out.mutable_subpath(i)->mutable_path()->mutable_mapping()->front().mutable_edit(); + edits->insert(edits->begin(), *edit); + multipath_aln_out.add_start(i); + } + + if (zip_to_zip_connection) { + // add subpath for soft clip (can't be added to existing path because it + // has sucessor subpaths, need to add it as a separate subpath + + multipath_aln_out.add_start(multipath_aln_out.subpath_size()); + + auto subpath = multipath_aln_out.add_subpath(); + subpath->add_next(source_idx); + + auto mapping = subpath->mutable_path()->add_mapping(); + *mapping->add_edit() = move(*edit); + + *mapping->mutable_position() = multipath_aln_out.subpath(source_idx).path().mapping().front().position(); + subpath->set_score(0); +#ifdef debug_multipath_alignment + cerr << "left zipped soft clip subpath from " << source_idx << " to left tail:" << endl; + cerr << debug_string(*subpath) << endl; +#endif + } + } + else { + + multipath_aln_out.add_start(multipath_aln_out.subpath_size()); + + auto tail_subpath = multipath_aln_out.add_subpath(); + *tail_subpath->mutable_path() = move(left_zip_aln.first); + tail_subpath->set_score(left_zip_aln.second); + + for (size_t i = source_idx + 1; i + 1 < multipath_aln_out.subpath_size(); ++i) { + tail_subpath->add_next(i); + } + + if (zip_to_zip_connection) { + tail_subpath->add_next(source_idx); + } +#ifdef debug_multipath_alignment + cerr << "left zipped subpath from " << source_idx << " to left tail:" << endl; + cerr << debug_string(*tail_subpath) << endl; +#endif } } } diff --git a/src/multipath_alignment_graph.hpp b/src/multipath_alignment_graph.hpp index d044afb24c3..81c95c0144e 100644 --- a/src/multipath_alignment_graph.hpp +++ b/src/multipath_alignment_graph.hpp @@ -305,6 +305,12 @@ namespace vg { size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier, size_t min_paths, unordered_set* sources = nullptr); + /// If a list of aligned subsequences are identifical in a prefix/suffix, remove that + /// prefix/suffix from all of the alignments and return it as a separate alignment. + /// If there is no shared prefix/suffix, returns an empty path with 0 score. + static pair zip_alignments(vector>& alt_alns, bool from_left, + const Alignment& alignment, const HandleGraph& align_graph, + string::const_iterator begin, const GSSWAligner* aligner); /// Memo for the transcendental pessimistic tail gap function (thread local to maintain thread-safety) static thread_local unordered_map> pessimistic_tail_gap_memo; diff --git a/src/position.cpp b/src/position.cpp index a2d530044bb..37cf737a4ca 100644 --- a/src/position.cpp +++ b/src/position.cpp @@ -75,4 +75,10 @@ string debug_string(const position_t& pos) { return to_return; } +void from_proto_position(const Position& from, position_t& to) { + to.set_node_id(from.node_id()); + to.set_offset(from.offset()); + to.set_is_reverse(from.is_reverse()); +} + } diff --git a/src/position.hpp b/src/position.hpp index 6df9d625bad..ec4953ab6d4 100644 --- a/src/position.hpp +++ b/src/position.hpp @@ -61,7 +61,7 @@ pair min_oriented_distances(const unordered_map > >& path_offsets2); string debug_string(const position_t& pos); - +void from_proto_position(const Position& from, position_t& to); /* * position_t diff --git a/src/splicing.cpp b/src/splicing.cpp index 6010ecc7dc8..3f6606b1470 100644 --- a/src/splicing.cpp +++ b/src/splicing.cpp @@ -623,9 +623,7 @@ multipath_alignment_t from_hit(const Alignment& alignment, const HandleGraph& gr edit->set_sequence(string(mem.end, alignment.sequence().end())); } } - Path proto_path; - to_proto_path(*path, proto_path); - subpath->set_score(scorer.score_partial_alignment(alignment, graph, proto_path, + subpath->set_score(scorer.score_partial_alignment(alignment, graph, *path, alignment.sequence().begin())); identify_start_subpaths(multipath_aln); @@ -640,7 +638,7 @@ tuple trimmed_end(const Alignment& aln, int64_t len, bo #endif const Path& path = aln.path(); - Path dummy_path; + path_t dummy_path; tuple return_val; @@ -663,7 +661,7 @@ tuple trimmed_end(const Alignment& aln, int64_t len, bo cerr << "after mapping " << i << ", remaining length " << len << endl; #endif - *dummy_path.add_mapping() = path.mapping(i); + from_proto_mapping(path.mapping(i), *dummy_path.add_mapping()); --i; } if (i < 0) { @@ -672,14 +670,15 @@ tuple trimmed_end(const Alignment& aln, int64_t len, bo #endif get<0>(return_val) = initial_position(path); get<1>(return_val) = path_to_length(path); - dummy_path = path; + dummy_path.clear_mapping(); + from_proto_path(path, dummy_path); copied_full_path = true; } else { const Mapping& mapping = path.mapping(i); int64_t j = mapping.edit_size() - 1; int64_t from_length = 0; - Mapping* dummy_mapping = nullptr; + path_mapping_t* dummy_mapping = nullptr; while (j >= 0 && (len > mapping.edit(j).to_length() || mapping.edit(j).from_length() == 0)) { auto to_length = mapping.edit(j).to_length(); @@ -693,7 +692,7 @@ tuple trimmed_end(const Alignment& aln, int64_t len, bo if (!dummy_mapping) { dummy_mapping = dummy_path.add_mapping(); } - *dummy_mapping->add_edit() = mapping.edit(j); + from_proto_edit(mapping.edit(j), *dummy_mapping->add_edit()); --j; } if (j >= 0 && len > 0) { @@ -708,7 +707,7 @@ tuple trimmed_end(const Alignment& aln, int64_t len, bo if (!dummy_mapping) { dummy_mapping = dummy_path.add_mapping(); } - Edit* dummy_edit = dummy_mapping->add_edit(); + auto* dummy_edit = dummy_mapping->add_edit(); dummy_edit->set_from_length(last_from_length); dummy_edit->set_to_length(len); if (!mapping.edit(j).sequence().empty()) { @@ -720,7 +719,7 @@ tuple trimmed_end(const Alignment& aln, int64_t len, bo get_is_rev(get<0>(return_val)) = position.is_reverse(); get_offset(get<0>(return_val)) = position.offset() + mapping_from_length(mapping) - from_length; if (dummy_mapping) { - *dummy_mapping->mutable_position() = make_position(get<0>(return_val)); + from_proto_position(mapping.position(), *dummy_mapping->mutable_position()); } } } @@ -741,7 +740,7 @@ tuple trimmed_end(const Alignment& aln, int64_t len, bo cerr << "after mapping " << i << ", remaining length " << len << endl; #endif - *dummy_path.add_mapping() = path.mapping(i); + from_proto_mapping(path.mapping(i), *dummy_path.add_mapping()); ++i; } if (i == path.mapping_size()) { @@ -750,14 +749,15 @@ tuple trimmed_end(const Alignment& aln, int64_t len, bo #endif get<0>(return_val) = final_position(path); get<1>(return_val) = path_to_length(path); - dummy_path = path; + dummy_path.clear_mapping(); + from_proto_path(path, dummy_path); copied_full_path = true; } else { const Mapping& mapping = path.mapping(i); int64_t j = 0; int64_t from_length = 0; - Mapping* dummy_mapping = nullptr; + path_mapping_t* dummy_mapping = nullptr; while (j < mapping.edit_size() && (len > mapping.edit(j).to_length() || mapping.edit(j).from_length() == 0)) { auto to_length = mapping.edit(j).to_length(); @@ -771,9 +771,9 @@ tuple trimmed_end(const Alignment& aln, int64_t len, bo if (!dummy_mapping) { dummy_mapping = dummy_path.add_mapping(); - *dummy_mapping->mutable_position() = mapping.position(); + from_proto_position(mapping.position(), *dummy_mapping->mutable_position()); } - *dummy_mapping->add_edit() = mapping.edit(j); + from_proto_edit(mapping.edit(j), *dummy_mapping->add_edit()); ++j; } if (j != mapping.edit_size() && len > 0) { @@ -787,9 +787,9 @@ tuple trimmed_end(const Alignment& aln, int64_t len, bo if (!dummy_mapping) { dummy_mapping = dummy_path.add_mapping(); - *dummy_mapping->mutable_position() = mapping.position(); + from_proto_position(mapping.position(), *dummy_mapping->mutable_position()); } - Edit* dummy_edit = dummy_mapping->add_edit(); + auto* dummy_edit = dummy_mapping->add_edit(); dummy_edit->set_from_length(last_from_length); dummy_edit->set_to_length(len); if (!mapping.edit(j).sequence().empty()) { @@ -815,7 +815,7 @@ tuple trimmed_end(const Alignment& aln, int64_t len, bo *dummy_path.mutable_mapping(dummy_path.mapping_size() - i - 1)); } // the final mapping was also built in reverse - Mapping* mapping = dummy_path.mutable_mapping(0); + auto* mapping = dummy_path.mutable_mapping(0); for (size_t i = 0, end = mapping->edit_size() / 2; i < end; ++i) { swap(*mapping->mutable_edit(i), *mapping->mutable_edit(mapping->edit_size() - i - 1)); @@ -919,13 +919,13 @@ bool trim_path(path_t* path, bool from_left, int64_t mapping_idx, int64_t edit_i return do_trim; } -pair, pair> split_splice_segment(const Alignment& splice_segment, - const tuple& left_trace, - const tuple& right_trace, - int64_t splice_junction_idx, - const GSSWAligner& scorer, - const HandleGraph& graph) { - +pair, pair> split_splice_segment(const Alignment& splice_segment, + const tuple& left_trace, + const tuple& right_trace, + int64_t splice_junction_idx, + const GSSWAligner& scorer, + const HandleGraph& graph) { + #ifdef debug_linker_split cerr << "splitting splice segment " << pb2json(splice_segment) << endl; cerr << "split is at index " << splice_junction_idx << endl; @@ -935,7 +935,7 @@ pair, pair> split_splice_segment(const Alignm // TODO: make the scoring robust to hanging indels at the trace positions - pair, pair> return_val; + pair, pair> return_val; auto& left_path = return_val.first.first; // walk the part of the splice segment before the trace on the left side @@ -946,7 +946,7 @@ pair, pair> split_splice_segment(const Alignm } // special logic to handle the mapping with the traced location if (get<0>(left_trace) < splice_junction_idx) { - Mapping* post_trace_mapping = nullptr; + path_mapping_t* post_trace_mapping = nullptr; size_t trace_leading_from_length = 0; const auto& trace_mapping = splice_segment.path().mapping(get<0>(left_trace)); for (int64_t j = 0; j < get<1>(left_trace); ++j) { @@ -976,7 +976,7 @@ pair, pair> split_splice_segment(const Alignm if (!post_trace_mapping) { post_trace_mapping = left_path.add_mapping(); } - *post_trace_mapping->add_edit() = trace_mapping.edit(j); + from_proto_edit(trace_mapping.edit(j), *post_trace_mapping->add_edit()); } if (post_trace_mapping) { const auto& trace_pos = trace_mapping.position(); @@ -994,11 +994,11 @@ pair, pair> split_splice_segment(const Alignm continue; } auto left_mapping = left_path.add_mapping(); - *left_mapping->mutable_position() = mapping.position(); + from_proto_position(mapping.position(), *left_mapping->mutable_position()); for (size_t j = 0; j < mapping.edit_size(); ++j) { const auto& edit = mapping.edit(j); if (edit.from_length() != 0 || edit.to_length() != 0) { - *left_mapping->add_edit() = edit; + from_proto_edit(edit, *left_mapping->add_edit()); left_to_length += edit.to_length(); } } @@ -1012,11 +1012,11 @@ pair, pair> split_splice_segment(const Alignm continue; } auto right_mapping = right_path.add_mapping(); - *right_mapping->mutable_position() = mapping.position(); + from_proto_position(mapping.position(), *right_mapping->mutable_position()); for (size_t j = 0; j < mapping.edit_size(); ++j) { const auto& edit = mapping.edit(j); if (edit.from_length() != 0 || edit.to_length() != 0) { - *right_mapping->add_edit() = edit; + from_proto_edit(edit, *right_mapping->add_edit()); } } } @@ -1027,9 +1027,9 @@ pair, pair> split_splice_segment(const Alignm (get<1>(right_trace) != 0 || get<2>(right_trace) != 0)) { auto pre_trace_mapping = right_path.add_mapping(); const auto& trace_mapping = splice_segment.path().mapping(get<0>(right_trace)); - *pre_trace_mapping->mutable_position() = trace_mapping.position(); + from_proto_position(trace_mapping.position(), *pre_trace_mapping->mutable_position()); for (int64_t j = 0; j < get<1>(right_trace); ++j) { - *pre_trace_mapping->add_edit() = trace_mapping.edit(j); + from_proto_edit(trace_mapping.edit(j), *pre_trace_mapping->add_edit()); } if (get<1>(right_trace) < trace_mapping.edit_size() && get<2>(right_trace) != 0) { const auto& trace_edit = trace_mapping.edit(get<1>(right_trace)); @@ -1287,9 +1287,7 @@ multipath_alignment_t&& fuse_spliced_alignments(const Alignment& alignment, auto path = left_mp_aln.mutable_subpath(i)->mutable_path(); bool trimmed = trim_path(path, false, m, e, b); if (trimmed) { - Path proto_path; - to_proto_path(*path, proto_path); - int32_t new_score = scorer.score_partial_alignment(alignment, graph, proto_path, + int32_t new_score = scorer.score_partial_alignment(alignment, graph, *path, alignment.sequence().begin() + left_to_length[i]); left_mp_aln.mutable_subpath(i)->set_score(new_score); } @@ -1366,7 +1364,7 @@ multipath_alignment_t&& fuse_spliced_alignments(const Alignment& alignment, auto subpath = left_mp_aln.add_subpath(); subpath->set_score(splice_segment_halves.first.second); - from_proto_path(splice_segment_halves.first.first, *subpath->mutable_path()); + *subpath->mutable_path() = move(splice_segment_halves.first.first); } if (splice_segment_halves.second.first.mapping_size() != 0) { @@ -1388,7 +1386,7 @@ multipath_alignment_t&& fuse_spliced_alignments(const Alignment& alignment, auto subpath = left_mp_aln.add_subpath(); subpath->set_score(splice_segment_halves.second.second); - from_proto_path(splice_segment_halves.second.first, *subpath->mutable_path()); + *subpath->mutable_path() = move(splice_segment_halves.second.first); } #ifdef debug_fusing @@ -1458,9 +1456,7 @@ multipath_alignment_t&& fuse_spliced_alignments(const Alignment& alignment, bool trimmed = trim_path(path, true, m, e, b); if (trimmed) { int64_t new_to_len = path_to_length(*path); - Path proto_path; - to_proto_path(*path, proto_path); - int32_t new_score = scorer.score_partial_alignment(alignment, graph, proto_path, + int32_t new_score = scorer.score_partial_alignment(alignment, graph, *path, (alignment.sequence().begin() + right_to_length[i] + to_len - new_to_len)); right_mp_aln.mutable_subpath(i)->set_score(new_score); From 97736bf11fbd7914ce2a9ba2aee7abdd72ef4b76 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Sun, 15 Aug 2021 11:29:04 -0400 Subject: [PATCH 2/2] debug tail zipping algorithm --- src/multipath_alignment_graph.cpp | 244 ++++++++++++++++++++++++------ src/multipath_mapper.cpp | 2 +- src/splicing.cpp | 27 ++-- src/subcommand/mpmap_main.cpp | 8 +- 4 files changed, 219 insertions(+), 62 deletions(-) diff --git a/src/multipath_alignment_graph.cpp b/src/multipath_alignment_graph.cpp index 48f49eb1bb0..efd229e7f3e 100644 --- a/src/multipath_alignment_graph.cpp +++ b/src/multipath_alignment_graph.cpp @@ -4148,6 +4148,13 @@ namespace vg { pair MultipathAlignmentGraph::zip_alignments(vector>& alt_alns, bool from_left, const Alignment& alignment, const HandleGraph& align_graph, string::const_iterator begin, const GSSWAligner* aligner) { +#ifdef debug_multipath_alignment + cerr << "attempting to zip alignments from " << (from_left ? "left" : "right") << endl; + for (auto& alt_aln : alt_alns) { + cerr << debug_string(alt_aln.first) << endl; + } +#endif + pair return_val; int64_t i, j, incr; @@ -4155,10 +4162,17 @@ namespace vg { if (from_left) { incr = 1; i = 0; + j = 0; } else { incr = -1; i = path_1.mapping_size() - 1; + if (i >= 0) { + j = path_1.mapping(i).edit_size() - 1; + } + else { + j = 0; + } } // past-the-last index of the longest match that concludes in a match/mismatvch int64_t last_aligned_i, last_aligned_j; @@ -4166,34 +4180,68 @@ namespace vg { // walk the first path to check for full prefix/suffix matches bool found_mismatch = false, in_indel = false; while (i >= 0 && i < path_1.mapping_size()) { + // check to make sure the positions of all of the alts match at this mapping const auto& mapping_1 = path_1.mapping(i); - // start from the appropriate side of the mapping + // start from the appropriate side of the mapping and check for matches of indels if (from_left) { j = 0; } else { j = mapping_1.edit_size() - 1; } + const auto& pos_1 = mapping_1.position(); + auto offset_1 = from_left ? pos_1.offset() : pos_1.offset() + mapping_from_length(mapping_1); + for (size_t k = 1; k < alt_alns.size(); ++k) { + //cerr << "checking for matching position on aln " << k << endl; + const auto& path_2 = alt_alns[k].first; + int64_t i2; + if (from_left) { + i2 = i; + } + else { + i2 = path_2.mapping_size() - path_1.mapping_size() + i; + } + if (i2 < 0 || i2 >= path_2.mapping_size()) { + //cerr << "no corresponding mapping in alt aln " << k << endl; + found_mismatch = true; + break; + } + const auto& mapping_2 = path_2.mapping(i2); + const auto& pos_2 = mapping_2.position(); + if (pos_1.node_id() != pos_2.node_id() + || pos_2.is_reverse() != pos_2.is_reverse() + || (from_left ? pos_2.offset() : pos_2.offset() + mapping_from_length(mapping_2)) != offset_1) { + //cerr << "positions mismatch" << endl; + found_mismatch = true; + break; + } + } + if (found_mismatch) { + break; + } while(j >= 0 && j < path_1.mapping(i).edit_size()) { const auto& edit_1 = mapping_1.edit(j); + //cerr << "checking for matches to edit " << i << " " << j << ": " << debug_string(edit_1) << endl; if (!in_indel) { // we've matched up to here and the most recent match wasn't an indel last_aligned_i = i; last_aligned_j = j; } // check whether all of the other paths match the first path at the next edit + bool found_unmatchable = false; for (size_t k = 1; k < alt_alns.size() && !found_mismatch; ++k) { auto& path_2 = alt_alns[k].first; // find the corresponding mapping index int64_t i2; if (from_left) { - i2 = 0; + i2 = i; } else { i2 = path_2.mapping_size() - path_1.mapping_size() + i; } if (i2 < 0 || i2 >= path_2.mapping_size()) { + //cerr << "no corresponding mapping in alt aln " << k << endl; found_mismatch = true; break; } @@ -4201,17 +4249,24 @@ namespace vg { // find the corresponding edit index int64_t j2; if (from_left) { - j2 = 0; + j2 = j; } else { j2 = mapping_2.edit_size() - mapping_1.edit_size() + j; } if (j2 < 0 || j2 >= mapping_2.edit_size()) { + //cerr << "no corresponding edit in alt aln " << k << endl; found_mismatch = true; break; } const auto& edit_2 = mapping_2.edit(j2); + //cerr << "comparing to alt aln " << k << " edit " << i2 << " " << j2 << ": " << debug_string(edit_2) << endl; found_mismatch = (edit_1 != edit_2); + if ((from_left && j + 1 == mapping_1.edit_size()) || (!from_left && j == 0)) { + // check if there will be edits in this mapping that we won't hit by using the indexes + // of edits on path 1 + found_unmatchable = found_unmatchable || (mapping_1.edit_size() != mapping_2.edit_size()); + } } if (found_mismatch) { break; @@ -4219,6 +4274,10 @@ namespace vg { else { j += incr; in_indel = (edit_1.from_length() == 0 || edit_1.to_length() == 0); + if (found_unmatchable) { + found_mismatch = true; + break; + } } } if (found_mismatch) { @@ -4226,10 +4285,17 @@ namespace vg { } else { i += incr; + if (from_left || i < 0) { + j = 0; + } + else { + j = path_1.mapping(i).edit_size() - 1; + } } } if (in_indel) { + //cerr << "last match is in an indel" << endl; // the match concluded in an insertion or deletion, we need to make sure it doesn't // continue onto the next edit bool all_nexts_aligned = true; @@ -4259,49 +4325,121 @@ namespace vg { } } + if ((from_left && (i != 0 || j != 0)) || (!from_left && (i != path_1.mapping_size() - 1 || j != path_1.mapping().back().edit_size() - 1))) { - + //cerr << "matched up to " << i << " " << j << endl; // we matched part of the path across all alternate alignments, we can zip it up // delete the corresponding parts from all the other paths for (int64_t k = 1; k < alt_alns.size(); ++k) { + //cerr << "deleting from aln " << k << endl; auto& path_2 = alt_alns[k].first; if (from_left) { // we have to remove part of the path from the beginning of the existing paths - int64_t num_full_mappings = i; - if (i < path_2.mapping_size()) { - auto mapping = path_2.mutable_mapping(i); - if (j < mapping->edit_size()) { - // we have to remove part of the mapping - int64_t from_len = 0; - for (int64_t l = 0; l < j; ++l) { - from_len += mapping->edit(l).from_length(); - } - mapping->mutable_edit()->erase(mapping->mutable_edit()->begin(), - mapping->mutable_edit()->begin() + j); - mapping->mutable_position()->set_offset(mapping->position().offset() + from_len); + int64_t num_mappings, num_edits; + if (j == 0) { + if (path_1.mapping(i - 1).edit_size() == path_2.mapping(i - 1).edit_size()) { + // this is the past-the-last on the mapping + num_mappings = i; + num_edits = 0; + } + else { + // the full mapping on path 1 didn't use up the final mapping on path 2 + num_mappings = i - 1; + num_edits = path_1.mapping(i - 1).edit_size(); + } + } + else { + if (i < path_2.mapping_size() && j == path_2.mapping(i).edit_size()) { + // past-the-last beyond the end of this mapping + num_mappings = i + 1; + num_edits = 0; } else { - // we can actually remove the whole mapping - ++num_full_mappings; + // past-the-last in the middle of this mapping + num_mappings = i; + num_edits = j; + } + } + //cerr << "deleting " << num_mappings << " mappings and " << num_edits << " from the left" << endl; + + if (num_mappings < path_2.mapping_size() && num_edits != 0) { + // we have to remove part of the mapping + auto mapping = path_2.mutable_mapping(i); + int64_t from_len = 0; + for (int64_t l = 0; l < num_edits; ++l) { + from_len += mapping->edit(l).from_length(); } + mapping->mutable_edit()->erase(mapping->mutable_edit()->begin(), + mapping->mutable_edit()->begin() + num_edits); + mapping->mutable_position()->set_offset(mapping->position().offset() + from_len); } // remove any full mappings that are shared path_2.mutable_mapping()->erase(path_2.mutable_mapping()->begin(), - path_2.mutable_mapping()->begin() + i); + path_2.mutable_mapping()->begin() + num_mappings); } else { - // we remove from the back (easier, but need to translate indexes) + // we remove from the back, more complicated because of needing to translate indexes + int64_t i2 = path_2.mapping_size() - path_1.mapping_size() + i; - path_2.mutable_mapping()->resize(i2 + 1); - const auto& mapping_1 = path_1.mapping(i); - auto mapping_2 = path_2.mutable_mapping(i2); - int64_t j2 = mapping_2->edit_size() - mapping_1.edit_size() + j; - mapping_2->mutable_edit()->resize(j2 + 1); - if (mapping_2->edit().empty()) { - // we actually removed the whole final mapping, get rid of it - path_2.mutable_mapping()->pop_back(); + int64_t num_mappings, num_edits; + if (i2 < 0) { + // we delete all of path 2 + num_mappings = path_2.mapping_size(); + num_edits = 0; + } + else if (i < 0) { + // we ran through path 1 but not path 2 + if (path_1.mapping().front().edit_size() == path_2.mapping(i2 + 1).edit_size()) { + // we ran through the full mapping on path 2 + num_mappings = path_1.mapping_size(); + num_edits = 0; + } + else { + // we didn't run through the entire mapping on path 2 + num_mappings = path_1.mapping_size() - 1; + num_edits = path_1.mapping().front().edit_size(); + } + } + else { + // both of the ending mappings can be indexed + const auto& mapping_1 = path_1.mapping(i); + if (j == mapping_1.edit_size() - 1) { + // mismatch at the beginning of a mapping on path 1 + if (path_1.mapping(i + 1).edit_size() == path_2.mapping(i2 + 1).edit_size()) { + // the previous mappings on both paths end at the same edit + num_mappings = path_1.mapping_size() - i - 1; + num_edits = 0; + } + else { + // we didn't run through the entire mapping on path 2 + num_mappings = path_1.mapping_size() - i - 2; + num_edits = path_1.mapping(i + 1).edit_size(); + } + } + else { + // the mismatch happened after at least one edit in this mapping on path 1 + int64_t j2 = path_2.mapping(i2).edit_size() - mapping_1.edit_size() + j; + if (j2 < 0) { + // past-the-last beyond the end of this mapping + num_mappings = path_1.mapping_size() - i; + num_edits = 0; + } + else { + // past-the-last in the middle of this mapping + num_mappings = path_1.mapping_size() - i - 1; + num_edits = mapping_1.edit_size() - j - 1; + } + } + } + //cerr << "deleting " << num_mappings << " mappings and " << num_edits << " from the right" << endl; + + // delete the number of mappings and edits we decided + path_2.mutable_mapping()->resize(path_2.mapping_size() - num_mappings); + if (num_edits) { + auto& mapping_2 = path_2.mutable_mapping()->back(); + mapping_2.mutable_edit()->resize(mapping_2.edit_size() - num_edits); } } } @@ -4328,17 +4466,21 @@ namespace vg { num_edits = path_1.mapping(i).edit_size() - j - 1; } } + //cerr << "need to copy " << num_mappings << " full mappings and " << num_edits << " edits" << endl; // copy to the zipped alignment for (int64_t m = 0; m < num_mappings; k += incr, ++m) { + //cerr << "copy mapping " << debug_string(*path_1.mutable_mapping(k)) << endl; *return_val.first.add_mapping() = move(*path_1.mutable_mapping(k)); } if (num_edits != 0) { auto mapping = path_1.mutable_mapping(k); + int64_t mapping_len = mapping_from_length(*mapping); auto pos = mapping->mutable_position(); auto copy_mapping = return_val.first.add_mapping(); auto copy_pos = copy_mapping->mutable_position(); for (int64_t l = from_left ? 0 : mapping->edit_size() - 1, e = 0; e < num_edits; l += incr, ++e) { auto edit = mapping->mutable_edit(l); + //cerr << "copy edit " << debug_string(*edit) << endl; *copy_mapping->add_edit() = move(*edit); } if (from_left) { @@ -4354,7 +4496,7 @@ namespace vg { mapping->mutable_edit()->resize(mapping->edit_size() - num_edits); copy_pos->set_node_id(pos->node_id()); copy_pos->set_is_reverse(pos->is_reverse()); - copy_pos->set_offset(pos->offset() + mapping_from_length(*copy_mapping)); + copy_pos->set_offset(pos->offset() + (mapping_len - mapping_from_length(*copy_mapping))); } } @@ -4379,6 +4521,10 @@ namespace vg { for (auto& alt_aln : alt_alns) { alt_aln.second -= return_val.second; } +#ifdef debug_multipath_alignment + cerr << "successfully made zipped alignment with score " << return_val.second << endl; + cerr << debug_string(return_val.first) << endl; +#endif } return return_val; @@ -4680,7 +4826,7 @@ namespace vg { } #ifdef debug_multipath_alignment - cerr << "subpath from " << j << " to " << edge.first << ":" << endl; + cerr << "subpath from " << j << " to " << edge.first << " at index " << multipath_aln_out.subpath_size() - 1 << ":" << endl; cerr << debug_string(*connecting_subpath) << endl; #endif } @@ -4748,13 +4894,14 @@ namespace vg { // the zipped alignment is now the sink end_pos = final_position(sink_subpath->path()); #ifdef debug_multipath_alignment - cerr << "left zipped alignment from " << j << " to right tail:" << endl; + cerr << "left zipped alignment from " << j << " to right tail" << " at index " << multipath_aln_out.subpath_size() - 1 << ":" << endl; cerr << debug_string(*sink_subpath) << endl; #endif } // add in the bool zip_to_zip_connection = false; + size_t frayed_tips_begin = multipath_aln_out.subpath_size(); for (auto& tail_alignment : deduplicated) { if (tail_alignment.first.mapping().empty()) { @@ -4789,10 +4936,11 @@ namespace vg { pos->set_offset(offset(end_pos)); } #ifdef debug_multipath_alignment - cerr << "subpath from " << sink_idx << " to right tail:" << endl; + cerr << "subpath from " << sink_idx << " to right tail at index " << multipath_aln_out.subpath_size() - 1 << ":" << endl; cerr << debug_string(*tail_subpath) << endl; #endif } + size_t frayed_tips_end = multipath_aln_out.subpath_size(); if (!right_zip_aln.first.mapping().empty()) { if (right_zip_aln.first.mapping_size() == 1 @@ -4807,7 +4955,7 @@ namespace vg { auto edit = right_zip_aln.first.mutable_mapping(0)->mutable_edit(0); // add an edit to the tails - for (size_t i = sink_idx + 1; i < multipath_aln_out.subpath_size(); ++i) { + for (size_t i = frayed_tips_begin; i < frayed_tips_end; ++i) { *multipath_aln_out.mutable_subpath(i)->mutable_path()->mutable_mapping()->back().add_edit() = *edit; } @@ -4827,7 +4975,7 @@ namespace vg { sink_subpath->set_score(0); #ifdef debug_multipath_alignment - cerr << "right zipped softclip subpath from " << sink_idx << " to right tail:" << endl; + cerr << "right zipped softclip subpath from " << sink_idx << " to right tail at index " << multipath_aln_out.subpath_size() - 1 << ":" << endl; cerr << debug_string(*tail_subpath) << endl; #endif } @@ -4837,7 +4985,7 @@ namespace vg { } else { // add edges from all of the alt alns to the zipped tail - for (size_t i = sink_idx + 1; i < multipath_aln_out.subpath_size(); ++i) { + for (size_t i = frayed_tips_begin; i < frayed_tips_end; ++i) { multipath_aln_out.mutable_subpath(i)->add_next(multipath_aln_out.subpath_size()); } if (zip_to_zip_connection) { @@ -4847,7 +4995,7 @@ namespace vg { // an arbitrary choice of a predecessor (this will only come into play // if the previous paths end mid-node, in which case all of them must // end on that node) - end_pos = final_position(multipath_aln_out.subpath().back().path()); + end_pos = final_position(multipath_aln_out.subpath(frayed_tips_begin).path()); auto tail_subpath = multipath_aln_out.add_subpath(); *tail_subpath->mutable_path() = move(right_zip_aln.first); @@ -4862,7 +5010,7 @@ namespace vg { // the zipped end is now the sink sink_subpath = tail_subpath; #ifdef debug_multipath_alignment - cerr << "right zipped alignment from " << sink_idx << " to right tail:" << endl; + cerr << "right zipped alignment from " << sink_idx << " to right tail at index " << multipath_aln_out.subpath_size() - 1 << ":" << endl; cerr << debug_string(*tail_subpath) << endl; #endif } @@ -4887,8 +5035,8 @@ namespace vg { pair left_zip_aln, right_zip_aln; if (deduplicated.size() > 1) { - right_zip_aln = zip_alignments(deduplicated, false, alignment, align_graph, path_node.end, aligner); - left_zip_aln = zip_alignments(deduplicated, true, alignment, align_graph, path_node.end, aligner); + right_zip_aln = zip_alignments(deduplicated, false, alignment, align_graph, alignment.sequence().begin(), aligner); + left_zip_aln = zip_alignments(deduplicated, true, alignment, align_graph, alignment.sequence().begin(), aligner); } size_t source_idx = j; @@ -4904,14 +5052,16 @@ namespace vg { zip_subpath->set_score(right_zip_aln.second); first_mapping = &zip_subpath->path().mapping().front(); + zip_subpath->add_next(j); #ifdef debug_multipath_alignment - cerr << "right zipped alignment from " << j << " to left tail:" << endl; + cerr << "right zipped alignment from " << j << " to left tail at index " << multipath_aln_out.subpath_size() - 1 << ":" << endl; cerr << debug_string(*zip_subpath) << endl; #endif } bool zip_to_zip_connection = false; + size_t frayed_tips_begin = multipath_aln_out.subpath_size(); for (auto& tail_alignment : deduplicated) { if (tail_alignment.first.mapping().empty()) { // it's possible that an entire tail get's swallowed up in zipping @@ -4919,7 +5069,7 @@ namespace vg { continue; } - if (!left_zip_aln.first.mapping().empty()) { + if (left_zip_aln.first.mapping().empty()) { // the left part of these alignments hasn't been zipped, so this // is a start multipath_aln_out.add_start(multipath_aln_out.subpath_size()); @@ -4932,7 +5082,7 @@ namespace vg { tail_subpath->add_next(source_idx); #ifdef debug_multipath_alignment - cerr << "subpath from " << source_idx << " to left tail:" << endl; + cerr << "subpath from " << source_idx << " to left tail at index " << multipath_aln_out.subpath_size() - 1 << ":" << endl; cerr << debug_string(*tail_subpath) << endl; #endif path_mapping_t* final_mapping = tail_subpath->mutable_path()->mutable_mapping(tail_subpath->path().mapping_size() - 1); @@ -4944,6 +5094,7 @@ namespace vg { *final_mapping->mutable_position() = first_mapping->position(); } } + size_t frayed_tips_end = multipath_aln_out.subpath_size(); if (!left_zip_aln.first.mapping().empty()) { if (left_zip_aln.first.mapping_size() == 1 @@ -4958,7 +5109,7 @@ namespace vg { auto edit = &left_zip_aln.first.mutable_mapping()->front().mutable_edit()->front(); // add edit to subpaths and add starts - for (size_t i = source_idx + 1; i < multipath_aln_out.subpath_size(); ++i) { + for (size_t i = frayed_tips_begin; i < frayed_tips_end; ++i) { auto edits = multipath_aln_out.mutable_subpath(i)->mutable_path()->mutable_mapping()->front().mutable_edit(); edits->insert(edits->begin(), *edit); multipath_aln_out.add_start(i); @@ -4979,7 +5130,7 @@ namespace vg { *mapping->mutable_position() = multipath_aln_out.subpath(source_idx).path().mapping().front().position(); subpath->set_score(0); #ifdef debug_multipath_alignment - cerr << "left zipped soft clip subpath from " << source_idx << " to left tail:" << endl; + cerr << "left zipped soft clip subpath from " << source_idx << " to left tail at index " << multipath_aln_out.subpath_size() - 1 << ":" << endl; cerr << debug_string(*subpath) << endl; #endif } @@ -4992,15 +5143,14 @@ namespace vg { *tail_subpath->mutable_path() = move(left_zip_aln.first); tail_subpath->set_score(left_zip_aln.second); - for (size_t i = source_idx + 1; i + 1 < multipath_aln_out.subpath_size(); ++i) { + for (size_t i = frayed_tips_begin; i < frayed_tips_end; ++i) { tail_subpath->add_next(i); } - if (zip_to_zip_connection) { tail_subpath->add_next(source_idx); } #ifdef debug_multipath_alignment - cerr << "left zipped subpath from " << source_idx << " to left tail:" << endl; + cerr << "left zipped subpath from " << source_idx << " to left tail at index " << multipath_aln_out.subpath_size() - 1 << ":" << endl; cerr << debug_string(*tail_subpath) << endl; #endif } diff --git a/src/multipath_mapper.cpp b/src/multipath_mapper.cpp index 21d5f2554cc..9f931b856d9 100644 --- a/src/multipath_mapper.cpp +++ b/src/multipath_mapper.cpp @@ -54,7 +54,7 @@ namespace vg { void MultipathMapper::multipath_map_internal(const Alignment& alignment, MappingQualityMethod mapq_method, vector& multipath_alns_out) { - + #ifdef debug_multipath_mapper cerr << "multipath mapping read " << pb2json(alignment) << endl; cerr << "querying MEMs..." << endl; diff --git a/src/splicing.cpp b/src/splicing.cpp index 3f6606b1470..ba83a73f761 100644 --- a/src/splicing.cpp +++ b/src/splicing.cpp @@ -826,7 +826,7 @@ tuple trimmed_end(const Alignment& aln, int64_t len, bo begin = aln.sequence().begin(); } #ifdef debug_trimming - cerr << "scoring trimmed subpath " << pb2json(dummy_path) << ", with substring " << (begin - aln.sequence().begin()) << ":" << (begin - aln.sequence().begin()) + get<1>(return_val) << endl; + cerr << "scoring trimmed subpath " << debug_string(dummy_path) << ", with substring " << (begin - aln.sequence().begin()) << ":" << (begin - aln.sequence().begin()) + get<1>(return_val) << endl; #endif get<2>(return_val) = aligner.score_partial_alignment(aln, graph, dummy_path, begin); @@ -1055,8 +1055,8 @@ pair, pair> split_splice_segment(const Al true); #ifdef debug_linker_split - cerr << "left partial score " << return_val.first.second << ", partial path " << pb2json(return_val.first.first) << endl; - cerr << "right partial score " << return_val.second.second << ", partial path " << pb2json(return_val.second.first) << endl; + cerr << "left partial score " << return_val.first.second << ", partial path " << debug_string(return_val.first.first) << endl; + cerr << "right partial score " << return_val.second.second << ", partial path " << debug_string(return_val.second.first) << endl; #endif // deletions can span the splice junction, in which case they will have been scored incorrectly @@ -1104,8 +1104,9 @@ pair, pair> split_splice_segment(const Al // a little bit, but i think it's worth it to have a good alignment across the // splice junction) int32_t total_gap_score = scorer.score_gap(left_del_size + right_del_size); - return_val.first.second += (total_gap_score / 2 - scorer.score_gap(left_del_size)); - return_val.second.second += (total_gap_score - total_gap_score / 2 - scorer.score_gap(right_del_size)); + int32_t left_gap_score = (left_del_size * total_gap_score) / (left_del_size + right_del_size); + return_val.first.second += (left_gap_score - scorer.score_gap(left_del_size)); + return_val.second.second += (total_gap_score - left_gap_score - scorer.score_gap(right_del_size)); #ifdef debug_linker_split cerr << "re-divided deletion score, now left score " << return_val.first.second << ", right score " << return_val.second.second << endl; @@ -1350,13 +1351,17 @@ multipath_alignment_t&& fuse_spliced_alignments(const Alignment& alignment, #ifdef debug_fusing cerr << "split the linker:" << endl; - cerr << "left score " << splice_segment_halves.first.second << ", path " << pb2json(splice_segment_halves.first.first) << endl; - cerr << "right score " << splice_segment_halves.second.second << ", path " << pb2json(splice_segment_halves.second.first) << endl; + cerr << "left score " << splice_segment_halves.first.second << ", path " << debug_string(splice_segment_halves.first.first) << endl; + cerr << "right score " << splice_segment_halves.second.second << ", path " << debug_string(splice_segment_halves.second.first) << endl; #endif - if (splice_segment_halves.first.first.mapping_size() != 0) { + bool have_left_linker = !splice_segment_halves.first.first.mapping().empty(); + bool have_right_linker = !splice_segment_halves.second.first.mapping().empty(); + + if (have_left_linker) { for (const auto& left_loc : left_mp_aln_trace) { auto i = get<0>(left_loc) - left_removed_so_far[get<0>(left_loc)]; + // TODO: shouldn't this always be >= 0? the linker's attachment should be preserved... if (i < left_mp_aln.subpath_size()) { left_mp_aln.mutable_subpath(i)->add_next(left_mp_aln.subpath_size()); } @@ -1367,8 +1372,8 @@ multipath_alignment_t&& fuse_spliced_alignments(const Alignment& alignment, *subpath->mutable_path() = move(splice_segment_halves.first.first); } - if (splice_segment_halves.second.first.mapping_size() != 0) { - if (splice_segment_halves.first.first.mapping_size() == 0) { + if (have_right_linker) { + if (!have_left_linker) { // we skipped the left side of the splice, so connect to the left splice points for (const auto& left_loc : left_mp_aln_trace) { auto i = get<0>(left_loc) - left_removed_so_far[get<0>(left_loc)]; @@ -1473,7 +1478,7 @@ multipath_alignment_t&& fuse_spliced_alignments(const Alignment& alignment, } } } - else if (splice_segment_halves.second.first.mapping_size() == 0) { + else if (!have_right_linker) { // the splice segment on the right side is empty, make connection the the left side auto connection = left_mp_aln.mutable_subpath(right_subpaths_begin - 1)->add_connection(); connection->set_next(left_mp_aln.subpath_size()); diff --git a/src/subcommand/mpmap_main.cpp b/src/subcommand/mpmap_main.cpp index b01a4402e33..141cdcca168 100644 --- a/src/subcommand/mpmap_main.cpp +++ b/src/subcommand/mpmap_main.cpp @@ -1436,8 +1436,8 @@ int main_mpmap(int argc, char** argv) { time_t time_start; auto progress_boilerplate = [&]() { stringstream strm; - strm.precision(1); strm << fixed; + strm.precision(0); if (!clock_init) { time(&time_start); strm << 0.0 << " s"; @@ -1451,6 +1451,7 @@ int main_mpmap(int argc, char** argv) { strm << secs << " s"; } else { + strm.precision(1); double mins = secs / 60.0; if (mins <= 60.0) { strm << mins << " m"; @@ -2137,7 +2138,8 @@ int main_mpmap(int argc, char** argv) { // FASTQ input if (!fastq_name_1.empty()) { if (!suppress_progress) { - cerr << progress_boilerplate() << "Mapping reads from " << (fastq_name_1 == "-" ? "STDIN" : fastq_name_1) << (fastq_name_2.empty() ? "" : " and " + (fastq_name_2 == "-" ? "STDIN" : fastq_name_2)) << " using " << thread_count << " threads" << endl; + + cerr << progress_boilerplate() << "Mapping reads from " << (fastq_name_1 == "-" ? "STDIN" : fastq_name_1) << (fastq_name_2.empty() ? "" : " and " + (fastq_name_2 == "-" ? "STDIN" : fastq_name_2)) << " using " << thread_count << " thread" << (thread_count > 1 ? "s" : "") << endl; } if (interleaved_input) { @@ -2156,7 +2158,7 @@ int main_mpmap(int argc, char** argv) { // GAM input if (!gam_file_name.empty()) { if (!suppress_progress) { - cerr << progress_boilerplate() << "Mapping reads from " << (gam_file_name == "-" ? "STDIN" : gam_file_name) << " using " << thread_count << " threads" << endl; + cerr << progress_boilerplate() << "Mapping reads from " << (gam_file_name == "-" ? "STDIN" : gam_file_name) << " using " << thread_count << " thread" << (thread_count > 1 ? "s" : "") << endl; } function execute = [&](istream& gam_in) {