From 573a2ef1794055343acfc0e40f11e2681f06687c Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Mon, 11 Mar 2024 15:03:18 -0500 Subject: [PATCH 1/9] penalties 4,8,1 to be closer to anchorwave and minimap2 --- src/common/wflign/src/wflign.cpp | 4 ++-- src/interface/parse_args.hpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/common/wflign/src/wflign.cpp b/src/common/wflign/src/wflign.cpp index 90489365..fa84e219 100644 --- a/src/common/wflign/src/wflign.cpp +++ b/src/common/wflign/src/wflign.cpp @@ -338,7 +338,7 @@ void WFlign::wflign_affine_wavefront( } else { wfa_affine_penalties.match = 0; wfa_affine_penalties.mismatch = 4; - wfa_affine_penalties.gap_opening = 6; + wfa_affine_penalties.gap_opening = 8; wfa_affine_penalties.gap_extension = 1; /* if (mashmap_estimated_identity >= 0.80) { @@ -554,7 +554,7 @@ void WFlign::wflign_affine_wavefront( } else { wflambda_affine_penalties.match = 0; wflambda_affine_penalties.mismatch = 4; - wflambda_affine_penalties.gap_opening = 6; + wflambda_affine_penalties.gap_opening = 8; wflambda_affine_penalties.gap_extension = 1; } diff --git a/src/interface/parse_args.hpp b/src/interface/parse_args.hpp index 5c3c2a74..37f9c644 100644 --- a/src/interface/parse_args.hpp +++ b/src/interface/parse_args.hpp @@ -105,11 +105,11 @@ void parse_args(int argc, {'O', "invert-filtering"}); args::ValueFlag wflambda_segment_length(alignment_opts, "N", "wflambda segment length: size (in bp) of segment mapped in hierarchical WFA problem [default: 256]", {'W', "wflamda-segment"}); args::ValueFlag wfa_score_params(alignment_opts, "mismatch,gap1,ext1", - "score parameters for the wfa alignment (affine); match score is fixed at 0 [default: 4,6,1]",//, if 4 values then gaps are affine, if 6 values then gaps are convex", + "score parameters for the wfa alignment (affine); match score is fixed at 0 [default: 4,8,1]",//, if 4 values then gaps are affine, if 6 values then gaps are convex", {'g', "wfa-params"}); //wflign parameters args::ValueFlag wflign_score_params(alignment_opts, "mismatch,gap1,ext1", - "score parameters for the wflign alignment (affine); match score is fixed at 0 [default: 4,6,1]",//, if 4 then gaps are affine, if 6 then gaps are convex [default: 1,4,6,2,26,1]", + "score parameters for the wflign alignment (affine); match score is fixed at 0 [default: 4,8,1]",//, if 4 then gaps are affine, if 6 then gaps are convex [default: 1,4,6,2,26,1]", {'G', "wflign-params"}); args::ValueFlag wflign_max_mash_dist(alignment_opts, "N", "maximum mash distance to perform the alignment in a wflambda segment [default: adaptive with respect to the estimated identity]", {'b', "max-mash-dist"}); args::ValueFlag wflign_min_wavefront_length(alignment_opts, "N", "min wavefront length for heuristic WFlign [default: 1024]", {'j', "wflign-min-wf-len"}); From 8f285757a0fd716fe4b84b640774f7ce56389012 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Mon, 11 Mar 2024 15:17:31 -0500 Subject: [PATCH 2/9] try to over estimate max_score --- src/common/wflign/src/wflign_patch.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index 03840ddd..87de2417 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -203,7 +203,10 @@ void do_wfa_patch_alignment( (affine_penalties.gap_extension * std::min( (int)chain_gap, (int)std::max(target_length, query_length) - )); + )) + 32; + // + 32 because we are not really setting the MaxScore, but the MaxAlignmentSteps. + // AlignmentStep is usually > Score (they are the same with edit-distance) + wf_aligner.setMaxAlignmentSteps(max_score); const int status = wf_aligner.alignEnd2End(target + i,target_length,query + j,query_length); aln.ok = (status == WF_STATUS_ALG_COMPLETED); @@ -933,8 +936,8 @@ void write_merged_alignment( #ifdef WFA_PNG_AND_TSV if (emit_patching_tsv) { *out_patching_tsv - << query_name << ":" << query_pos << "-" << query_pos + query_delta << "\t" - << target_name << ":" << target_pos - target_pointer_shift << "-" << target_pos - target_pointer_shift + target_delta << "\t" + << query_name << "\t" << query_pos << "\t" << query_pos + query_delta << "\t" + << target_name << "\t" << (target_pos - target_pointer_shift) << "\t" << (target_pos - target_pointer_shift + target_delta) << "\t" << patch_aln.ok << std::endl; } #endif From 3fd7c6fc2a3d4d51760ee5e8e18895094c08354c Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Fri, 15 Mar 2024 19:34:15 -0500 Subject: [PATCH 3/9] convex penalties for patching --- src/align/include/align_parameters.hpp | 7 ++ src/align/include/computeAlignments.hpp | 5 + src/common/wflign/src/wflign.cpp | 112 ++++++++++++++------- src/common/wflign/src/wflign.hpp | 13 +++ src/common/wflign/src/wflign_alignment.hpp | 6 +- src/common/wflign/src/wflign_patch.cpp | 47 +++++---- src/common/wflign/src/wflign_patch.hpp | 8 +- src/interface/parse_args.hpp | 56 +++++++---- 8 files changed, 171 insertions(+), 83 deletions(-) diff --git a/src/align/include/align_parameters.hpp b/src/align/include/align_parameters.hpp index 6c7c455b..20b5bb7f 100644 --- a/src/align/include/align_parameters.hpp +++ b/src/align/include/align_parameters.hpp @@ -24,10 +24,17 @@ struct Parameters { //wflambda uint16_t wflambda_segment_length; //segment length for wflambda + int wfa_mismatch_score; int wfa_gap_opening_score; int wfa_gap_extension_score; + int wfa_patching_mismatch_score; + int wfa_patching_gap_opening_score1; + int wfa_patching_gap_extension_score1; + int wfa_patching_gap_opening_score2; + int wfa_patching_gap_extension_score2; + // wflign int wflign_mismatch_score; int wflign_gap_opening_score; diff --git a/src/align/include/computeAlignments.hpp b/src/align/include/computeAlignments.hpp index 58994a72..7bb127f0 100644 --- a/src/align/include/computeAlignments.hpp +++ b/src/align/include/computeAlignments.hpp @@ -524,6 +524,11 @@ namespace align param.wfa_mismatch_score, param.wfa_gap_opening_score, param.wfa_gap_extension_score, + param.wfa_patching_mismatch_score, + param.wfa_patching_gap_opening_score1, + param.wfa_patching_gap_extension_score1, + param.wfa_patching_gap_opening_score2, + param.wfa_patching_gap_extension_score2, currentRecord.mashmap_estimated_identity, param.wflign_mismatch_score, param.wflign_gap_opening_score, diff --git a/src/common/wflign/src/wflign.cpp b/src/common/wflign/src/wflign.cpp index fa84e219..d8f58b75 100644 --- a/src/common/wflign/src/wflign.cpp +++ b/src/common/wflign/src/wflign.cpp @@ -45,6 +45,11 @@ WFlign::WFlign( const int wfa_mismatch_score, const int wfa_gap_opening_score, const int wfa_gap_extension_score, + const int wfa_patching_mismatch_score, + const int wfa_patching_gap_opening_score1, + const int wfa_patching_gap_extension_score1, + const int wfa_patching_gap_opening_score2, + const int wfa_gap_extension_score2, const float mashmap_estimated_identity, const int wflign_mismatch_score, const int wflign_gap_opening_score, @@ -60,9 +65,17 @@ WFlign::WFlign( // Parameters this->segment_length = segment_length; this->min_identity = min_identity; + this->wfa_mismatch_score = wfa_mismatch_score; this->wfa_gap_opening_score = wfa_gap_opening_score; this->wfa_gap_extension_score = wfa_gap_extension_score; + + this->wfa_patching_mismatch_score = wfa_patching_mismatch_score; + this->wfa_patching_gap_opening_score1 = wfa_patching_gap_opening_score1; + this->wfa_patching_gap_extension_score1 = wfa_patching_gap_extension_score1; + this->wfa_patching_gap_opening_score2 = wfa_patching_gap_opening_score2; + this->wfa_patching_gap_extension_score2 = wfa_patching_gap_extension_score2; + this->mashmap_estimated_identity = mashmap_estimated_identity; this->wflign_mismatch_score = wflign_mismatch_score; this->wflign_gap_opening_score = wflign_gap_opening_score; @@ -328,18 +341,18 @@ void WFlign::wflign_affine_wavefront( const int minhash_kmer_size = std::max(8, std::min(17, (int)std::floor(1.0 / (1.0 - mashmap_estimated_identity)))); - // Set penalties + // Set penalties for wfa affine wflign_penalties_t wfa_affine_penalties; if (wfa_mismatch_score > 0 && wfa_gap_opening_score > 0 && wfa_gap_extension_score > 0){ wfa_affine_penalties.match = 0; wfa_affine_penalties.mismatch = wfa_mismatch_score; - wfa_affine_penalties.gap_opening = wfa_gap_opening_score; - wfa_affine_penalties.gap_extension = wfa_gap_extension_score; + wfa_affine_penalties.gap_opening1 = wfa_gap_opening_score; + wfa_affine_penalties.gap_extension1 = wfa_gap_extension_score; } else { wfa_affine_penalties.match = 0; - wfa_affine_penalties.mismatch = 4; - wfa_affine_penalties.gap_opening = 8; - wfa_affine_penalties.gap_extension = 1; + wfa_affine_penalties.mismatch = 6; + wfa_affine_penalties.gap_opening1 = 8; + wfa_affine_penalties.gap_extension1 = 1; /* if (mashmap_estimated_identity >= 0.80) { // Polynomial fitting @@ -359,6 +372,24 @@ void WFlign::wflign_affine_wavefront( */ } + // Set penalties for wfa convex + wflign_penalties_t wfa_convex_penalties; + if (wfa_patching_mismatch_score > 0 && wfa_patching_gap_opening_score1 > 0 && wfa_patching_gap_extension_score1 > 0 && wfa_patching_gap_opening_score2 > 0 && wfa_patching_gap_extension_score2 > 0){ + wfa_convex_penalties.match = 0; + wfa_convex_penalties.mismatch = wfa_patching_mismatch_score; + wfa_convex_penalties.gap_opening1 = wfa_patching_gap_opening_score1; + wfa_convex_penalties.gap_extension1 = wfa_patching_gap_extension_score1; + wfa_convex_penalties.gap_opening2 = wfa_patching_gap_opening_score2; + wfa_convex_penalties.gap_extension2 = wfa_patching_gap_extension_score2; + } else { + wfa_convex_penalties.match = 0; + wfa_convex_penalties.mismatch = 6; + wfa_convex_penalties.gap_opening1 = 8; + wfa_convex_penalties.gap_extension1 = 2; + wfa_convex_penalties.gap_opening2 = 26; + wfa_convex_penalties.gap_extension2 = 1; + } + // heuristic bound on the max mash dist, adaptive based on estimated // identity the goal here is to sparsify the set of alignments in the // wflambda layer we then patch up the gaps between them @@ -405,11 +436,14 @@ void WFlign::wflign_affine_wavefront( (mashmap_estimated_identity >= 0.99 && query_length <= MAX_LEN_FOR_STANDARD_WFA && target_length <= MAX_LEN_FOR_STANDARD_WFA) ) { - wfa::WFAlignerGapAffine* wf_aligner = - new wfa::WFAlignerGapAffine( - wfa_affine_penalties.mismatch, - wfa_affine_penalties.gap_opening, - wfa_affine_penalties.gap_extension, + wfa::WFAlignerGapAffine2Pieces* wf_aligner = + new wfa::WFAlignerGapAffine2Pieces( + 0, + wfa_convex_penalties.mismatch, + wfa_convex_penalties.gap_opening1, + wfa_convex_penalties.gap_extension1, + wfa_convex_penalties.gap_opening2, + wfa_convex_penalties.gap_extension2, wfa::WFAligner::Alignment, wfa::WFAligner::MemoryUltralow); wf_aligner->setHeuristicNone(); @@ -466,12 +500,16 @@ void WFlign::wflign_affine_wavefront( delete wf_aligner; // use biWFA for all patching - wf_aligner = new wfa::WFAlignerGapAffine( - wfa_affine_penalties.mismatch, - wfa_affine_penalties.gap_opening, - wfa_affine_penalties.gap_extension, - wfa::WFAligner::Alignment, - wfa::WFAligner::MemoryUltralow); + wf_aligner = + new wfa::WFAlignerGapAffine2Pieces( + 0, + wfa_convex_penalties.mismatch, + wfa_convex_penalties.gap_opening1, + wfa_convex_penalties.gap_extension1, + wfa_convex_penalties.gap_opening2, + wfa_convex_penalties.gap_extension2, + wfa::WFAligner::Alignment, + wfa::WFAligner::MemoryUltralow); wf_aligner->setHeuristicNone(); // write a merged alignment @@ -479,7 +517,7 @@ void WFlign::wflign_affine_wavefront( *out, trace, *wf_aligner, - wfa_affine_penalties, + wfa_convex_penalties, emit_md_tag, paf_format_else_sam, no_seq_in_sam, @@ -549,29 +587,29 @@ void WFlign::wflign_affine_wavefront( if (wflign_mismatch_score > 0 && wflign_gap_opening_score > 0 && wflign_gap_extension_score > 0){ wflambda_affine_penalties.match = 0; wflambda_affine_penalties.mismatch = wflign_mismatch_score; - wflambda_affine_penalties.gap_opening = wflign_gap_opening_score; - wflambda_affine_penalties.gap_extension = wflign_gap_extension_score; + wflambda_affine_penalties.gap_opening1 = wflign_gap_opening_score; + wflambda_affine_penalties.gap_extension1 = wflign_gap_extension_score; } else { wflambda_affine_penalties.match = 0; wflambda_affine_penalties.mismatch = 4; - wflambda_affine_penalties.gap_opening = 8; - wflambda_affine_penalties.gap_extension = 1; + wflambda_affine_penalties.gap_opening1 = 6; + wflambda_affine_penalties.gap_extension1 = 1; } //std::cerr << "wfa_affine_penalties.mismatch " << wfa_affine_penalties.mismatch << std::endl; //std::cerr << "wfa_affine_penalties.gap_opening " << wfa_affine_penalties.gap_opening << std::endl; //std::cerr << "wfa_affine_penalties.gap_extension " << wfa_affine_penalties.gap_extension << std::endl; //std::cerr << "wflambda_affine_penalties.mismatch " << wflambda_affine_penalties.mismatch << std::endl; - //std::cerr << "wflambda_affine_penalties.gap_opening " << wflambda_affine_penalties.gap_opening << std::endl; - //std::cerr << "wflambda_affine_penalties.gap_extension " << wflambda_affine_penalties.gap_extension << std::endl; + //std::cerr << "wflambda_affine_penalties.gap_opening1 " << wflambda_affine_penalties.gap_opening1 << std::endl; + //std::cerr << "wflambda_affine_penalties.gap_extension1 " << wflambda_affine_penalties.gap_extension1 << std::endl; //std::cerr << "max_mash_dist_to_evaluate " << max_mash_dist_to_evaluate << std::endl; // Configure the attributes of the wflambda-aligner wfa::WFAlignerGapAffine* wflambda_aligner = new wfa::WFAlignerGapAffine( wflambda_affine_penalties.mismatch, - wflambda_affine_penalties.gap_opening, - wflambda_affine_penalties.gap_extension, + wflambda_affine_penalties.gap_opening1, + wflambda_affine_penalties.gap_extension1, wfa::WFAligner::Alignment, wfa::WFAligner::MemoryUltralow); wflambda_aligner->setHeuristicNone(); // It should help @@ -591,8 +629,8 @@ void WFlign::wflign_affine_wavefront( wfa::WFAlignerGapAffine* wf_aligner = new wfa::WFAlignerGapAffine( wfa_affine_penalties.mismatch, - wfa_affine_penalties.gap_opening, - wfa_affine_penalties.gap_extension, + wfa_affine_penalties.gap_opening1, + wfa_affine_penalties.gap_extension1, wfa::WFAligner::Alignment, wfa::WFAligner::MemoryHigh); wf_aligner->setHeuristicNone(); @@ -923,12 +961,16 @@ void WFlign::wflign_affine_wavefront( delete wf_aligner; // use biWFA for all patching - wf_aligner = new wfa::WFAlignerGapAffine( - wfa_affine_penalties.mismatch, - wfa_affine_penalties.gap_opening, - wfa_affine_penalties.gap_extension, - wfa::WFAligner::Alignment, - wfa::WFAligner::MemoryUltralow); + wfa::WFAlignerGapAffine2Pieces* wf_aligner = + new wfa::WFAlignerGapAffine2Pieces( + 0, + wfa_convex_penalties.mismatch, + wfa_convex_penalties.gap_opening1, + wfa_convex_penalties.gap_extension1, + wfa_convex_penalties.gap_opening2, + wfa_convex_penalties.gap_extension2, + wfa::WFAligner::Alignment, + wfa::WFAligner::MemoryUltralow); wf_aligner->setHeuristicNone(); // write a merged alignment @@ -936,7 +978,7 @@ void WFlign::wflign_affine_wavefront( *out, trace, *wf_aligner, - wfa_affine_penalties, + wfa_convex_penalties, emit_md_tag, paf_format_else_sam, no_seq_in_sam, diff --git a/src/common/wflign/src/wflign.hpp b/src/common/wflign/src/wflign.hpp index 7e367778..3ff28f87 100644 --- a/src/common/wflign/src/wflign.hpp +++ b/src/common/wflign/src/wflign.hpp @@ -40,9 +40,17 @@ namespace wflign { uint16_t segment_length; float min_identity; int _minhash_kmer_size; + int wfa_mismatch_score; int wfa_gap_opening_score; int wfa_gap_extension_score; + + int wfa_patching_mismatch_score; + int wfa_patching_gap_opening_score1; + int wfa_patching_gap_extension_score1; + int wfa_patching_gap_opening_score2; + int wfa_patching_gap_extension_score2; + float mashmap_estimated_identity; // WFlign parameters int wflign_mismatch_score; @@ -90,6 +98,11 @@ namespace wflign { const int wfa_mismatch_score, const int wfa_gap_opening_score, const int wfa_gap_extension_score, + const int wfa_patching_mismatch_score, + const int wfa_patching_gap_opening_score1, + const int wfa_patching_gap_extension_score1, + const int wfa_patching_gap_opening_score2, + const int wfa_patching_gap_extension_score2, const float mashmap_estimated_identity, const int wflign_mismatch_score, const int wflign_gap_opening_score, diff --git a/src/common/wflign/src/wflign_alignment.hpp b/src/common/wflign/src/wflign_alignment.hpp index 53b54ed7..9bd0c52f 100644 --- a/src/common/wflign/src/wflign_alignment.hpp +++ b/src/common/wflign/src/wflign_alignment.hpp @@ -20,8 +20,10 @@ typedef struct { typedef struct { int match; int mismatch; - int gap_opening; - int gap_extension; + int gap_opening1; + int gap_extension1; + int gap_opening2; + int gap_extension2; } wflign_penalties_t; /* diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index 87de2417..6777c3a7 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -191,20 +191,21 @@ void do_wfa_patch_alignment( const char* target, const uint64_t& i, const uint64_t& target_length, - wfa::WFAlignerGapAffine& wf_aligner, - const wflign_penalties_t& affine_penalties, + wfa::WFAlignerGapAffine2Pieces& wf_aligner, + const wflign_penalties_t& convex_penalties, alignment_t& aln, const int64_t& chain_gap, const int& max_patching_score) { const int max_score = max_patching_score ? max_patching_score : - affine_penalties.gap_opening + - (affine_penalties.gap_extension * std::min( + // gap_opening2/gap_extension1 is usually > gap_opening1/gap_extension2 + convex_penalties.gap_opening2 + + (convex_penalties.gap_extension1 * std::min( (int)chain_gap, (int)std::max(target_length, query_length) - )) + 32; - // + 32 because we are not really setting the MaxScore, but the MaxAlignmentSteps. + )) + 64; + // + 64 because we are not really setting the MaxScore, but the MaxAlignmentSteps. // AlignmentStep is usually > Score (they are the same with edit-distance) wf_aligner.setMaxAlignmentSteps(max_score); @@ -238,8 +239,8 @@ void do_wfa_patch_alignment( void write_merged_alignment( std::ostream &out, const std::vector &trace, - wfa::WFAlignerGapAffine& wf_aligner, - const wflign_penalties_t& affine_penalties, + wfa::WFAlignerGapAffine2Pieces& wf_aligner, + const wflign_penalties_t& convex_penalties, const bool& emit_md_tag, const bool& paf_format_else_sam, const bool& no_seq_in_sam, @@ -384,7 +385,7 @@ void write_merged_alignment( &wflign_max_len_minor, &distance_close_big_enough_indels, &min_wf_length, &max_dist_threshold, &wf_aligner, - &affine_penalties, + &convex_penalties, &chain_gap, &max_patching_score #ifdef WFA_PNG_AND_TSV , @@ -546,11 +547,14 @@ void write_merged_alignment( // } // std::cerr << std::endl; - wfa::WFAlignerGapAffine* wf_aligner_heads = - new wfa::WFAlignerGapAffine( - affine_penalties.mismatch, - affine_penalties.gap_opening, - affine_penalties.gap_extension, + wfa::WFAlignerGapAffine2Pieces* wf_aligner_heads = + new wfa::WFAlignerGapAffine2Pieces( + 0, + convex_penalties.mismatch, + convex_penalties.gap_opening1, + convex_penalties.gap_extension1, + convex_penalties.gap_opening2, + convex_penalties.gap_extension2, wfa::WFAligner::Alignment, wfa::WFAligner::MemoryMed); wf_aligner_heads->setHeuristicWFmash(min_wf_length,max_dist_threshold); @@ -878,7 +882,7 @@ void write_merged_alignment( do_wfa_patch_alignment( query, query_pos, query_delta, target - target_pointer_shift, target_pos, target_delta, - wf_aligner, affine_penalties, patch_aln, chain_gap, max_patching_score); + wf_aligner, convex_penalties, patch_aln, chain_gap, max_patching_score); if (patch_aln.ok) { // std::cerr << "got an ok patch aln" << // std::endl; @@ -1044,11 +1048,14 @@ void write_merged_alignment( // << target_delta_x // << std::endl; - wfa::WFAlignerGapAffine* wf_aligner_tails = - new wfa::WFAlignerGapAffine( - affine_penalties.mismatch, - affine_penalties.gap_opening, - affine_penalties.gap_extension, + wfa::WFAlignerGapAffine2Pieces* wf_aligner_tails = + new wfa::WFAlignerGapAffine2Pieces( + 0, + convex_penalties.mismatch, + convex_penalties.gap_opening1, + convex_penalties.gap_extension1, + convex_penalties.gap_opening2, + convex_penalties.gap_extension2, wfa::WFAligner::Alignment, wfa::WFAligner::MemoryMed); wf_aligner_tails->setHeuristicWFmash(min_wf_length,max_dist_threshold); diff --git a/src/common/wflign/src/wflign_patch.hpp b/src/common/wflign/src/wflign_patch.hpp index b39205c7..8304d45c 100644 --- a/src/common/wflign/src/wflign_patch.hpp +++ b/src/common/wflign/src/wflign_patch.hpp @@ -52,16 +52,16 @@ namespace wflign { const char* target, const uint64_t& i, const uint64_t& target_length, - wfa::WFAlignerGapAffine& _wf_aligner, - const wflign_penalties_t& affine_penalties, + wfa::WFAlignerGapAffine2Pieces& _wf_aligner, + const wflign_penalties_t& convex_penalties, alignment_t& aln, const int64_t& chain_gap, const int& max_patching_score); void write_merged_alignment( std::ostream &out, const std::vector &trace, - wfa::WFAlignerGapAffine& wf_aligner, - const wflign_penalties_t& affine_penalties, + wfa::WFAlignerGapAffine2Pieces& wf_aligner, + const wflign_penalties_t& convex_penalties, const bool& emit_md_tag, const bool& paf_format_else_sam, const bool& no_seq_in_sam, diff --git a/src/interface/parse_args.hpp b/src/interface/parse_args.hpp index 37f9c644..a5331a6a 100644 --- a/src/interface/parse_args.hpp +++ b/src/interface/parse_args.hpp @@ -105,12 +105,15 @@ void parse_args(int argc, {'O', "invert-filtering"}); args::ValueFlag wflambda_segment_length(alignment_opts, "N", "wflambda segment length: size (in bp) of segment mapped in hierarchical WFA problem [default: 256]", {'W', "wflamda-segment"}); args::ValueFlag wfa_score_params(alignment_opts, "mismatch,gap1,ext1", - "score parameters for the wfa alignment (affine); match score is fixed at 0 [default: 4,8,1]",//, if 4 values then gaps are affine, if 6 values then gaps are convex", - {'g', "wfa-params"}); + "score parameters for the wfa alignment (affine); match score is fixed at 0 [default: 6,8,1]", + {"wfa-params"}); + args::ValueFlag wfa_patching_score_params(alignment_opts, "mismatch,gap1,ext1,gap2,ext2", + "score parameters for the wfa patching alignment (convex); match score is fixed at 0 [default: 6,8,2,26,1]", + {"wfa-patching-params"}); //wflign parameters args::ValueFlag wflign_score_params(alignment_opts, "mismatch,gap1,ext1", - "score parameters for the wflign alignment (affine); match score is fixed at 0 [default: 4,8,1]",//, if 4 then gaps are affine, if 6 then gaps are convex [default: 1,4,6,2,26,1]", - {'G', "wflign-params"}); + "score parameters for the wflign alignment (affine); match score is fixed at 0 [default: 4,6,1]", + {"wflign-params"}); args::ValueFlag wflign_max_mash_dist(alignment_opts, "N", "maximum mash distance to perform the alignment in a wflambda segment [default: adaptive with respect to the estimated identity]", {'b', "max-mash-dist"}); args::ValueFlag wflign_min_wavefront_length(alignment_opts, "N", "min wavefront length for heuristic WFlign [default: 1024]", {'j', "wflign-min-wf-len"}); args::ValueFlag wflign_max_distance_threshold(alignment_opts, "N", "max distance threshold for heuristic WFlign [default: 2048/(estimated_identity^2)]", {'q', "wflign-max-distance"}); @@ -261,7 +264,7 @@ void parse_args(int argc, if (!args::get(wfa_score_params).empty()) { const std::vector params_str = skch::CommonFunc::split(args::get(wfa_score_params), ','); if (params_str.size() != 3) { - std::cerr << "[wfmash] ERROR error: 3 scoring parameters must be given to -g/--wfa-params"//either 3 or 5 scoring parameters must be given to -g/--wflamda-params + std::cerr << "[wfmash] ERROR error: 3 scoring parameters must be given to --wfa-params" << std::endl; exit(1); } @@ -273,24 +276,41 @@ void parse_args(int argc, align_parameters.wfa_mismatch_score = params[0]; align_parameters.wfa_gap_opening_score = params[1]; align_parameters.wfa_gap_extension_score = params[2]; - - /*if (params.size() == 6) { - align_parameters.wflambda_mismatch_score = params[0]; - align_parameters.wflambda_gap_opening_score = params[1]; - align_parameters.wflambda_gap_extension_score = params[2]; - xx = params[4]; - xx = params[5]; - }*/ } else { align_parameters.wfa_mismatch_score = -1; align_parameters.wfa_gap_opening_score = -1; align_parameters.wfa_gap_extension_score = -1; } + if (!args::get(wfa_patching_score_params).empty()) { + const std::vector params_str = skch::CommonFunc::split(args::get(wfa_patching_score_params), ','); + if (params_str.size() != 5) { + std::cerr << "[wfmash] ERROR error: 5 scoring parameters must be given to --wfa-patching-params" + << std::endl; + exit(1); + } + + std::vector params(params_str.size()); + std::transform(params_str.begin(), params_str.end(), params.begin(), + [](const std::string &s) { return std::stoi(s); }); + + align_parameters.wfa_patching_mismatch_score = params[0]; + align_parameters.wfa_patching_gap_opening_score1 = params[1]; + align_parameters.wfa_patching_gap_extension_score1 = params[2]; + align_parameters.wfa_patching_gap_opening_score2 = params[3]; + align_parameters.wfa_patching_gap_extension_score2 = params[4]; + } else { + align_parameters.wfa_patching_mismatch_score = -1; + align_parameters.wfa_patching_gap_opening_score1 = -1; + align_parameters.wfa_patching_gap_extension_score1 = -1; + align_parameters.wfa_patching_gap_opening_score2 = -1; + align_parameters.wfa_patching_gap_extension_score2 = -1; + } + if (!args::get(wflign_score_params).empty()) { const std::vector params_str = skch::CommonFunc::split(args::get(wflign_score_params), ','); if (params_str.size() != 3) { - std::cerr << "[wfmash] ERROR error: 3 scoring parameters must be given to -G/--wflign-params."//either 3 or 5 scoring parameters must be given to -G/--wflign-params + std::cerr << "[wfmash] ERROR error: 3 scoring parameters must be given to --wflign-params." << std::endl; exit(1); } @@ -302,14 +322,6 @@ void parse_args(int argc, align_parameters.wflign_mismatch_score = params[0]; align_parameters.wflign_gap_opening_score = params[1]; align_parameters.wflign_gap_extension_score = params[2]; - - /*if (params.size() == 6) { - align_parameters.wflign_gap_opening_score = params[0]; - align_parameters.wflign_gap_extension_score = params[1]; - align_parameters.wflign_gap_extension_score = params[2]; - xx = params[4]; - xx = params[5]; - }*/ } else { align_parameters.wflign_mismatch_score = -1; align_parameters.wflign_gap_opening_score = -1; From fce0722401a4b424497641b813a937891f0ec6af Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Sun, 17 Mar 2024 10:49:59 -0500 Subject: [PATCH 4/9] fix typo and little decrease in patch_mismatch_pen --- src/common/wflign/src/wflign.cpp | 4 ++-- src/interface/parse_args.hpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/common/wflign/src/wflign.cpp b/src/common/wflign/src/wflign.cpp index d8f58b75..51cc14a5 100644 --- a/src/common/wflign/src/wflign.cpp +++ b/src/common/wflign/src/wflign.cpp @@ -49,7 +49,7 @@ WFlign::WFlign( const int wfa_patching_gap_opening_score1, const int wfa_patching_gap_extension_score1, const int wfa_patching_gap_opening_score2, - const int wfa_gap_extension_score2, + const int wfa_patching_gap_extension_score2, const float mashmap_estimated_identity, const int wflign_mismatch_score, const int wflign_gap_opening_score, @@ -383,7 +383,7 @@ void WFlign::wflign_affine_wavefront( wfa_convex_penalties.gap_extension2 = wfa_patching_gap_extension_score2; } else { wfa_convex_penalties.match = 0; - wfa_convex_penalties.mismatch = 6; + wfa_convex_penalties.mismatch = 5; wfa_convex_penalties.gap_opening1 = 8; wfa_convex_penalties.gap_extension1 = 2; wfa_convex_penalties.gap_opening2 = 26; diff --git a/src/interface/parse_args.hpp b/src/interface/parse_args.hpp index a5331a6a..e665aef9 100644 --- a/src/interface/parse_args.hpp +++ b/src/interface/parse_args.hpp @@ -108,7 +108,7 @@ void parse_args(int argc, "score parameters for the wfa alignment (affine); match score is fixed at 0 [default: 6,8,1]", {"wfa-params"}); args::ValueFlag wfa_patching_score_params(alignment_opts, "mismatch,gap1,ext1,gap2,ext2", - "score parameters for the wfa patching alignment (convex); match score is fixed at 0 [default: 6,8,2,26,1]", + "score parameters for the wfa patching alignment (convex); match score is fixed at 0 [default: 5,8,2,26,1]", {"wfa-patching-params"}); //wflign parameters args::ValueFlag wflign_score_params(alignment_opts, "mismatch,gap1,ext1", From ba2228782515d93fc7d2d6b8f576549f48c22185 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Tue, 19 Mar 2024 09:23:41 -0500 Subject: [PATCH 5/9] update parameter and cleaning --- src/common/wflign/src/wflign.cpp | 2 +- src/common/wflign/src/wflign_patch.cpp | 102 +------------------------ src/interface/parse_args.hpp | 2 +- 3 files changed, 4 insertions(+), 102 deletions(-) diff --git a/src/common/wflign/src/wflign.cpp b/src/common/wflign/src/wflign.cpp index 51cc14a5..820c2ff3 100644 --- a/src/common/wflign/src/wflign.cpp +++ b/src/common/wflign/src/wflign.cpp @@ -386,7 +386,7 @@ void WFlign::wflign_affine_wavefront( wfa_convex_penalties.mismatch = 5; wfa_convex_penalties.gap_opening1 = 8; wfa_convex_penalties.gap_extension1 = 2; - wfa_convex_penalties.gap_opening2 = 26; + wfa_convex_penalties.gap_opening2 = 49; wfa_convex_penalties.gap_extension2 = 1; } diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index 6777c3a7..25b3df65 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -319,63 +319,6 @@ void write_merged_alignment( // gaps in query and ref and adding our results to the final trace as we // go -#define MAX_NUM_INDELS_TO_LOOK_AT 4 - auto distance_close_big_enough_indels = - [](const uint32_t indel_len, auto iterator, - const std::vector &trace) { - const uint32_t min_indel_len_to_find = indel_len / 3; - const uint16_t max_dist_to_look_at = - std::min(indel_len * 64, (uint32_t)8096); - - // std::cerr << "min_indel_len_to_find " << - // min_indel_len_to_find << std::endl; std::cerr << - // "max_dist_to_look_at " << max_dist_to_look_at << std::endl; - - auto q = iterator; - - uint8_t num_indels_to_find = MAX_NUM_INDELS_TO_LOOK_AT; - uint32_t curr_size_close_indel = 0; - int32_t dist_close_indels = 0; - - //std::cerr << "indel_len " << indel_len << std::endl; - //std::cerr << "min_indel_len_to_find " << min_indel_len_to_find << std::endl; - //std::cerr << "max_dist_to_look_at " << max_dist_to_look_at << std::endl; - - while (q != trace.end() && - dist_close_indels < max_dist_to_look_at) { - curr_size_close_indel = 0; - while (q != trace.end() && (*q == 'I' || *q == 'D')) { - ++curr_size_close_indel; - - ++dist_close_indels; - ++q; - } - // std::cerr << "\t\tcurr_size_close_indel " << - // curr_size_close_indel << std::endl; - if (curr_size_close_indel >= min_indel_len_to_find) { - // std::cerr << "\t\tnum_indels_to_find " << - // (uint16_t)num_indels_to_find << std::endl; - if (--num_indels_to_find == 0) { - break; - } - } - - while (q != trace.end() && - (dist_close_indels < max_dist_to_look_at) && - *q != 'I' && *q != 'D') { - ++dist_close_indels; - ++q; - } - } - - //std::cerr << "dist_close_indels " << dist_close_indels << std::endl; - //std::cerr << "num_indels_found " << MAX_NUM_INDELS_TO_LOOK_AT - num_indels_to_find << std::endl; - - return num_indels_to_find < MAX_NUM_INDELS_TO_LOOK_AT - ? dist_close_indels - : -1; - }; - auto patching = [&query, &query_name, &query_length, &query_start, &query_offset, &target, &target_name, &target_length_mut, &target_start, &target_offset, @@ -383,7 +326,7 @@ void write_merged_alignment( &target_pointer_shift, &wflign_max_len_major, &wflign_max_len_minor, - &distance_close_big_enough_indels, &min_wf_length, + &min_wf_length, &max_dist_threshold, &wf_aligner, &convex_penalties, &chain_gap, &max_patching_score @@ -692,22 +635,11 @@ void write_merged_alignment( (query_delta < wflign_max_len_minor || target_delta < wflign_max_len_minor))) { - // this should only happen if we're at >99% identity - int32_t distance_close_indels = -1; - /* - int32_t distance_close_indels = (query_delta > 3 || target_delta > 3) ? - distance_close_big_enough_indels(std::max(query_delta, target_delta), q, unpatched) : - -1; - */ - - // std::cerr << "distance_close_indels " - // << distance_close_indels << std::endl; // Trigger the patching if there is a dropout // (consecutive Is and Ds) or if there is a close and // big enough indel forward if (size_region_to_repatch > 0 || - (query_delta > 0 && target_delta > 0) || - (distance_close_indels > 0)) { + (query_delta > 0 && target_delta > 0)) { #ifdef WFLIGN_DEBUG // std::cerr << "query_delta " << query_delta << // "\n"; std::cerr << "target_delta " << target_delta @@ -754,8 +686,6 @@ void write_merged_alignment( patched.pop_back(); --size_region_to_repatch; } - - //distance_close_indels = distance_close_big_enough_indels(std::max(query_delta, target_delta), q, unpatched); } else { // nibble backward if we're below the correct // length @@ -808,34 +738,6 @@ void write_merged_alignment( default: break; } - - --distance_close_indels; - } - - // Nibble until the close, big enough indel is - // reached Important when the patching can't be - // computed correctly without including the next - // indel - while (q != unpatched.end() && - distance_close_indels > 0) { - const auto &c = *q++; - switch (c) { - case 'M': - case 'X': - ++query_delta; - ++target_delta; - break; - case 'I': - ++query_delta; - break; - case 'D': - ++target_delta; - break; - default: - break; - } - - --distance_close_indels; } // check forward if there are other Is/Ds to merge diff --git a/src/interface/parse_args.hpp b/src/interface/parse_args.hpp index e665aef9..cdc7bede 100644 --- a/src/interface/parse_args.hpp +++ b/src/interface/parse_args.hpp @@ -108,7 +108,7 @@ void parse_args(int argc, "score parameters for the wfa alignment (affine); match score is fixed at 0 [default: 6,8,1]", {"wfa-params"}); args::ValueFlag wfa_patching_score_params(alignment_opts, "mismatch,gap1,ext1,gap2,ext2", - "score parameters for the wfa patching alignment (convex); match score is fixed at 0 [default: 5,8,2,26,1]", + "score parameters for the wfa patching alignment (convex); match score is fixed at 0 [default: 5,8,2,49,1]", {"wfa-patching-params"}); //wflign parameters args::ValueFlag wflign_score_params(alignment_opts, "mismatch,gap1,ext1", From 9ecd74bace2349081e042f1cb6689fe0da6f81e3 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Thu, 21 Mar 2024 12:12:50 -0500 Subject: [PATCH 6/9] improve head/tail patching --- .../WFA2-lib/wavefront/wavefront_aligner.c | 2 +- src/common/wflign/src/wflign_patch.cpp | 49 +++++++++++++++++-- 2 files changed, 45 insertions(+), 6 deletions(-) diff --git a/src/common/wflign/deps/WFA2-lib/wavefront/wavefront_aligner.c b/src/common/wflign/deps/WFA2-lib/wavefront/wavefront_aligner.c index f32ed268..20557369 100644 --- a/src/common/wflign/deps/WFA2-lib/wavefront/wavefront_aligner.c +++ b/src/common/wflign/deps/WFA2-lib/wavefront/wavefront_aligner.c @@ -516,7 +516,7 @@ void wavefront_aligner_set_alignment_free_ends( const int text_begin_free, const int text_end_free) { wf_aligner->alignment_form.span = alignment_endsfree; - wf_aligner->alignment_form.extension = false; + wf_aligner->alignment_form.extension = true; wf_aligner->alignment_form.pattern_begin_free = pattern_begin_free; wf_aligner->alignment_form.pattern_end_free = pattern_end_free; wf_aligner->alignment_form.text_begin_free = text_begin_free; diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index 25b3df65..6766de60 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -78,7 +78,7 @@ bool do_wfa_segment_alignment( aln.j = j; aln.i = i; - aln.ok = (status == 0); + aln.ok = (status == WF_STATUS_ALG_COMPLETED); // fill the alignment info if we aligned if (aln.ok) { @@ -504,7 +504,8 @@ void write_merged_alignment( const int status = wf_aligner_heads->alignEndsFree( target_rev.c_str(),target_rev.size(),0,0, query_rev.c_str(),query_rev.size(),0,query_rev.size()); - if (status == 0) { // WF_ALIGN_SUCCESSFUL + //std::cerr << "Head patching status " << status << "\n"; + if (status == WF_STATUS_ALG_COMPLETED || status == WF_STATUS_ALG_PARTIAL) { //hack_cigar(wf_aligner_heads->cigar, query_rev.c_str(), target_rev.c_str(), query_rev.size(), target_rev.size(), 0, 0); #ifdef VALIDATE_WFA_WFLIGN @@ -521,7 +522,6 @@ void write_merged_alignment( } #endif - //std::cerr << "Head patching\n"; got_alignment = true; target_pos = target_pos_x; @@ -534,6 +534,28 @@ void write_merged_alignment( char* cigar_ops; int cigar_length; wf_aligner_heads->getAlignment(&cigar_ops,&cigar_length); + + // Put the missing part of the CIGAR string first and then its aligned part + uint64_t missing_query_len = query_rev.size(); + uint64_t missing_target_len = target_rev.size(); + for(int xxx = cigar_length - 1; xxx >= 0; --xxx) { + // The CIGAR string can be incomplete + switch (cigar_ops[xxx]) { + case 'M': case 'X': + --missing_query_len; --missing_target_len; break; + case 'I': --missing_query_len; break; + case 'D': --missing_target_len; break; + default: break; + } + } + // Put the missing part of the CIGAR string + for (uint64_t i = 0; i < missing_query_len; ++i) { + patched.push_back('I'); + } + for (uint64_t i = 0; i < missing_target_len; ++i) { + patched.push_back('D'); + } + for(int xxx = cigar_length - 1; xxx >= 0; --xxx) { //std::cerr << cigar_ops[xxx]; patched.push_back(cigar_ops[xxx]); @@ -982,7 +1004,6 @@ void write_merged_alignment( } #endif - //std::cerr << "Tail patching\n"; got_alignment = true; { @@ -1003,11 +1024,29 @@ void write_merged_alignment( char* cigar_ops; int cigar_length; wf_aligner_tails->getAlignment(&cigar_ops,&cigar_length); + uint64_t missing_query_len = query_delta; + uint64_t missing_target_len = target_delta; for(int xxx = 0; xxx < cigar_length; ++xxx) { //std::cerr << cigar_ops[xxx]; patched.push_back(cigar_ops[xxx]); + + // The CIGAR string can be incomplete + switch (cigar_ops[xxx]) { + case 'M': case 'X': + --missing_query_len; --missing_target_len; break; + case 'I': --missing_query_len; break; + case 'D': --missing_target_len; break; + default: break; + } } //std::cerr << "\n"; + // Put the missing part of the CIGAR string + for (uint64_t i = 0; i < missing_query_len; ++i) { + patched.push_back('I'); + } + for (uint64_t i = 0; i < missing_target_len; ++i) { + patched.push_back('D'); + } } delete wf_aligner_tails; } @@ -1222,7 +1261,7 @@ void write_merged_alignment( sort_indels(tracev); } - // std::cerr << "SECOND PATCH ROUND + //std::cerr << "SECOND PATCH ROUND" << std::endl; // +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n"; patching(pre_tracev, tracev); } From c0ca4b52da00b7e0350e4fd62ccd03e3068ef5c9 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Thu, 21 Mar 2024 14:24:41 -0500 Subject: [PATCH 7/9] head/tail patching more aggressive in the 1st round; trigger patching more --- src/common/wflign/src/wflign_patch.cpp | 60 +++++++++++--------------- 1 file changed, 26 insertions(+), 34 deletions(-) diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index 6766de60..107af0af 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -234,8 +234,6 @@ void do_wfa_patch_alignment( } } -#define MIN_WFA_HEAD_TAIL_PATCH_LENGTH 256 - void write_merged_alignment( std::ostream &out, const std::vector &trace, @@ -279,10 +277,6 @@ void write_merged_alignment( uint64_t target_length_mut = target_length; - // patching parameters - // we will nibble patching back to this length - const uint64_t min_wfa_patch_length = 8; - // we need to get the start position in the query and target // then run through the whole alignment building up the cigar // finally emitting it @@ -336,7 +330,7 @@ void write_merged_alignment( &out_patching_tsv #endif ](std::vector &unpatched, - std::vector &patched) { + std::vector &patched, const uint16_t &min_wfa_head_tail_patch_length, const uint16_t &min_wfa_patch_length) { auto q = unpatched.begin(); uint64_t query_pos = query_start; @@ -365,7 +359,7 @@ void write_merged_alignment( // nibble forward if we're below the correct length // this gives a bit of context for the alignment while (q != unpatched.end() && - (query_delta < MIN_WFA_HEAD_TAIL_PATCH_LENGTH || target_delta < MIN_WFA_HEAD_TAIL_PATCH_LENGTH)) { + (query_delta < min_wfa_head_tail_patch_length || target_delta < min_wfa_head_tail_patch_length)) { const auto &c = *q++; switch (c) { case 'M': case 'X': @@ -461,16 +455,15 @@ void write_merged_alignment( target_delta + target_delta_to_shift; if (target_delta_x > 0) { -// std::cerr << "B HEAD patching in " -// << query_name << " " -// << query_offset << "@ " << query_pos << -// " - " << query_delta -// << " --- " -// << target_name << " " << target_offset -// << " @ " << -// target_pos << " - " -// << target_delta_x -// << std::endl; + // std::cerr << "B HEAD patching in " + // << query_name << " " + // << query_offset + query_pos << + // " - " << query_delta + // << " --- " + // << target_name << " " + // << target_offset + target_pos << " - " + // << target_delta_x + // << std::endl; std::string query_rev(query + query_pos, query_delta); std::reverse(query_rev.begin(), query_rev.end()); @@ -661,7 +654,8 @@ void write_merged_alignment( // (consecutive Is and Ds) or if there is a close and // big enough indel forward if (size_region_to_repatch > 0 || - (query_delta > 0 && target_delta > 0)) { + (query_delta > 0 && target_delta > 0) || + (query_delta > 2 || target_delta > 2)) { #ifdef WFLIGN_DEBUG // std::cerr << "query_delta " << query_delta << // "\n"; std::cerr << "target_delta " << target_delta @@ -922,7 +916,7 @@ void write_merged_alignment( // nibble backward if we're below the correct length // this gives a bit of context for the alignment while (!patched.empty() && - (query_delta < MIN_WFA_HEAD_TAIL_PATCH_LENGTH || target_delta < MIN_WFA_HEAD_TAIL_PATCH_LENGTH)) { + (query_delta < min_wfa_head_tail_patch_length || target_delta < min_wfa_head_tail_patch_length)) { const auto& c = patched.back(); switch (c) { case 'M': case 'X': @@ -961,16 +955,14 @@ void write_merged_alignment( target_pos + target_delta)); if (target_delta_x > 0) { -// std::cerr << "B TAIL patching in " -// << query_name << " " << -// query_offset << " @ " << -// query_pos << " - " << -// query_delta << " " -// << target_name << " " << -// target_offset << " @ " -// << target_pos - target_pointer_shift << " - " -// << target_delta_x -// << std::endl; + // std::cerr << "B TAIL patching in " + // << query_name << " " << + // query_offset + query_pos << " - " << + // query_delta << " " + // << target_name << " " << + // target_offset + target_pos - target_pointer_shift << " - " + // << target_delta_x + // << std::endl; wfa::WFAlignerGapAffine2Pieces* wf_aligner_tails = new wfa::WFAlignerGapAffine2Pieces( @@ -986,8 +978,8 @@ void write_merged_alignment( const int status = wf_aligner_tails->alignEndsFree( target - target_pointer_shift + target_pos, target_delta_x,0,0, query + query_pos, query_delta,0,query_delta); - - if (status == 0) { // WF_ALIGN_SUCCESSFUL + //std::cerr << "Tail patching status " << status << "\n"; + if (status == WF_STATUS_ALG_COMPLETED || status == WF_STATUS_ALG_PARTIAL) { //hack_cigar(wf_aligner_tails->cigar, query + query_pos, target - target_pointer_shift + target_pos, query_delta, target_delta_x, 0, 0); #ifdef VALIDATE_WFA_WFLIGN @@ -1233,7 +1225,7 @@ void write_merged_alignment( //std::cerr << "FIRST PATCH ROUND" << std::endl; // +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n"; - patching(erodev, pre_tracev); + patching(erodev, pre_tracev, 512, 8); // In the 1st round, we nibble more aggressively #ifdef VALIDATE_WFA_WFLIGN if (!validate_trace(pre_tracev, query, @@ -1263,7 +1255,7 @@ void write_merged_alignment( //std::cerr << "SECOND PATCH ROUND" << std::endl; // +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n"; - patching(pre_tracev, tracev); + patching(pre_tracev, tracev, 256, 8); // In the 2nd round, we nibble less aggressively } // normalize the indels From cf7668a2e383de063ac688418f5e3ad9440f51c6 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Thu, 21 Mar 2024 15:20:05 -0500 Subject: [PATCH 8/9] detect close indels, not too agressively --- src/common/wflign/src/wflign_patch.cpp | 94 +++++++++++++++++++++++++- 1 file changed, 92 insertions(+), 2 deletions(-) diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index 107af0af..9fed9acf 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -313,6 +313,62 @@ void write_merged_alignment( // gaps in query and ref and adding our results to the final trace as we // go +#define MAX_NUM_INDELS_TO_LOOK_AT 2 + auto distance_close_big_enough_indels = + [](const uint32_t indel_len, auto iterator, + const std::vector &trace, + const uint16_t&max_dist_to_look_at) { + const uint32_t min_indel_len_to_find = indel_len / 3; + + // std::cerr << "min_indel_len_to_find " << + // min_indel_len_to_find << std::endl; std::cerr << + // "max_dist_to_look_at " << max_dist_to_look_at << std::endl; + + auto q = iterator; + + uint8_t num_indels_to_find = MAX_NUM_INDELS_TO_LOOK_AT; + uint32_t curr_size_close_indel = 0; + int32_t dist_close_indels = 0; + + // std::cerr << "indel_len " << indel_len << std::endl; + // std::cerr << "min_indel_len_to_find " << min_indel_len_to_find << std::endl; + // std::cerr << "max_dist_to_look_at " << max_dist_to_look_at << std::endl; + + while (q != trace.end() && + dist_close_indels < max_dist_to_look_at) { + curr_size_close_indel = 0; + while (q != trace.end() && (*q == 'I' || *q == 'D')) { + ++curr_size_close_indel; + + ++dist_close_indels; + ++q; + } + // std::cerr << "\t\tcurr_size_close_indel " << + // curr_size_close_indel << std::endl; + if (curr_size_close_indel >= min_indel_len_to_find) { + // std::cerr << "\t\tnum_indels_to_find " << + // (uint16_t)num_indels_to_find << std::endl; + if (--num_indels_to_find == 0) { + break; + } + } + + while (q != trace.end() && + (dist_close_indels < max_dist_to_look_at) && + *q != 'I' && *q != 'D') { + ++dist_close_indels; + ++q; + } + } + + // std::cerr << "dist_close_indels " << dist_close_indels << std::endl; + // std::cerr << "num_indels_found " << MAX_NUM_INDELS_TO_LOOK_AT - num_indels_to_find << std::endl; + + return num_indels_to_find < MAX_NUM_INDELS_TO_LOOK_AT + ? std::min(dist_close_indels, 512) + : -1; + }; + auto patching = [&query, &query_name, &query_length, &query_start, &query_offset, &target, &target_name, &target_length_mut, &target_start, &target_offset, @@ -320,7 +376,7 @@ void write_merged_alignment( &target_pointer_shift, &wflign_max_len_major, &wflign_max_len_minor, - &min_wf_length, + &distance_close_big_enough_indels, &min_wf_length, &max_dist_threshold, &wf_aligner, &convex_penalties, &chain_gap, &max_patching_score @@ -650,12 +706,18 @@ void write_merged_alignment( (query_delta < wflign_max_len_minor || target_delta < wflign_max_len_minor))) { + int32_t distance_close_indels = + (query_delta > 10 || target_delta > 10) ? + distance_close_big_enough_indels(std::max(query_delta, target_delta), q, unpatched, 192) : + -1; + // Trigger the patching if there is a dropout // (consecutive Is and Ds) or if there is a close and // big enough indel forward if (size_region_to_repatch > 0 || (query_delta > 0 && target_delta > 0) || - (query_delta > 2 || target_delta > 2)) { + (query_delta > 2 || target_delta > 2) || + distance_close_indels > 0) { #ifdef WFLIGN_DEBUG // std::cerr << "query_delta " << query_delta << // "\n"; std::cerr << "target_delta " << target_delta @@ -754,6 +816,34 @@ void write_merged_alignment( default: break; } + + --distance_close_indels; + } + + // Nibble until the close, big enough indel is + // reached Important when the patching can't be + // computed correctly without including the next + // indel + while (q != unpatched.end() && + distance_close_indels > 0) { + const auto &c = *q++; + switch (c) { + case 'M': + case 'X': + ++query_delta; + ++target_delta; + break; + case 'I': + ++query_delta; + break; + case 'D': + ++target_delta; + break; + default: + break; + } + + --distance_close_indels; } // check forward if there are other Is/Ds to merge From 59755a779ff56c4f399d4902222b22c22a334656 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Thu, 21 Mar 2024 15:58:19 -0500 Subject: [PATCH 9/9] 1st patching agressive, 2nd round soft --- src/common/wflign/src/wflign_patch.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index 9fed9acf..159df509 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -365,7 +365,7 @@ void write_merged_alignment( // std::cerr << "num_indels_found " << MAX_NUM_INDELS_TO_LOOK_AT - num_indels_to_find << std::endl; return num_indels_to_find < MAX_NUM_INDELS_TO_LOOK_AT - ? std::min(dist_close_indels, 512) + ? dist_close_indels : -1; }; @@ -386,7 +386,10 @@ void write_merged_alignment( &out_patching_tsv #endif ](std::vector &unpatched, - std::vector &patched, const uint16_t &min_wfa_head_tail_patch_length, const uint16_t &min_wfa_patch_length) { + std::vector &patched, + const uint16_t &min_wfa_head_tail_patch_length, + const uint16_t &min_wfa_patch_length, + const uint16_t &max_dist_to_look_at) { auto q = unpatched.begin(); uint64_t query_pos = query_start; @@ -708,7 +711,7 @@ void write_merged_alignment( int32_t distance_close_indels = (query_delta > 10 || target_delta > 10) ? - distance_close_big_enough_indels(std::max(query_delta, target_delta), q, unpatched, 192) : + distance_close_big_enough_indels(std::max(query_delta, target_delta), q, unpatched, max_dist_to_look_at) : -1; // Trigger the patching if there is a dropout @@ -1315,7 +1318,7 @@ void write_merged_alignment( //std::cerr << "FIRST PATCH ROUND" << std::endl; // +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n"; - patching(erodev, pre_tracev, 512, 8); // In the 1st round, we nibble more aggressively + patching(erodev, pre_tracev, 4096, 32, 512); // In the 1st round, we patch more aggressively #ifdef VALIDATE_WFA_WFLIGN if (!validate_trace(pre_tracev, query, @@ -1345,7 +1348,7 @@ void write_merged_alignment( //std::cerr << "SECOND PATCH ROUND" << std::endl; // +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n"; - patching(pre_tracev, tracev, 256, 8); // In the 2nd round, we nibble less aggressively + patching(pre_tracev, tracev, 256, 8, 128); // In the 2nd round, we patch less aggressively } // normalize the indels