From e0fe4f89427021f38321562fb390de5462584627 Mon Sep 17 00:00:00 2001 From: Tobias Rausch Date: Tue, 24 Sep 2024 10:41:31 +0200 Subject: [PATCH] alt align --- src/gaf.h | 17 +++++++++++++++-- src/junction.h | 39 +++++++++++++++++++++++++-------------- src/pangenome.h | 18 ++++++++---------- 3 files changed, 48 insertions(+), 26 deletions(-) diff --git a/src/gaf.h b/src/gaf.h index 80d4aef..2b14a80 100644 --- a/src/gaf.h +++ b/src/gaf.h @@ -92,9 +92,16 @@ namespace torali } inline bool - parseAlignRecord(std::istream& instream, Graph const& g, AlignRecord& ar, std::string& qname) { + parseAlignRecord(std::istream& instream, Graph const& g, AlignRecord& ar, std::string& qname, std::set& validSR) { std::string gline; - if(std::getline(instream, gline)) { + if(std::getline(instream, gline)) { + if (!validSR.empty()) { + std::size_t found = gline.find('\t'); + if (found != std::string::npos) { + if (validSR.find(hash_lr(gline.substr(0, found))) == validSR.end()) return false; + } + } + typedef boost::tokenizer< boost::char_separator > Tokenizer; boost::char_separator sep("\t"); Tokenizer tokens(gline, sep); @@ -141,6 +148,12 @@ namespace torali } else return false; } + inline bool + parseAlignRecord(std::istream& instream, Graph const& g, AlignRecord& ar, std::string& qname) { + std::set validSR; + return parseAlignRecord(instream, g, ar, qname, validSR); + } + inline bool parseAlignRecord(std::istream& instream, Graph const& g, AlignRecord& ar) { std::string qname; diff --git a/src/junction.h b/src/junction.h index 2425a2e..a1e48b2 100644 --- a/src/junction.h +++ b/src/junction.h @@ -302,7 +302,7 @@ namespace torali template inline void - findJunctions(TConfig const& c, TValidRegion const& validRegions, TReadBp& readBp) { + findJunctions(TConfig const& c, TValidRegion const& validRegions, TReadBp& readBp, std::set& validSR) { typedef typename TValidRegion::value_type TChrIntervals; // Open file handles @@ -338,6 +338,7 @@ namespace torali if ((rec->core.qual < c.minMapQual) || (rec->core.tid<0)) continue; std::size_t seed = hash_lr(rec); + if ((!validSR.empty()) && (validSR.find(seed) == validSR.end())) continue; //std::cerr << bam_get_qname(rec) << '\t' << seed << std::endl; uint32_t rp = rec->core.pos; // reference pointer uint32_t sp = 0; // sequence pointer @@ -459,15 +460,21 @@ namespace torali template inline void - _findSRBreakpoints(TConfig const& c, TValidRegions const& validRegions, TSvtSRBamRecord& srBR) { + _findSRBreakpoints(TConfig const& c, TValidRegions const& validRegions, TSvtSRBamRecord& srBR, std::set& validSR) { // Breakpoints typedef std::vector TJunctionVector; typedef std::map TReadBp; TReadBp readBp; - findJunctions(c, validRegions, readBp); + findJunctions(c, validRegions, readBp, validSR); fetchSVs(c, readBp, srBR); } + template + inline void + _findSRBreakpoints(TConfig const& c, TValidRegions const& validRegions, TSvtSRBamRecord& srBR) { + std::set validSR; + _findSRBreakpoints(c, validRegions, srBR, validSR); + } template inline void @@ -480,7 +487,13 @@ namespace torali // Alternate alignments if (c.hasAltFile) { - std::set validSR; // Set of split-reads in ALL alternate alignments + + // Insert all split-reads + std::set validSR; + for(uint32_t svt = 0; svt < srBR.size(); ++svt) { + for(uint32_t i = 0; i < srBR[svt].size(); ++i) validSR.insert(srBR[svt][i].id); + } + // Parse alternate alignments std::vector align; std::vector genome; @@ -501,10 +514,10 @@ namespace torali _parseExcludeIntervals(altConfig, hdr, vR); bam_hdr_destroy(hdr); sam_close(samfile); - _findSRBreakpoints(altConfig, vR, altSR); + _findSRBreakpoints(altConfig, vR, altSR, validSR); // Debug - //std::cerr << "AltAlign: " << align[i] << std::endl; + //std::cerr << '[' << boost::posix_time::to_simple_string(boost::posix_time::second_clock::local_time()) << "] " << "AltLinearAlign: " << align[i] << std::endl; //outputSRBamRecords(altConfig, altSR, true); } else { Graph g; @@ -515,28 +528,26 @@ namespace torali altConfig.maxReadSep = 2000; // Alternate alignment in GAF format - _findGraphSRBreakpoints(altConfig, g, altSR); + _findGraphSRBreakpoints(altConfig, g, altSR, validSR); // Debug - //std::cerr << "AltAlign: " << align[i] << std::endl; + //std::cerr << '[' << boost::posix_time::to_simple_string(boost::posix_time::second_clock::local_time()) << "] " << "AltGraphAlign: " << align[i] << std::endl; //outputGraphSRBamRecords(altConfig, g, altSR); } // Update valid SR set std::set newValidSR; for(uint32_t svt = 0; svt < altSR.size(); ++svt) { + //std::cerr << '[' << boost::posix_time::to_simple_string(boost::posix_time::second_clock::local_time()) << "] " << "SVT Size: " << svt << ',' << altSR[svt].size() << std::endl; if (altSR[svt].empty()) continue; for(uint32_t i = 0; i < altSR[svt].size(); ++i) { - if (validSR.empty()) newValidSR.insert(altSR[svt][i].id); - else { - if (validSR.find(altSR[svt][i].id) != validSR.end()) newValidSR.insert(altSR[svt][i].id); - } + if (validSR.find(altSR[svt][i].id) != validSR.end()) newValidSR.insert(altSR[svt][i].id); } } validSR = newValidSR; // Debug - //std::cerr << validSR.size() << std::endl; + //std::cerr << '[' << boost::posix_time::to_simple_string(boost::posix_time::second_clock::local_time()) << "] " << "SR size: " << validSR.size() << std::endl; } // Filter primary SR records @@ -560,7 +571,7 @@ namespace torali // Debug - //std::cerr << "MainAlign: " << std::endl; + //std::cerr << '[' << boost::posix_time::to_simple_string(boost::posix_time::second_clock::local_time()) << "] " << "MainAlign: " << std::endl; //outputSRBamRecords(c, srBR, true); // Cluster BAM records diff --git a/src/pangenome.h b/src/pangenome.h index 6c5383e..6b95f7d 100644 --- a/src/pangenome.h +++ b/src/pangenome.h @@ -45,7 +45,7 @@ namespace torali { template inline bool - findGraphJunctions(TConfig& c, Graph const& g, TReadBp& readBp) { + findGraphJunctions(TConfig& c, Graph const& g, TReadBp& readBp, std::set& validSR) { boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cerr << '[' << boost::posix_time::to_simple_string(now) << "] " << "Split-read scanning" << std::endl; @@ -67,11 +67,10 @@ namespace torali { // Parse GAF std::istream instream(&dataIn); - bool parseAR = true; - while (parseAR) { + while (!instream.eof()) { AlignRecord ar; std::string qname; - if (parseAlignRecord(instream, g, ar, qname)) { + if (parseAlignRecord(instream, g, ar, qname, validSR)) { if (ar.mapq < c.minMapQual) continue; // Iterate path alignments @@ -205,7 +204,7 @@ namespace torali { } //std::cerr << ar.seed << ',' << ar.qlen << ',' << ar.qstart << ',' << ar.qend << ',' << ar.strand << ',' << ar.plen << ',' << ar.pstart << ',' << ar.pend << ',' << ar.matches << ',' << ar.alignlen << ',' << ar.mapq << std::endl; - } else parseAR = false; + } } // Sort junctions @@ -222,12 +221,12 @@ namespace torali { template inline void - _findGraphSRBreakpoints(TConfig const& c, Graph const& g, TSvtSRBamRecord& srBR) { + _findGraphSRBreakpoints(TConfig const& c, Graph const& g, TSvtSRBamRecord& srBR, std::set& validSR) { // Breakpoints typedef std::vector TJunctionVector; typedef std::map TReadBp; TReadBp readBp; - findGraphJunctions(c, g, readBp); + findGraphJunctions(c, g, readBp, validSR); fetchSVs(c, readBp, srBR); } @@ -255,8 +254,7 @@ namespace torali { // Parse GAF std::istream instream(&dataIn); - bool parseAR = true; - while (parseAR) { + while (!instream.eof()) { AlignRecord ar; std::string qname; if (parseAlignRecord(instream, g, ar, qname)) { @@ -266,7 +264,7 @@ namespace torali { std::cerr << "Warning: Hash collision! " << ar.seed << ',' << hm[ar.seed] << ',' << qname << std::endl; } } - } else parseAR = false; + } } // Close file