From 151538358ed6554ecd4ce8653de91bad624c97ec Mon Sep 17 00:00:00 2001 From: olliecheng Date: Tue, 19 Mar 2024 12:26:59 +1100 Subject: [PATCH 1/3] Add detection of chimeric reads in SearchResult --- flexiplex.c++ | 150 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 91 insertions(+), 59 deletions(-) diff --git a/flexiplex.c++ b/flexiplex.c++ index 8ed6d27..10fadc7 100644 --- a/flexiplex.c++ +++ b/flexiplex.c++ @@ -129,6 +129,8 @@ struct SearchResult { string rev_line; vector vec_bc_for; vector vec_bc_rev; + int count; + bool chimeric; }; // Code for fast edit distance calculation for short sequences modified from @@ -458,75 +460,103 @@ void print_line(string id, string read, string quals, ostream & out_stream){ } //print fastq or fasta lines.. -void print_read(string read_id,string read, string qual, +void print_read(string read_id, string read, string qual, vector & vec_bc, string prefix, bool split, unordered_set & found_barcodes, bool trim_barcodes){ - //loop over the barcodes found... usually will just be one - for(int b=0; b0 && temp_read_length 0 && temp_read_length < read_length) + read_length = temp_read_length; + } + string qual_new = ""; // don't trim the quality scores if it's a fasta file + + if (qual != "") { + qual_new = qual.substr(read_start, read_length); + } + string read_new = read.substr(read_start, read_length); + + if (b == 0 && !trim_barcodes) { // override if read shouldn't be cut + new_read_id = read_id; + read_new = read; + qual_new = qual; + b = vec_size; // force loop to exit after this iteration + } + + if (split) { // to a file if spliting by barcode + string outname = prefix + "_" + barcode + "."; + if (qual == "") + outname += "fasta"; + else + outname += "fastq"; + ofstream outstream; + if (found_barcodes.insert(barcode).second) + outstream.open(outname); // override file if this is the first read + // for the barcode + else + outstream.open(outname, ofstream::app); // remove file if this is the + // first read for the barcode + print_line(new_read_id, read_new, qual_new, outstream); + outstream.close(); + } else { + print_line(new_read_id, read_new, qual_new, std::cout); + } } - } } // separated out from main so that this can be run with threads void search_read(vector & reads, unordered_set & known_barcodes, - int flank_edit_distance, int edit_distance, - const std::vector> &search_pattern) { + int flank_edit_distance, int edit_distance, + const std::vector> &search_pattern) { - for(int r=0; r 1; } } @@ -842,9 +872,9 @@ int main(int argc, char **argv) { for (int b = 0; b < sr_v[t][r].vec_bc_rev.size(); b++) barcode_counts[sr_v[t][r].vec_bc_rev.at(b).barcode]++; - if ((sr_v[t][r].vec_bc_for.size() + sr_v[t][r].vec_bc_rev.size()) > 0) + if (sr_v[t][r].count > 0) bc_count++; - if ((sr_v[t][r].vec_bc_for.size() + sr_v[t][r].vec_bc_rev.size()) > 1) { + if (sr_v[t][r].chimeric) { multi_bc_count++; } @@ -859,10 +889,12 @@ int main(int argc, char **argv) { sr_v[t][r].qual_scores, sr_v[t][r].vec_bc_for, out_filename_prefix, split_file_by_barcode, found_barcodes, remove_barcodes); + + // for a chimeric read, we first need to reverse the quality scores reverse(sr_v[t][r].qual_scores.begin(), sr_v[t][r].qual_scores.end()); - if (remove_barcodes || sr_v[t][r].vec_bc_for.size() == - 0) // case we just want to print read once - // if multiple bc found. + + // case we just want to print read once if multiple bc found. + if (remove_barcodes || sr_v[t][r].vec_bc_for.size() == 0) print_read(sr_v[t][r].read_id + "_-", sr_v[t][r].rev_line, sr_v[t][r].qual_scores, sr_v[t][r].vec_bc_rev, out_filename_prefix, split_file_by_barcode, From 2a1d913061b0d724c1fd5c6876c45880ec729c29 Mon Sep 17 00:00:00 2001 From: olliecheng Date: Tue, 19 Mar 2024 13:27:01 +1100 Subject: [PATCH 2/3] Add chimeric read reporting flag using -c true This change allows for chimeric reads to be identified in the output read, by appending the suffix "_C" to reads which are identified within a chimeric read. To enable this, use the "-c true" flag. --- flexiplex.c++ | 69 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 51 insertions(+), 18 deletions(-) diff --git a/flexiplex.c++ b/flexiplex.c++ index 10fadc7..dceb797 100644 --- a/flexiplex.c++ +++ b/flexiplex.c++ @@ -62,10 +62,15 @@ void print_usage(){ cerr << " including flanking sequenence and split read if multiple\n"; cerr << " barcodes found (default: true).\n"; cerr << " -s true/false Sort reads into separate files by barcode (default: false)\n"; - cerr << " -n prefix Prefix for output filenames.\n"; - cerr << " -e N Maximum edit distance to barcode (default 2).\n"; - cerr << " -f N Maximum edit distance to primer+polyT (default 8).\n"; - cerr << " -p N Number of threads (default: 1).\n\n"; + cerr << " -c true/false Add a _C suffix to the read identifier of any chimeric reads\n"; + cerr << " (default: false). For instance if,\n"; + cerr << " @BC_UMI#READID_+1of2\n"; + cerr << " is chimeric, it will become:\n"; + cerr << " @BC_UMI#READID_+1of2_C\n"; + cerr << " -n prefix Prefix for output filenames.\n"; + cerr << " -e N Maximum edit distance to barcode (default 2).\n"; + cerr << " -f N Maximum edit distance to primer+polyT (default 8).\n"; + cerr << " -p N Number of threads (default: 1).\n\n"; cerr << " Specifying adaptor / barcode structure : \n"; cerr << " -x sequence Append flanking sequence to search for\n"; @@ -463,7 +468,8 @@ void print_line(string id, string read, string quals, ostream & out_stream){ void print_read(string read_id, string read, string qual, vector & vec_bc, string prefix, bool split, unordered_set & found_barcodes, - bool trim_barcodes){ + bool trim_barcodes, + bool chimeric){ auto vec_size = vec_bc.size(); @@ -473,6 +479,10 @@ void print_read(string read_id, string read, string qual, // format the new read id. Using FLAMES format. stringstream ss; ss << (b + 1) << "of" << vec_size; + if (chimeric) { + ss << "_" << "C"; + } + string barcode = vec_bc.at(b).barcode; string new_read_id = barcode + "_" + vec_bc.at(b).umi + "#" + read_id + ss.str(); @@ -579,6 +589,7 @@ int main(int argc, char **argv) { bool split_file_by_barcode = false; //(s) bool remove_barcodes = true; //(r) + bool print_chimeric = false; //(c) std::vector> search_pattern; @@ -598,7 +609,7 @@ int main(int argc, char **argv) { vector myArgs(argv, argv + argc); while ((c = getopt(myArgs.size(), myArgs.data(), - "d:k:i:b:u:x:e:f:n:s:hp:")) != EOF) { + "d:k:i:b:u:x:e:f:n:s:hp:c:")) != EOF) { switch (c) { case 'd': { // d=predefined list of settings for various search/barcode // schemes @@ -714,6 +725,12 @@ int main(int argc, char **argv) { params += 2; break; } + case 'c': { + print_chimeric = get_bool_opt_arg(optarg); + + params += 2; + break; + } case 'p': { n_threads = atoi(optarg); cerr << "Setting number of threads to " << n_threads << endl; @@ -885,20 +902,35 @@ int main(int argc, char **argv) { print_stats(sr_v[t][r].read_id, sr_v[t][r].vec_bc_for, out_stat_file); print_stats(sr_v[t][r].read_id, sr_v[t][r].vec_bc_rev, out_stat_file); - print_read(sr_v[t][r].read_id + "_+", sr_v[t][r].line, - sr_v[t][r].qual_scores, sr_v[t][r].vec_bc_for, - out_filename_prefix, split_file_by_barcode, found_barcodes, - remove_barcodes); + print_read( + sr_v[t][r].read_id + "_+", + sr_v[t][r].line, + sr_v[t][r].qual_scores, + sr_v[t][r].vec_bc_for, + out_filename_prefix, + split_file_by_barcode, + found_barcodes, + remove_barcodes, + print_chimeric && sr_v[t][r].chimeric // include chimeric information if requested + ); - // for a chimeric read, we first need to reverse the quality scores - reverse(sr_v[t][r].qual_scores.begin(), sr_v[t][r].qual_scores.end()); - // case we just want to print read once if multiple bc found. - if (remove_barcodes || sr_v[t][r].vec_bc_for.size() == 0) - print_read(sr_v[t][r].read_id + "_-", sr_v[t][r].rev_line, - sr_v[t][r].qual_scores, sr_v[t][r].vec_bc_rev, - out_filename_prefix, split_file_by_barcode, - found_barcodes, remove_barcodes); + if (remove_barcodes || sr_v[t][r].vec_bc_for.size() == 0) { + // for a chimeric read, we first need to reverse the quality scores + reverse(sr_v[t][r].qual_scores.begin(), sr_v[t][r].qual_scores.end()); + + print_read( + sr_v[t][r].read_id + "_-", + sr_v[t][r].rev_line, + sr_v[t][r].qual_scores, + sr_v[t][r].vec_bc_rev, + out_filename_prefix, + split_file_by_barcode, + found_barcodes, + remove_barcodes, + print_chimeric && sr_v[t][r].chimeric // include chimeric information if requested + ); + } } } } @@ -952,3 +984,4 @@ int main(int argc, char **argv) { for (int i = hist.size() - 1; i >= 0; i--) cout << i + 1 << "\t" << hist[i] << "\n"; } + From a11676e88b5690707f08433f1db5a59b6428541c Mon Sep 17 00:00:00 2001 From: olliecheng Date: Tue, 26 Mar 2024 10:44:11 +1100 Subject: [PATCH 3/3] Only trigger chimeric read when there are reads in both the forward and reverse strands If there are multiple chimeric reads which are all in the same direction, then Flexiplex will automatically split the reads and so there is no need for chimeric read reporting. --- flexiplex.c++ | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flexiplex.c++ b/flexiplex.c++ index dceb797..613a8bf 100644 --- a/flexiplex.c++ +++ b/flexiplex.c++ @@ -564,9 +564,9 @@ void search_read(vector & reads, unordered_set & known_bar reads[r].count = forward_reads.size() + reverse_reads.size(); - // a chimeric read occurs when there are more than 1 detected barcodes in - // a single read - reads[r].chimeric = reads[r].count > 1; + // a chimeric read occurs when there are barcodes detected in both the forward + // and reverse strands. + reads[r].chimeric = forward_reads.size() && reverse_reads.size(); } }