@@ -4,14 +4,50 @@ Class for generating BAM file statistics. Records are accessed using multi-threa
4
4
*/
5
5
6
6
#include < iostream>
7
+ #include < fstream>
8
+ #include < string>
7
9
#include < thread>
8
10
#include < iostream>
9
11
#include < cmath>
12
+ #include < unordered_set>
10
13
11
14
#include " bam_module.h"
12
15
16
+ // Run the BAM module
17
+ int BAM_Module::run (Input_Para &input_params, Output_BAM &final_output)
18
+ {
19
+ int exit_code = 0 ;
20
+
21
+ // Determine if RRMS read ID filtering is required
22
+ if (input_params.rrms_csv != " " ){
23
+ std::cout << " RRMS read ID filtering enabled" << std::endl;
24
+ std::cout << " RRMS CSV file: " << input_params.rrms_csv << std::endl;
25
+
26
+ // Determine if RRMS stats should be generated for accepted or rejected
27
+ // reads
28
+ if (input_params.rrms_filter ){
29
+ std::cout << " RRMS stats will be generated for accepted reads" << std::endl;
30
+ } else {
31
+ std::cout << " RRMS stats will be generated for rejected reads" << std::endl;
32
+ }
33
+
34
+ // Read the RRMS CSV file and store the read IDs
35
+ std::cout << " Reading RRMS CSV file..." << std::endl;
36
+ std::unordered_set<std::string> rrms_read_ids = readRRMSFile (input_params.rrms_csv , input_params.rrms_filter );
37
+ std::cout << " Number of read IDs = " << rrms_read_ids.size () << std::endl;
38
+
39
+ // Store the read IDs in the input parameters
40
+ input_params.rrms_read_ids = rrms_read_ids;
41
+ }
13
42
14
- int BAM_Module::calculateStatistics (Input_Para& input_params, Output_BAM& final_output){
43
+ // Calculate statistics
44
+ exit_code = calculateStatistics (input_params, final_output);
45
+
46
+ return exit_code;
47
+ }
48
+
49
+ int BAM_Module::calculateStatistics (Input_Para &input_params, Output_BAM &final_output)
50
+ {
15
51
int exit_code = 0 ;
16
52
auto relapse_start_time = std::chrono::high_resolution_clock::now ();
17
53
@@ -108,7 +144,13 @@ int BAM_Module::calculateStatistics(Input_Para& input_params, Output_BAM& final_
108
144
109
145
// Save the summary statistics to a file
110
146
std::cout << " Saving summary statistics to file..." << std::endl;
111
- std::string summary_filepath = input_params.output_folder + " /bam_summary.txt" ;
147
+
148
+ // If in RRMS mode, append RRMS accepted/rejected to the output prefix
149
+ std::string output_prefix = " bam" ;
150
+ if (input_params.rrms_csv != " " ){
151
+ output_prefix += input_params.rrms_filter ? " _rrms_accepted" : " _rrms_rejected" ;
152
+ }
153
+ std::string summary_filepath = input_params.output_folder + " /" + output_prefix + " _summary.txt" ;
112
154
final_output.save_summary (summary_filepath, input_params, final_output);
113
155
std::cout << " Saved file: " << summary_filepath << std::endl;
114
156
@@ -120,13 +162,89 @@ int BAM_Module::calculateStatistics(Input_Para& input_params, Output_BAM& final_
120
162
121
163
void BAM_Module::batchStatistics (HTSReader& reader, int batch_size, Input_Para& input_params, Output_BAM& final_output, std::mutex& bam_mutex, std::mutex& output_mutex, std::mutex& cout_mutex)
122
164
{
123
- Output_BAM record_output; // Output for the current batch
124
-
125
165
// Read the next N records
126
- reader.readNextRecords (batch_size, record_output, bam_mutex);
166
+ Output_BAM record_output;
167
+ reader.readNextRecords (batch_size, record_output, bam_mutex, input_params.rrms_read_ids );
127
168
128
169
// Update the final output
129
170
output_mutex.lock ();
130
171
final_output.add (record_output);
131
172
output_mutex.unlock ();
132
173
}
174
+
175
+ std::unordered_set<std::string> BAM_Module::readRRMSFile (std::string rrms_csv_file, bool accepted_reads)
176
+ {
177
+ // Create an unordered set to store the read IDs for fast lookup
178
+ std::unordered_set<std::string> rrms_read_ids;
179
+
180
+ // Open the file
181
+ std::ifstream rrms_file (rrms_csv_file);
182
+
183
+ // Read the header and find the 'read_id' and 'decision' columns
184
+ std::string header;
185
+ std::vector<std::string> header_fields;
186
+ std::getline (rrms_file, header);
187
+ std::stringstream ss (header);
188
+ std::string field;
189
+ // std::cout << "RRMS CSV header:" << std::endl;
190
+ while (std::getline (ss, field, ' ,' )){
191
+ header_fields.push_back (field);
192
+ // std::cout << field << std::endl;
193
+ }
194
+
195
+ // Find the 'read_id' and 'decision' columns
196
+ int read_id_index = -1 ;
197
+ int decision_index = -1 ;
198
+ for (size_t i=0 ; i<header_fields.size (); i++){
199
+ if (header_fields[i] == " read_id" ){
200
+ read_id_index = i;
201
+ } else if (header_fields[i] == " decision" ){
202
+ decision_index = i;
203
+ }
204
+ }
205
+
206
+ // Exit if the read_id or decision columns are not found
207
+ if (read_id_index == -1 ){
208
+ std::cerr << " Error: 'read_id' column not found in RRMS CSV file" << std::endl;
209
+ exit (1 );
210
+ }
211
+
212
+ if (decision_index == -1 ){
213
+ std::cerr << " Error: 'decision' column not found in RRMS CSV file" << std::endl;
214
+ exit (1 );
215
+ }
216
+
217
+ // Read all rows in the file and store the read IDs if the decision is
218
+ // 'stop_receiving' for accepted, or 'unblock' for rejected reads.
219
+ std::string pattern = accepted_reads ? " stop_receiving" : " unblock" ;
220
+ std::string line;
221
+ while (std::getline (rrms_file, line)){
222
+ std::vector<std::string> fields;
223
+ std::string field;
224
+ std::stringstream ss (line);
225
+ while (std::getline (ss, field, ' ,' )){
226
+ fields.push_back (field);
227
+ }
228
+
229
+ // Get the read ID and decision
230
+ std::string read_id = fields[read_id_index];
231
+ std::string decision = fields[decision_index];
232
+
233
+ // Store the read ID if the decision matches the pattern
234
+ if (decision == pattern){
235
+ rrms_read_ids.insert (read_id);
236
+ }
237
+ }
238
+
239
+
240
+ // Close the file
241
+ rrms_file.close ();
242
+
243
+ // // Print the first 10 read IDs
244
+ // std::cout << "First 10 read IDs:" << std::endl;
245
+ // for (int i=0; i<10; i++){
246
+ // std::cout << rrms_read_ids[i] << std::endl;
247
+ // }
248
+
249
+ return rrms_read_ids;
250
+ }
0 commit comments