Skip to content

Commit c197d94

Browse files
authored
Merge pull request #47 from WGLab/46-add-rrms-support
Add RRMS support
2 parents 10833e6 + 907f9ec commit c197d94

14 files changed

+400
-306
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,4 +27,4 @@ SampleData
2727
*.log
2828

2929
# Testing scripts
30-
tests/SCRIPTS.txt
30+
scripts/

README.md

Lines changed: 21 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,10 @@
77
LongReadSum supports FASTA, FASTQ, BAM, FAST5, and sequencing_summary.txt file formats for quick generation of QC data in HTML and text format.
88

99
## Software requirements
10-
Please refer to `environment.yml` for detail. For your quick reference, LongReadSum needs
11-
```
12-
- python=3.9
13-
- hdf5
14-
- htslib
15-
- swig
16-
- matplotlib
17-
```
10+
Please refer to the conda
11+
[environment.yml](https://github.com/WGLab/LongReadSum/blob/main/environment.yml)
12+
file for all required packages.
13+
1814
# Installation using Anaconda (Linux 64-bit)
1915
First, install [Anaconda](https://www.anaconda.com/).
2016
LongReadSum can be installed using the following command:
@@ -71,34 +67,20 @@ export HDF5_PLUGIN_PATH=/full/path/to/ont-vbz-hdf-plugin-1.0.1-Linux/usr/local/h
7167

7268

7369
## Running
74-
Activate the conda environment:
75-
76-
`conda activate lrst_py39`
77-
78-
To test that you are using the correct Python interpreter, run:
79-
80-
`which python`
81-
82-
This should point to the environment's Python interpreter path:
83-
84-
`~/miniconda3/envs/lrst_py39/bin/python`
85-
86-
If the path is incorrect, export its location to `PATH`:
87-
88-
`export PATH=~/miniconda3/envs/lrst_py39/bin:$PATH`
89-
90-
Then you can run LongReadSum using the following command:
91-
92-
`python /path/to/LongReadSum [arguments]`
70+
Activate the conda environment and then run with arguments:
71+
```
72+
conda activate longreadsum
73+
python longreadsum [arguments]
74+
```
9375

9476
# General Usage
9577

9678
Specifying input files:
9779

9880
```
99-
usage: LongReadSum [-h] {fa,fq,f5,seqtxt,bam} ...
81+
usage: longreadsum [-h] {fa,fq,f5,f5s,seqtxt,bam,rrms} ...
10082
101-
QC tools for long-read sequencing data
83+
Fast and comprehensive QC for long read sequencing data.
10284
10385
positional arguments:
10486
{fa,fq,f5,seqtxt,bam}
@@ -108,16 +90,23 @@ positional arguments:
10890
f5s FAST5 file input with signal statistics output
10991
seqtxt sequencing_summary.txt input
11092
bam BAM file input
93+
rrms RRMS BAM file input
11194
11295
optional arguments:
11396
-h, --help show this help message and exit
11497
11598
Example with single inputs:
116-
python LongReadSum bam -i path/to/input.bam -o /output_directory/
99+
longreadsum bam -i input.bam -o output_directory -t 12
117100
118101
Example with multiple inputs:
119-
python LongReadSum bam -I "path/to/input1.bam, path/to/input2.bam" -o /output_directory/
120-
python LongReadSum bam -P "path/to/*.bam" -o /output_directory/
102+
longreadsum bam -I input1.bam, input2.bam -o output_directory
103+
longreadsum bam -P *.bam -o output_directory
104+
105+
RRMS example:
106+
longreadsum rrms --csv rrms_results.csv --input input.bam --output output_directory --threads 12
107+
108+
FAST5 signal mode example:
109+
longreadsum f5s --input input.fast5 --output output_directory
121110
```
122111

123112
# Revision history

__main__.py

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,8 @@
1-
"""
2-
__main__.py:
3-
Call the command-line interface.
4-
"""
5-
6-
import os
1+
# __main__.py: Call the command-line interface.
72

83
from src import cli
94

105

11-
from os.path import dirname, abspath
12-
13-
# Get the parent directory
14-
parent_dir = dirname(dirname(abspath(__file__)))
15-
16-
# # Set the HDF5 plugin path
17-
# os.environ['HDF5_PLUGIN_PATH'] = os.path.join(parent_dir, "lib")
18-
print("HDF5_PLUGIN_PATH is " + os.environ.get('HDF5_PLUGIN_PATH', ''))
19-
20-
216
def main():
227
cli.main()
238

include/bam_module.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,13 @@ class BAM_Module{
1616
std::map<std::string, bool> secondary_alignment;
1717
std::map<std::string, bool> supplementary_alignment;
1818

19+
int run(Input_Para& input_params, Output_BAM& final_output);
1920
int calculateStatistics(Input_Para& input_params, Output_BAM& final_output);
2021
static void batchStatistics(HTSReader& reader, int batch_size, Input_Para& input_params, Output_BAM& ref_output, std::mutex& bam_mutex, std::mutex& output_mutex, std::mutex& cout_mutex);
22+
23+
// RRMS
24+
// Read the RRMS CSV file and store the read IDs (accepted or rejected)
25+
std::unordered_set<std::string> readRRMSFile(std::string rrms_csv_file, bool accepted_reads);
2126
};
2227

2328
#endif

include/hts_reader.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ class HTSReader {
3535
int updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics* basic_qc, uint64_t *base_quality_distribution);
3636

3737
// Read the next batch of records from the BAM file
38-
int readNextRecords(int batch_size, Output_BAM & output_data, std::mutex & read_mutex);
38+
int readNextRecords(int batch_size, Output_BAM & output_data, std::mutex & read_mutex, std::unordered_set<std::string>& read_ids);
3939

4040
// Return if the reader has finished reading the BAM file
4141
bool hasNextRecord();

include/input_parameters.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ Define the Python bindings from our C++ modules
88

99
#include <vector>
1010
#include <string>
11+
#include <unordered_set> // For RRMS read ID filtering
1112
#define MAX_INPUT_FILES 2048
1213

1314

@@ -28,6 +29,9 @@ class Input_Para{
2829
std::string output_folder; // Output folder
2930
std::string input_files[MAX_INPUT_FILES]; // Input files
3031
std::string read_ids; // Read IDs comma-separated (FAST5 signal module)
32+
std::string rrms_csv; // CSV file with accepted/rejected read IDs (RRMS module)
33+
bool rrms_filter; // Generate RRMS stats for accepted (true) or rejected (false) reads
34+
std::unordered_set<std::string> rrms_read_ids; // List of read IDs from RRMS CSV file (accepted or rejected)
3135

3236
// Functions
3337
std::string add_input_file(const std::string& _ip_file);

include/output_data.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,8 @@ Define the output structures for each module.
1414
#include "input_parameters.h"
1515

1616
#define MAX_READ_LENGTH 10485760
17-
#define MAX_MAP_QUALITY 256
18-
#define MAX_BASE_QUALITY 256
19-
#define MAX_READ_QUALITY 256
17+
#define MAX_BASE_QUALITY 100
18+
#define MAX_READ_QUALITY 100
2019
#define MAX_SIGNAL_VALUE 5000
2120
#define PERCENTAGE_ARRAY_SIZE 101
2221
#define ZeroDefault 0

src/bam_module.cpp

Lines changed: 123 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,50 @@ Class for generating BAM file statistics. Records are accessed using multi-threa
44
*/
55

66
#include <iostream>
7+
#include <fstream>
8+
#include <string>
79
#include <thread>
810
#include <iostream>
911
#include <cmath>
12+
#include <unordered_set>
1013

1114
#include "bam_module.h"
1215

16+
// Run the BAM module
17+
int BAM_Module::run(Input_Para &input_params, Output_BAM &final_output)
18+
{
19+
int exit_code = 0;
20+
21+
// Determine if RRMS read ID filtering is required
22+
if (input_params.rrms_csv != ""){
23+
std::cout << "RRMS read ID filtering enabled" << std::endl;
24+
std::cout << "RRMS CSV file: " << input_params.rrms_csv << std::endl;
25+
26+
// Determine if RRMS stats should be generated for accepted or rejected
27+
// reads
28+
if (input_params.rrms_filter){
29+
std::cout << "RRMS stats will be generated for accepted reads" << std::endl;
30+
} else {
31+
std::cout << "RRMS stats will be generated for rejected reads" << std::endl;
32+
}
33+
34+
// Read the RRMS CSV file and store the read IDs
35+
std::cout << "Reading RRMS CSV file..." << std::endl;
36+
std::unordered_set<std::string> rrms_read_ids = readRRMSFile(input_params.rrms_csv, input_params.rrms_filter);
37+
std::cout << "Number of read IDs = " << rrms_read_ids.size() << std::endl;
38+
39+
// Store the read IDs in the input parameters
40+
input_params.rrms_read_ids = rrms_read_ids;
41+
}
1342

14-
int BAM_Module::calculateStatistics(Input_Para& input_params, Output_BAM& final_output){
43+
// Calculate statistics
44+
exit_code = calculateStatistics(input_params, final_output);
45+
46+
return exit_code;
47+
}
48+
49+
int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_output)
50+
{
1551
int exit_code = 0;
1652
auto relapse_start_time = std::chrono::high_resolution_clock::now();
1753

@@ -108,7 +144,13 @@ int BAM_Module::calculateStatistics(Input_Para& input_params, Output_BAM& final_
108144

109145
// Save the summary statistics to a file
110146
std::cout << "Saving summary statistics to file..." << std::endl;
111-
std::string summary_filepath = input_params.output_folder + "/bam_summary.txt";
147+
148+
// If in RRMS mode, append RRMS accepted/rejected to the output prefix
149+
std::string output_prefix = "bam";
150+
if (input_params.rrms_csv != ""){
151+
output_prefix += input_params.rrms_filter ? "_rrms_accepted" : "_rrms_rejected";
152+
}
153+
std::string summary_filepath = input_params.output_folder + "/" + output_prefix + "_summary.txt";
112154
final_output.save_summary(summary_filepath, input_params, final_output);
113155
std::cout << "Saved file: " << summary_filepath << std::endl;
114156

@@ -120,13 +162,89 @@ int BAM_Module::calculateStatistics(Input_Para& input_params, Output_BAM& final_
120162

121163
void BAM_Module::batchStatistics(HTSReader& reader, int batch_size, Input_Para& input_params, Output_BAM& final_output, std::mutex& bam_mutex, std::mutex& output_mutex, std::mutex& cout_mutex)
122164
{
123-
Output_BAM record_output; // Output for the current batch
124-
125165
// Read the next N records
126-
reader.readNextRecords(batch_size, record_output, bam_mutex);
166+
Output_BAM record_output;
167+
reader.readNextRecords(batch_size, record_output, bam_mutex, input_params.rrms_read_ids);
127168

128169
// Update the final output
129170
output_mutex.lock();
130171
final_output.add(record_output);
131172
output_mutex.unlock();
132173
}
174+
175+
std::unordered_set<std::string> BAM_Module::readRRMSFile(std::string rrms_csv_file, bool accepted_reads)
176+
{
177+
// Create an unordered set to store the read IDs for fast lookup
178+
std::unordered_set<std::string> rrms_read_ids;
179+
180+
// Open the file
181+
std::ifstream rrms_file(rrms_csv_file);
182+
183+
// Read the header and find the 'read_id' and 'decision' columns
184+
std::string header;
185+
std::vector<std::string> header_fields;
186+
std::getline(rrms_file, header);
187+
std::stringstream ss(header);
188+
std::string field;
189+
// std::cout << "RRMS CSV header:" << std::endl;
190+
while (std::getline(ss, field, ',')){
191+
header_fields.push_back(field);
192+
// std::cout << field << std::endl;
193+
}
194+
195+
// Find the 'read_id' and 'decision' columns
196+
int read_id_index = -1;
197+
int decision_index = -1;
198+
for (size_t i=0; i<header_fields.size(); i++){
199+
if (header_fields[i] == "read_id"){
200+
read_id_index = i;
201+
} else if (header_fields[i] == "decision"){
202+
decision_index = i;
203+
}
204+
}
205+
206+
// Exit if the read_id or decision columns are not found
207+
if (read_id_index == -1){
208+
std::cerr << "Error: 'read_id' column not found in RRMS CSV file" << std::endl;
209+
exit(1);
210+
}
211+
212+
if (decision_index == -1){
213+
std::cerr << "Error: 'decision' column not found in RRMS CSV file" << std::endl;
214+
exit(1);
215+
}
216+
217+
// Read all rows in the file and store the read IDs if the decision is
218+
// 'stop_receiving' for accepted, or 'unblock' for rejected reads.
219+
std::string pattern = accepted_reads ? "stop_receiving" : "unblock";
220+
std::string line;
221+
while (std::getline(rrms_file, line)){
222+
std::vector<std::string> fields;
223+
std::string field;
224+
std::stringstream ss(line);
225+
while (std::getline(ss, field, ',')){
226+
fields.push_back(field);
227+
}
228+
229+
// Get the read ID and decision
230+
std::string read_id = fields[read_id_index];
231+
std::string decision = fields[decision_index];
232+
233+
// Store the read ID if the decision matches the pattern
234+
if (decision == pattern){
235+
rrms_read_ids.insert(read_id);
236+
}
237+
}
238+
239+
240+
// Close the file
241+
rrms_file.close();
242+
243+
// // Print the first 10 read IDs
244+
// std::cout << "First 10 read IDs:" << std::endl;
245+
// for (int i=0; i<10; i++){
246+
// std::cout << rrms_read_ids[i] << std::endl;
247+
// }
248+
249+
return rrms_read_ids;
250+
}

0 commit comments

Comments
 (0)