Skip to content

Commit 22b884f

Browse files
authored
feat: simulate reads from real bam files (#181)
* initial commit simulate reads * Fix softclipping offset * Add MC aux field * Fix coordinates * Handle unmapped reads * Refactoring * Initialize variable * Fixes for rust-htslib 0.38 * parameter to enforce pairs * Introduce altered bases hashmap * Retain variant consistency
1 parent 93e7edb commit 22b884f

File tree

4 files changed

+227
-0
lines changed

4 files changed

+227
-0
lines changed

src/bam/anonymize_reads.rs

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
use anyhow::Result;
2+
use bio::io::fasta;
3+
use rand::prelude::{SliceRandom, ThreadRng};
4+
use rand::seq::IteratorRandom;
5+
use rust_htslib::bam;
6+
use rust_htslib::bam::Read;
7+
use std::collections::HashMap;
8+
use std::ops::Range;
9+
use std::path::Path;
10+
use uuid::Uuid;
11+
12+
pub fn anonymize_reads<P: AsRef<Path>>(
13+
bam: P,
14+
input_ref: P,
15+
output_bam: P,
16+
output_ref: P,
17+
chr: String,
18+
interval: Range<u64>,
19+
keep_only_pairs: bool,
20+
) -> Result<()> {
21+
let start = interval.start;
22+
let end = interval.end;
23+
let mut fasta_reader = fasta::IndexedReader::from_file(&input_ref)?;
24+
fasta_reader.fetch(&chr, start, end)?;
25+
let mut reference = Vec::new();
26+
fasta_reader.read(&mut reference)?;
27+
let mut rng = rand::thread_rng();
28+
let alphabet = [b'A', b'C', b'G', b'T'];
29+
30+
//Build artificial reference
31+
let mut artificial_reference = Vec::new();
32+
add_random_bases(end - start, &mut artificial_reference, &mut rng, &alphabet)?;
33+
let mut altered_bases = init_altered_bases(&reference, &artificial_reference)?;
34+
let mut fa_writer = fasta::Writer::to_file(output_ref)?;
35+
let ref_id = Uuid::new_v4().to_hyphenated().to_string();
36+
fa_writer.write(&ref_id, None, &artificial_reference)?;
37+
38+
let mut bam_reader = bam::IndexedReader::from_path(bam)?;
39+
bam_reader.fetch((chr.as_bytes(), start, end + 1))?;
40+
41+
let mut header = bam::Header::new();
42+
header.push_record(
43+
bam::header::HeaderRecord::new(b"SQ")
44+
.push_tag(b"SN", &ref_id)
45+
.push_tag(b"LN", &(end - start)),
46+
);
47+
let mut bam_writer = bam::Writer::from_path(output_bam, &header, bam::Format::Bam)?;
48+
let mate_in_range = |record: &bam::Record| -> bool {
49+
(record.mtid() == record.tid())
50+
&& (record.mpos() >= (start as i64))
51+
&& (record.mpos() < (end as i64))
52+
};
53+
for result in bam_reader.records() {
54+
let mut record = result?;
55+
if (record.pos() >= start as i64)
56+
&& (record.cigar().end_pos() < end as i64)
57+
&& (!keep_only_pairs || mate_in_range(&record))
58+
{
59+
record.cache_cigar();
60+
//Check if mate record end within region
61+
let artificial_seq = if record.is_unmapped() {
62+
let mut seq = Vec::new();
63+
add_random_bases(record.seq_len() as u64, &mut seq, &mut rng, &alphabet)?;
64+
seq
65+
} else {
66+
build_sequence(
67+
&mut altered_bases,
68+
&record,
69+
start as usize,
70+
&mut rng,
71+
&alphabet,
72+
)?
73+
};
74+
let artificial_record = build_record(&record, &artificial_seq, start as i64)?;
75+
bam_writer.write(&artificial_record)?;
76+
}
77+
}
78+
Ok(())
79+
}
80+
81+
fn init_altered_bases(
82+
original_ref: &[u8],
83+
artificial_reference: &[u8],
84+
) -> Result<HashMap<usize, HashMap<u8, u8>>> {
85+
let mut altered_bases = HashMap::new();
86+
for (i, (artifical_base, original_base)) in artificial_reference
87+
.iter()
88+
.zip(original_ref.iter())
89+
.enumerate()
90+
{
91+
altered_bases
92+
.entry(i)
93+
.or_insert_with(HashMap::new)
94+
.insert(*original_base, *artifical_base);
95+
}
96+
Ok(altered_bases)
97+
}
98+
99+
fn build_record(record: &bam::Record, artificial_seq: &[u8], offset: i64) -> Result<bam::Record> {
100+
let mut artificial_record = bam::record::Record::new();
101+
if let Ok(mate_cigar) = record.aux(b"MC") {
102+
artificial_record.push_aux(b"MC", mate_cigar)?;
103+
}
104+
artificial_record.set(
105+
record.qname(),
106+
Some(&record.cigar()),
107+
artificial_seq,
108+
record.qual(),
109+
);
110+
artificial_record.set_pos(record.pos() - offset);
111+
artificial_record.set_tid(0);
112+
artificial_record.set_mtid(0);
113+
artificial_record.set_mpos(record.mpos() - offset);
114+
artificial_record.set_flags(record.flags());
115+
artificial_record.set_insert_size(record.insert_size());
116+
artificial_record.set_mapq(record.mapq());
117+
Ok(artificial_record)
118+
}
119+
120+
fn build_sequence(
121+
altered_bases: &mut HashMap<usize, HashMap<u8, u8>>,
122+
record: &bam::Record,
123+
offset: usize,
124+
rng: &mut ThreadRng,
125+
alphabet: &[u8],
126+
) -> Result<Vec<u8>> {
127+
let mut artificial_seq = Vec::new();
128+
let record_seq = record.seq().as_bytes();
129+
let mut record_pos = 0;
130+
let mut ref_pos = record.pos() as usize - offset;
131+
//Create random seq for leading softclips
132+
for cigar in record.cigar_cached().unwrap().iter() {
133+
match cigar.char() {
134+
'S' => {
135+
add_random_bases(cigar.len() as u64, &mut artificial_seq, rng, alphabet)?;
136+
record_pos += cigar.len() as usize;
137+
}
138+
'M' | 'X' | '=' => {
139+
(0..cigar.len()).for_each(|_| {
140+
let base_mappings = altered_bases.get(&ref_pos).unwrap().clone();
141+
let altered_base = *altered_bases
142+
.get_mut(&ref_pos)
143+
.unwrap()
144+
.entry(*record_seq.get(record_pos).unwrap())
145+
.or_insert_with(|| {
146+
*alphabet
147+
.iter()
148+
.filter(|&x| !base_mappings.values().any(|y| x == y))
149+
.choose(rng)
150+
.unwrap()
151+
});
152+
artificial_seq.push(altered_base);
153+
ref_pos += 1;
154+
record_pos += 1;
155+
});
156+
// Add reference bases except for mismatches
157+
}
158+
'I' => {
159+
add_random_bases(cigar.len() as u64, &mut artificial_seq, rng, alphabet)?;
160+
record_pos += cigar.len() as usize;
161+
}
162+
'D' | 'N' => {
163+
ref_pos += cigar.len() as usize;
164+
}
165+
_ => {}
166+
}
167+
}
168+
169+
Ok(artificial_seq)
170+
}
171+
172+
fn add_random_bases(
173+
length: u64,
174+
seq: &mut Vec<u8>,
175+
rng: &mut ThreadRng,
176+
alphabet: &[u8],
177+
) -> Result<()> {
178+
(0..length).for_each(|_| seq.push(*alphabet.choose(rng).unwrap()));
179+
Ok(())
180+
}

src/bam/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
//! Tools that work on BAM files
2+
pub mod anonymize_reads;
23
pub mod collapse_reads_to_fragments;
34
pub mod depth;
45
pub mod plot;

src/cli.rs

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,34 @@ pub(crate) enum Command {
335335
cmd: CollapseReadsToFragmentsSubcommand,
336336
},
337337

338+
/// Tool to build artifical reads from real BAM files with identical properties.
339+
#[structopt(author = "Felix Mölder <felix.moelder@uni-due.de>")]
340+
BamAnonymize {
341+
#[structopt(parse(from_os_str), help = "Input BAM file")]
342+
bam: PathBuf,
343+
#[structopt(parse(from_os_str), help = "Input reference as fasta file")]
344+
input_ref: PathBuf,
345+
#[structopt(parse(from_os_str), help = "Output BAM file with artificial reads")]
346+
output_bam: PathBuf,
347+
#[structopt(
348+
parse(from_os_str),
349+
help = "Output fasta file with artificial reference"
350+
)]
351+
output_ref: PathBuf,
352+
#[structopt(help = "chromosome name")]
353+
chr: String,
354+
#[structopt(help = "1-based start position")]
355+
start: u64,
356+
#[structopt(help = "1-based exclusive end position")]
357+
end: u64,
358+
#[structopt(
359+
long,
360+
short = "p",
361+
help = "Only simulates reads whos mates are both in defined range."
362+
)]
363+
keep_only_pairs: bool,
364+
},
365+
338366
/// Tool to compute stats on sequence file (from STDIN), output is in YAML with fields:
339367
/// - min: length of shortest sequence
340368
/// - max: length of longest sequence

src/main.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,24 @@ fn main() -> Result<()> {
261261
verbose_read_names,
262262
)?,
263263
},
264+
BamAnonymize {
265+
bam,
266+
input_ref,
267+
output_bam,
268+
output_ref,
269+
chr,
270+
start,
271+
end,
272+
keep_only_pairs,
273+
} => bam::anonymize_reads::anonymize_reads(
274+
bam,
275+
input_ref,
276+
output_bam,
277+
output_ref,
278+
chr,
279+
start - 1..end - 1,
280+
keep_only_pairs,
281+
)?,
264282
SequenceStats { fastq } => sequences_stats::stats(fastq)?,
265283
}
266284
Ok(())

0 commit comments

Comments
 (0)