|
| 1 | +use anyhow::Result; |
| 2 | +use bio::io::fasta; |
| 3 | +use rand::prelude::{SliceRandom, ThreadRng}; |
| 4 | +use rand::seq::IteratorRandom; |
| 5 | +use rust_htslib::bam; |
| 6 | +use rust_htslib::bam::Read; |
| 7 | +use std::collections::HashMap; |
| 8 | +use std::ops::Range; |
| 9 | +use std::path::Path; |
| 10 | +use uuid::Uuid; |
| 11 | + |
| 12 | +pub fn anonymize_reads<P: AsRef<Path>>( |
| 13 | + bam: P, |
| 14 | + input_ref: P, |
| 15 | + output_bam: P, |
| 16 | + output_ref: P, |
| 17 | + chr: String, |
| 18 | + interval: Range<u64>, |
| 19 | + keep_only_pairs: bool, |
| 20 | +) -> Result<()> { |
| 21 | + let start = interval.start; |
| 22 | + let end = interval.end; |
| 23 | + let mut fasta_reader = fasta::IndexedReader::from_file(&input_ref)?; |
| 24 | + fasta_reader.fetch(&chr, start, end)?; |
| 25 | + let mut reference = Vec::new(); |
| 26 | + fasta_reader.read(&mut reference)?; |
| 27 | + let mut rng = rand::thread_rng(); |
| 28 | + let alphabet = [b'A', b'C', b'G', b'T']; |
| 29 | + |
| 30 | + //Build artificial reference |
| 31 | + let mut artificial_reference = Vec::new(); |
| 32 | + add_random_bases(end - start, &mut artificial_reference, &mut rng, &alphabet)?; |
| 33 | + let mut altered_bases = init_altered_bases(&reference, &artificial_reference)?; |
| 34 | + let mut fa_writer = fasta::Writer::to_file(output_ref)?; |
| 35 | + let ref_id = Uuid::new_v4().to_hyphenated().to_string(); |
| 36 | + fa_writer.write(&ref_id, None, &artificial_reference)?; |
| 37 | + |
| 38 | + let mut bam_reader = bam::IndexedReader::from_path(bam)?; |
| 39 | + bam_reader.fetch((chr.as_bytes(), start, end + 1))?; |
| 40 | + |
| 41 | + let mut header = bam::Header::new(); |
| 42 | + header.push_record( |
| 43 | + bam::header::HeaderRecord::new(b"SQ") |
| 44 | + .push_tag(b"SN", &ref_id) |
| 45 | + .push_tag(b"LN", &(end - start)), |
| 46 | + ); |
| 47 | + let mut bam_writer = bam::Writer::from_path(output_bam, &header, bam::Format::Bam)?; |
| 48 | + let mate_in_range = |record: &bam::Record| -> bool { |
| 49 | + (record.mtid() == record.tid()) |
| 50 | + && (record.mpos() >= (start as i64)) |
| 51 | + && (record.mpos() < (end as i64)) |
| 52 | + }; |
| 53 | + for result in bam_reader.records() { |
| 54 | + let mut record = result?; |
| 55 | + if (record.pos() >= start as i64) |
| 56 | + && (record.cigar().end_pos() < end as i64) |
| 57 | + && (!keep_only_pairs || mate_in_range(&record)) |
| 58 | + { |
| 59 | + record.cache_cigar(); |
| 60 | + //Check if mate record end within region |
| 61 | + let artificial_seq = if record.is_unmapped() { |
| 62 | + let mut seq = Vec::new(); |
| 63 | + add_random_bases(record.seq_len() as u64, &mut seq, &mut rng, &alphabet)?; |
| 64 | + seq |
| 65 | + } else { |
| 66 | + build_sequence( |
| 67 | + &mut altered_bases, |
| 68 | + &record, |
| 69 | + start as usize, |
| 70 | + &mut rng, |
| 71 | + &alphabet, |
| 72 | + )? |
| 73 | + }; |
| 74 | + let artificial_record = build_record(&record, &artificial_seq, start as i64)?; |
| 75 | + bam_writer.write(&artificial_record)?; |
| 76 | + } |
| 77 | + } |
| 78 | + Ok(()) |
| 79 | +} |
| 80 | + |
| 81 | +fn init_altered_bases( |
| 82 | + original_ref: &[u8], |
| 83 | + artificial_reference: &[u8], |
| 84 | +) -> Result<HashMap<usize, HashMap<u8, u8>>> { |
| 85 | + let mut altered_bases = HashMap::new(); |
| 86 | + for (i, (artifical_base, original_base)) in artificial_reference |
| 87 | + .iter() |
| 88 | + .zip(original_ref.iter()) |
| 89 | + .enumerate() |
| 90 | + { |
| 91 | + altered_bases |
| 92 | + .entry(i) |
| 93 | + .or_insert_with(HashMap::new) |
| 94 | + .insert(*original_base, *artifical_base); |
| 95 | + } |
| 96 | + Ok(altered_bases) |
| 97 | +} |
| 98 | + |
| 99 | +fn build_record(record: &bam::Record, artificial_seq: &[u8], offset: i64) -> Result<bam::Record> { |
| 100 | + let mut artificial_record = bam::record::Record::new(); |
| 101 | + if let Ok(mate_cigar) = record.aux(b"MC") { |
| 102 | + artificial_record.push_aux(b"MC", mate_cigar)?; |
| 103 | + } |
| 104 | + artificial_record.set( |
| 105 | + record.qname(), |
| 106 | + Some(&record.cigar()), |
| 107 | + artificial_seq, |
| 108 | + record.qual(), |
| 109 | + ); |
| 110 | + artificial_record.set_pos(record.pos() - offset); |
| 111 | + artificial_record.set_tid(0); |
| 112 | + artificial_record.set_mtid(0); |
| 113 | + artificial_record.set_mpos(record.mpos() - offset); |
| 114 | + artificial_record.set_flags(record.flags()); |
| 115 | + artificial_record.set_insert_size(record.insert_size()); |
| 116 | + artificial_record.set_mapq(record.mapq()); |
| 117 | + Ok(artificial_record) |
| 118 | +} |
| 119 | + |
| 120 | +fn build_sequence( |
| 121 | + altered_bases: &mut HashMap<usize, HashMap<u8, u8>>, |
| 122 | + record: &bam::Record, |
| 123 | + offset: usize, |
| 124 | + rng: &mut ThreadRng, |
| 125 | + alphabet: &[u8], |
| 126 | +) -> Result<Vec<u8>> { |
| 127 | + let mut artificial_seq = Vec::new(); |
| 128 | + let record_seq = record.seq().as_bytes(); |
| 129 | + let mut record_pos = 0; |
| 130 | + let mut ref_pos = record.pos() as usize - offset; |
| 131 | + //Create random seq for leading softclips |
| 132 | + for cigar in record.cigar_cached().unwrap().iter() { |
| 133 | + match cigar.char() { |
| 134 | + 'S' => { |
| 135 | + add_random_bases(cigar.len() as u64, &mut artificial_seq, rng, alphabet)?; |
| 136 | + record_pos += cigar.len() as usize; |
| 137 | + } |
| 138 | + 'M' | 'X' | '=' => { |
| 139 | + (0..cigar.len()).for_each(|_| { |
| 140 | + let base_mappings = altered_bases.get(&ref_pos).unwrap().clone(); |
| 141 | + let altered_base = *altered_bases |
| 142 | + .get_mut(&ref_pos) |
| 143 | + .unwrap() |
| 144 | + .entry(*record_seq.get(record_pos).unwrap()) |
| 145 | + .or_insert_with(|| { |
| 146 | + *alphabet |
| 147 | + .iter() |
| 148 | + .filter(|&x| !base_mappings.values().any(|y| x == y)) |
| 149 | + .choose(rng) |
| 150 | + .unwrap() |
| 151 | + }); |
| 152 | + artificial_seq.push(altered_base); |
| 153 | + ref_pos += 1; |
| 154 | + record_pos += 1; |
| 155 | + }); |
| 156 | + // Add reference bases except for mismatches |
| 157 | + } |
| 158 | + 'I' => { |
| 159 | + add_random_bases(cigar.len() as u64, &mut artificial_seq, rng, alphabet)?; |
| 160 | + record_pos += cigar.len() as usize; |
| 161 | + } |
| 162 | + 'D' | 'N' => { |
| 163 | + ref_pos += cigar.len() as usize; |
| 164 | + } |
| 165 | + _ => {} |
| 166 | + } |
| 167 | + } |
| 168 | + |
| 169 | + Ok(artificial_seq) |
| 170 | +} |
| 171 | + |
| 172 | +fn add_random_bases( |
| 173 | + length: u64, |
| 174 | + seq: &mut Vec<u8>, |
| 175 | + rng: &mut ThreadRng, |
| 176 | + alphabet: &[u8], |
| 177 | +) -> Result<()> { |
| 178 | + (0..length).for_each(|_| seq.push(*alphabet.choose(rng).unwrap())); |
| 179 | + Ok(()) |
| 180 | +} |
0 commit comments