Skip to content

Commit

Permalink
metrics: Split describe metrics into modules
Browse files Browse the repository at this point in the history
  • Loading branch information
zaeleus committed Nov 20, 2024
1 parent 980321c commit dc7db10
Show file tree
Hide file tree
Showing 8 changed files with 276 additions and 141 deletions.
148 changes: 7 additions & 141 deletions src/commands/describe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use tracing::info;
use crate::{
cli::DescribeArgs,
fastq::{self, Record},
metrics,
};

pub fn describe(args: DescribeArgs) -> io::Result<()> {
Expand All @@ -13,154 +14,19 @@ pub fn describe(args: DescribeArgs) -> io::Result<()> {
let mut reader = fastq::open(args.src)?;
let mut record = Record::default();

let mut metrics = Metrics::default();
let mut metrics = metrics::default();

while reader.read_record(&mut record)? != 0 {
visit(&mut metrics, &record)?;
}

print_metrics(&metrics);

info!("done");

Ok(())
}

#[derive(Clone, Copy, Default)]
struct ErrorProbability {
sum: f64,
count: u64,
}

struct Metrics {
record_count: u64,
min_sequence_length: usize,
max_sequence_length: usize,
error_probability_sums_per_position: Vec<ErrorProbability>,
}

impl Default for Metrics {
fn default() -> Self {
Self {
record_count: 0,
min_sequence_length: usize::MAX,
max_sequence_length: usize::MIN,
error_probability_sums_per_position: Vec::new(),
for metric in &mut metrics {
metric.visit(&record)?;
}
}
}

fn visit(metrics: &mut Metrics, record: &Record) -> io::Result<()> {
metrics.record_count += 1;

let read_length = record.sequence().len();

metrics.min_sequence_length = metrics.min_sequence_length.min(read_length);
metrics.max_sequence_length = metrics.max_sequence_length.max(read_length);

if read_length > metrics.error_probability_sums_per_position.len() {
metrics
.error_probability_sums_per_position
.resize(read_length, ErrorProbability::default());
for metric in &metrics {
metric.println();
}

for (quality_score, error_probability) in record
.quality_scores()
.iter()
.zip(&mut metrics.error_probability_sums_per_position)
{
let q = decode_score(*quality_score)?;
let p = phred_score_to_error_probability(q);
error_probability.sum += p;
error_probability.count += 1;
}
info!("done");

Ok(())
}

fn decode_score(c: u8) -> io::Result<u8> {
const OFFSET: u8 = b'!';

c.checked_sub(OFFSET)
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "invalid quality score"))
}

// https://en.wikipedia.org/wiki/Phred_quality_score#Definition
const BASE: f64 = 10.0;
const FACTOR: f64 = 10.0;

fn phred_score_to_error_probability(n: u8) -> f64 {
BASE.powf(-f64::from(n) / FACTOR)
}

fn error_probability_to_phred_score(p: f64) -> f64 {
-FACTOR * p.log10()
}

fn print_metrics(metrics: &Metrics) {
let record_count = metrics.record_count;

println!("record_count\t{record_count}");

let min_sequence_length = if record_count == 0 {
0
} else {
metrics.min_sequence_length
};

println!("min_sequence_length\t{min_sequence_length}");

let max_sequence_length = if record_count == 0 {
0
} else {
metrics.max_sequence_length
};

println!("max_sequence_length\t{max_sequence_length}");

let avg_quality_score_per_position: Vec<_> = metrics
.error_probability_sums_per_position
.iter()
.map(|error_probability| {
let n = error_probability.count as f64;
let avg_error_probability = error_probability.sum / n;
error_probability_to_phred_score(avg_error_probability)
})
.collect();

println!("avg_quality_score_per_position\t{avg_quality_score_per_position:?}");
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_decode_score() -> io::Result<()> {
assert_eq!(decode_score(b'!')?, 0);
assert_eq!(decode_score(b'~')?, 93);
assert!(matches!(
decode_score(0x00),
Err(e) if e.kind() == io::ErrorKind::InvalidData
));
Ok(())
}

#[test]
fn test_phred_score_to_error_probability() {
assert_eq!(phred_score_to_error_probability(0), 1.0);
assert_eq!(phred_score_to_error_probability(10), 0.1);
assert_eq!(phred_score_to_error_probability(20), 0.01);
assert_eq!(phred_score_to_error_probability(30), 0.001);
assert_eq!(phred_score_to_error_probability(40), 0.0001);
}

#[test]
fn test_error_probability_to_phred_score() {
assert_eq!(error_probability_to_phred_score(1.0), 0.0);
assert_eq!(error_probability_to_phred_score(0.1), 10.0);
assert_eq!(error_probability_to_phred_score(0.01), 20.0);
assert_eq!(error_probability_to_phred_score(0.001), 30.0);
assert_eq!(error_probability_to_phred_score(0.0001), 40.0);
}
}
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ pub mod commands;
pub mod distributions;
pub mod fastq;
pub mod generator;
mod metrics;
pub mod pair_writer;
pub mod validators;

Expand Down
21 changes: 21 additions & 0 deletions src/metrics.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
mod avg_quality_score_per_position;
mod max_sequence_length;
mod metric;
mod min_sequence_length;
mod record_count;

pub use self::metric::Metric;
use self::{
avg_quality_score_per_position::AvgQualityScorePerPosition,
max_sequence_length::MaxSequenceLength, min_sequence_length::MinSequenceLength,
record_count::RecordCount,
};

pub fn default() -> Vec<Box<dyn Metric>> {
vec![
Box::new(RecordCount::default()),
Box::new(MinSequenceLength::default()),
Box::new(MaxSequenceLength::default()),
Box::new(AvgQualityScorePerPosition::default()),
]
}
110 changes: 110 additions & 0 deletions src/metrics/avg_quality_score_per_position.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
use std::io;

use super::Metric;
use crate::fastq::Record;

const NAME: &str = "avg_quality_score_per_position";

#[derive(Clone, Copy, Default)]
struct ErrorProbability {
sum: f64,
count: u64,
}

#[derive(Default)]
pub struct AvgQualityScorePerPosition {
error_probability_sums_per_position: Vec<ErrorProbability>,
}

impl Metric for AvgQualityScorePerPosition {
fn visit(&mut self, record: &Record) -> io::Result<()> {
let read_length = record.sequence().len();

if read_length > self.error_probability_sums_per_position.len() {
self.error_probability_sums_per_position
.resize(read_length, ErrorProbability::default());
}

for (quality_score, error_probability) in record
.quality_scores()
.iter()
.zip(&mut self.error_probability_sums_per_position)
{
let q = decode_score(*quality_score)?;
let p = phred_score_to_error_probability(q);
error_probability.sum += p;
error_probability.count += 1;
}

Ok(())
}

fn println(&self) {
let avg_quality_score_per_position: Vec<_> = self
.error_probability_sums_per_position
.iter()
.map(|error_probability| {
let n = error_probability.count as f64;
let avg_error_probability = error_probability.sum / n;
error_probability_to_phred_score(avg_error_probability)
})
.collect();

println!("{NAME}\t{avg_quality_score_per_position:?}");
}
}

fn decode_score(c: u8) -> io::Result<u8> {
const OFFSET: u8 = b'!';

c.checked_sub(OFFSET)
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "invalid quality score"))
}

// https://en.wikipedia.org/wiki/Phred_quality_score#Definition
const BASE: f64 = 10.0;
const FACTOR: f64 = 10.0;

fn phred_score_to_error_probability(n: u8) -> f64 {
BASE.powf(-f64::from(n) / FACTOR)
}

fn error_probability_to_phred_score(p: f64) -> f64 {
-FACTOR * p.log10()
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_decode_score() -> io::Result<()> {
assert_eq!(decode_score(b'!')?, 0);
assert_eq!(decode_score(b'~')?, 93);

assert!(matches!(
decode_score(0x00),
Err(e) if e.kind() == io::ErrorKind::InvalidData
));

Ok(())
}

#[test]
fn test_phred_score_to_error_probability() {
assert_eq!(phred_score_to_error_probability(0), 1.0);
assert_eq!(phred_score_to_error_probability(10), 0.1);
assert_eq!(phred_score_to_error_probability(20), 0.01);
assert_eq!(phred_score_to_error_probability(30), 0.001);
assert_eq!(phred_score_to_error_probability(40), 0.0001);
}

#[test]
fn test_error_probability_to_phred_score() {
assert_eq!(error_probability_to_phred_score(1.0), 0.0);
assert_eq!(error_probability_to_phred_score(0.1), 10.0);
assert_eq!(error_probability_to_phred_score(0.01), 20.0);
assert_eq!(error_probability_to_phred_score(0.001), 30.0);
assert_eq!(error_probability_to_phred_score(0.0001), 40.0);
}
}
38 changes: 38 additions & 0 deletions src/metrics/max_sequence_length.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
use std::io;

use super::Metric;
use crate::fastq::Record;

const NAME: &str = "max_sequence_length";

#[derive(Default)]
pub struct MaxSequenceLength(usize);

impl Metric for MaxSequenceLength {
fn visit(&mut self, record: &Record) -> io::Result<()> {
let read_length = record.sequence().len();
self.0 = self.0.max(read_length);
Ok(())
}

fn println(&self) {
println!("{NAME}\t{}", self.0);
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_visit() -> io::Result<()> {
let mut metric = MaxSequenceLength::default();
assert_eq!(metric.0, 0);

let record = Record::new("", "ACGT", "", "");
metric.visit(&record)?;
assert_eq!(metric.0, 4);

Ok(())
}
}
8 changes: 8 additions & 0 deletions src/metrics/metric.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
use std::io;

use crate::fastq::Record;

pub trait Metric {
fn visit(&mut self, record: &Record) -> io::Result<()>;
fn println(&self);
}
Loading

0 comments on commit dc7db10

Please sign in to comment.