Skip to content

Commit

Permalink
Merge branch 'release/v0.3.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
ACEnglish committed May 26, 2023
2 parents 926e731 + 9d5c8b4 commit 0f59592
Show file tree
Hide file tree
Showing 9 changed files with 187 additions and 90 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "regioners"
version = "0.2.3"
version = "0.3.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand Down
72 changes: 51 additions & 21 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,13 @@ a single intersection.
#### Excluding genomic regions with `--mask`
The genome may have regions where intervals should not be placed (e.g. reference gaps). Input intervals overlapping masked regions are removed and randomization will not place intervals there.

#### Local z-score `--window` and `--step`
`regioners` will calculate a local z-score for the two intervals' overlap
([details](https://www.bioconductor.org/packages/release/bioc/vignettes/regioneR/inst/doc/regioneR.html#local-z-score)).
The `--window` is how many basepairs upstream and downstream the intervals will be shifted to perform the local z-score and the
`--step` is the step size of the windows. For example, with a 1,000bp `--window` and `--step` of 100bp, the output will
have 20 local z-scores.

#### IO parameters
* `--genome` : A two column file with `chrom\tsize`. This becomes the space over which we can shuffle regions. If there are any regions
in the bed files on chromosomes not inside the `--genome` file, those regions will not be loaded.
Expand All @@ -94,41 +101,64 @@ For comparison, a regioneR test of *100* permutations on above data in an Rstudi
The output is a json with structure:
- A_cnt : number of entries in `-A` (note may be swapped from original paramter)
- B_cnt : number of entries in `-B` (note may be swapped from original paramter)
- alt : alternate hypothesis used for p-value - 'l'ess or 'g'reater
- count : overlap counter used
- no_merge : input beds overlaps were not merged before processing if true
- n : number of permutations performed
- obs : observed number of intersections
- per_chrom : randomization performed per-chromosome
- perm_mu : permutations' mean
- perm_sd : permutations' standard deviation
- perms : list of permutations' number of intersections
- pval : permutation test's p-value
- random : randomizer used
- swapped : were `-A` and `-B` swapped
- zscore : permutation test's zscore
- test : dictionary of test results
- localZ : dictionary of local z-score results

Test Key/Values
- alt : alternate hypothesis used for p-value - 'l'ess or 'g'reater
- mean : average number of overlaps of the permutations
- num_perms : number of permutations performed
- observed : observed number of intersections
- pval : permutation test's p-value
- perms : list of permutations' number of intersections
- std_dev : permutations' standard deviation
- z_score : permutation test's z-score

LocalZ Key/Values
- shifts : list of z-scores for each shift
- step : step size used
- window : window size used

## Plotting

Using python with seaborn:
```python
data = json.load(open("regioners_output.json"))
p = sb.histplot(data=data, x="perms",
color='gray', edgecolor='gray', kde=False, stat='density')
p = sb.kdeplot(data=data, x="perms",
color='black', ax=p)

x = data['obs']
import json
import seaborn as sb
import matplotlib.pyplot as plt

# Load results and get the test information for plotting
results = json.load(open("regioners_output.json"))
test = results['test']
# Draw the permutations' distribution
p = sb.histplot(data=test, x="perms",
color='gray', edgecolor='gray', kde=False, stat='density')
p = sb.kdeplot(data=test, x="perms",
color='black', ax=p)
# Draw a line at the observed intersections
obs = test['observed']
plt.axvline(obs, color='blue')
# Draw a box for annotation
props = dict(boxstyle='round', facecolor='wheat', alpha=0.9)
plt.axvline(x, color='blue')
plt.text(x, y, 'observed intersections',rotation=90, bbox=props, ma='center')
y = 0.007
plt.text(obs, y, 'observed intersections',rotation=90, bbox=props, ma='center')
p.set(xlabel="Intersection Count", ylabel="Permutation Density")
```
plt.show()

<img src="https://raw.githubusercontent.com/ACEnglish/regioners/main/figs/example_plot.png" alt="Girl in a jacket" style="width:250px;">
# Plot the local z-scores
local_z = data["localZ"]
p = sb.lineplot(x=range(-local_z["window"], local_z["window"], local_z["step"]), y=local_z['shifts'])
p.set(title="Local z-score values", xlabel="Shift", ylabel="z-score")
```

<img src="https://raw.githubusercontent.com/ACEnglish/regioners/main/figs/example_plot.png" alt="PermTest" style="width:250px;">
<img src="https://raw.githubusercontent.com/ACEnglish/regioners/main/figs/example_zscore.png" alt="LocalZ" style="width:250px;">

## ToDos:
## Future Features?:

- gzip file reading
- local z-score
Binary file added figs/example_zscore.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
8 changes: 8 additions & 0 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,14 @@ pub struct ArgParser {
/// do not swap A and B
#[arg(long = "no-swap", default_value_t = false)]
pub no_swap: bool,

/// local Z-score window size
#[arg(long, default_value_t = 1000)]
pub window: i64,

/// local Z-score window step
#[arg(long, default_value_t = 50)]
pub step: u64,
}

impl ArgParser {
Expand Down
2 changes: 1 addition & 1 deletion src/gapbreaks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ pub struct GapBreaks {
impl GapBreaks {
pub fn new(total_gap_size: u64) -> Self {
Self {
total_gap_size: total_gap_size,
total_gap_size,
rand: StdRand::seed(ClockSeed::default().next_u64()),
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ pub fn read_mask(file: &Path) -> MaskShift {
ret
}

pub fn read_genome(file: &std::path::PathBuf, mask: &Option<MaskShift>) -> GenomeShift {
pub fn read_genome(file: &Path, mask: &Option<MaskShift>) -> GenomeShift {
/*
Read a two column genome into a GenomeShifter
*/
Expand Down
160 changes: 97 additions & 63 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ use std::thread::JoinHandle;
use clap::Parser;
#[cfg(feature = "progbars")]
use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
use rust_lapper::Lapper;
use serde::Serialize;
use serde_json::json;

mod cli;
Expand All @@ -21,41 +23,93 @@ mod randomizers;

use crate::cli::ArgParser;
use crate::io::{read_bed, read_genome, read_mask};
use crate::randomizers::Randomizer;

/// Calculate mean and standard deviation from a vector of numbers
fn mean_std(v: &[u64]) -> (f64, f64) {
/* Calculate mean and standard deviation */
let n = v.len() as f64;
let mean = v.iter().sum::<u64>() as f64 / n;
let variance = v.iter().map(|x| (*x as f64 - mean).powi(2)).sum::<f64>() / n;
let std_dev = variance.sqrt();
use crate::randomizers::{shift_intervals, Randomizer};

/// Creates and holds permutation test results
#[derive(Serialize)]
struct PermTest {
observed: u64,
num_perms: f64,
mean: f64,
std_dev: f64,
p_val: f64,
z_score: f64,
alt: char,
perms: Vec<u64>,
}

(mean, std_dev)
impl PermTest {
pub fn new(observed: u64, perms: Vec<u64>) -> Self {
let n = perms.len() as f64;
let mean = perms.iter().sum::<u64>() as f64 / n;
let variance = perms
.iter()
.map(|x| (*x as f64 - mean).powi(2))
.sum::<f64>()
/ n;
let std_dev = variance.sqrt();
let (alt, p_count): (char, f64) = if (observed as f64) < mean {
(
'l',
perms.iter().map(|i| (*i < observed) as u8 as f64).sum(),
)
} else {
(
'g',
perms.iter().map(|i| (*i > observed) as u8 as f64).sum(),
)
};
let p_val = (p_count + 1.0) / (n + 1.0);
let z_score = if (observed == 0) & (mean == 0.0) {
warn!("z_score cannot be computed");
0.0
} else {
((observed as f64) - mean) / std_dev
};
PermTest {
observed,
p_val,
num_perms: n,
z_score,
mean,
std_dev,
alt,
perms,
}
}
}

/// Alternate hypothesis enum
#[derive(Clone, Copy)]
#[repr(u8)]
enum Alternate {
/// Observed intersections is less than permutations
Less = b'l',
/// Observed intersections is greater than permutations
Greater = b'g',
/// Creates and holds local z-score results
#[derive(Serialize)]
struct LocalZscore {
shifts: Vec<f64>,
window: i64,
step: u64,
}

/// Count the number of permutations greater/less than the observed count given
/// the alternate hypothesis
fn count_permutations(observed_count: u64, perms: &[u64], alt: Alternate) -> f64 {
match alt {
Alternate::Less => perms
.iter()
.map(|i| (observed_count >= *i) as u8 as f64)
.sum(),
Alternate::Greater => perms
.iter()
.map(|i| (observed_count <= *i) as u8 as f64)
.sum(),
impl LocalZscore {
pub fn new(
a_intv: &Lapper<u64, u64>,
b_intv: &Lapper<u64, u64>,
args: &ArgParser,
test: &PermTest,
) -> Self {
let shifts: Vec<f64> = (-args.window..args.window)
.step_by(args.step as usize)
.map(|i| {
let observed = args.count.ovl(&shift_intervals(&a_intv, i), &b_intv);
if (observed == 0) & (test.mean == 0.0) {
0.0
} else {
((observed as f64) - test.mean) / test.std_dev
}
})
.collect();
LocalZscore {
shifts,
window: args.window,
step: args.step,
}
}
}

Expand All @@ -71,7 +125,7 @@ fn main() -> std::io::Result<()> {
std::process::exit(1);
}

let mask = args.mask.map(|p| read_mask(&p));
let mask = args.mask.as_ref().map(|p| read_mask(&p));
let mut genome = read_genome(&args.genome, &mask);
let mut a_intv = read_bed(&args.bed_a, &genome, &mask);
let mut b_intv = read_bed(&args.bed_b, &genome, &mask);
Expand All @@ -94,7 +148,6 @@ fn main() -> std::io::Result<()> {
if args.random == Randomizer::Novl {
genome.make_gap_budget(&a_intv, &args.per_chrom)
}

// Won't need to change again. Can pass pointers to threads
let genome = Arc::new(genome);
let a_intv = Arc::new(a_intv);
Expand All @@ -107,10 +160,9 @@ fn main() -> std::io::Result<()> {
let initial_overlap_count: u64 = args.count.ovl(&a_intv, &b_intv);
info!("observed : {}", initial_overlap_count);

let chunk_size: u32 = ((args.num_times as f32) / (args.threads as f32)).ceil() as u32;

#[cfg(feature = "progbars")]
let (progs, pb) = {
let chunk_size: u32 = ((args.num_times as f32) / (args.threads as f32)).ceil() as u32;
let progs = MultiProgress::new();
let sty = ProgressStyle::with_template(
"[{elapsed_precise}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}",
Expand All @@ -135,10 +187,9 @@ fn main() -> std::io::Result<()> {
#[cfg(feature = "progbars")]
let m_p = pb[i as usize].clone();

let start_iter = (i as u32) * chunk_size;
let stop_iter = std::cmp::min(start_iter + chunk_size, args.num_times);
std::thread::spawn(move || {
(start_iter..stop_iter)
((i as u32)..args.num_times)
.step_by(args.threads as usize)
.map(|_| {
#[cfg(feature = "progbars")]
m_p.inc(1);
Expand All @@ -159,41 +210,24 @@ fn main() -> std::io::Result<()> {
/*if let Ok(report) = guard.report().build() { println!("report: {:?}", &report); };*/

// Calculate
let (mu, sd) = mean_std(&perm_counts);
let alt = if (initial_overlap_count as f64) < mu {
Alternate::Less
} else {
Alternate::Greater
};
let p_val = (count_permutations(initial_overlap_count, &perm_counts, alt) + 1.0)
/ ((args.num_times as f64) + 1.0);
let z_score = if (initial_overlap_count == 0) & (mu == 0.0) {
warn!("z_score cannot be computed");
0.0
} else {
((initial_overlap_count as f64) - mu) / sd
};
let test = PermTest::new(initial_overlap_count, perm_counts);
let local_zscores = LocalZscore::new(&a_intv, &b_intv, &args, &test);

// Output
info!("perm mu: {}", mu);
info!("perm sd: {}", sd);
info!("alt hypo : {}", alt as u8 as char);
info!("p-val : {}", p_val);
let data = json!({"pval": p_val,
"zscore": z_score,
"obs":initial_overlap_count,
"perm_mu": mu,
"perm_sd": sd,
"alt": alt as u8 as char,
"n": args.num_times,
info!("perm mu: {}", test.mean);
info!("perm sd: {}", test.std_dev);
info!("alt hypo : {}", test.alt);
info!("p-val : {}", test.p_val);
let data = json!({"test": test,
"swapped": swapped,
"no_merge": args.no_merge,
"random": args.random,
"count": args.count,
"A_cnt" : a_count,
"B_cnt" : b_count,
"per_chrom": args.per_chrom,
"perms": perm_counts});
"localZ": local_zscores,
});
let mut file = File::create(args.output)?;
file.write_all(serde_json::to_string(&data).unwrap().as_bytes())
}
Loading

0 comments on commit 0f59592

Please sign in to comment.