Skip to content

Commit

Permalink
filter based on reference length and correct bubble popping
Browse files Browse the repository at this point in the history
  • Loading branch information
ekg committed Oct 7, 2021
1 parent 5cd1914 commit 26a1f0c
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 18 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,16 @@ This exposed snarl tree information can be used to filter the VCF to obtain a se
`vcfbub` lets us do two common operations on these VCFs:

1. We can filter sites by maximum level in the snarl tree. For instance, `--max-level 0` would keep only sites with `LV=0`. In practice, vg's snarl finder ensures that these are sites rooted on the main linear axis of the pangenome graph. Those at higher levels occur within larger variants.
2. We can filter sites by maximum allele size. In this case, `--max-length 10000` would keep only sites where the largest allele is less than 10kb. Setting `--max-length` additionally ensures that the output contains the bubbles nested inside of any popped bubble, even if they are at greater than `--max-level`.
2. We can filter sites by maximum allele size, either for the reference allele or any allele. In this case, `--max-ref-length 10000` would keep only sites where the reference allele is less than 10kb long. Setting `--max-ref-length` or `--max-allele-length` additionally ensures that the output contains the bubbles nested inside of any popped bubble, even if they are at greater than `--max-level`.

`vcfbub` accomplishes a simple task: we keep sites that are the children of those which we "pop" due to their size.
These occur around complex large SVs, such as multi-Mbp inversions and segmental duplications.
We often need to remove these, as they provide little information for many downstream applications, such as haplotype panels or other imputation references.

## usage

This removes all non-top-level variant sites (`-l 0`) unless they are inside of variants > 10kb (`-b 10000`):
This removes all non-top-level variant sites (`-l 0`) unless they are inside of variants with reference length > 10kb (`-r 10000`):

```
vcfbub -l 0 -b 10000 var.vcf >filt.vcf
vcfbub -l 0 -r 10000 var.vcf >filt.vcf
```
65 changes: 50 additions & 15 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@ fn get_max_allele_length(vcf_record: &VCFRecord) -> usize {
alt_lengths.iter().max().unwrap().clone()
}

fn get_ref_allele_length(vcf_record: &VCFRecord) -> usize {
vcf_record.reference.len()
}

fn main() {
let matches = App::new("vcfbub")
.version("0.1.0")
Expand Down Expand Up @@ -107,13 +111,28 @@ fn main() {
.takes_value(true),
)
.arg(
Arg::with_name("max-length")
.short("b")
.long("max-length")
Arg::with_name("max-allele-length")
.short("a")
.long("max-allele-length")
.value_name("LENGTH")
.help("Filter sites whose longest allele is greater than LENGTH.")
.help("Filter sites whose max allele length is greater than LENGTH.")
.takes_value(true),
)
.arg(
Arg::with_name("max-ref-length")
.short("r")
.long("max-ref-length")
.value_name("LENGTH")
.help("Filter sites whose reference allele is longer than LENGTH.")
.takes_value(true),
)
.arg(
Arg::with_name("debug")
.short("d")
.long("debug")
.help("Print debugging information about which sites are removed.")
.takes_value(false),
)
.get_matches();

let infile = matches.value_of("input").unwrap();
Expand All @@ -123,11 +142,18 @@ fn main() {
None => i32::MAX,
};

let max_length = match matches.value_of("max-length") {
let max_allele_length = match matches.value_of("max-allele-length") {
Some(l) => l.parse::<usize>().unwrap(),
None => usize::MAX,
};

let max_ref_length = match matches.value_of("max-ref-length") {
Some(l) => l.parse::<usize>().unwrap(),
None => usize::MAX,
};

let print_debug = matches.is_present("debug");

let vcf_header = get_header(infile);

// make a pass to check the levels
Expand All @@ -137,18 +163,24 @@ fn main() {
let mut popped_bubbles: HashSet<String> = HashSet::new();
for_each_line_in_vcf(infile, |vcf_record, _| {
let level = get_level(vcf_record);
let max_allele_length = get_max_allele_length(vcf_record);
let max_length = get_max_allele_length(vcf_record);
let ref_length = get_ref_allele_length(vcf_record);
let mut keep = true;
if level > max_level {
keep_record.push(false);
} else {
keep_record.push(true);
keep = false;
}
if max_allele_length > max_length {
if max_length > max_allele_length || ref_length > max_ref_length {
if print_debug {
eprintln!(
"popped {} {}",
get_snarl_id(vcf_record),
vcf_record.position
);
}
popped_bubbles.insert(get_snarl_id(vcf_record));
keep = false;
}
//let snarl_id = get_snarl_id(vcf_record);
//let parent_snarl = get_parent_snarl(vcf_record);
//parent_bubble.insert(snarl_id, parent_snarl);
keep_record.push(keep);
});

// setup writer
Expand All @@ -158,8 +190,11 @@ fn main() {
for_each_line_in_vcf(infile, |vcf_record, idx| {
let snarl_id = get_snarl_id(vcf_record);
let parent_snarl = get_parent_snarl(vcf_record);
if keep_record[idx] && !popped_bubbles.contains(&snarl_id)
|| !keep_record[idx] && popped_bubbles.contains(&parent_snarl)
if keep_record[idx] // we keep records when they're marked to keep
// or if they're marked don't-keep but their parent was popped and they weren't popped
|| (!keep_record[idx]
&& popped_bubbles.contains(&parent_snarl)
&& !popped_bubbles.contains(&snarl_id))
{
writer.write_record(&vcf_record).unwrap()
}
Expand Down

0 comments on commit 26a1f0c

Please sign in to comment.