Skip to content

Commit

Permalink
Merge branch 'develop' into feature/rust
Browse files Browse the repository at this point in the history
  • Loading branch information
ACEnglish committed Feb 3, 2024
2 parents 7d890e1 + a4f27ce commit 7c0a018
Show file tree
Hide file tree
Showing 17 changed files with 264 additions and 114 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ RUN apt-get -qq update \
rm -rf /var/lib/apt/lists/*

RUN mkdir -p /opt/truvari-source/truvari/
COPY setup.py README.md /opt/truvari-source
COPY setup.py README.md pyproject.toml /opt/truvari-source/
COPY truvari/ /opt/truvari-source/truvari/
WORKDIR /opt/truvari-source

Expand Down
18 changes: 15 additions & 3 deletions docs/api/truvari.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@ entry_size_similarity
^^^^^^^^^^^^^^^^^^^^^
.. autofunction:: entry_size_similarity

entry_shared_ref_context
^^^^^^^^^^^^^^^^^^^^^^^^
.. autofunction:: entry_shared_ref_context

entry_to_hash
^^^^^^^^^^^^^
.. autofunction:: entry_to_hash
Expand All @@ -62,9 +66,9 @@ entry_variant_type
^^^^^^^^^^^^^^^^^^
.. autofunction:: entry_variant_type

entry_shared_ref_context
^^^^^^^^^^^^^^^^^^^^^^^^
.. autofunction:: entry_shared_ref_context
entry_within
^^^^^^^^^^^^
..autofunction:: entry_within

Extra Methods
-------------
Expand Down Expand Up @@ -166,6 +170,14 @@ cmd_exe
^^^^^^^
.. autofunction:: cmd_exe

consolidate_phab_vcfs
^^^^^^^^^^^^^^^^^^^^^
.. autofunction:: consolidate_phab_vcfs

coords_within
^^^^^^^^^^^^^
.. autofunction:: coords_within

count_entries
^^^^^^^^^^^^^
.. autofunction:: count_entries
Expand Down
39 changes: 39 additions & 0 deletions repo_utils/run_unittest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
"""
One off unittests
"""
import os
import sys
import pysam
from collections import defaultdict

# Use the current truvari, not any installed libraries
sys.path.insert(0, os.getcwd())

import truvari

# Assume we're running in truvari root directory

"""
Boundary issues
"""
vcf_fn = "repo_utils/test_files/variants/boundary.vcf.gz"
bed_fn = "repo_utils/test_files/beds/boundary.bed"
region_start = 10
region_end = 20

vcf = pysam.VariantFile(vcf_fn)
for entry in vcf:
state = entry.info['include'] == 'in'
assert state == truvari.entry_within(entry, region_start, region_end), f"Bad Boundary {str(entry)}"

v = pysam.VariantFile(vcf_fn)
regions = truvari.RegionVCFIterator(v, includebed=bed_fn)

truv_ans = defaultdict(lambda: False)
for entry in regions.iterate(v):
truv_ans[truvari.entry_to_key(entry)] = True

v = pysam.VariantFile(vcf_fn)
for entry in v:
state = entry.info['include'] == 'in'
assert state == truv_ans[truvari.entry_to_key(entry)], f"Bad Boundary {str(entry)}"
7 changes: 7 additions & 0 deletions repo_utils/sub_tests/unittest.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# ------------------------------------------------------------
# unittest
# ------------------------------------------------------------
run test_unittest coverage run --concurrency=multiprocessing -p repo_utils/run_unittest.py
if [ $test_unittest ]; then
assert_exit_code 0
fi
1 change: 1 addition & 0 deletions repo_utils/test_files/beds/boundary.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
reference 10 20
Binary file added repo_utils/test_files/variants/boundary.vcf.gz
Binary file not shown.
Binary file added repo_utils/test_files/variants/boundary.vcf.gz.tbi
Binary file not shown.
1 change: 1 addition & 0 deletions repo_utils/truvari_ssshtests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ source $TESTSRC/sub_tests/phab.sh
source $TESTSRC/sub_tests/refine.sh
source $TESTSRC/sub_tests/segment.sh
source $TESTSRC/sub_tests/stratify.sh
source $TESTSRC/sub_tests/unittest.sh
source $TESTSRC/sub_tests/vcf2df.sh
source $TESTSRC/sub_tests/version.sh

Expand Down
11 changes: 11 additions & 0 deletions trust/src/comparisons.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@ use noodles_vcf::{
};
use std::str::FromStr;

pub fn coords_within(qstart: usize, qend: usize, rstart: usize, rend: usize, end_within: bool) -> bool {
let ending = if end_within { qend <= rend } else { qend < rend};
(qstart >= rstart) & ending
}

pub fn entry_boundaries(entry: &vcf::Record, ins_inflate: bool) -> (usize, usize) {
let mut start = usize::from(entry.position()) - 1;
let mut end = usize::from(entry.end().expect("No Variant End"));
Expand Down Expand Up @@ -137,6 +142,12 @@ pub fn sizesim(size_a: usize, size_b: usize) -> (f32, isize) {
(pct, diff)
}

pub fn entry_within(entry: &vcf::Record, rstart: usize, rend: usize) -> bool {
let (qstart, qend) = entry_boundaries(&entry, false);
let end_within = entry_variant_type(&entry) != Svtype::Ins;
coords_within(qstart, qend, rstart, rend, end_within)
}

pub fn overlap_percent(astart: usize, aend: usize, bstart: usize, bend: usize) -> f32 {
if (astart >= bstart) & (aend <= bend) {
return 1.0;
Expand Down
4 changes: 4 additions & 0 deletions truvari/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
:meth:`entry_size_similarity`
:meth:`entry_to_hash`
:meth:`entry_to_key`
:meth:`entry_within`
:meth:`entry_variant_type`
Extra methods:
Expand Down Expand Up @@ -51,6 +52,7 @@
:meth:`chunker`
:meth:`cmd_exe`
:meth:`consolidate_phab_vcfs`
:meth:`coords_within`
:meth:`count_entries`
:meth:`file_zipper`
:meth:`help_unknown_cmd`
Expand Down Expand Up @@ -101,6 +103,7 @@
)

from truvari.comparisons import (
coords_within,
create_pos_haplotype,
entry_boundaries,
entry_distance,
Expand All @@ -116,6 +119,7 @@
entry_to_hash,
entry_to_key,
entry_variant_type,
entry_within,
overlap_percent,
overlaps,
reciprocal_overlap,
Expand Down
53 changes: 42 additions & 11 deletions truvari/collapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from functools import cmp_to_key, partial

import pysam
import numpy as np
from intervaltree import IntervalTree

import truvari
Expand All @@ -30,6 +31,7 @@ class CollapsedCalls():
match_id: str
matches: list = field(default_factory=list)
gt_consolidate_count: int = 0
genotype_mask: str = "" # bad

def combine(self, other):
"""
Expand Down Expand Up @@ -66,6 +68,32 @@ def annotate_entry(self, header, med_info):
if med_info:
self.entry.info["CollapseStart"], self.entry.info["CollapseEnd"], self.entry.info["CollapseSize"] = self.calc_median_sizepos()

@staticmethod
def make_genotype_mask(entry, gtmode):
"""
Populate the genotype mask
"""
if gtmode == 'off':
return None
to_mask = (lambda x: 1 in x) if gtmode == 'all' else (
lambda x: x.count(1) == 1)
return np.array([to_mask(_.allele_indices) for _ in entry.samples.values()], dtype=bool)

def gt_conflict(self, other, which_gt):
"""
Return true if entry's genotypes conflict with any of the current collapse
which_gt all prevents variants present in the same sample from being collapsed
which_gt het only prevents two het variants from being collapsed.
"""
if which_gt == 'off':
return False

o_mask = self.make_genotype_mask(other, which_gt)
if (self.genotype_mask & o_mask).any():
return True
self.genotype_mask |= o_mask
return False


def chain_collapse(cur_collapse, all_collapse, matcher):
"""
Expand Down Expand Up @@ -98,37 +126,36 @@ def collapse_chunk(chunk, matcher):
call_id += 1
m_collap = CollapsedCalls(remaining_calls.pop(0),
f'{chunk_id}.{call_id}')
unmatched = []
# Sort based on size difference of current call
remaining_calls.sort(key=partial(relative_size_sorter, m_collap.entry))
for candidate in remaining_calls:
# quicker genotype comparison - needs to be refactored
m_collap.genotype_mask = m_collap.make_genotype_mask(
m_collap.entry, matcher.gt)

# Sort based on size difference to current call
for candidate in sorted(remaining_calls, key=partial(relative_size_sorter, m_collap.entry)):
mat = matcher.build_match(m_collap.entry,
candidate,
m_collap.match_id,
skip_gt=True,
short_circuit=True)
if matcher.hap and not hap_resolve(m_collap.entry, candidate):
mat.state = False
if mat.state and gt_conflict(m_collap, candidate, matcher.gt):
if mat.state and m_collap.gt_conflict(candidate, matcher.gt):
mat.state = False
if mat.state:
m_collap.matches.append(mat)
else:
unmatched.append(candidate)

# Does this collap need to go into a previous collap?
if not matcher.chain or not chain_collapse(m_collap, ret, matcher):
ret.append(m_collap)

remaining_calls = unmatched
# If hap, only allow the best match
if matcher.hap and m_collap.matches:
mats = sorted(m_collap.matches, reverse=True)
m_collap.matches = [mats.pop(0)]
remaining_calls.extend(mat.comp for mat in mats)

# Sort based on the desired sorting to choose the next one
remaining_calls.sort(key=matcher.sorter)
# Remove everything that was used
to_rm = [_.comp for _ in m_collap.matches]
remaining_calls = [_ for _ in remaining_calls if _ not in to_rm]

if matcher.no_consolidate:
for val in ret:
Expand All @@ -146,12 +173,14 @@ def collapse_chunk(chunk, matcher):
ret.sort(key=cmp_to_key(lambda x, y: x.entry.pos - y.entry.pos))
return ret


def relative_size_sorter(base, comp):
"""
Sort calls based on the absolute size difference of base and comp
"""
return abs(truvari.entry_size(base) - truvari.entry_size(comp))


def collapse_into_entry(entry, others, hap_mode=False):
"""
Consolidate information for genotypes where sample is unset
Expand Down Expand Up @@ -214,6 +243,7 @@ def gt_conflict(cur_collapse, entry, which_gt):
Return true if entry's genotypes conflict with any of the current collapse
which_gt all prevents variants present in the same sample from being collapsed
which_gt het only prevents two het variants from being collapsed.
Might be deprecated, now?
"""
if which_gt == 'off':
return False
Expand Down Expand Up @@ -245,6 +275,7 @@ def checker(base, comp):

return False


def get_ac(gt):
"""
Helper method to get allele count. assumes only 1s as ALT
Expand Down
32 changes: 32 additions & 0 deletions truvari/comparisons.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,31 @@
import truvari


def coords_within(qstart, qend, rstart, rend, end_within):
"""
Returns if a span is within the provided [start, end). All coordinates assumed 0-based
:param `qstart`: query start position
:type `qstart`: integer
:param `qend`: query end position
:type `qend`: integer
:param `start`: start of span
:type `start`: integer
:param `end`: end of span
:type `end`: integer
:param `end_within`: if true, qend <= rend, else qend < rend
:type `end_within`: bool
:return: If the coordinates are within the span
:rtype: bool
"""
if end_within:
ending = qend <= rend
else:
ending = qend < rend
return qstart >= rstart and ending


def create_pos_haplotype(a1, a2, ref, min_len=0):
"""
Create haplotypes of two allele's regions that are assumed to be overlapping
Expand Down Expand Up @@ -459,6 +484,13 @@ def entry_variant_type(entry):
return truvari.get_svtype(mat.groupdict()["SVTYPE"])
return truvari.get_svtype("UNK")

def entry_within(entry, rstart, rend):
"""
Extract entry boundaries and type to call `coords_within`
"""
qstart, qend = truvari.entry_boundaries(entry)
end_within = truvari.entry_variant_type(entry) != truvari.SV.INS
return coords_within(qstart, qend, rstart, rend, end_within)

def overlap_percent(astart, aend, bstart, bend):
"""
Expand Down
1 change: 1 addition & 0 deletions truvari/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def __init__(self, args=None):

self.reference = None
if self.params.reference is not None:
#logging.warning("`--reference` is being deprecated and is only
self.reference = pysam.FastaFile(self.params.reference)

@staticmethod
Expand Down
2 changes: 1 addition & 1 deletion truvari/phab.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def make_consensus(data, ref_fn):
correction = [-start, -start]
for entry in vcf.fetch(chrom, start, end):
# Variant must be within boundaries
if entry.start < start or entry.stop > end:
if not truvari.entry_within(entry, start, end):
continue
if entry.samples[sample]['GT'][0] == 1:
correction[0] = incorporate(haps[0], entry, correction[0])
Expand Down
Loading

0 comments on commit 7c0a018

Please sign in to comment.