diff --git a/Dockerfile b/Dockerfile index e83bb6f2..233cc924 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,7 +10,7 @@ RUN apt-get -qq update \ rm -rf /var/lib/apt/lists/* RUN mkdir -p /opt/truvari-source/truvari/ -COPY setup.py README.md /opt/truvari-source +COPY setup.py README.md pyproject.toml /opt/truvari-source/ COPY truvari/ /opt/truvari-source/truvari/ WORKDIR /opt/truvari-source diff --git a/docs/api/truvari.rst b/docs/api/truvari.rst index ded7f385..d0716c72 100644 --- a/docs/api/truvari.rst +++ b/docs/api/truvari.rst @@ -50,6 +50,10 @@ entry_size_similarity ^^^^^^^^^^^^^^^^^^^^^ .. autofunction:: entry_size_similarity +entry_shared_ref_context +^^^^^^^^^^^^^^^^^^^^^^^^ +.. autofunction:: entry_shared_ref_context + entry_to_hash ^^^^^^^^^^^^^ .. autofunction:: entry_to_hash @@ -62,9 +66,9 @@ entry_variant_type ^^^^^^^^^^^^^^^^^^ .. autofunction:: entry_variant_type -entry_shared_ref_context -^^^^^^^^^^^^^^^^^^^^^^^^ -.. autofunction:: entry_shared_ref_context +entry_within +^^^^^^^^^^^^ +..autofunction:: entry_within Extra Methods ------------- @@ -166,6 +170,14 @@ cmd_exe ^^^^^^^ .. autofunction:: cmd_exe +consolidate_phab_vcfs +^^^^^^^^^^^^^^^^^^^^^ +.. autofunction:: consolidate_phab_vcfs + +coords_within +^^^^^^^^^^^^^ +.. autofunction:: coords_within + count_entries ^^^^^^^^^^^^^ .. autofunction:: count_entries diff --git a/repo_utils/run_unittest.py b/repo_utils/run_unittest.py new file mode 100644 index 00000000..9c3e66d0 --- /dev/null +++ b/repo_utils/run_unittest.py @@ -0,0 +1,39 @@ +""" +One off unittests +""" +import os +import sys +import pysam +from collections import defaultdict + +# Use the current truvari, not any installed libraries +sys.path.insert(0, os.getcwd()) + +import truvari + +# Assume we're running in truvari root directory + +""" +Boundary issues +""" +vcf_fn = "repo_utils/test_files/variants/boundary.vcf.gz" +bed_fn = "repo_utils/test_files/beds/boundary.bed" +region_start = 10 +region_end = 20 + +vcf = pysam.VariantFile(vcf_fn) +for entry in vcf: + state = entry.info['include'] == 'in' + assert state == truvari.entry_within(entry, region_start, region_end), f"Bad Boundary {str(entry)}" + +v = pysam.VariantFile(vcf_fn) +regions = truvari.RegionVCFIterator(v, includebed=bed_fn) + +truv_ans = defaultdict(lambda: False) +for entry in regions.iterate(v): + truv_ans[truvari.entry_to_key(entry)] = True + +v = pysam.VariantFile(vcf_fn) +for entry in v: + state = entry.info['include'] == 'in' + assert state == truv_ans[truvari.entry_to_key(entry)], f"Bad Boundary {str(entry)}" diff --git a/repo_utils/sub_tests/unittest.sh b/repo_utils/sub_tests/unittest.sh new file mode 100644 index 00000000..85d11064 --- /dev/null +++ b/repo_utils/sub_tests/unittest.sh @@ -0,0 +1,7 @@ +# ------------------------------------------------------------ +# unittest +# ------------------------------------------------------------ +run test_unittest coverage run --concurrency=multiprocessing -p repo_utils/run_unittest.py +if [ $test_unittest ]; then + assert_exit_code 0 +fi diff --git a/repo_utils/test_files/beds/boundary.bed b/repo_utils/test_files/beds/boundary.bed new file mode 100644 index 00000000..ecf93366 --- /dev/null +++ b/repo_utils/test_files/beds/boundary.bed @@ -0,0 +1 @@ +reference 10 20 diff --git a/repo_utils/test_files/variants/boundary.vcf.gz b/repo_utils/test_files/variants/boundary.vcf.gz new file mode 100644 index 00000000..18eca1be Binary files /dev/null and b/repo_utils/test_files/variants/boundary.vcf.gz differ diff --git a/repo_utils/test_files/variants/boundary.vcf.gz.tbi b/repo_utils/test_files/variants/boundary.vcf.gz.tbi new file mode 100644 index 00000000..e4a1af46 Binary files /dev/null and b/repo_utils/test_files/variants/boundary.vcf.gz.tbi differ diff --git a/repo_utils/truvari_ssshtests.sh b/repo_utils/truvari_ssshtests.sh index 6f7c17d0..3e5cffa5 100644 --- a/repo_utils/truvari_ssshtests.sh +++ b/repo_utils/truvari_ssshtests.sh @@ -23,6 +23,7 @@ source $TESTSRC/sub_tests/phab.sh source $TESTSRC/sub_tests/refine.sh source $TESTSRC/sub_tests/segment.sh source $TESTSRC/sub_tests/stratify.sh +source $TESTSRC/sub_tests/unittest.sh source $TESTSRC/sub_tests/vcf2df.sh source $TESTSRC/sub_tests/version.sh diff --git a/trust/src/comparisons.rs b/trust/src/comparisons.rs index 725c673b..2b2228b4 100644 --- a/trust/src/comparisons.rs +++ b/trust/src/comparisons.rs @@ -5,6 +5,11 @@ use noodles_vcf::{ }; use std::str::FromStr; +pub fn coords_within(qstart: usize, qend: usize, rstart: usize, rend: usize, end_within: bool) -> bool { + let ending = if end_within { qend <= rend } else { qend < rend}; + (qstart >= rstart) & ending +} + pub fn entry_boundaries(entry: &vcf::Record, ins_inflate: bool) -> (usize, usize) { let mut start = usize::from(entry.position()) - 1; let mut end = usize::from(entry.end().expect("No Variant End")); @@ -137,6 +142,12 @@ pub fn sizesim(size_a: usize, size_b: usize) -> (f32, isize) { (pct, diff) } +pub fn entry_within(entry: &vcf::Record, rstart: usize, rend: usize) -> bool { + let (qstart, qend) = entry_boundaries(&entry, false); + let end_within = entry_variant_type(&entry) != Svtype::Ins; + coords_within(qstart, qend, rstart, rend, end_within) +} + pub fn overlap_percent(astart: usize, aend: usize, bstart: usize, bend: usize) -> f32 { if (astart >= bstart) & (aend <= bend) { return 1.0; diff --git a/truvari/__init__.py b/truvari/__init__.py index 76697620..4993746b 100644 --- a/truvari/__init__.py +++ b/truvari/__init__.py @@ -19,6 +19,7 @@ :meth:`entry_size_similarity` :meth:`entry_to_hash` :meth:`entry_to_key` +:meth:`entry_within` :meth:`entry_variant_type` Extra methods: @@ -51,6 +52,7 @@ :meth:`chunker` :meth:`cmd_exe` :meth:`consolidate_phab_vcfs` +:meth:`coords_within` :meth:`count_entries` :meth:`file_zipper` :meth:`help_unknown_cmd` @@ -101,6 +103,7 @@ ) from truvari.comparisons import ( + coords_within, create_pos_haplotype, entry_boundaries, entry_distance, @@ -116,6 +119,7 @@ entry_to_hash, entry_to_key, entry_variant_type, + entry_within, overlap_percent, overlaps, reciprocal_overlap, diff --git a/truvari/collapse.py b/truvari/collapse.py index 816a7c0f..de0989ab 100644 --- a/truvari/collapse.py +++ b/truvari/collapse.py @@ -15,6 +15,7 @@ from functools import cmp_to_key, partial import pysam +import numpy as np from intervaltree import IntervalTree import truvari @@ -30,6 +31,7 @@ class CollapsedCalls(): match_id: str matches: list = field(default_factory=list) gt_consolidate_count: int = 0 + genotype_mask: str = "" # bad def combine(self, other): """ @@ -66,6 +68,32 @@ def annotate_entry(self, header, med_info): if med_info: self.entry.info["CollapseStart"], self.entry.info["CollapseEnd"], self.entry.info["CollapseSize"] = self.calc_median_sizepos() + @staticmethod + def make_genotype_mask(entry, gtmode): + """ + Populate the genotype mask + """ + if gtmode == 'off': + return None + to_mask = (lambda x: 1 in x) if gtmode == 'all' else ( + lambda x: x.count(1) == 1) + return np.array([to_mask(_.allele_indices) for _ in entry.samples.values()], dtype=bool) + + def gt_conflict(self, other, which_gt): + """ + Return true if entry's genotypes conflict with any of the current collapse + which_gt all prevents variants present in the same sample from being collapsed + which_gt het only prevents two het variants from being collapsed. + """ + if which_gt == 'off': + return False + + o_mask = self.make_genotype_mask(other, which_gt) + if (self.genotype_mask & o_mask).any(): + return True + self.genotype_mask |= o_mask + return False + def chain_collapse(cur_collapse, all_collapse, matcher): """ @@ -98,10 +126,12 @@ def collapse_chunk(chunk, matcher): call_id += 1 m_collap = CollapsedCalls(remaining_calls.pop(0), f'{chunk_id}.{call_id}') - unmatched = [] - # Sort based on size difference of current call - remaining_calls.sort(key=partial(relative_size_sorter, m_collap.entry)) - for candidate in remaining_calls: + # quicker genotype comparison - needs to be refactored + m_collap.genotype_mask = m_collap.make_genotype_mask( + m_collap.entry, matcher.gt) + + # Sort based on size difference to current call + for candidate in sorted(remaining_calls, key=partial(relative_size_sorter, m_collap.entry)): mat = matcher.build_match(m_collap.entry, candidate, m_collap.match_id, @@ -109,26 +139,23 @@ def collapse_chunk(chunk, matcher): short_circuit=True) if matcher.hap and not hap_resolve(m_collap.entry, candidate): mat.state = False - if mat.state and gt_conflict(m_collap, candidate, matcher.gt): + if mat.state and m_collap.gt_conflict(candidate, matcher.gt): mat.state = False if mat.state: m_collap.matches.append(mat) - else: - unmatched.append(candidate) # Does this collap need to go into a previous collap? if not matcher.chain or not chain_collapse(m_collap, ret, matcher): ret.append(m_collap) - remaining_calls = unmatched # If hap, only allow the best match if matcher.hap and m_collap.matches: mats = sorted(m_collap.matches, reverse=True) m_collap.matches = [mats.pop(0)] - remaining_calls.extend(mat.comp for mat in mats) - # Sort based on the desired sorting to choose the next one - remaining_calls.sort(key=matcher.sorter) + # Remove everything that was used + to_rm = [_.comp for _ in m_collap.matches] + remaining_calls = [_ for _ in remaining_calls if _ not in to_rm] if matcher.no_consolidate: for val in ret: @@ -146,12 +173,14 @@ def collapse_chunk(chunk, matcher): ret.sort(key=cmp_to_key(lambda x, y: x.entry.pos - y.entry.pos)) return ret + def relative_size_sorter(base, comp): """ Sort calls based on the absolute size difference of base and comp """ return abs(truvari.entry_size(base) - truvari.entry_size(comp)) + def collapse_into_entry(entry, others, hap_mode=False): """ Consolidate information for genotypes where sample is unset @@ -214,6 +243,7 @@ def gt_conflict(cur_collapse, entry, which_gt): Return true if entry's genotypes conflict with any of the current collapse which_gt all prevents variants present in the same sample from being collapsed which_gt het only prevents two het variants from being collapsed. + Might be deprecated, now? """ if which_gt == 'off': return False @@ -245,6 +275,7 @@ def checker(base, comp): return False + def get_ac(gt): """ Helper method to get allele count. assumes only 1s as ALT diff --git a/truvari/comparisons.py b/truvari/comparisons.py index 332bbfd9..8dea1a47 100644 --- a/truvari/comparisons.py +++ b/truvari/comparisons.py @@ -10,6 +10,31 @@ import truvari +def coords_within(qstart, qend, rstart, rend, end_within): + """ + Returns if a span is within the provided [start, end). All coordinates assumed 0-based + + :param `qstart`: query start position + :type `qstart`: integer + :param `qend`: query end position + :type `qend`: integer + :param `start`: start of span + :type `start`: integer + :param `end`: end of span + :type `end`: integer + :param `end_within`: if true, qend <= rend, else qend < rend + :type `end_within`: bool + + :return: If the coordinates are within the span + :rtype: bool + """ + if end_within: + ending = qend <= rend + else: + ending = qend < rend + return qstart >= rstart and ending + + def create_pos_haplotype(a1, a2, ref, min_len=0): """ Create haplotypes of two allele's regions that are assumed to be overlapping @@ -459,6 +484,13 @@ def entry_variant_type(entry): return truvari.get_svtype(mat.groupdict()["SVTYPE"]) return truvari.get_svtype("UNK") +def entry_within(entry, rstart, rend): + """ + Extract entry boundaries and type to call `coords_within` + """ + qstart, qend = truvari.entry_boundaries(entry) + end_within = truvari.entry_variant_type(entry) != truvari.SV.INS + return coords_within(qstart, qend, rstart, rend, end_within) def overlap_percent(astart, aend, bstart, bend): """ diff --git a/truvari/matching.py b/truvari/matching.py index 28b323bf..a8789403 100644 --- a/truvari/matching.py +++ b/truvari/matching.py @@ -90,6 +90,7 @@ def __init__(self, args=None): self.reference = None if self.params.reference is not None: + #logging.warning("`--reference` is being deprecated and is only self.reference = pysam.FastaFile(self.params.reference) @staticmethod diff --git a/truvari/phab.py b/truvari/phab.py index c7bb42ae..52d733fc 100644 --- a/truvari/phab.py +++ b/truvari/phab.py @@ -119,7 +119,7 @@ def make_consensus(data, ref_fn): correction = [-start, -start] for entry in vcf.fetch(chrom, start, end): # Variant must be within boundaries - if entry.start < start or entry.stop > end: + if not truvari.entry_within(entry, start, end): continue if entry.samples[sample]['GT'][0] == 1: correction[0] = incorporate(haps[0], entry, correction[0]) diff --git a/truvari/refine.py b/truvari/refine.py index 2076de7a..19316943 100644 --- a/truvari/refine.py +++ b/truvari/refine.py @@ -143,7 +143,8 @@ def refined_stratify(outdir, to_eval_coords, regions, threads=1): """ update regions in-place with the output variant counts """ - counts = truvari.benchdir_count_entries(outdir, to_eval_coords, True, threads) + counts = truvari.benchdir_count_entries( + outdir, to_eval_coords, True, threads) counts.index = regions[regions['refined']].index counts.columns = ["out_tpbase", "out_tp", "out_fn", "out_fp"] regions = regions.join(counts) @@ -168,38 +169,51 @@ def make_variant_report(regions): return summary +def region_filter(vcf, tree, inside=True): + """ + Given a VariantRecord iter and defaultdict(IntervalTree), yield variants which are inside/outside the tree regions + """ + for entry in vcf: + start, end = truvari.entry_boundaries(entry) + m_ovl = list(tree[entry.chrom].overlap(start, end)) + if len(m_ovl) == 1: + end_within = truvari.entry_variant_type(entry) != truvari.SV.INS + is_within = truvari.coords_within(start, end, m_ovl[0].begin, m_ovl[0].end - 1, end_within) + if is_within == inside: + yield entry + elif not inside: + yield entry + def recount_variant_report(orig_dir, phab_dir, regions): """ Count original variants not in refined regions and consolidate with the refined counts. """ - def falls_in_count(fn, no_count): - """ count number of variants that don't start in no_count """ - vcf = pysam.VariantFile(fn) - count = 0 - for entry in vcf: - if entry.chrom in no_count.index: - chrom = no_count.loc[entry.chrom] - if ((chrom['start'] <= entry.start) & (entry.start <= chrom['end'])).any(): - continue - count += 1 - return count + tree = defaultdict(IntervalTree) + n_regions = regions[regions["refined"]].copy() + n_regions['start'] -= PHAB_BUFFER + n_regions['end'] += PHAB_BUFFER + for _, row in n_regions.iterrows(): + tree[row['chrom']].addi(row['start'], row['end']) summary = truvari.StatsBox() with open(os.path.join(phab_dir, "summary.json")) as fh: summary.update(json.load(fh)) - # if the variant starts in a refined region, skip it - no_count = regions[regions["refined"]].copy().set_index('chrom') - no_count['start'] -= PHAB_BUFFER - no_count['end'] += PHAB_BUFFER - tpb = falls_in_count(os.path.join(orig_dir, 'tp-base.vcf.gz'), no_count) + vcf = pysam.VariantFile(os.path.join(orig_dir, 'tp-base.vcf.gz')) + tpb = len(list(region_filter(vcf, tree, False))) summary["TP-base"] += tpb - tpc = falls_in_count(os.path.join(orig_dir, 'tp-comp.vcf.gz'), no_count) + + vcf = pysam.VariantFile(os.path.join(orig_dir, 'tp-comp.vcf.gz')) + tpc = len(list(region_filter(vcf, tree, False))) summary["TP-comp"] += tpc - fp = falls_in_count(os.path.join(orig_dir, 'fp.vcf.gz'), no_count) + + vcf = pysam.VariantFile(os.path.join(orig_dir, 'fp.vcf.gz')) + fp = len(list(region_filter(vcf, tree, False))) summary["FP"] += fp - fn = falls_in_count(os.path.join(orig_dir, 'fn.vcf.gz'), no_count) + + vcf = pysam.VariantFile(os.path.join(orig_dir, 'fn.vcf.gz')) + fn = len(list(region_filter(vcf, tree, False))) summary["FN"] += fn summary["comp cnt"] += tpc + fp @@ -213,75 +227,75 @@ def make_region_report(data): Given a refine counts DataFrame, calculate the performance of PPV, TNR, etc. Also adds 'state' column to regions inplace """ - false_pos = data['out_fp'] != 0 - false_neg = data['out_fn'] != 0 - any_false = false_pos | false_neg + false_pos=data['out_fp'] != 0 + false_neg=data['out_fn'] != 0 + any_false=false_pos | false_neg - true_positives = (data['out_tp'] != 0) & ( + true_positives=(data['out_tp'] != 0) & ( data['out_tpbase'] != 0) & ~any_false - true_negatives = ( + true_negatives=( data[['out_tpbase', 'out_tp', 'out_fn', 'out_fp']] == 0).all(axis=1) - state_map = defaultdict(lambda: 'UNK') + state_map=defaultdict(lambda: 'UNK') state_map.update({(True, False, False, False): 'TP', (False, True, False, False): 'TN', (False, False, True, False): 'FP', (False, False, False, True): 'FN', (False, False, True, True): 'FN,FP'}) - data['state'] = [state_map[_] + data['state']=[state_map[_] for _ in zip(true_positives, true_negatives, false_pos, false_neg)] - tot = (true_positives + false_pos + false_neg + true_negatives).astype(int) - und = data.iloc[tot[tot != 1].index] - - result = {} - - base = (data['out_tpbase'] != 0) | (data['out_fn'] != 0) - baseP = int(base.sum()) - baseN = int((~base).sum()) - comp = (data['out_tp'] != 0) | (data['out_fp'] != 0) - compP = int(comp.sum()) - compN = int((~comp).sum()) - - result["TP"] = int(true_positives.sum()) - result["TN"] = int(true_negatives.sum()) - result["FP"] = int(false_pos.sum()) - result["FN"] = int(false_neg.sum()) - result["base P"] = baseP - result["base N"] = baseN - result["comp P"] = compP - result["comp N"] = compN + tot=(true_positives + false_pos + false_neg + true_negatives).astype(int) + und=data.iloc[tot[tot != 1].index] + + result={} + + base=(data['out_tpbase'] != 0) | (data['out_fn'] != 0) + baseP=int(base.sum()) + baseN=int((~base).sum()) + comp=(data['out_tp'] != 0) | (data['out_fp'] != 0) + compP=int(comp.sum()) + compN=int((~comp).sum()) + + result["TP"]=int(true_positives.sum()) + result["TN"]=int(true_negatives.sum()) + result["FP"]=int(false_pos.sum()) + result["FN"]=int(false_neg.sum()) + result["base P"]=baseP + result["base N"]=baseN + result["comp P"]=compP + result["comp N"]=compN # precision - result["PPV"] = result["TP"] / \ + result["PPV"]=result["TP"] / \ result["comp P"] if result["comp P"] != 0 else None # recall - result["TPR"] = result["TP"] / \ + result["TPR"]=result["TP"] / \ result["base P"] if result["base P"] != 0 else None # specificity - result["TNR"] = result["TN"] / \ + result["TNR"]=result["TN"] / \ result["base N"] if result["base N"] != 0 else None # negative predictive value - result["NPV"] = result["TN"] / \ + result["NPV"]=result["TN"] / \ result["comp N"] if result["comp N"] != 0 else None # accuracy if result["base P"] + result["base N"] != 0: - result["ACC"] = (result["TP"] + result["TN"]) / \ + result["ACC"]=(result["TP"] + result["TN"]) / \ (result["base P"] + result["base N"]) else: - result["ACC"] = None + result["ACC"]=None if result["TPR"] is not None and result["TNR"] is not None: - result["BA"] = (result["TPR"] + result["TNR"]) / 2 + result["BA"]=(result["TPR"] + result["TNR"]) / 2 else: - result["BA"] = None + result["BA"]=None if result["PPV"] and result["TPR"]: - result["F1"] = 2 * ((result["PPV"] * result["TPR"]) / + result["F1"]=2 * ((result["PPV"] * result["TPR"]) / (result["PPV"] + result["TPR"])) else: - result["F1"] = None - result["UND"] = len(und) + result["F1"]=None + result["UND"]=len(und) return result @@ -290,7 +304,7 @@ def parse_args(args): """ Pull the command line parameters """ - parser = argparse.ArgumentParser(prog="refine", description=__doc__, + parser=argparse.ArgumentParser(prog="refine", description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("benchdir", metavar="DIR", help="Truvari bench directory") @@ -313,7 +327,7 @@ def parse_args(args): help="Parameters for mafft, wrap in a single quote (%(default)s)") parser.add_argument("--debug", action="store_true", help="Verbose logging") - args = parser.parse_args(args) + args=parser.parse_args(args) return args @@ -323,45 +337,45 @@ def check_params(args): Returns as a Namespace """ - check_fail = False - param_path = os.path.join(args.benchdir, "params.json") + check_fail=False + param_path=os.path.join(args.benchdir, "params.json") if not os.path.exists(param_path): - check_fail = True + check_fail=True logging.error( "Bench directory %s doesn't have params.json", param_path) - params = None + params=None with open(param_path, 'r') as fh: - params = json.load(fh) + params=json.load(fh) - p_file = os.path.join(args.benchdir, "phab.output.vcf.gz") + p_file=os.path.join(args.benchdir, "phab.output.vcf.gz") if os.path.exists(p_file): - check_fail = True + check_fail=True logging.error("Phab output file %s already exists", p_file) if params["reference"] is None and args.reference is None: - check_fail = True + check_fail=True logging.error( "Reference not in params.json or given as a parameter to refine") elif args.reference is None: - args.reference = params["reference"] + args.reference=params["reference"] if not os.path.exists(args.reference): - check_fail = True + check_fail=True logging.error("Reference %s does not exist", args.reference) # Setup prefix - params["cSample"] = "p:" + params["cSample"] + params["cSample"]="p:" + params["cSample"] - bhdir = os.path.join(args.benchdir, 'phab_bench') + bhdir=os.path.join(args.benchdir, 'phab_bench') if os.path.exists(bhdir): - check_fail = True + check_fail=True logging.error("Directory %s exists. Cannot run refine", bhdir) # Should check that bench dir has compressed/indexed vcfs for fetching for i in ["tp-base.vcf.gz", "tp-comp.vcf.gz", "fn.vcf.gz", "fp.vcf.gz"]: if not os.path.exists(os.path.join(args.benchdir, i)): - check_fail = True + check_fail=True logging.error("Benchdir doesn't have compressed/indexed %s", i) if check_fail: @@ -375,12 +389,12 @@ def refine_main(cmdargs): """ Main """ - args = parse_args(cmdargs) + args=parse_args(cmdargs) if phab_check_requirements(args.align): logging.error("Couldn't run Truvari. Please fix parameters\n") sys.exit(100) - params = check_params(args) + params=check_params(args) truvari.setup_logging(args.debug, truvari.LogFileStderr(os.path.join( args.benchdir, "refine.log.txt")), @@ -388,26 +402,26 @@ def refine_main(cmdargs): logging.info("Params:\n%s", json.dumps(vars(args), indent=4)) # Stratify. - regions = initial_stratify(args.benchdir, + regions=initial_stratify(args.benchdir, resolve_regions(params, args), args.threads) # Figure out which to reevaluate if args.use_original_vcfs: base_vcf, comp_vcf = params.base, params.comp - regions["refined"] = original_stratify(base_vcf, comp_vcf, regions) + regions["refined"]=original_stratify(base_vcf, comp_vcf, regions) else: - base_vcf, comp_vcf = consolidate_bench_vcfs(args.benchdir) - regions["refined"] = (regions["in_fn"] > 0) & (regions["in_fp"] > 0) + base_vcf, comp_vcf=consolidate_bench_vcfs(args.benchdir) + regions["refined"]=(regions["in_fn"] > 0) & (regions["in_fp"] > 0) logging.info("%d regions to be refined", regions["refined"].sum()) - reeval_bed = truvari.make_temp_filename(suffix=".bed") + reeval_bed=truvari.make_temp_filename(suffix=".bed") regions[regions["refined"]].to_csv(reeval_bed, sep='\t', header=False, index=False) # Send the vcfs to phab - phab_vcf = os.path.join(args.benchdir, "phab.output.vcf.gz") - to_eval_coords = (regions[regions["refined"]][["chrom", "start", "end"]] + phab_vcf=os.path.join(args.benchdir, "phab.output.vcf.gz") + to_eval_coords=(regions[regions["refined"]][["chrom", "start", "end"]] .to_numpy() .tolist()) truvari.phab(to_eval_coords, base_vcf, args.reference, phab_vcf, buffer=PHAB_BUFFER, @@ -416,21 +430,21 @@ def refine_main(cmdargs): # Now run bench on the phab harmonized variants logging.info("Running bench") - matcher = truvari.Matcher(params) - matcher.params.no_ref = 'a' - outdir = os.path.join(args.benchdir, "phab_bench") - m_bench = truvari.Bench(matcher, phab_vcf, phab_vcf, outdir, reeval_bed) + matcher=truvari.Matcher(params) + matcher.params.no_ref='a' + outdir=os.path.join(args.benchdir, "phab_bench") + m_bench=truvari.Bench(matcher, phab_vcf, phab_vcf, outdir, reeval_bed) m_bench.run() - regions = refined_stratify(outdir, to_eval_coords, regions, args.threads) + regions=refined_stratify(outdir, to_eval_coords, regions, args.threads) - summary = (recount_variant_report(args.benchdir, outdir, regions) + summary=(recount_variant_report(args.benchdir, outdir, regions) if args.recount else make_variant_report(regions)) summary.clean_out() summary.write_json(os.path.join( args.benchdir, 'refine.variant_summary.json')) - report = make_region_report(regions) + report=make_region_report(regions) regions['end'] -= 1 # Undo IntervalTree's correction regions.to_csv(os.path.join( args.benchdir, 'refine.regions.txt'), sep='\t', index=False) diff --git a/truvari/region_vcf_iter.py b/truvari/region_vcf_iter.py index c300970a..cfd29260 100644 --- a/truvari/region_vcf_iter.py +++ b/truvari/region_vcf_iter.py @@ -94,14 +94,13 @@ def include(self, entry): # Filter these early so we don't have to keep checking overlaps if self.max_span is not None and aend - astart > self.max_span: return False - if astart == aend - 1: - return self.tree[entry.chrom].overlaps(astart) + m_ovl = self.tree[entry.chrom].overlap(astart, aend) if len(m_ovl) != 1: return False m_ovl = list(m_ovl)[0] - # Edge case - the variant spans the entire include region - return astart >= m_ovl.begin and aend <= m_ovl.end + end_within = truvari.entry_variant_type(entry) != truvari.SV.INS + return truvari.coords_within(astart, aend, m_ovl.begin, m_ovl.end - 1, end_within) def extend(self, pad): """ diff --git a/truvari/stratify.py b/truvari/stratify.py index b259cfaa..032bfefb 100644 --- a/truvari/stratify.py +++ b/truvari/stratify.py @@ -50,10 +50,8 @@ def count_entries(vcf, chroms, regions, within): chrom, coords = row start, end = coords for entry in vcf.fetch(chrom, start, end): - if within: - ent_start, ent_end = truvari.entry_boundaries(entry) - if not (start <= ent_start and ent_end <= end): - continue + if within and not truvari.entry_within(entry, start, end): + continue counts[idx] += 1 return counts