Skip to content

Commit

Permalink
region fixing
Browse files Browse the repository at this point in the history
Anytime we're using IntervalTree, we have to add 1 to the end.
Anytime we're moving out of IntervalTree, we must remove the one.
  • Loading branch information
ACEnglish committed Feb 5, 2024
1 parent 4ddaa91 commit 6bc38fe
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 10 deletions.
2 changes: 1 addition & 1 deletion repo_utils/run_unittest.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
truv_ans[truvari.entry_to_key(entry)] = True

vcf.reset()
for entry in v:
for entry in vcf:
state = entry.info['include'] == 'in'
assert state == truv_ans[truvari.entry_to_key(entry)], f"Bad Boundary {str(entry)}"

Expand Down
2 changes: 1 addition & 1 deletion truvari/bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -584,7 +584,7 @@ def check_refine_candidate(self, result):
if match.comp is not None:
chrom = match.comp.chrom
pos.extend(truvari.entry_boundaries(match.comp))
if has_unmatched and pos: # I don't think I need to confirm pos, but unsure
if has_unmatched and pos:
self.refine_candidates.append(f"{chrom}\t{min(*pos)}\t{max(*pos)}")


Expand Down
8 changes: 3 additions & 5 deletions truvari/refine.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def resolve_regions(params, args):
reeval_trees, count = truvari.build_anno_tree(args.regions, idxfmt="")
logging.info("%d --regions loaded", count)

return [[chrom, intv.begin, intv.end]
return [[chrom, intv.begin, intv.end - 1]
for chrom, all_intv in reeval_trees.items()
for intv in sorted(all_intv)]

Expand Down Expand Up @@ -176,15 +176,14 @@ def recount_variant_report(orig_dir, phab_dir, regions):
"""
tree = defaultdict(IntervalTree)
n_regions = regions[regions["refined"]].copy()
n_regions['start'] -= PHAB_BUFFER
n_regions['end'] += PHAB_BUFFER
for _, row in n_regions.iterrows():
tree[row['chrom']].addi(row['start'], row['end'])
tree[row['chrom']].addi(row['start'], row['end'] + 1)

summary = truvari.StatsBox()
with open(os.path.join(phab_dir, "summary.json")) as fh:
summary.update(json.load(fh))

# Adding the original counts to the updated phab counts
vcf = pysam.VariantFile(os.path.join(orig_dir, 'tp-base.vcf.gz'))
tpb = len(list(truvari.region_filter(vcf, tree, False)))
summary["TP-base"] += tpb
Expand Down Expand Up @@ -430,7 +429,6 @@ def refine_main(cmdargs):
args.benchdir, 'refine.variant_summary.json'))

report = make_region_report(regions)
regions['end'] -= 1 # Undo IntervalTree's correction
regions.to_csv(os.path.join(
args.benchdir, 'refine.regions.txt'), sep='\t', index=False)
with open(os.path.join(args.benchdir, "refine.region_summary.json"), 'w') as fout:
Expand Down
13 changes: 10 additions & 3 deletions truvari/region_vcf_iter.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ def build_anno_tree(filename, chrom_col=0, start_col=1, end_col=2, one_based=Fal
:type `end_col`: integer, optional
:param `one_based`: True if coordinates are one-based
:type `one_based`: bool, optional
:param `is_pyintv`: add 1 to end position to correct pyintervaltree.overlap behavior
:type `is_pyintv`: bool, optional
:param `comment`: ignore lines if they start with this string
:type `comment`: string, optional
:param `idxfmt`: Index of column in file with chromosome
Expand Down Expand Up @@ -174,6 +176,9 @@ def region_filter(vcf, tree, inside=True):
cur_intv = cur_tree.popleft()
except IndexError:
# region-less chromosome
if not inside:
for entry in vcf.fetch(chrom):
yield entry
continue

cur_iter = vcf.fetch(chrom)
Expand All @@ -186,10 +191,12 @@ def region_filter(vcf, tree, inside=True):

while True:
# if start is after this interval, we need the next interval
if cur_start > cur_intv.end - 1:
if cur_start > cur_intv.end:
try:
cur_intv = cur_tree.popleft()
except IndexError:
if not inside:
yield cur_entry # pass this back before flush after the while
break
# well before, we need the next entry
elif cur_end < cur_intv.begin:
Expand All @@ -207,11 +214,11 @@ def region_filter(vcf, tree, inside=True):
yield cur_entry
try:
cur_entry = next(cur_iter)
cur_stop, cur_end = truvari.entry_boundaries(cur_entry)
cur_start, cur_end = truvari.entry_boundaries(cur_entry)
except StopIteration:
break

# if we finished the intervals first, need to flush the rest of the outside entries
if not inside:
if not inside:
for entry in cur_iter:
yield entry

0 comments on commit 6bc38fe

Please sign in to comment.