Skip to content

Commit

Permalink
Merge pull request #161 from pdimens/secondary_deconvolve
Browse files Browse the repository at this point in the history
bugfix - divide-by-zero error
  • Loading branch information
pdimens authored Nov 8, 2024
2 parents ec5b2b2 + 0bdd82a commit ac9a13c
Show file tree
Hide file tree
Showing 28 changed files with 434 additions and 121 deletions.
3 changes: 3 additions & 0 deletions .github/filters.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ bwa: &bwa
- 'harpy/bin/count_bx.py'
- 'harpy/bin/assign_mi.py'
- 'harpy/bin/make_windows.py'
- 'harpy/bin/depth_windows.py'
- 'test/fastq/**'
ema: &ema
- *common
Expand All @@ -68,6 +69,7 @@ ema: &ema
- 'harpy/bin/bx_stats.py'
- 'harpy/bin/count_bx.py'
- 'harpy/bin/make_windows.py'
- 'harpy/bin/depth_windows.py'
- 'test/fastq/**'
strobealign: &strobealign
- *common
Expand All @@ -80,6 +82,7 @@ strobealign: &strobealign
- 'harpy/bin/bx_stats.py'
- 'harpy/bin/count_bx.py'
- 'harpy/bin/make_windows.py'
- 'harpy/bin/depth_windows.py'
- 'test/fastq/**'
mpileup: &mpileup
- *common
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -553,7 +553,7 @@ jobs:
shell: micromamba-shell {0}
if: always()
run: |
cp test/bam/sample1.bam test/bam/pineapple.bam && rename_bam -d test/bam/pineapple.bam pineapple1
cp test/bam/sample1.bam test/bam/pineapple.bam && rename_bam.py -d pineapple1 test/bam/pineapple.bam
harpy phase --quiet --vcf-samples -o phasevcf --vcf test/vcf/test.bcf test/bam
leviathan:
Expand Down
11 changes: 6 additions & 5 deletions harpy/_cli_types_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ def convert(self, value, param, ctx):
for i in parts:
if int(i) % 2 == 0 or int(i) > 128:
raise ValueError
return [int(i) for i in parts]
# make it a set as a failsafe against duplicates
return list(set(int(i) for i in parts))
except ValueError:
self.fail(f"{value} is not 'auto' or odd integers <128 separated by a comma.", param, ctx)

Expand All @@ -50,7 +51,8 @@ def convert(self, value, param, ctx):
with open(value, "r") as cont_in:
return [i.strip() for i in cont_in.readlines()]
else:
return [i.strip() for i in value.split(',')]
# make it a set as a failsafe against duplicates
return list(set(i.strip() for i in value.split(',')))

class InputFile(click.ParamType):
"""A class for a click type that verifies that a file exists and that it has an expected extension"""
Expand All @@ -64,7 +66,7 @@ def convert(self, value, param, ctx):
"vcf": ["vcf", "bcf", "vcf.gz"],
"gff": [".gff",".gff3"]
}
if self.filetype not in filedict.keys():
if self.filetype not in filedict:
self.fail(f"Extension validation for {self.filetype} is not yet implemented. This error should only appear during development; if you are a user and seeing this, please post an issue on GitHub: https://github.com/pdimens/harpy/issues/new?assignees=&labels=bug&projects=&template=bug_report.yml")
if not os.path.exists(value):
self.fail(f"{value} does not exist. Please check the spelling and try again.", param, ctx)
Expand Down Expand Up @@ -113,5 +115,4 @@ def convert(self, value, param, ctx):
elif not os.access(f"{value}/config.yaml", os.R_OK):
self.fail(f"{value}/config.yaml does not have read access. Please check the file permissions and try again.", param, ctx)
return value

### program-specific extra-params types

2 changes: 1 addition & 1 deletion harpy/_validations.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def check_RG(bamfile):
samplename = Path(bamfile).stem
samview = subprocess.run(f"samtools samples {bamfile}".split(), stdout = subprocess.PIPE).stdout.decode('utf-8').split()
if samplename != samview[0]:
return os.path.basename(i), samview[0]
return os.path.basename(bamfile), samview[0]

with harpy_progressbar(quiet) as progress:
task_progress = progress.add_task("[green]Checking RG tags...", total=len(bamlist))
Expand Down
9 changes: 8 additions & 1 deletion harpy/bin/10xtoHaplotag.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#! /usr/bin/env python
"""Convert 10X style barcodes into Haplotag style ones"""
import gzip
import os
import sys
import gzip
import argparse
from itertools import zip_longest, product

Expand All @@ -20,6 +21,12 @@
parser.print_help(sys.stderr)
sys.exit(1)
args = parser.parse_args()
err = []
for i in [args.forward, args.reverse, args.barcodes]:
if not os.path.exists(i):
err.append(i)
if err:
parser.error("Some input files were not found on the system:\n" + ", ".join(err))

def process_record(fw_entry, rv_entry):
"""convert the 10X to haplotag"""
Expand Down
15 changes: 6 additions & 9 deletions harpy/bin/assign_mi.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
sys.exit(1)

args = parser.parse_args()
if not os.path.exists(args.input):
parser.error(f"{args.input} was not found")

def write_validbx(bam, alnrecord, mol_id):
'''
Expand Down Expand Up @@ -62,10 +64,9 @@ def write_invalidbx(bam, alnrecord):
at the end and writes it to the output
bam file. Keeps existing MI tag if present.
'''
# will keeping an existing MI tag if present
# may create incorrect molecule associations by chance
# will not keep an existing MI tag if present
# also remove DI because it's not necessary
tags = [j for j in alnrecord.get_tags() if j[0] not in ['DI', 'BX']]
tags = [j for j in alnrecord.get_tags() if j[0] not in ['MI', 'DI', 'BX']]
_bx = alnrecord.get_tag("BX")
# if hyphen is present, it's been deconvolved and shouldn't have been
# and rm the hyphen part
Expand All @@ -84,11 +85,7 @@ def write_missingbx(bam, alnrecord):
at the end and writes it to the output
bam file. Removes existing MI tag, if exists.
'''
# get all the tags except MI b/c it's being replaced (if exists)
# this won't write a new MI, but keeping an existing one
# may create incorrect molecule associations by chance
# also remove DI because it's not necessary
# removes BX... just in case. It's not supposed to be there to begin with
# removes MI and DI tags, writes new BX tag
tags = [j for j in alnrecord.get_tags() if j[0] not in ['MI', 'DI', 'BX']]
tags.append(("BX", "A00C00B00D00"))
alnrecord.set_tags(tags)
Expand All @@ -105,7 +102,7 @@ def write_missingbx(bam, alnrecord):
MI = 0

if not os.path.exists(bam_input):
print(f"Error: {bam_input} not found", file = sys.stderr)
sys.stderr.write(f"Error: {bam_input} not found\n")
sys.exit(1)

if bam_input.lower().endswith(".bam"):
Expand Down
4 changes: 4 additions & 0 deletions harpy/bin/bx_stats.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#! /usr/bin/env python
"""calculate linked-read metrics from barcode information"""
import os
import re
import sys
import gzip
Expand Down Expand Up @@ -29,6 +30,9 @@
sys.exit(1)

args = parser.parse_args()
if not os.path.exists(args.input):
parser.error(f"{args.input} was not found")

alnfile = pysam.AlignmentFile(args.input)
outfile = gzip.open(args.output, "wb", 6)
outfile.write(b"contig\tmolecule\treads\tstart\tend\tlength_inferred\taligned_bp\tinsert_len\tcoverage_bp\tcoverage_inserts\n")
Expand Down
5 changes: 3 additions & 2 deletions harpy/bin/check_bam.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
sys.exit(1)

args = parser.parse_args()
if not os.path.exists(args.input):
parser.error(f"{args.input} was not found")

bam_in = args.input
if bam_in.lower().endswith(".bam"):
Expand Down Expand Up @@ -71,6 +73,5 @@

alnfile.close()


values = [str(i) for i in [os.path.basename(bam_in), NAME_MISMATCH, N_READS, NO_MI, NO_BX, BAD_BX, BX_NOT_LAST]]
print("\t".join(values), file = sys.stdout)
sys.stdout.write("\t".join(values) + "\n")
4 changes: 3 additions & 1 deletion harpy/bin/check_fastq.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
sys.exit(1)

args = parser.parse_args()
if not os.path.exists(args.input):
parser.error(f"{args.input} was not found")

fq_in = args.input

Expand Down Expand Up @@ -61,4 +63,4 @@
NO_BX += 1

values = [str(i) for i in [os.path.basename(fq_in), N_READS, NO_BX, BAD_BX, BAD_SAM_SPEC, BX_NOT_LAST]]
print("\t".join(values), file = sys.stdout)
sys.stdout.write("\t".join(values) + "\n")
62 changes: 49 additions & 13 deletions harpy/bin/concatenate_bam.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import sys
from pathlib import Path
import argparse
from itertools import product
import pysam

parser = argparse.ArgumentParser(
Expand All @@ -12,27 +13,28 @@
"""
Concatenate records from haplotagged SAM/BAM files while making sure MI:i tags remain unique for every sample.
This is a means of accomplishing the same as \'samtools cat\', except all MI (Molecule Identifier) tags are updated
so individuals don't have overlapping MI tags (which would mess up all the linked-read data). You can either provide
all the files you want to concatenate, or a single file featuring filenames with the \'-b\' option.
so individuals don't have overlapping MI tags (which would mess up all the linked-read info). You can either provide
all the files you want to concatenate, or a single file featuring filenames with the \'-b\' option. Use the \'--bx\'
option to also rewrite BX tags such that they are unique between samples too.
""",
usage = "concatenate_bam.py -o output.bam file_1.bam file_2.bam...file_N.bam",
usage = "concatenate_bam.py [--bx] -o output.bam file_1.bam..file_N.bam",
exit_on_error = False
)
parser.add_argument("alignments", nargs='*', help = "SAM or BAM files")
parser.add_argument("-o", "--out", required = True, type = str, help = "Name of BAM output file")
parser.add_argument("-b", "--bamlist", required = False, type = str, help = "List of SAM or BAM files to concatenate")
parser.add_argument("--bx", dest = "bx_unique", action='store_true', help="Also rewrite BX tags for uniqueness")

if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit(1)
args = parser.parse_args()

if (args.alignments and args.bamlist):
print("Please provide a single file to \'--bamlist\' (-b) featuring all the files you want to concatenate (one per line):", file = sys.stderr)
print("[example]: concatenate_bam.py -o c_acronotus.bam -b alignments.txt\n", file = sys.stderr)

print("Alternatively, provide the files after \'-o output.bam\':", file = sys.stderr)
print("[example]: concatenate_bam.py -o c_acronotus.bam sample1.bam sample2.bam", file = sys.stderr)

sys.stderr.write("Please provide a single file to \'--bamlist\' (-b) featuring all the files you want to concatenate (one per line):\n")
sys.stderr.write("[example]: concatenate_bam.py -o c_acronotus.bam -b alignments.txt\n\n")
sys.stderr.write("Alternatively, provide the files after \'-o output.bam\':\n",)
sys.stderr.write("[example]: concatenate_bam.py -o c_acronotus.bam sample1.bam sample2.bam\n")
sys.exit(1)

if args.bamlist:
Expand All @@ -45,6 +47,17 @@
else:
aln_list = args.alignments

# validate files exist
err = []
for i in aln_list:
if not os.path.exists(i):
err.append(i)
if err:
parser.error("Some input files were not found on the system:\n" + ", ".join(err))

# Get the max number of unique haplotag barcodes
haplotag_limit = 96**4

# instantiate output file
if aln_list[0].lower().endswith(".bam"):
if not os.path.exists(f"{aln_list[0]}.bai"):
Expand All @@ -69,10 +82,18 @@
header['RG'][0]['ID'] = Path(args.out).stem
header['RG'][0]['SM'] = Path(args.out).stem

# set up a generator for the BX tags if --bx was invoked
if args.bx_unique:
bc_range = [f"{i}".zfill(2) for i in range(1,97)]
bc_generator = product("A", bc_range, "C", bc_range, "B", bc_range, "D", bc_range)

with pysam.AlignmentFile(args.out, "wb", header = header) as bam_out:
# current available unused MI tag
MI_NEW = 1

if args.bx_unique:
BX_NEW = "".join(next(bc_generator))
else:
BX_NEW = None
# iterate through the bam files
for xam in aln_list:
# create MI dict for this sample
Expand All @@ -90,14 +111,29 @@
mi = record.get_tag("MI")
# if previously converted for this sample, use that
if mi in MI_LOCAL:
record.set_tag("MI", MI_LOCAL[mi])
record.set_tag("MI", MI_LOCAL[mi][0])
if args.bx_unique:
record.set_tag("BX", MI_LOCAL[mi][1])
else:
record.set_tag("MI", MI_NEW)
if args.bx_unique:
record.set_tag("BX", BX_NEW)
# add to sample conversion dict
MI_LOCAL[mi] = MI_NEW
MI_LOCAL[mi] = [MI_NEW, BX_NEW]
# increment to next unique MI
MI_NEW += 1
except:
if args.bx_unique:
if MI_NEW > haplotag_limit:
raise IndexError
BX_NEW = "".join(next(bc_generator))
except IndexError:
errtext = f"Error:\nNumber of unique molecules exceeds the number of possible unique haplotag barcodes (96^4 = {haplotag_limit}). "
errtext += "Consider pooling fewer individuals per group.\n\nIf this concatenation was performed as part of the harpy sv leviathan workflow, "
errtext += "there is a limitation in LEVIATHAN where it does not recognize hyphenated (deconvolved) linked-read barcodes, which necessitates using all possible unique standard haplotag barcodes. "
errtext += "Consider using the 'sv naibr' workflow, which uses unique MI tags instead.\n"
sys.stderr.write(errtext)
sys.exit(1)
except KeyError:
pass
bam_out.write(record)
if DELETE_INDEX:
Expand Down
22 changes: 11 additions & 11 deletions harpy/bin/count_bx.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#! /usr/bin/env python
"""parse a fastq file to count BX stats"""
import os
import re
import sys
import argparse
Expand All @@ -24,15 +25,14 @@
sys.exit(1)

args = parser.parse_args()
if not os.path.exists(args.input):
parser.error(f"{args.input} was not found")

N_READS = 0
N_BX = 0
N_VALID = 0
# haplotag = re.compile("([A-Z]\d{2,}){3,}")
haplotag = re.compile('A[0-9]{2}C[0-9]{2}B[0-9]{2}D[0-9]{2}')
# invalid = re.compile('[A-Z]00')
invalid = re.compile('[AaBbCcDd]00')
# inv_dict = {}
inv_dict = {
"A" : 0,
"B" : 0,
Expand All @@ -58,11 +58,11 @@
continue
N_VALID += 1

print(f"totalReads\t{N_READS}", file = sys.stdout)
print(f"bxTagCount\t{N_BX}", file = sys.stdout)
print(f"bxValid\t{N_VALID}", file = sys.stdout)
print(f"bxInvalid\t{N_BX - N_VALID}", file = sys.stdout)
print("A00\t",str(inv_dict["A"]), file = sys.stdout)
print("C00\t",str(inv_dict["C"]), file = sys.stdout)
print("B00\t",str(inv_dict["B"]), file = sys.stdout)
print("D00\t",str(inv_dict["D"]), file = sys.stdout)
sys.stdout.write(f"totalReads\t{N_READS}\n")
sys.stdout.write(f"bxTagCount\t{N_BX}\n")
sys.stdout.write(f"bxValid\t{N_VALID}\n")
sys.stdout.write(f"bxInvalid\t{N_BX - N_VALID}\n")
sys.stdout.write(f"A00\t{inv_dict['A']}\n")
sys.stdout.write(f"C00\t{inv_dict['C']}\n")
sys.stdout.write(f"B00\t{inv_dict['B']}\n")
sys.stdout.write(f"D00\t{inv_dict['D']}\n")
Loading

0 comments on commit ac9a13c

Please sign in to comment.