diff --git a/.github/filters.yml b/.github/filters.yml index b73bd48e4..b248390a2 100644 --- a/.github/filters.yml +++ b/.github/filters.yml @@ -57,6 +57,7 @@ bwa: &bwa - 'harpy/bin/count_bx.py' - 'harpy/bin/assign_mi.py' - 'harpy/bin/make_windows.py' + - 'harpy/bin/depth_windows.py' - 'test/fastq/**' ema: &ema - *common @@ -68,6 +69,7 @@ ema: &ema - 'harpy/bin/bx_stats.py' - 'harpy/bin/count_bx.py' - 'harpy/bin/make_windows.py' + - 'harpy/bin/depth_windows.py' - 'test/fastq/**' strobealign: &strobealign - *common @@ -80,6 +82,7 @@ strobealign: &strobealign - 'harpy/bin/bx_stats.py' - 'harpy/bin/count_bx.py' - 'harpy/bin/make_windows.py' + - 'harpy/bin/depth_windows.py' - 'test/fastq/**' mpileup: &mpileup - *common diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 82935dbcf..2a3a5c944 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -553,7 +553,7 @@ jobs: shell: micromamba-shell {0} if: always() run: | - cp test/bam/sample1.bam test/bam/pineapple.bam && rename_bam -d test/bam/pineapple.bam pineapple1 + cp test/bam/sample1.bam test/bam/pineapple.bam && rename_bam.py -d pineapple1 test/bam/pineapple.bam harpy phase --quiet --vcf-samples -o phasevcf --vcf test/vcf/test.bcf test/bam leviathan: diff --git a/harpy/_cli_types_generic.py b/harpy/_cli_types_generic.py index e28a3ead2..77d01bf25 100644 --- a/harpy/_cli_types_generic.py +++ b/harpy/_cli_types_generic.py @@ -35,7 +35,8 @@ def convert(self, value, param, ctx): for i in parts: if int(i) % 2 == 0 or int(i) > 128: raise ValueError - return [int(i) for i in parts] + # make it a set as a failsafe against duplicates + return list(set(int(i) for i in parts)) except ValueError: self.fail(f"{value} is not 'auto' or odd integers <128 separated by a comma.", param, ctx) @@ -50,7 +51,8 @@ def convert(self, value, param, ctx): with open(value, "r") as cont_in: return [i.strip() for i in cont_in.readlines()] else: - return [i.strip() for i in value.split(',')] + # make it a set as a failsafe against duplicates + return list(set(i.strip() for i in value.split(','))) class InputFile(click.ParamType): """A class for a click type that verifies that a file exists and that it has an expected extension""" @@ -64,7 +66,7 @@ def convert(self, value, param, ctx): "vcf": ["vcf", "bcf", "vcf.gz"], "gff": [".gff",".gff3"] } - if self.filetype not in filedict.keys(): + if self.filetype not in filedict: self.fail(f"Extension validation for {self.filetype} is not yet implemented. This error should only appear during development; if you are a user and seeing this, please post an issue on GitHub: https://github.com/pdimens/harpy/issues/new?assignees=&labels=bug&projects=&template=bug_report.yml") if not os.path.exists(value): self.fail(f"{value} does not exist. Please check the spelling and try again.", param, ctx) @@ -113,5 +115,4 @@ def convert(self, value, param, ctx): elif not os.access(f"{value}/config.yaml", os.R_OK): self.fail(f"{value}/config.yaml does not have read access. Please check the file permissions and try again.", param, ctx) return value - -### program-specific extra-params types + \ No newline at end of file diff --git a/harpy/_validations.py b/harpy/_validations.py index 728304954..8ca7fef1a 100644 --- a/harpy/_validations.py +++ b/harpy/_validations.py @@ -167,7 +167,7 @@ def check_RG(bamfile): samplename = Path(bamfile).stem samview = subprocess.run(f"samtools samples {bamfile}".split(), stdout = subprocess.PIPE).stdout.decode('utf-8').split() if samplename != samview[0]: - return os.path.basename(i), samview[0] + return os.path.basename(bamfile), samview[0] with harpy_progressbar(quiet) as progress: task_progress = progress.add_task("[green]Checking RG tags...", total=len(bamlist)) diff --git a/harpy/bin/10xtoHaplotag.py b/harpy/bin/10xtoHaplotag.py index 9855cb43f..8d50f3b7c 100755 --- a/harpy/bin/10xtoHaplotag.py +++ b/harpy/bin/10xtoHaplotag.py @@ -1,7 +1,8 @@ #! /usr/bin/env python """Convert 10X style barcodes into Haplotag style ones""" -import gzip +import os import sys +import gzip import argparse from itertools import zip_longest, product @@ -20,6 +21,12 @@ parser.print_help(sys.stderr) sys.exit(1) args = parser.parse_args() +err = [] +for i in [args.forward, args.reverse, args.barcodes]: + if not os.path.exists(i): + err.append(i) +if err: + parser.error("Some input files were not found on the system:\n" + ", ".join(err)) def process_record(fw_entry, rv_entry): """convert the 10X to haplotag""" diff --git a/harpy/bin/assign_mi.py b/harpy/bin/assign_mi.py index f3cad54b2..d0c4182f1 100755 --- a/harpy/bin/assign_mi.py +++ b/harpy/bin/assign_mi.py @@ -29,6 +29,8 @@ sys.exit(1) args = parser.parse_args() +if not os.path.exists(args.input): + parser.error(f"{args.input} was not found") def write_validbx(bam, alnrecord, mol_id): ''' @@ -62,10 +64,9 @@ def write_invalidbx(bam, alnrecord): at the end and writes it to the output bam file. Keeps existing MI tag if present. ''' - # will keeping an existing MI tag if present - # may create incorrect molecule associations by chance + # will not keep an existing MI tag if present # also remove DI because it's not necessary - tags = [j for j in alnrecord.get_tags() if j[0] not in ['DI', 'BX']] + tags = [j for j in alnrecord.get_tags() if j[0] not in ['MI', 'DI', 'BX']] _bx = alnrecord.get_tag("BX") # if hyphen is present, it's been deconvolved and shouldn't have been # and rm the hyphen part @@ -84,11 +85,7 @@ def write_missingbx(bam, alnrecord): at the end and writes it to the output bam file. Removes existing MI tag, if exists. ''' - # get all the tags except MI b/c it's being replaced (if exists) - # this won't write a new MI, but keeping an existing one - # may create incorrect molecule associations by chance - # also remove DI because it's not necessary - # removes BX... just in case. It's not supposed to be there to begin with + # removes MI and DI tags, writes new BX tag tags = [j for j in alnrecord.get_tags() if j[0] not in ['MI', 'DI', 'BX']] tags.append(("BX", "A00C00B00D00")) alnrecord.set_tags(tags) @@ -105,7 +102,7 @@ def write_missingbx(bam, alnrecord): MI = 0 if not os.path.exists(bam_input): - print(f"Error: {bam_input} not found", file = sys.stderr) + sys.stderr.write(f"Error: {bam_input} not found\n") sys.exit(1) if bam_input.lower().endswith(".bam"): diff --git a/harpy/bin/bx_stats.py b/harpy/bin/bx_stats.py index 495a9f259..b46176a5d 100755 --- a/harpy/bin/bx_stats.py +++ b/harpy/bin/bx_stats.py @@ -1,5 +1,6 @@ #! /usr/bin/env python """calculate linked-read metrics from barcode information""" +import os import re import sys import gzip @@ -29,6 +30,9 @@ sys.exit(1) args = parser.parse_args() +if not os.path.exists(args.input): + parser.error(f"{args.input} was not found") + alnfile = pysam.AlignmentFile(args.input) outfile = gzip.open(args.output, "wb", 6) outfile.write(b"contig\tmolecule\treads\tstart\tend\tlength_inferred\taligned_bp\tinsert_len\tcoverage_bp\tcoverage_inserts\n") diff --git a/harpy/bin/check_bam.py b/harpy/bin/check_bam.py index cf636d355..64fa4d84d 100755 --- a/harpy/bin/check_bam.py +++ b/harpy/bin/check_bam.py @@ -26,6 +26,8 @@ sys.exit(1) args = parser.parse_args() +if not os.path.exists(args.input): + parser.error(f"{args.input} was not found") bam_in = args.input if bam_in.lower().endswith(".bam"): @@ -71,6 +73,5 @@ alnfile.close() - values = [str(i) for i in [os.path.basename(bam_in), NAME_MISMATCH, N_READS, NO_MI, NO_BX, BAD_BX, BX_NOT_LAST]] -print("\t".join(values), file = sys.stdout) +sys.stdout.write("\t".join(values) + "\n") diff --git a/harpy/bin/check_fastq.py b/harpy/bin/check_fastq.py index 64fcb9eca..695bc6924 100755 --- a/harpy/bin/check_fastq.py +++ b/harpy/bin/check_fastq.py @@ -25,6 +25,8 @@ sys.exit(1) args = parser.parse_args() +if not os.path.exists(args.input): + parser.error(f"{args.input} was not found") fq_in = args.input @@ -61,4 +63,4 @@ NO_BX += 1 values = [str(i) for i in [os.path.basename(fq_in), N_READS, NO_BX, BAD_BX, BAD_SAM_SPEC, BX_NOT_LAST]] -print("\t".join(values), file = sys.stdout) +sys.stdout.write("\t".join(values) + "\n") diff --git a/harpy/bin/concatenate_bam.py b/harpy/bin/concatenate_bam.py index a56381ddb..b3e2418ae 100755 --- a/harpy/bin/concatenate_bam.py +++ b/harpy/bin/concatenate_bam.py @@ -4,6 +4,7 @@ import sys from pathlib import Path import argparse +from itertools import product import pysam parser = argparse.ArgumentParser( @@ -12,27 +13,28 @@ """ Concatenate records from haplotagged SAM/BAM files while making sure MI:i tags remain unique for every sample. This is a means of accomplishing the same as \'samtools cat\', except all MI (Molecule Identifier) tags are updated - so individuals don't have overlapping MI tags (which would mess up all the linked-read data). You can either provide - all the files you want to concatenate, or a single file featuring filenames with the \'-b\' option. + so individuals don't have overlapping MI tags (which would mess up all the linked-read info). You can either provide + all the files you want to concatenate, or a single file featuring filenames with the \'-b\' option. Use the \'--bx\' + option to also rewrite BX tags such that they are unique between samples too. """, - usage = "concatenate_bam.py -o output.bam file_1.bam file_2.bam...file_N.bam", + usage = "concatenate_bam.py [--bx] -o output.bam file_1.bam..file_N.bam", exit_on_error = False ) parser.add_argument("alignments", nargs='*', help = "SAM or BAM files") parser.add_argument("-o", "--out", required = True, type = str, help = "Name of BAM output file") parser.add_argument("-b", "--bamlist", required = False, type = str, help = "List of SAM or BAM files to concatenate") +parser.add_argument("--bx", dest = "bx_unique", action='store_true', help="Also rewrite BX tags for uniqueness") + if len(sys.argv) == 1: parser.print_help(sys.stderr) sys.exit(1) args = parser.parse_args() if (args.alignments and args.bamlist): - print("Please provide a single file to \'--bamlist\' (-b) featuring all the files you want to concatenate (one per line):", file = sys.stderr) - print("[example]: concatenate_bam.py -o c_acronotus.bam -b alignments.txt\n", file = sys.stderr) - - print("Alternatively, provide the files after \'-o output.bam\':", file = sys.stderr) - print("[example]: concatenate_bam.py -o c_acronotus.bam sample1.bam sample2.bam", file = sys.stderr) - + sys.stderr.write("Please provide a single file to \'--bamlist\' (-b) featuring all the files you want to concatenate (one per line):\n") + sys.stderr.write("[example]: concatenate_bam.py -o c_acronotus.bam -b alignments.txt\n\n") + sys.stderr.write("Alternatively, provide the files after \'-o output.bam\':\n",) + sys.stderr.write("[example]: concatenate_bam.py -o c_acronotus.bam sample1.bam sample2.bam\n") sys.exit(1) if args.bamlist: @@ -45,6 +47,17 @@ else: aln_list = args.alignments +# validate files exist +err = [] +for i in aln_list: + if not os.path.exists(i): + err.append(i) +if err: + parser.error("Some input files were not found on the system:\n" + ", ".join(err)) + +# Get the max number of unique haplotag barcodes +haplotag_limit = 96**4 + # instantiate output file if aln_list[0].lower().endswith(".bam"): if not os.path.exists(f"{aln_list[0]}.bai"): @@ -69,10 +82,18 @@ header['RG'][0]['ID'] = Path(args.out).stem header['RG'][0]['SM'] = Path(args.out).stem +# set up a generator for the BX tags if --bx was invoked +if args.bx_unique: + bc_range = [f"{i}".zfill(2) for i in range(1,97)] + bc_generator = product("A", bc_range, "C", bc_range, "B", bc_range, "D", bc_range) + with pysam.AlignmentFile(args.out, "wb", header = header) as bam_out: # current available unused MI tag MI_NEW = 1 - + if args.bx_unique: + BX_NEW = "".join(next(bc_generator)) + else: + BX_NEW = None # iterate through the bam files for xam in aln_list: # create MI dict for this sample @@ -90,14 +111,29 @@ mi = record.get_tag("MI") # if previously converted for this sample, use that if mi in MI_LOCAL: - record.set_tag("MI", MI_LOCAL[mi]) + record.set_tag("MI", MI_LOCAL[mi][0]) + if args.bx_unique: + record.set_tag("BX", MI_LOCAL[mi][1]) else: record.set_tag("MI", MI_NEW) + if args.bx_unique: + record.set_tag("BX", BX_NEW) # add to sample conversion dict - MI_LOCAL[mi] = MI_NEW + MI_LOCAL[mi] = [MI_NEW, BX_NEW] # increment to next unique MI MI_NEW += 1 - except: + if args.bx_unique: + if MI_NEW > haplotag_limit: + raise IndexError + BX_NEW = "".join(next(bc_generator)) + except IndexError: + errtext = f"Error:\nNumber of unique molecules exceeds the number of possible unique haplotag barcodes (96^4 = {haplotag_limit}). " + errtext += "Consider pooling fewer individuals per group.\n\nIf this concatenation was performed as part of the harpy sv leviathan workflow, " + errtext += "there is a limitation in LEVIATHAN where it does not recognize hyphenated (deconvolved) linked-read barcodes, which necessitates using all possible unique standard haplotag barcodes. " + errtext += "Consider using the 'sv naibr' workflow, which uses unique MI tags instead.\n" + sys.stderr.write(errtext) + sys.exit(1) + except KeyError: pass bam_out.write(record) if DELETE_INDEX: diff --git a/harpy/bin/count_bx.py b/harpy/bin/count_bx.py index 3341800c6..5bf64efd1 100755 --- a/harpy/bin/count_bx.py +++ b/harpy/bin/count_bx.py @@ -1,5 +1,6 @@ #! /usr/bin/env python """parse a fastq file to count BX stats""" +import os import re import sys import argparse @@ -24,15 +25,14 @@ sys.exit(1) args = parser.parse_args() +if not os.path.exists(args.input): + parser.error(f"{args.input} was not found") N_READS = 0 N_BX = 0 N_VALID = 0 -# haplotag = re.compile("([A-Z]\d{2,}){3,}") haplotag = re.compile('A[0-9]{2}C[0-9]{2}B[0-9]{2}D[0-9]{2}') -# invalid = re.compile('[A-Z]00') invalid = re.compile('[AaBbCcDd]00') -# inv_dict = {} inv_dict = { "A" : 0, "B" : 0, @@ -58,11 +58,11 @@ continue N_VALID += 1 -print(f"totalReads\t{N_READS}", file = sys.stdout) -print(f"bxTagCount\t{N_BX}", file = sys.stdout) -print(f"bxValid\t{N_VALID}", file = sys.stdout) -print(f"bxInvalid\t{N_BX - N_VALID}", file = sys.stdout) -print("A00\t",str(inv_dict["A"]), file = sys.stdout) -print("C00\t",str(inv_dict["C"]), file = sys.stdout) -print("B00\t",str(inv_dict["B"]), file = sys.stdout) -print("D00\t",str(inv_dict["D"]), file = sys.stdout) +sys.stdout.write(f"totalReads\t{N_READS}\n") +sys.stdout.write(f"bxTagCount\t{N_BX}\n") +sys.stdout.write(f"bxValid\t{N_VALID}\n") +sys.stdout.write(f"bxInvalid\t{N_BX - N_VALID}\n") +sys.stdout.write(f"A00\t{inv_dict['A']}\n") +sys.stdout.write(f"C00\t{inv_dict['C']}\n") +sys.stdout.write(f"B00\t{inv_dict['B']}\n") +sys.stdout.write(f"D00\t{inv_dict['D']}\n") diff --git a/harpy/bin/deconvolve_alignments.py b/harpy/bin/deconvolve_alignments.py new file mode 100755 index 000000000..007ee1ba5 --- /dev/null +++ b/harpy/bin/deconvolve_alignments.py @@ -0,0 +1,210 @@ +#! /usr/bin/env python +"""deconvolve BX-tagged barcodes and assign molecular identifier (MI) tags to alignments based on distance and barcode""" +import os +import re +import sys +import argparse +import pysam + +parser = argparse.ArgumentParser( + prog = 'deconvolve_alignments.py', + description = + """ + Deconvolve BX-tagged barcodes and assign an MI:i: (Molecular Identifier) + tag to each barcoded record based on a molecular distance cutoff. Unmapped records + are discarded in the output. Records without a BX:Z: tag or + with an invalid barcode (00 as one of its segments) are presevered + but are not assigned an MI:i tag. Input file MUST BE COORDINATE SORTED. + """, + usage = "deconvolve_alignments.py -c cutoff -o output.bam input.bam", + exit_on_error = False + ) + +parser.add_argument('-c','--cutoff', type=int, default = 100000, help = "Distance in base pairs at which alignments with the same barcode should be considered different molecules (default: 100000)") +parser.add_argument('-o', '--output', help = "Output bam file") +parser.add_argument('input', help = "Input coordinate-sorted bam/sam file") + +if len(sys.argv) == 1: + parser.print_help(sys.stderr) + sys.exit(1) + +args = parser.parse_args() +if not os.path.exists(args.input): + parser.error(f"{args.input} was not found") + +def write_validbx(bam, alnrecord, bx_tag, mol_id): + ''' + bam: the output bam + alnrecord: the pysam alignment record + mol_id: the "mol_id" entry of a barcode dictionary + Formats an alignment record to include the MI tag + and the BX at the end and writes it to the output + bam file. Replaces existing MI tag, if exists. + ''' + # get all the tags except MI b/c it's being replaced (if exists) + # will manually parse BX, so omit that too + # also remove DI because it's not necessary + tags = [j for j in alnrecord.get_tags() if j[0] not in ['MI', 'DI', 'BX']] + tags.append(("MI", mol_id)) + tags.append(("BX", bx_tag)) + alnrecord.set_tags(tags) + # write record to output file + bam.write(alnrecord) + +def write_invalidbx(bam, alnrecord): + ''' + bam: the output bam + alnrecord: the pysam alignment record + Formats an alignment record to include the BX + at the end and writes it to the output + bam file. Keeps existing MI tag if present. + ''' + # will not keep an existing MI tag if present + # also remove DI because it's not necessary + tags = [j for j in alnrecord.get_tags() if j[0] not in ['MI', 'DI', 'BX']] + _bx = alnrecord.get_tag("BX") + # if hyphen is present, it's been deconvolved and shouldn't have been + # and rm the hyphen part + bx_clean = _bx.split("-")[0] + tags.append(("BX", bx_clean)) + # update the record's tags + alnrecord.set_tags(tags) + # write record to output file + bam.write(alnrecord) + +def write_missingbx(bam, alnrecord): + ''' + bam: the output bam + alnrecord: the pysam alignment record + Formats an alignment record to include invalid BX + at the end and writes it to the output + bam file. Removes existing MI tag, if exists. + ''' + # removes MI and DI tags, writes new BX tag + tags = [j for j in alnrecord.get_tags() if j[0] not in ['MI', 'DI', 'BX']] + tags.append(("BX", "A00C00B00D00")) + alnrecord.set_tags(tags) + # write record to output file + bam.write(alnrecord) + +bam_input = args.input +# initialize the dict +d = {} +# LAST_CONTIG keeps track of the last contig so we can +# clear the dict when it's a new contig/chromosome +LAST_CONTIG = False +# MI is the name of the current molecule, starting a 1 (0+1) +MI = 0 + +if not os.path.exists(bam_input): + sys.stderr.write(f"Error: {bam_input} not found\n") + sys.exit(1) + +if bam_input.lower().endswith(".bam") and not os.path.exists(bam_input + ".bai"): + try: + pysam.index(bam_input) + except (OSError, pysam.SamtoolsError) as e: + sys.stderr.write(f"Error indexing BAM file: {e}\n") + sys.exit(1) +# iniitalize input/output files +alnfile = pysam.AlignmentFile(bam_input) +outfile = pysam.AlignmentFile(args.output, "wb", template = alnfile) + +for record in alnfile.fetch(): + chrm = record.reference_name + bp = record.query_alignment_length + # check if the current chromosome is different from the previous one + # if so, empty the dict (a consideration for RAM usage) + if LAST_CONTIG is not False and chrm != LAST_CONTIG: + d = {} + if record.is_unmapped: + # skip, don't output + LAST_CONTIG = chrm + continue + + try: + bx = record.get_tag("BX") + # do a regex search to find X00 pattern in the BX + if re.search("[ABCD]0{2,4}", bx): + # if found, invalid + write_invalidbx(outfile, record) + LAST_CONTIG = chrm + continue + except KeyError: + # There is no bx tag + write_missingbx(outfile, record) + LAST_CONTIG = chrm + continue + + aln = record.get_blocks() + if not aln: + # unaligned, skip and don't output + LAST_CONTIG = chrm + continue + + # logic to accommodate split records + # start position of first alignment + pos_start = aln[0][0] + # end position of last alignment + pos_end = aln[-1][1] + + # create bx entry if it's not present + if bx not in d: + # increment MI b/c it's a new molecule + MI += 1 + d[bx] = { + "lastpos" : pos_end, + "current_suffix": 0, + "mol_id": MI + } + # write and move on + write_validbx(outfile, record, bx, MI) + LAST_CONTIG = chrm + continue + + # store the original barcode as `orig` b/c we might need to suffix it + orig = bx + # if there is a suffix, append it to the barcode name + if d[orig]["current_suffix"] > 0: + bx = orig + "-" + str(d[orig]["current_suffix"]) + + # distance from last alignment = current aln start - previous aln end + dist = pos_start - d[bx]["lastpos"] + # if the distance between alignments is > cutoff, it's a different molecule + # so we'll +1 the suffix of the original barcode and relabel this one as + # BX + suffix. Since it's a new entry, we initialize it and move on + if dist > args.cutoff: + # increment MI b/c it's a new molecule + MI += 1 + # increment original barcode's suffix + d[orig]["current_suffix"] += 1 + bx = orig + "-" + str(d[orig]["current_suffix"]) + # add new entry for new suffixed barcode with unique MI + d[bx] = { + "lastpos" : pos_end, + "current_suffix": 0, + "mol_id": MI + } + # write and move on + write_validbx(outfile, record, bx, MI) + LAST_CONTIG = chrm + continue + + if record.is_reverse or (record.is_forward and not record.is_paired): + # set the last position to be the end of current alignment + d[bx]["lastpos"] = pos_end + + # if it hasn't moved on by now, it's a record for an + # existing barcode/molecule. Write the record. + write_validbx(outfile, record, bx, d[bx]["mol_id"]) + + # update the chromosome tracker + LAST_CONTIG = chrm + +alnfile.close() +# index the output file +try: + pysam.index(args.output) +except (OSError, pysam.SamtoolsError) as e: + sys.stderr.write(f"Error indexing output BAM file: {e}\n") + sys.exit(1) diff --git a/harpy/bin/depth_windows.py b/harpy/bin/depth_windows.py index e387dffb3..c1380a231 100755 --- a/harpy/bin/depth_windows.py +++ b/harpy/bin/depth_windows.py @@ -5,11 +5,24 @@ parser = argparse.ArgumentParser( prog = 'depth_windows.py', - description = 'Reads the output of samtools depth -a from stdin and calculates a windowed mean.' + description = 'Reads the output of samtools depth -a from stdin and calculates a windowed mean', + usage = "samtools depth -a file.bam | depth_windows.py windowsize > output.txt", ) -parser.add_argument('windowsize', type= int, help = "The window size to use to calcualte mean depth over (non-overlapping)") +parser.add_argument('windowsize', type= int, help = "The window size to calcualte mean depth over (non-overlapping)") +if len(sys.argv) == 1: + parser.print_help(sys.stderr) + sys.exit(1) args = parser.parse_args() +if args.windowsize < 1: + parser.error("Error: window size must be greater than 0") +if args.windowsize == 1: + # just print the input to output + for line in sys.stdin: + sys.stdout.write(line) + sys.exit(0) + + _SUM = 0 START = 1 END = args.windowsize @@ -23,7 +36,9 @@ # the contig has changed, make the END POSITION the last POSITION, print output if LAST_CONTIG and contig != LAST_CONTIG: WINSIZE = (POSITION + 1) - START - print(f"{LAST_CONTIG}\t{POSITION}\t{_SUM / WINSIZE}", file = sys.stdout) + if WINSIZE > 0: + depth = _SUM / WINSIZE + sys.stdout.write(f"{LAST_CONTIG}\t{POSITION}\t{depth}\n") # reset the window START/END and sum _SUM = 0 START = 1 @@ -33,7 +48,8 @@ _SUM += int(line[2]) if POSITION == END: - print(f"{contig}\t{END}\t{_SUM / args.windowsize}", file = sys.stdout) + depth = _SUM / args.windowsize + sys.stdout.write(f"{contig}\t{END}\t{depth}\n") # reset the window START/END and sum _SUM = 0 START = END + 1 diff --git a/harpy/bin/haplotag_acbd.py b/harpy/bin/haplotag_acbd.py index 9b49df23f..d3a26e48e 100755 --- a/harpy/bin/haplotag_acbd.py +++ b/harpy/bin/haplotag_acbd.py @@ -1,8 +1,23 @@ #! /usr/bin/env python """Generates the BC_{ABCD}.txt files necessary to demultiplex Gen I haplotag barcodes""" +import os import sys +import argparse -outdir = sys.argv[1].rstrip("/") +parser = argparse.ArgumentParser( + prog = 'haplotag_acbd.py', + description ="Generates the BC_{ABCD}.txt files necessary to demultiplex Gen I haplotag barcodes", + usage = "haplotag_acbd.py output_directory", + exit_on_error = False + ) +parser.add_argument("output_directory", type = str, help = "Directory to create barcode files") +if len(sys.argv) == 1: + parser.print_help(sys.stderr) + sys.exit(1) + +args = parser.parse_args() +outdir = args.output_directory.rstrip("/") +os.makedirs(outdir, exist_ok = True) BX = { "A": ["ACGGAA", "CCAACA", "AGATCG", "TTCTCC", "TTCCTG", "TTCGGT", "TTGTGG", "TTGCCT", "TTGGTC", "TTACGC", "TTAGCG", "TCTTCG", "TCTCTC", "TCTGGA", "TCCACT", "TCGTAC", "TCGATG", "TCACAG", "TGTTGC", "TGTCCA", "TGTGTG", "TGCTAG", "TGCATC", "TGGAGT", "TGAGAC", "TATCGG", "TATGCC", "TACCAC", "TAGGAG", "CTTCGT", "CTTGCA", "CTCTGA", "CTCAAC", "CTGCTA", "CTGGAT", "CTAAGG", "CCTCAA", "CCTGTT", "CCATTC", "CGTTCT", "CGTAGA", "CGGTAA", "CGACTT", "CATACG", "CACTTG", "CACGAA", "CACAGT", "CAGATC", "CAACGA", "CAAGCT", "GTTCAC", "GTCGTA", "GTGTCA", "GTGAAG", "GTAACC", "GCTTGT", "GCCTAA", "GCACTA", "GCAGAT", "GGTGAA", "GGCAAT", "GGATGA", "GGAATG", "GATCCT", "GATAGC", "GACACA", "GAGCAA", "GAGGTT", "ATTCCG", "ATTGGC", "ATCGAG", "ACTACC", "ACCAGA", "ACGTCT", "ACACGT", "ACAGTG", "AGCTGT", "AGCCTA", "AGGTTC", "AGGCAT", "AGGACA", "AGAAGC", "AACGTC", "AAGCTG", "CGAGTA", "GAATCC", "GAATGG", "AAGTGC", "AAGAGG", "TACAGG", "CTGACT", "CTAGTC", "CCTAAG", "CCATAG", "CGTAAC", "CAATGC"], diff --git a/harpy/bin/infer_sv.py b/harpy/bin/infer_sv.py index 121fc91fe..718316a29 100755 --- a/harpy/bin/infer_sv.py +++ b/harpy/bin/infer_sv.py @@ -1,7 +1,8 @@ #! /usr/bin/env python """Create column in NAIBR bedpe output inferring the SV type from the orientation""" -import argparse +import os import sys +import argparse parser = argparse.ArgumentParser( prog = 'infer_sv.py', @@ -11,13 +12,15 @@ ) parser.add_argument("bedfile", help = "Input bedpe file containing the output of NAIBR.") -parser.add_argument("-f", "--fail", dest = "failfile", required = False, type=str, metavar = "", help="output variants who fail filtering into separate file") +parser.add_argument("-f", "--fail", dest = "failfile", type=str, metavar = "fail.bedpe", help="output variants who fail filtering into separate file") if len(sys.argv) == 1: parser.print_help(sys.stderr) sys.exit(1) args = parser.parse_args() +if not os.path.exists(args.bedfile): + parser.error(f"{args.bedfile} was not found") conversions = { "+-": "deletion", diff --git a/harpy/bin/make_windows.py b/harpy/bin/make_windows.py index 30aa070d7..e4a61fb43 100755 --- a/harpy/bin/make_windows.py +++ b/harpy/bin/make_windows.py @@ -1,28 +1,41 @@ #! /usr/bin/env python """Create a BED file of fixed intervals from a fasta or fai file""" -import argparse +import os import sys +import gzip +import argparse parser = argparse.ArgumentParser( prog='make_windows.py', - description='Create a BED file of fixed intervals from a fasta or fai file (generated with samtools faidx). Nearly identical to bedtools makewindows, except the intervals are nonoverlapping.' + description='Create a BED file of fixed intervals from a fasta or fai file (generated with samtools faidx). Nearly identical to bedtools makewindows, except the intervals are nonoverlapping.', + usage = "make_windows.py -w -m <0,1> input.fasta > output.bed", ) -parser.add_argument("-i", dest = "infile", required = True, type=str, metavar = "", help="input fasta or fasta.fai file") -parser.add_argument("-o", dest = "outfile", required = True, type=str, metavar = "", help="output BED file name") -parser.add_argument("-w", dest = "window", type=int, metavar = "", default = 10000, help="interval size (default: %(default)s)") -parser.add_argument("-m", dest = "mode", type=int, metavar = "0 or 1 based", default = 1, help="0 or 1 based intervals (default: %(default)s)") +parser.add_argument("input", type=str, help="input fasta or fasta.fai file") +parser.add_argument("-w", "--window", type=int, default = 10000, help="interval size (default: %(default)s)") +parser.add_argument("-m", "--mode", type=int, default = 1, choices = [0,1], help="0 or 1 based intervals (default: %(default)s)") +if len(sys.argv) == 1: + parser.print_help(sys.stderr) + sys.exit(1) args = parser.parse_args() -testname = args.infile.lower() -outbed = open(args.outfile, "w", encoding="utf-8") +if not os.path.exists(args.input): + parser.error(f"{args.input} was not found") -def readinput(infile, filestream): - """automatically read as compressed or not""" - if infile.endswith("gz"): - return filestream.readline().decode() - return filestream.readline() +if args.window < 1: + parser.error("window size cannot be less than 1") -def makewindows(_contig, _c_len, windowsize, outfile): +testname = args.input.lower() + +def is_gzip(file_path): + """helper function to determine if a file is gzipped""" + try: + with gzip.open(file_path, 'rt') as f: + f.read(10) + return True + except gzip.BadGzipFile: + return False + +def makewindows(_contig, _c_len, windowsize): """create a file of the specified windows""" start = args.mode end = min(_c_len, windowsize) @@ -35,32 +48,26 @@ def makewindows(_contig, _c_len, windowsize, outfile): starts.append(start) # write to output file for (startpos, endpos) in zip (starts,ends): - outfile.write(f"{_contig}\t{startpos}\t{endpos}\n") + sys.stdout.write(f"{_contig}\t{startpos}\t{endpos}\n") if testname.endswith("fai"): - with open(args.infile, "r", encoding="utf-8") as fai: + with open(args.input, "r", encoding="utf-8") as fai: while True: - # Get next line from file line = fai.readline() - # if line is empty - # end of file is reached if not line: break - # split the line by tabs lsplit = line.split("\t") contig = lsplit[0] c_len = int(lsplit[1]) c_len += args.mode - makewindows(contig, c_len, args.window, outbed) - outbed.close() + makewindows(contig, c_len, args.window) -elif testname.endswith("fasta") or testname.endswith("fa") or testname.endswith("gz"): - if testname.endswith("gz"): - import gzip - fopen = gzip.open(args.infile, "r") +elif testname.endswith("fasta") or testname.endswith("fa") or testname.endswith("fasta.gz") or testname.endswith("fa.gz"): + if is_gzip(args.input): + fopen = gzip.open(args.input, "rt") else: - fopen = open(args.infile, "r", encoding="utf-8") - line = readinput(testname, fopen) + fopen = open(args.input, "r", encoding="utf-8") + line = fopen.readline() while True: C_LEN=0 if not line: @@ -71,7 +78,7 @@ def makewindows(_contig, _c_len, windowsize, outfile): # keep reading until hitting another ">" HEADER = False while not HEADER: - line = readinput(testname, fopen) + line = fopen.readline() if not line: break line = line.rstrip("\n") @@ -82,9 +89,7 @@ def makewindows(_contig, _c_len, windowsize, outfile): C_LEN += len(line)-1 HEADER = False C_LEN += args.mode - makewindows(contig, C_LEN, args.window, outbed) - outbed.close() + makewindows(contig, C_LEN, args.window) fopen.close() else: - print("Input file not recognized as ending in one of .fai, .fasta, .fa, or .gz (case insensitive).", file = sys.stderr) - sys.exit(1) + parser.error(f"{args.input} is not recognized as ending in one of .fai, .fasta[.gz], .fa[.gz] (case insensitive).\n") diff --git a/harpy/bin/molecule_coverage.py b/harpy/bin/molecule_coverage.py index b323d1e5d..4283cbe65 100755 --- a/harpy/bin/molecule_coverage.py +++ b/harpy/bin/molecule_coverage.py @@ -1,6 +1,7 @@ #! /usr/bin/env python """Using the ranges of BX molecule start/stop positions, calculates "molecular coverage" across the genome.""" +import os import sys import gzip import argparse @@ -19,11 +20,11 @@ FASTA fai index (like the kind created using samtools faidx) to know the actual sizes of the contigs. """, - usage = "molecule_coverage.py -f genome.fasta.fai STATSFILE > output.cov", + usage = "molecule_coverage.py -f genome.fasta.fai statsfile > output.cov", exit_on_error = False ) -parser.add_argument('-f', '--fai', type = str, help = "FASTA index (.fai) file of genome used for alignment") +parser.add_argument('-f', '--fai', required = True, type = str, help = "FASTA index (.fai) file of genome used for alignment") parser.add_argument('statsfile', help = "stats file produced by harpy via bx_stats.py") if len(sys.argv) == 1: @@ -31,6 +32,12 @@ sys.exit(1) args = parser.parse_args() +err = [] +for i in [args.statsfile, args.fai]: + if not os.path.exists(i): + err.append(i) +if err: + parser.error("Input files were not found:\n" + ", ".join(err)) # main program contigs = {} @@ -69,7 +76,7 @@ def write_coverage(counter_obj, contigname): if val.strip() == "end": IDX_END = idx if IDX_CONTIG is None or IDX_START is None or IDX_END is None: - print("Error: Required columns 'contig', 'start', or 'end' not found in header", file=sys.stderr) + sys.stderr.write("Error: Required columns 'contig', 'start', or 'end' not found in header\n") sys.exit(1) while True: line = statsfile.readline() diff --git a/harpy/bin/parse_phaseblocks.py b/harpy/bin/parse_phaseblocks.py index 1453c718f..ce681478b 100755 --- a/harpy/bin/parse_phaseblocks.py +++ b/harpy/bin/parse_phaseblocks.py @@ -1,26 +1,32 @@ #! /usr/bin/env python """Parse a phase block file from HapCut2 to pull out summary information""" -import argparse + +import os import sys +import argparse parser = argparse.ArgumentParser( prog='parse_phaseblocks.py', - description='Parse a phase block file from HapCut2 to pull out summary information' + description='Parse a phase block file from HapCut2 to pull out summary information', + usage = "parse_phaseblocks.py input > output.txt" ) -parser.add_argument("-i", dest = "infile", required = True, type=str, metavar = "", help="input HapCut2 phase blocks file") +parser.add_argument("input", type=str, help="input HapCut2 phase blocks file") +if len(sys.argv) == 1: + parser.print_help(sys.stderr) + sys.exit(1) + args = parser.parse_args() +if not os.path.exists(args.input): + parser.error(f"{args.input} was not found\n") -samplename = args.infile.replace(".blocks", "") +samplename = args.input.replace(".blocks", "") -with open(args.infile, "r", encoding="utf-8") as blocks: +with open(args.input, "r", encoding="utf-8") as blocks: FIRST_LINE = True while True: - # Get next line from file line = blocks.readline() - # if line is empty, end of file is reached if not line: break - lsplit = line.split() if lsplit[0] == "BLOCK:": n_snp = int(lsplit[6]) @@ -31,6 +37,6 @@ pos_start = int(lsplit[4]) contig = lsplit[3] FIRST_LINE = False - print(f"{samplename}\t{contig}\t{n_snp}\t{pos_start}\t{len_block}", file = sys.stdout) + sys.stdout.write(f"{samplename}\t{contig}\t{n_snp}\t{pos_start}\t{len_block}\n") else: continue diff --git a/harpy/bin/rename_bam b/harpy/bin/rename_bam.py similarity index 77% rename from harpy/bin/rename_bam rename to harpy/bin/rename_bam.py index ea5936b79..4b90fb305 100755 --- a/harpy/bin/rename_bam +++ b/harpy/bin/rename_bam.py @@ -6,10 +6,11 @@ parser = argparse.ArgumentParser( prog='rename_bam.py', - description='Rename a sam/bam file and modify the @RG tag of the alignment file to reflect the change for both ID and SM. This process creates a new file \'newname.bam\' and you may use -d to delete the original file. Requires samtools.' + description='Rename a sam/bam file and modify the @RG tag of the alignment file to reflect the change for both ID and SM. This process creates a new file \'newname.bam\' and you may use -d to delete the original file. Requires samtools.', + usage = "rename_bam.py [-d] new_name input.bam" ) -parser.add_argument("input", type=str, metavar = "input.bam", help="input bam or sam file") -parser.add_argument("name", type=str, metavar = "new_name", help="new sample name") +parser.add_argument("name", type=str, help="new sample name") +parser.add_argument("input", type=str, help="input bam or sam file") parser.add_argument("-d", "--delete", dest = "delete", action='store_true', help="delete the original file") if len(sys.argv) == 1: @@ -19,7 +20,7 @@ args = parser.parse_args() if not os.path.exists(args.input): - parser.error(f"The file {args.input} does not exist :(") + parser.error(f"{args.input} was not found") outdir = os.path.dirname(args.input) diff --git a/harpy/reports/align_stats.Rmd b/harpy/reports/align_stats.Rmd index d63fd478c..e56b680b9 100644 --- a/harpy/reports/align_stats.Rmd +++ b/harpy/reports/align_stats.Rmd @@ -585,7 +585,8 @@ plot_contigs <- snakemake@params$contigs contigs <- group_by(tb, contig) %>% summarize(size = max(position)) %>% arrange(desc(size)) -if (plot_contigs == "default"){ + +if (all(plot_contigs == "default")){ # limit the data to only the 30 largest contigs if (nrow(contigs) > 30){ .contigs <- contigs[1:30, ] diff --git a/harpy/reports/hapcut.Rmd b/harpy/reports/hapcut.Rmd index 7f4caacd9..15a4129a7 100644 --- a/harpy/reports/hapcut.Rmd +++ b/harpy/reports/hapcut.Rmd @@ -189,7 +189,7 @@ dropdown_buttons <- function(CONTIGS, CUTOFF = 0){ ```{r plotting_contigs_setup} plotting_contigs <- snakemake@params$contigs -if (plotting_contigs == "default"){ +if (all(plotting_contigs == "default")){ .contigs <- group_by(df, contig) %>% summarize(size = max(pos_end)) %>% arrange(desc(size)) diff --git a/harpy/reports/leviathan.Rmd b/harpy/reports/leviathan.Rmd index 7c885f5ae..236165ada 100644 --- a/harpy/reports/leviathan.Rmd +++ b/harpy/reports/leviathan.Rmd @@ -208,7 +208,7 @@ fa.sizes <- read.table(fai, header = F) %>% arrange(desc(2)) colnames(fa.sizes) <- c("contig", "size") plot_contigs <- snakemake@params$contigs -if (plot_contigs == "default"){ +if (all(plot_contigs == "default")){ # make sure that the contigs with SV are the ones being plotted, not the others fa.sizes <- fa.sizes[fa.sizes$contig %in% unique(sv$contig), 1:2] # limit the data to only the 30 of the largest present contigs diff --git a/harpy/reports/leviathan_pop.Rmd b/harpy/reports/leviathan_pop.Rmd index d099aab01..895f94bdc 100644 --- a/harpy/reports/leviathan_pop.Rmd +++ b/harpy/reports/leviathan_pop.Rmd @@ -221,7 +221,7 @@ zooming on the plot. fa.sizes <- read.table(faidx, header = F)[,1:2] %>% arrange(desc(2)) colnames(fa.sizes) <- c("contig", "size") plot_contigs <- snakemake@params$contigs -if (plot_contigs == "default"){ +if (all(plot_contigs == "default")){ # make sure that the contigs with SV are the ones being plotted, not the others fa.sizes <- fa.sizes[fa.sizes$contig %in% unique(sv$contig), ] # limit the data to only the 30 of the largest present contigs diff --git a/harpy/reports/naibr.Rmd b/harpy/reports/naibr.Rmd index 919da4ff5..22ebeb11f 100644 --- a/harpy/reports/naibr.Rmd +++ b/harpy/reports/naibr.Rmd @@ -291,7 +291,7 @@ ggplot(aes(x, y, fill = colour))+ fa.sizes <- read.table(fai, header = F) %>% arrange(desc(2)) colnames(fa.sizes) <- c("contig", "size") plot_contigs <- snakemake@params$contigs -if (plot_contigs == "default"){ +if (all(plot_contigs == "default")){ # make sure that the contigs with SV are the ones being plotted, not the others fa.sizes <- fa.sizes[fa.sizes$contig %in% unique(sv$Chr1), 1:2] # limit the data to only the 30 of the largest present contigs diff --git a/harpy/reports/naibr_pop.Rmd b/harpy/reports/naibr_pop.Rmd index c14366679..a92bae2c1 100644 --- a/harpy/reports/naibr_pop.Rmd +++ b/harpy/reports/naibr_pop.Rmd @@ -293,7 +293,7 @@ zooming on the plot. fa.sizes <- read.table(fai, header = F) %>% arrange(desc(2)) colnames(fa.sizes) <- c("contig", "size") plot_contigs <- snakemake@params$contigs -if (plot_contigs == "default"){ +if (all(plot_contigs == "default")){ # make sure that the contigs with SV are the ones being plotted, not the others fa.sizes <- fa.sizes[fa.sizes$contig %in% unique(sv$Chr1), 1:2] # limit the data to only the 30 of the largest present contigs diff --git a/harpy/snakefiles/phase.smk b/harpy/snakefiles/phase.smk index 7c1bcf540..3682e4545 100644 --- a/harpy/snakefiles/phase.smk +++ b/harpy/snakefiles/phase.smk @@ -247,7 +247,7 @@ rule summarize_blocks: """ echo -e "sample\\tcontig\\tn_snp\\tpos_start\\tblock_length" > {params} for i in {input}; do - parse_phaseblocks.py -i $i >> {params} + parse_phaseblocks.py $i >> {params} done gzip {params} """ diff --git a/harpy/snakefiles/sv_leviathan_pop.smk b/harpy/snakefiles/sv_leviathan_pop.smk index abe65382d..3bb0af094 100644 --- a/harpy/snakefiles/sv_leviathan_pop.smk +++ b/harpy/snakefiles/sv_leviathan_pop.smk @@ -81,7 +81,7 @@ rule concat_groups: container: None shell: - "concatenate_bam.py -o {output} -b {input.bamlist} 2> {log}" + "concatenate_bam.py --bx -o {output} -b {input.bamlist} 2> {log}" rule sort_groups: input: @@ -276,7 +276,7 @@ rule workflow_summary: summary = ["The harpy sv leviathan workflow ran using these parameters:"] summary.append(f"The provided genome: {bn}") concat = "The alignments were concatenated using:\n" - concat += "\tconcatenate_bam.py -o groupname.bam -b samples.list" + concat += "\tconcatenate_bam.py --bx -o groupname.bam -b samples.list" summary.append(concat) bc_idx = "The barcodes were indexed using:\n" bc_idx += "LRez index bam -p -b INPUT" diff --git a/harpy/snp.py b/harpy/snp.py index c7830b786..fe8672df2 100644 --- a/harpy/snp.py +++ b/harpy/snp.py @@ -99,14 +99,13 @@ def mpileup(inputs, output_dir, regions, genome, threads, populations, ploidy, e # setup regions checks regtype = validate_regions(regions, genome) + region = Path(f"{workflowdir}/positions.bed").resolve() if regtype == "windows": - region = Path(f"{workflowdir}/positions.bed").resolve() - os.system(f"make_windows.py -m 1 -i {genome} -o {region} -w {regions}") - elif regtype == "region": - region = regions - else: - region = Path(f"{workflowdir}/positions.bed").resolve() + os.system(f"make_windows.py -m 1 -w {regions} {genome} > {region}") + elif regtype == "file": os.system(f"cp -f {regions} {region}") + else: + region = regions fetch_rule(workflowdir, "snp_mpileup.smk") fetch_report(workflowdir, "bcftools_stats.Rmd") @@ -204,14 +203,13 @@ def freebayes(inputs, output_dir, genome, threads, populations, ploidy, regions, # setup regions checks regtype = validate_regions(regions, genome) + region = Path(f"{workflowdir}/positions.bed").resolve().as_posix() if regtype == "windows": - region = Path(f"{workflowdir}/positions.bed").resolve().as_posix() - os.system(f"make_windows.py -m 0 -i {genome} -o {region} -w {regions}") - elif regtype == "region": - region = regions - else: - region = Path(f"{workflowdir}/positions.bed").resolve().as_posix() + os.system(f"make_windows.py -m 1 -w {regions} {genome} > {region}") + elif regtype == "file": os.system(f"cp -f {regions} {region}") + else: + region = regions fetch_rule(workflowdir, "snp_freebayes.smk") fetch_report(workflowdir, "bcftools_stats.Rmd")