Merge pull request #161 from pdimens/secondary_deconvolve

bugfix - divide-by-zero error
pdimens · Nov 8, 2024 · ac9a13c · ac9a13c
2 parents ec5b2b2 + 0bdd82a
commit ac9a13c
Show file tree

Hide file tree

Showing 28 changed files with 434 additions and 121 deletions.
diff --git a/.github/filters.yml b/.github/filters.yml
@@ -57,6 +57,7 @@ bwa: &bwa
   - 'harpy/bin/count_bx.py'
   - 'harpy/bin/assign_mi.py'
   - 'harpy/bin/make_windows.py'
+  - 'harpy/bin/depth_windows.py'
   - 'test/fastq/**'
 ema: &ema
   - *common
@@ -68,6 +69,7 @@ ema: &ema
   - 'harpy/bin/bx_stats.py'
   - 'harpy/bin/count_bx.py'
   - 'harpy/bin/make_windows.py'
+  - 'harpy/bin/depth_windows.py'
   - 'test/fastq/**'
 strobealign: &strobealign
   - *common
@@ -80,6 +82,7 @@ strobealign: &strobealign
   - 'harpy/bin/bx_stats.py'
   - 'harpy/bin/count_bx.py'
   - 'harpy/bin/make_windows.py'
+  - 'harpy/bin/depth_windows.py'
   - 'test/fastq/**'
 mpileup: &mpileup
   - *common

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -553,7 +553,7 @@ jobs:
         shell: micromamba-shell {0}
         if: always()
         run: |
-          cp test/bam/sample1.bam test/bam/pineapple.bam && rename_bam -d test/bam/pineapple.bam pineapple1
+          cp test/bam/sample1.bam test/bam/pineapple.bam && rename_bam.py -d pineapple1 test/bam/pineapple.bam
           harpy phase --quiet --vcf-samples -o phasevcf --vcf test/vcf/test.bcf  test/bam
 
   leviathan:

diff --git a/harpy/_cli_types_generic.py b/harpy/_cli_types_generic.py
@@ -35,7 +35,8 @@ def convert(self, value, param, ctx):
             for i in parts:
                 if int(i) % 2 == 0 or int(i) > 128:
                     raise ValueError
-            return [int(i) for i in parts]
+            # make it a set as a failsafe against duplicates
+            return list(set(int(i) for i in parts))
         except ValueError:
             self.fail(f"{value} is not 'auto' or odd integers <128 separated by a comma.", param, ctx)
 
@@ -50,7 +51,8 @@ def convert(self, value, param, ctx):
             with open(value, "r") as cont_in:
                 return [i.strip() for i in cont_in.readlines()]
         else:
-            return [i.strip() for i in value.split(',')]
+            # make it a set as a failsafe against duplicates
+            return list(set(i.strip() for i in value.split(',')))
 
 class InputFile(click.ParamType):
     """A class for a click type that verifies that a file exists and that it has an expected extension"""
@@ -64,7 +66,7 @@ def convert(self, value, param, ctx):
             "vcf": ["vcf", "bcf", "vcf.gz"],
             "gff": [".gff",".gff3"]
         }
-        if self.filetype not in filedict.keys():
+        if self.filetype not in filedict:
             self.fail(f"Extension validation for {self.filetype} is not yet implemented. This error should only appear during development; if you are a user and seeing this, please post an issue on GitHub: https://github.com/pdimens/harpy/issues/new?assignees=&labels=bug&projects=&template=bug_report.yml")
         if not os.path.exists(value):
             self.fail(f"{value} does not exist. Please check the spelling and try again.", param, ctx)
@@ -113,5 +115,4 @@ def convert(self, value, param, ctx):
         elif not os.access(f"{value}/config.yaml", os.R_OK):
             self.fail(f"{value}/config.yaml does not have read access. Please check the file permissions and try again.", param, ctx)
         return value
-
-### program-specific extra-params types
+
diff --git a/harpy/_validations.py b/harpy/_validations.py
@@ -167,7 +167,7 @@ def check_RG(bamfile):
         samplename = Path(bamfile).stem
         samview = subprocess.run(f"samtools samples {bamfile}".split(), stdout = subprocess.PIPE).stdout.decode('utf-8').split()
         if samplename != samview[0]:
-            return os.path.basename(i), samview[0]
+            return os.path.basename(bamfile), samview[0]
 
     with harpy_progressbar(quiet) as progress:
         task_progress = progress.add_task("[green]Checking RG tags...", total=len(bamlist))

diff --git a/harpy/bin/10xtoHaplotag.py b/harpy/bin/10xtoHaplotag.py
@@ -1,7 +1,8 @@
 #! /usr/bin/env python
 """Convert 10X style barcodes into Haplotag style ones"""
-import gzip
+import os
 import sys
+import gzip
 import argparse
 from itertools import zip_longest, product
 
@@ -20,6 +21,12 @@
     parser.print_help(sys.stderr)
     sys.exit(1)
 args = parser.parse_args()
+err = []
+for i in [args.forward, args.reverse, args.barcodes]:
+    if not os.path.exists(i):
+        err.append(i)
+if err:
+    parser.error("Some input files were not found on the system:\n" + ", ".join(err))
 
 def process_record(fw_entry, rv_entry):
     """convert the 10X to haplotag"""

diff --git a/harpy/bin/assign_mi.py b/harpy/bin/assign_mi.py
@@ -29,6 +29,8 @@
     sys.exit(1)
 
 args = parser.parse_args()
+if not os.path.exists(args.input):
+    parser.error(f"{args.input} was not found")
 
 def write_validbx(bam, alnrecord, mol_id):
     '''
@@ -62,10 +64,9 @@ def write_invalidbx(bam, alnrecord):
     at the end and writes it to the output
     bam file. Keeps existing MI tag if present.
     '''
-    # will keeping an existing MI tag if present
-    # may create incorrect molecule associations by chance
+    # will not keep an existing MI tag if present
     # also remove DI because it's not necessary
-    tags = [j for j in alnrecord.get_tags() if j[0] not in ['DI', 'BX']]
+    tags = [j for j in alnrecord.get_tags() if j[0] not in ['MI', 'DI', 'BX']]
     _bx = alnrecord.get_tag("BX")
     # if hyphen is present, it's been deconvolved and shouldn't have been
     # and rm the hyphen part
@@ -84,11 +85,7 @@ def write_missingbx(bam, alnrecord):
     at the end and writes it to the output
     bam file. Removes existing MI tag, if exists.
     '''
-    # get all the tags except MI b/c it's being replaced (if exists)
-    # this won't write a new MI, but keeping an existing one
-    # may create incorrect molecule associations by chance
-    # also remove DI because it's not necessary
-    # removes BX... just in case. It's not supposed to be there to begin with
+     # removes MI and DI tags, writes new BX tag
     tags = [j for j in alnrecord.get_tags() if j[0] not in ['MI', 'DI', 'BX']]
     tags.append(("BX", "A00C00B00D00"))
     alnrecord.set_tags(tags)
@@ -105,7 +102,7 @@ def write_missingbx(bam, alnrecord):
 MI = 0
 
 if not os.path.exists(bam_input):
-    print(f"Error: {bam_input} not found", file = sys.stderr)
+    sys.stderr.write(f"Error: {bam_input} not found\n")
     sys.exit(1)
 
 if bam_input.lower().endswith(".bam"):

diff --git a/harpy/bin/bx_stats.py b/harpy/bin/bx_stats.py
@@ -1,5 +1,6 @@
 #! /usr/bin/env python
 """calculate linked-read metrics from barcode information"""
+import os
 import re
 import sys
 import gzip
@@ -29,6 +30,9 @@
     sys.exit(1)
 
 args = parser.parse_args()
+if not os.path.exists(args.input):
+    parser.error(f"{args.input} was not found")
+
 alnfile = pysam.AlignmentFile(args.input)
 outfile = gzip.open(args.output, "wb", 6)
 outfile.write(b"contig\tmolecule\treads\tstart\tend\tlength_inferred\taligned_bp\tinsert_len\tcoverage_bp\tcoverage_inserts\n")

diff --git a/harpy/bin/check_bam.py b/harpy/bin/check_bam.py
@@ -26,6 +26,8 @@
     sys.exit(1)
 
 args = parser.parse_args()
+if not os.path.exists(args.input):
+    parser.error(f"{args.input} was not found")
 
 bam_in = args.input
 if bam_in.lower().endswith(".bam"):
@@ -71,6 +73,5 @@
 
 alnfile.close()
 
-
 values = [str(i) for i in [os.path.basename(bam_in), NAME_MISMATCH, N_READS, NO_MI, NO_BX, BAD_BX, BX_NOT_LAST]]
-print("\t".join(values), file = sys.stdout)
+sys.stdout.write("\t".join(values) + "\n")
diff --git a/harpy/bin/check_fastq.py b/harpy/bin/check_fastq.py
@@ -25,6 +25,8 @@
     sys.exit(1)
 
 args = parser.parse_args()
+if not os.path.exists(args.input):
+    parser.error(f"{args.input} was not found")
 
 fq_in = args.input
 
@@ -61,4 +63,4 @@
             NO_BX += 1
 
 values = [str(i) for i in [os.path.basename(fq_in), N_READS, NO_BX, BAD_BX, BAD_SAM_SPEC, BX_NOT_LAST]]
-print("\t".join(values), file = sys.stdout)
+sys.stdout.write("\t".join(values) + "\n")
diff --git a/harpy/bin/concatenate_bam.py b/harpy/bin/concatenate_bam.py
@@ -4,6 +4,7 @@
 import sys
 from pathlib import Path
 import argparse
+from itertools import product
 import pysam
 
 parser = argparse.ArgumentParser(
@@ -12,27 +13,28 @@
     """
     Concatenate records from haplotagged SAM/BAM files while making sure MI:i tags remain unique for every sample.
     This is a means of accomplishing the same as \'samtools cat\', except all MI (Molecule Identifier) tags are updated
-    so individuals don't have overlapping MI tags (which would mess up all the linked-read data). You can either provide
-    all the files you want to concatenate, or a single file featuring filenames with the \'-b\' option.
+    so individuals don't have overlapping MI tags (which would mess up all the linked-read info). You can either provide
+    all the files you want to concatenate, or a single file featuring filenames with the \'-b\' option. Use the \'--bx\'
+    option to also rewrite BX tags such that they are unique between samples too.
     """,
-    usage = "concatenate_bam.py -o output.bam file_1.bam file_2.bam...file_N.bam",
+    usage = "concatenate_bam.py [--bx] -o output.bam file_1.bam..file_N.bam",
     exit_on_error = False
     )
 parser.add_argument("alignments", nargs='*', help = "SAM or BAM files")
 parser.add_argument("-o", "--out", required = True, type = str, help = "Name of BAM output file")
 parser.add_argument("-b", "--bamlist", required = False, type = str, help = "List of SAM or BAM files to concatenate")
+parser.add_argument("--bx", dest = "bx_unique", action='store_true', help="Also rewrite BX tags for uniqueness")
+
 if len(sys.argv) == 1:
     parser.print_help(sys.stderr)
     sys.exit(1)
 args = parser.parse_args()
 
 if (args.alignments and args.bamlist):
-    print("Please provide a single file to \'--bamlist\' (-b) featuring all the files you want to concatenate (one per line):", file = sys.stderr)
-    print("[example]: concatenate_bam.py -o c_acronotus.bam -b alignments.txt\n", file = sys.stderr)
-
-    print("Alternatively, provide the files after \'-o output.bam\':", file = sys.stderr)
-    print("[example]: concatenate_bam.py -o c_acronotus.bam sample1.bam sample2.bam", file = sys.stderr)
-
+    sys.stderr.write("Please provide a single file to \'--bamlist\' (-b) featuring all the files you want to concatenate (one per line):\n")
+    sys.stderr.write("[example]: concatenate_bam.py -o c_acronotus.bam -b alignments.txt\n\n")
+    sys.stderr.write("Alternatively, provide the files after \'-o output.bam\':\n",)
+    sys.stderr.write("[example]: concatenate_bam.py -o c_acronotus.bam sample1.bam sample2.bam\n")
     sys.exit(1)
 
 if args.bamlist:
@@ -45,6 +47,17 @@
     else:
         aln_list = args.alignments
 
+# validate files exist
+err = []
+for i in aln_list:
+    if not os.path.exists(i):
+        err.append(i)
+if err:
+    parser.error("Some input files were not found on the system:\n" + ", ".join(err))
+
+# Get the max number of unique haplotag barcodes
+haplotag_limit = 96**4
+
 # instantiate output file
 if aln_list[0].lower().endswith(".bam"):
     if not os.path.exists(f"{aln_list[0]}.bai"):
@@ -69,10 +82,18 @@
 header['RG'][0]['ID'] = Path(args.out).stem
 header['RG'][0]['SM'] = Path(args.out).stem
 
+# set up a generator for the BX tags if --bx was invoked
+if args.bx_unique:
+    bc_range = [f"{i}".zfill(2) for i in range(1,97)]
+    bc_generator = product("A", bc_range, "C", bc_range, "B", bc_range, "D", bc_range)
+
 with pysam.AlignmentFile(args.out, "wb", header = header) as bam_out:
     # current available unused MI tag
     MI_NEW = 1
-
+    if args.bx_unique:
+        BX_NEW = "".join(next(bc_generator))
+    else:
+        BX_NEW = None
     # iterate through the bam files
     for xam in aln_list:
         # create MI dict for this sample
@@ -90,14 +111,29 @@
                     mi = record.get_tag("MI")
                     # if previously converted for this sample, use that
                     if mi in MI_LOCAL:
-                        record.set_tag("MI", MI_LOCAL[mi])
+                        record.set_tag("MI", MI_LOCAL[mi][0])
+                        if args.bx_unique:
+                            record.set_tag("BX", MI_LOCAL[mi][1])
                     else:
                         record.set_tag("MI", MI_NEW)
+                        if args.bx_unique:
+                            record.set_tag("BX", BX_NEW)
                         # add to sample conversion dict
-                        MI_LOCAL[mi] = MI_NEW
+                        MI_LOCAL[mi] = [MI_NEW, BX_NEW]
                         # increment to next unique MI
                         MI_NEW += 1
-                except:
+                        if args.bx_unique:
+                            if MI_NEW > haplotag_limit:
+                                raise IndexError
+                            BX_NEW = "".join(next(bc_generator))
+                except IndexError:
+                    errtext = f"Error:\nNumber of unique molecules exceeds the number of possible unique haplotag barcodes (96^4 = {haplotag_limit}). "
+                    errtext += "Consider pooling fewer individuals per group.\n\nIf this concatenation was performed as part of the harpy sv leviathan workflow, "
+                    errtext += "there is a limitation in LEVIATHAN where it does not recognize hyphenated (deconvolved) linked-read barcodes, which necessitates using all possible unique standard haplotag barcodes. "
+                    errtext += "Consider using the 'sv naibr' workflow, which uses unique MI tags instead.\n"
+                    sys.stderr.write(errtext)
+                    sys.exit(1)
+                except KeyError:
                     pass
                 bam_out.write(record)
         if DELETE_INDEX:

diff --git a/harpy/bin/count_bx.py b/harpy/bin/count_bx.py
@@ -1,5 +1,6 @@
 #! /usr/bin/env python
 """parse a fastq file to count BX stats"""
+import os
 import re
 import sys
 import argparse
@@ -24,15 +25,14 @@
     sys.exit(1)
 
 args = parser.parse_args()
+if not os.path.exists(args.input):
+    parser.error(f"{args.input} was not found")
 
 N_READS = 0
 N_BX = 0
 N_VALID = 0
-# haplotag = re.compile("([A-Z]\d{2,}){3,}")
 haplotag = re.compile('A[0-9]{2}C[0-9]{2}B[0-9]{2}D[0-9]{2}')
-# invalid = re.compile('[A-Z]00')
 invalid = re.compile('[AaBbCcDd]00')
-# inv_dict = {}
 inv_dict = {
     "A" : 0,
     "B" : 0,
@@ -58,11 +58,11 @@
                     continue
                 N_VALID += 1
 
-print(f"totalReads\t{N_READS}", file = sys.stdout)
-print(f"bxTagCount\t{N_BX}", file = sys.stdout)
-print(f"bxValid\t{N_VALID}", file = sys.stdout)
-print(f"bxInvalid\t{N_BX - N_VALID}", file = sys.stdout)
-print("A00\t",str(inv_dict["A"]), file = sys.stdout)
-print("C00\t",str(inv_dict["C"]), file = sys.stdout)
-print("B00\t",str(inv_dict["B"]), file = sys.stdout)
-print("D00\t",str(inv_dict["D"]), file = sys.stdout)
+sys.stdout.write(f"totalReads\t{N_READS}\n")
+sys.stdout.write(f"bxTagCount\t{N_BX}\n")
+sys.stdout.write(f"bxValid\t{N_VALID}\n")
+sys.stdout.write(f"bxInvalid\t{N_BX - N_VALID}\n")
+sys.stdout.write(f"A00\t{inv_dict['A']}\n")
+sys.stdout.write(f"C00\t{inv_dict['C']}\n")
+sys.stdout.write(f"B00\t{inv_dict['B']}\n")
+sys.stdout.write(f"D00\t{inv_dict['D']}\n")