Skip to content

Commit

Permalink
add --bx flag to concatenation
Browse files Browse the repository at this point in the history
  • Loading branch information
pdimens committed Nov 8, 2024
1 parent 876ba9c commit 0bdd82a
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 9 deletions.
44 changes: 37 additions & 7 deletions harpy/bin/concatenate_bam.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import sys
from pathlib import Path
import argparse
from itertools import product
import pysam

parser = argparse.ArgumentParser(
Expand All @@ -12,15 +13,18 @@
"""
Concatenate records from haplotagged SAM/BAM files while making sure MI:i tags remain unique for every sample.
This is a means of accomplishing the same as \'samtools cat\', except all MI (Molecule Identifier) tags are updated
so individuals don't have overlapping MI tags (which would mess up all the linked-read data). You can either provide
all the files you want to concatenate, or a single file featuring filenames with the \'-b\' option.
so individuals don't have overlapping MI tags (which would mess up all the linked-read info). You can either provide
all the files you want to concatenate, or a single file featuring filenames with the \'-b\' option. Use the \'--bx\'
option to also rewrite BX tags such that they are unique between samples too.
""",
usage = "concatenate_bam.py -o output.bam file_1.bam file_2.bam...file_N.bam",
usage = "concatenate_bam.py [--bx] -o output.bam file_1.bam..file_N.bam",
exit_on_error = False
)
parser.add_argument("alignments", nargs='*', help = "SAM or BAM files")
parser.add_argument("-o", "--out", required = True, type = str, help = "Name of BAM output file")
parser.add_argument("-b", "--bamlist", required = False, type = str, help = "List of SAM or BAM files to concatenate")
parser.add_argument("--bx", dest = "bx_unique", action='store_true', help="Also rewrite BX tags for uniqueness")

if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit(1)
Expand Down Expand Up @@ -51,6 +55,9 @@
if err:
parser.error("Some input files were not found on the system:\n" + ", ".join(err))

# Get the max number of unique haplotag barcodes
haplotag_limit = 96**4

# instantiate output file
if aln_list[0].lower().endswith(".bam"):
if not os.path.exists(f"{aln_list[0]}.bai"):
Expand All @@ -75,10 +82,18 @@
header['RG'][0]['ID'] = Path(args.out).stem
header['RG'][0]['SM'] = Path(args.out).stem

# set up a generator for the BX tags if --bx was invoked
if args.bx_unique:
bc_range = [f"{i}".zfill(2) for i in range(1,97)]
bc_generator = product("A", bc_range, "C", bc_range, "B", bc_range, "D", bc_range)

with pysam.AlignmentFile(args.out, "wb", header = header) as bam_out:
# current available unused MI tag
MI_NEW = 1

if args.bx_unique:
BX_NEW = "".join(next(bc_generator))
else:
BX_NEW = None
# iterate through the bam files
for xam in aln_list:
# create MI dict for this sample
Expand All @@ -96,14 +111,29 @@
mi = record.get_tag("MI")
# if previously converted for this sample, use that
if mi in MI_LOCAL:
record.set_tag("MI", MI_LOCAL[mi])
record.set_tag("MI", MI_LOCAL[mi][0])
if args.bx_unique:
record.set_tag("BX", MI_LOCAL[mi][1])
else:
record.set_tag("MI", MI_NEW)
if args.bx_unique:
record.set_tag("BX", BX_NEW)
# add to sample conversion dict
MI_LOCAL[mi] = MI_NEW
MI_LOCAL[mi] = [MI_NEW, BX_NEW]
# increment to next unique MI
MI_NEW += 1
except:
if args.bx_unique:
if MI_NEW > haplotag_limit:
raise IndexError
BX_NEW = "".join(next(bc_generator))
except IndexError:
errtext = f"Error:\nNumber of unique molecules exceeds the number of possible unique haplotag barcodes (96^4 = {haplotag_limit}). "
errtext += "Consider pooling fewer individuals per group.\n\nIf this concatenation was performed as part of the harpy sv leviathan workflow, "
errtext += "there is a limitation in LEVIATHAN where it does not recognize hyphenated (deconvolved) linked-read barcodes, which necessitates using all possible unique standard haplotag barcodes. "
errtext += "Consider using the 'sv naibr' workflow, which uses unique MI tags instead.\n"
sys.stderr.write(errtext)
sys.exit(1)
except KeyError:
pass
bam_out.write(record)
if DELETE_INDEX:
Expand Down
4 changes: 2 additions & 2 deletions harpy/snakefiles/sv_leviathan_pop.smk
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ rule concat_groups:
container:
None
shell:
"concatenate_bam.py -o {output} -b {input.bamlist} 2> {log}"
"concatenate_bam.py --bx -o {output} -b {input.bamlist} 2> {log}"

rule sort_groups:
input:
Expand Down Expand Up @@ -276,7 +276,7 @@ rule workflow_summary:
summary = ["The harpy sv leviathan workflow ran using these parameters:"]
summary.append(f"The provided genome: {bn}")
concat = "The alignments were concatenated using:\n"
concat += "\tconcatenate_bam.py -o groupname.bam -b samples.list"
concat += "\tconcatenate_bam.py --bx -o groupname.bam -b samples.list"
summary.append(concat)
bc_idx = "The barcodes were indexed using:\n"
bc_idx += "LRez index bam -p -b INPUT"
Expand Down

0 comments on commit 0bdd82a

Please sign in to comment.