Skip to content

Commit

Permalink
Merge pull request #164 from pdimens/haplotag_lrsims
Browse files Browse the repository at this point in the history
fix param call
  • Loading branch information
pdimens authored Nov 19, 2024
2 parents a86e607 + d20ee09 commit b56bf1c
Show file tree
Hide file tree
Showing 13 changed files with 772 additions and 1,001,358 deletions.
2 changes: 1 addition & 1 deletion .github/filters.yml
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ simreads: &simreads
- 'extractReads.cpp'
- 'harpy/bin/inline_to_haplotag.py'
- 'harpy/bin/haplotag_barcodes.py'
- 'harpy/scripts/LRSIM_harpy.pl'
- 'harpy/scripts/HaploSim.pl'
assembly: &assembly
- *common
- *container
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -746,7 +746,7 @@ jobs:
- name: simulate linked reads
shell: micromamba-shell {0}
run: |
gunzip test/haplotag.bc.gz
haplotag_barcodes.py -n 14000000 > test/haplotag.bc
harpy simulate linkedreads --quiet -t 4 -n 2 -b test/haplotag.bc -l 100 -p 50 test/genome/genome.fasta.gz test/genome/genome2.fasta.gz
assembly:
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,5 @@ harpy.egg-info/
__pycache__/
.Benchmark/
extractReads
haplotag.bc
_Inline
205 changes: 89 additions & 116 deletions harpy/_validations.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion harpy/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ def ema(inputs, output_dir, platform, barcode_list, fragment_density, genome, de
if contigs:
fasta_contig_match(contigs, genome)
if barcode_list:
validate_barcodefile(barcode_list)
validate_barcodefile(barcode_list, False, quiet)
fetch_rule(workflowdir, "align_ema.smk")
fetch_report(workflowdir, "align_stats.Rmd")
fetch_report(workflowdir, "align_bxstats.Rmd")
Expand Down
19 changes: 13 additions & 6 deletions harpy/bin/haplotag_barcodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,26 @@

parser = argparse.ArgumentParser(
prog = 'haplotag_barcodes.py',
description ="Generates a text file listing the haplotagging ACBD barcodes",
usage = "haplotag_barcodes.py > barcodes.txt"
description ="Prints haplotag linked-read barcodes to stdout",
usage = "haplotag_barcodes.py [-n] > barcodes.txt"
)

parser.add_argument("-n", "--number", default=96**4, type = int, help = "How many barcodes to print (min: 1, max: 84934656)")
args = parser.parse_args()

if args.number < 1 or args.number > 96**4:
parser.error("--number must be between 1 and 96^4 (84934656)")
# subtract 1 b/c of python 0-indexing
args.number -= 1

BX = {
"A": ["ACGGAA", "CCAACA", "AGATCG", "TTCTCC", "TTCCTG", "TTCGGT", "TTGTGG", "TTGCCT", "TTGGTC", "TTACGC", "TTAGCG", "TCTTCG", "TCTCTC", "TCTGGA", "TCCACT", "TCGTAC", "TCGATG", "TCACAG", "TGTTGC", "TGTCCA", "TGTGTG", "TGCTAG", "TGCATC", "TGGAGT", "TGAGAC", "TATCGG", "TATGCC", "TACCAC", "TAGGAG", "CTTCGT", "CTTGCA", "CTCTGA", "CTCAAC", "CTGCTA", "CTGGAT", "CTAAGG", "CCTCAA", "CCTGTT", "CCATTC", "CGTTCT", "CGTAGA", "CGGTAA", "CGACTT", "CATACG", "CACTTG", "CACGAA", "CACAGT", "CAGATC", "CAACGA", "CAAGCT", "GTTCAC", "GTCGTA", "GTGTCA", "GTGAAG", "GTAACC", "GCTTGT", "GCCTAA", "GCACTA", "GCAGAT", "GGTGAA", "GGCAAT", "GGATGA", "GGAATG", "GATCCT", "GATAGC", "GACACA", "GAGCAA", "GAGGTT", "ATTCCG", "ATTGGC", "ATCGAG", "ACTACC", "ACCAGA", "ACGTCT", "ACACGT", "ACAGTG", "AGCTGT", "AGCCTA", "AGGTTC", "AGGCAT", "AGGACA", "AGAAGC", "AACGTC", "AAGCTG", "CGAGTA", "GAATCC", "GAATGG", "AAGTGC", "AAGAGG", "TACAGG", "CTGACT", "CTAGTC", "CCTAAG", "CCATAG", "CGTAAC", "CAATGC"],
"C": ["GAAACG", "ACACCA", "TCGAGA", "TCCTTC", "CTGTTC", "GGTTTC", "TGGTTG", "CCTTTG", "GTCTTG", "CGCTTA", "GCGTTA", "TCGTCT", "CTCTCT", "GGATCT", "ACTTCC", "TACTCG", "ATGTCG", "CAGTCA", "TGCTGT", "CCATGT", "GTGTGT", "TAGTGC", "ATCTGC", "AGTTGG", "GACTGA", "CGGTAT", "GCCTAT", "CACTAC", "GAGTAG", "CGTCTT", "GCACTT", "TGACTC", "AACCTC", "CTACTG", "GATCTG", "AGGCTA", "CAACCT", "GTTCCT", "TTCCCA", "TCTCGT", "AGACGT", "TAACGG", "CTTCGA", "ACGCAT", "TTGCAC", "GAACAC", "AGTCAC", "ATCCAG", "CGACAA", "GCTCAA", "CACGTT", "GTAGTC", "TCAGTG", "AAGGTG", "ACCGTA", "TGTGCT", "TAAGCC", "CTAGCA", "GATGCA", "GAAGGT", "AATGGC", "TGAGGA", "ATGGGA", "CCTGAT", "AGCGAT", "ACAGAC", "CAAGAG", "GTTGAG", "CCGATT", "GGCATT", "GAGATC", "ACCACT", "AGAACC", "TCTACG", "CGTACA", "GTGACA", "TGTAGC", "CTAAGC", "TTCAGG", "CATAGG", "ACAAGG", "AGCAGA", "GTCAAC", "CTGAAG", "GTACGA", "TCCGAA", "TGGGAA", "TGCAAG", "AGGAAG", "AGGTAC", "ACTCTG", "GTCCTA", "AAGCCT", "TAGCCA", "AACCGT", "TGCCAA"],
"B": ["AACGGA", "ACCAAC", "GAGATC", "CTTCTC", "GTTCCT", "TTTCGG", "GTTGTG", "TTTGCC", "CTTGGT", "CTTACG", "GTTAGC", "GTCTTC", "CTCTCT", "ATCTGG", "TTCCAC", "CTCGTA", "GTCGAT", "GTCACA", "CTGTTG", "ATGTCC", "GTGTGT", "GTGCTA", "CTGCAT", "TTGGAG", "CTGAGA", "GTATCG", "CTATGC", "CTACCA", "GTAGGA", "TCTTCG", "ACTTGC", "ACTCTG", "CCTCAA", "ACTGCT", "TCTGGA", "GCTAAG", "ACCTCA", "TCCTGT", "CCCATT", "TCGTTC", "ACGTAG", "ACGGTA", "TCGACT", "GCATAC", "GCACTT", "ACACGA", "TCACAG", "CCAGAT", "ACAACG", "TCAAGC", "CGTTCA", "AGTCGT", "AGTGTC", "GGTGAA", "CGTAAC", "TGCTTG", "AGCCTA", "AGCACT", "TGCAGA", "AGGTGA", "TGGCAA", "AGGATG", "GGGAAT", "TGATCC", "CGATAG", "AGACAC", "AGAGCA", "TGAGGT", "GATTCC", "CATTGG", "GATCGA", "CACTAC", "AACCAG", "TACGTC", "TACACG", "GACAGT", "TAGCTG", "AAGCCT", "CAGGTT", "TAGGCA", "AAGGAC", "CAGAAG", "CAACGT", "GAAGCT", "ACGAGT", "CGAATC", "GGAATG", "CAAGTG", "GAAGAG", "GTACAG", "TCTGAC", "CCTAGT", "GCCTAA", "GCCATA", "CCGTAA", "CCAATG"],
"D": ["GGAAAC", "AACACC", "ATCGAG", "CTCCTT", "CCTGTT", "CGGTTT", "GTGGTT", "GCCTTT", "GGTCTT", "ACGCTT", "AGCGTT", "TTCGTC", "TCTCTC", "TGGATC", "CACTTC", "GTACTC", "GATGTC", "ACAGTC", "TTGCTG", "TCCATG", "TGTGTG", "CTAGTG", "CATCTG", "GAGTTG", "AGACTG", "TCGGTA", "TGCCTA", "CCACTA", "GGAGTA", "TCGTCT", "TGCACT", "CTGACT", "CAACCT", "GCTACT", "GGATCT", "AAGGCT", "TCAACC", "TGTTCC", "ATTCCC", "TTCTCG", "TAGACG", "GTAACG", "ACTTCG", "TACGCA", "CTTGCA", "CGAACA", "CAGTCA", "GATCCA", "ACGACA", "AGCTCA", "TCACGT", "CGTAGT", "GTCAGT", "GAAGGT", "AACCGT", "TTGTGC", "CTAAGC", "ACTAGC", "AGATGC", "TGAAGG", "CAATGG", "ATGAGG", "AATGGG", "TCCTGA", "TAGCGA", "CACAGA", "GCAAGA", "GGTTGA", "TCCGAT", "TGGCAT", "CGAGAT", "TACCAC", "CAGAAC", "GTCTAC", "ACGTAC", "AGTGAC", "CTGTAG", "CCTAAG", "GTTCAG", "GCATAG", "GACAAG", "AAGCAG", "CGTCAA", "GCTGAA", "AGTACG", "ATCCGA", "ATGGGA", "GTGCAA", "GAGGAA", "CAGGTA", "GACTCT", "AGTCCT", "TAAGCC", "ATAGCC", "TAACCG", "ATGCCA"]
}

bc_generator = product(BX["A"], BX["C"], BX["B"], BX["D"])
for BC in bc_generator:
sys.stdout.write("".join(BC) + "\n")
for i,BC in enumerate(bc_generator):
sys.stdout.write("".join(BC) + "\n")
if i >= args.number:
break
172 changes: 101 additions & 71 deletions harpy/bin/inline_to_haplotag.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,18 @@
import os
import sys
import gzip
import sqlite3
import argparse
from itertools import zip_longest, product
from itertools import zip_longest

parser = argparse.ArgumentParser(
prog = 'inline_to_haplotag.py',
description = 'Moves inline linked read barcodes to read headers (OX:Z) and converts them into haplotag ACBD format (BX:Z).',
usage = "inline_to_haplotag.py -f <forward.fq.gz> -r <reverse.fq.gz> -b <barcodes.txt> -p <prefix>",
description = 'Moves inline linked read barcodes to read headers (OX:Z) and converts them into haplotag ACBD format (BX:Z). Barcodes must all be the same length.',
usage = f"inline_to_haplotag.py -f <forward.fq.gz> -r <reverse.fq.gz> -b <barcodes.txt> -p <prefix>",
exit_on_error = False
)

parser.add_argument("-f", "--forward", required = True, type = str, help = "Forward reads of paired-end FASTQ file pair (gzipped)")
parser.add_argument("-r", "--reverse", required = True, type = str, help = "Reverse reads of paired-end FASTQ file pair (gzipped)")
parser.add_argument("-l", "--length", required = True, type = str, help = "Length of the barcodes (all must be one length)")
parser.add_argument("-p", "--prefix", required = True, type = str, help = "Prefix for outfile files (e.g. <prefix>.R1.fq.gz)")
parser.add_argument("-b", "--barcodes", required = True, type=str, help="File listing the linked-read barcodes to convert to haplotag format, one barcode per line")
if len(sys.argv) == 1:
Expand All @@ -29,90 +28,121 @@
if err:
parser.error("Some input files were not found on the system:\n" + ", ".join(err))

def iter_fastq_records(file_handle):
"""Iterate over FASTQ records in a file.
file_handle: Opened gzip file handle
Yields: FASTQ record [header, seq, '+', qual]
Raises ValueError If file is not in FASTQ format
"""
record = []
for line in file_handle:
line = line.decode().rstrip("\n")
record.append(line)
if len(record) == 4:
# format sanity check
if not (record[0].startswith("@") and record[2] == "+"):
raise ValueError("Invalid FASTQ format")
yield record
record = []
if record:
raise ValueError("Incomplete FASTQ record at end of file")
def valid_record(fq_rec, FR):
"""fastq format sanity check"""
if not (fq_rec[0].startswith("@") and fq_rec[2] == "+"):
raise ValueError(f"Invalid FASTQ format for {FR} reads")

def insert_key_value(conn, key, value):
"""insert a key-value pair into sqlite database"""
cursor = conn.cursor()
cursor.execute('''
INSERT OR REPLACE INTO kv_store (key, value) VALUES (?, ?)
''', (key, value)) # Use parameterized queries to avoid SQL injection
conn.commit()

def validate_barcode(barcode):
"""Validate barcode format (A,C,G,T)."""
if not set(barcode).issubset({'A','C','G','T'}):
raise ValueError(f"Invalid barcode format: {barcode}. Barcodes must be captial letters and only contain standard nucleotide values ATCG.")
def get_value_by_key(conn, key):
"""retrieve a value by key from sqlite database"""
cursor = conn.cursor()
cursor.execute('''
SELECT value FROM kv_store WHERE key = ?
''', (key,))
result = cursor.fetchone() # Fetch one row
if result:
# Return the value (first column of the result)
return result[0]
else:
# Return invalid ACBD haplotag if the key does not exist
return "A00C00B00D00"

def process_record(fw_entry, rv_entry, barcode_dict, haplotag_bc, bc_len):
def process_record(fw_entry, rv_entry, barcode_database, bc_len):
"""convert the barcode to haplotag"""
# [0] = header, [1] = seq, [2] = +, [3] = qual
# search for a valid barcode at all possible barcode lengths
if fw_entry:
valid_record(fw_entry, "forward")
bc_inline = fw_entry[1][:bc_len]
bc_hap = barcode_dict.get(bc_inline, "A00C00B00D00")
# the default barcode entry is None, meaning it hasnt been assigned a haplotag equivalent yet
if not bc_hap:
bc_hap = "".join(next(haplotag_bc))
barcode_dict[bc_inline] = bc_hap
_new_fw = fw_entry[0].split()[0] + f"\tOX:Z:{bc_inline}\tBX:Z:{bc_hap}\n"
_new_fw += fw_entry[1][bc_len:] + "\n"
_new_fw += fw_entry[2] + "\n"
_new_fw += fw_entry[3][bc_len:] + "\n"
bc_hap = get_value_by_key(barcode_database, bc_inline)
fw_entry[0] = fw_entry[0].split()[0] + f"\tOX:Z:{bc_inline}\tBX:Z:{bc_hap}"
fw_entry[1] = fw_entry[1][bc_len:]
fw_entry[3] = fw_entry[3][bc_len:]
_new_fw = "\n".join(fw_entry) + "\n"
if rv_entry:
_new_rv = rv_entry[0].split()[0] + f"\tOX:Z:{bc_inline}\tBX:Z:{bc_hap}\n"
_new_rv += rv_entry[1] + "\n"
_new_rv += rv_entry[2] + "\n"
_new_rv += rv_entry[3] + "\n"
valid_record(rv_entry, "reverse")
rv_entry[0] = rv_entry[0].split()[0] + f"\tOX:Z:{bc_inline}\tBX:Z:{bc_hap}"
_new_rv = "\n".join(rv_entry) + "\n"
else:
_new_rv = None
return _new_fw, _new_rv
else:
_new_fw = None
# no forward read, therefor no barcode to search for
_new_rv = rv_entry[0].split()[0] + f"\tBX:Z:A00C00B00D00\n"
_new_rv += rv_entry[1] + "\n"
_new_rv += rv_entry[2] + "\n"
_new_rv += rv_entry[3] + "\n"
return None, _new_rv
if rv_entry:
valid_record(rv_entry, "reverse")
rv_entry[0] = rv_entry[0].split()[0] + "\tBX:Z:A00C00B00D00"
_new_rv = "\n".join(rv_entry) + "\n"
else:
_new_rv = None
return _new_fw, _new_rv

bc_range = [f"{i}".zfill(2) for i in range(1,97)]
bc_generator = product("A", bc_range, "C", bc_range, "B", bc_range, "D", bc_range)
# Connect to an in-memory SQLite database
bc_db = sqlite3.connect(':memory:')
# Create the table to store key-value pairs
bc_db.cursor().execute('''
CREATE TABLE kv_store (
key TEXT PRIMARY KEY,
value TEXT
)
''')
bc_db.commit()

nucleotides = {'A','C','G','T'}
lengths = set()

bc_dict = {}
# read in barcodes
opener = gzip.open if args.barcodes.lower().endswith('.gz') else open
mode = 'rt' if args.barcodes.lower().endswith('.gz') else 'r'
with opener(args.barcodes, mode) as bc_file:
for line in bc_file:
barcode = line.rstrip().split()[0]
validate_barcode(barcode)
bc_dict[barcode] = None
try:
ATCG,ACBD = line.rstrip().split()
except ValueError:
sys.stderr.write(f"Invalid barcode entry: {line.rstrip()}\nExpected two entries: a nucleotide barcode and ACBD format barcode with a space/tab separatng them, e.g. ATATCAGA A01C22B13D93")
sys.exit(1)
if not set(ATCG).issubset(nucleotides):
sys.stderr.write(f"Invalid barcode format: {ATCG}. Barcodes must be captial letters and only contain standard nucleotide values ATCG.\n")
sys.exit(1)

insert_key_value(bc_db, ATCG, ACBD)
#bc_dict[ATCG] = ACBD
lengths.add(len(ATCG))
if len(lengths) > 1:
sys.stderr.write("Can only search sequences for barcodes of a single length, but multiple barcode legnths detected: " + ",".join([str(i) for i in lengths]))
else:
bc_len = lengths.pop()

# simultaneously iterate the forward and reverse fastq files
fw_out = gzip.open(f"{args.prefix}.R1.fq.gz", "wb", 6)
rv_out = gzip.open(f"{args.prefix}.R2.fq.gz", "wb", 6)

with gzip.open(args.forward, "r") as fw_i, gzip.open(args.reverse, "r") as rv_i:
for fw_record, rv_record in zip_longest(iter_fastq_records(fw_i), iter_fastq_records(rv_i)):
new_fw, new_rv = process_record(fw_record, rv_record, bc_dict, bc_generator, args.length)
if new_fw:
fw_out.write(new_fw.encode("utf-8"))
if new_rv:
rv_out.write(new_rv.encode("utf-8"))

fw_out.close()
rv_out.close()
with gzip.open(args.forward, "r") as fw_i, gzip.open(args.reverse, "r") as rv_i,\
gzip.open(f"{args.prefix}.R1.fq.gz", "wb", 6) as fw_out,\
gzip.open(f"{args.prefix}.R2.fq.gz", "wb", 6) as rv_out:
record_F = []
record_R = []
for fw_record, rv_record in zip_longest(fw_i, rv_i):
try:
record_F.append(fw_record.decode().rstrip("\n"))
except AttributeError:
# if the file ends before the other one
pass
try:
record_R.append(rv_record.decode().rstrip("\n"))
except AttributeError:
pass
# sanity checks
if len(record_F) == 4 or len(record_R) == 4:
new_fw, new_rv = process_record(record_F, record_R, bc_db, bc_len)
if new_fw:
fw_out.write(new_fw.encode("utf-8"))
record_F = []
if new_rv:
rv_out.write(new_rv.encode("utf-8"))
record_R = []

with open(f"{args.prefix}.barcodes", "w") as bx_out:
for i,j in bc_dict.items():
if j:
bx_out.write(f"{i}\t{j}\n")
bc_db.cursor().close()
Loading

0 comments on commit b56bf1c

Please sign in to comment.