Merge pull request #164 from pdimens/haplotag_lrsims

fix param call
pdimens · Nov 19, 2024 · b56bf1c · b56bf1c
2 parents a86e607 + d20ee09
commit b56bf1c
Show file tree

Hide file tree

Showing 13 changed files with 772 additions and 1,001,358 deletions.
diff --git a/.github/filters.yml b/.github/filters.yml
@@ -155,7 +155,7 @@ simreads: &simreads
   - 'extractReads.cpp'
   - 'harpy/bin/inline_to_haplotag.py'
   - 'harpy/bin/haplotag_barcodes.py'
-  - 'harpy/scripts/LRSIM_harpy.pl'
+  - 'harpy/scripts/HaploSim.pl'
 assembly: &assembly
   - *common
   - *container

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -746,7 +746,7 @@ jobs:
       - name: simulate linked reads
         shell: micromamba-shell {0}
         run: |
-          gunzip test/haplotag.bc.gz
+          haplotag_barcodes.py -n 14000000 > test/haplotag.bc
           harpy simulate linkedreads --quiet -t 4 -n 2 -b test/haplotag.bc -l 100 -p 50 test/genome/genome.fasta.gz test/genome/genome2.fasta.gz
 
   assembly:

diff --git a/.gitignore b/.gitignore
@@ -26,4 +26,5 @@ harpy.egg-info/
 __pycache__/
 .Benchmark/
 extractReads
+haplotag.bc
 _Inline
diff --git a/harpy/_validations.py b/harpy/_validations.py
diff --git a/harpy/align.py b/harpy/align.py
@@ -210,7 +210,7 @@ def ema(inputs, output_dir, platform, barcode_list, fragment_density, genome, de
     if contigs:
         fasta_contig_match(contigs, genome)
     if barcode_list:
-        validate_barcodefile(barcode_list)
+        validate_barcodefile(barcode_list, False, quiet)
     fetch_rule(workflowdir, "align_ema.smk")
     fetch_report(workflowdir, "align_stats.Rmd")
     fetch_report(workflowdir, "align_bxstats.Rmd")

diff --git a/harpy/bin/haplotag_barcodes.py b/harpy/bin/haplotag_barcodes.py
@@ -6,19 +6,26 @@
 
 parser = argparse.ArgumentParser(
     prog = 'haplotag_barcodes.py',
-    description ="Generates a text file listing the haplotagging ACBD barcodes",
-    usage = "haplotag_barcodes.py > barcodes.txt"
+    description ="Prints haplotag linked-read barcodes to stdout",
+    usage = "haplotag_barcodes.py [-n] > barcodes.txt"
     )
-
+parser.add_argument("-n", "--number", default=96**4, type = int, help = "How many barcodes to print (min: 1, max: 84934656)")
 args = parser.parse_args()
 
+if args.number < 1 or args.number > 96**4:
+    parser.error("--number must be between 1 and 96^4 (84934656)")
+# subtract 1 b/c of python 0-indexing
+args.number -= 1
+
 BX = {
     "A": ["ACGGAA", "CCAACA", "AGATCG", "TTCTCC", "TTCCTG", "TTCGGT", "TTGTGG", "TTGCCT", "TTGGTC", "TTACGC", "TTAGCG", "TCTTCG", "TCTCTC", "TCTGGA", "TCCACT", "TCGTAC", "TCGATG", "TCACAG", "TGTTGC", "TGTCCA", "TGTGTG", "TGCTAG", "TGCATC", "TGGAGT", "TGAGAC", "TATCGG", "TATGCC", "TACCAC", "TAGGAG", "CTTCGT", "CTTGCA", "CTCTGA", "CTCAAC", "CTGCTA", "CTGGAT", "CTAAGG", "CCTCAA", "CCTGTT", "CCATTC", "CGTTCT", "CGTAGA", "CGGTAA", "CGACTT", "CATACG", "CACTTG", "CACGAA", "CACAGT", "CAGATC", "CAACGA", "CAAGCT", "GTTCAC", "GTCGTA", "GTGTCA", "GTGAAG", "GTAACC", "GCTTGT", "GCCTAA", "GCACTA", "GCAGAT", "GGTGAA", "GGCAAT", "GGATGA", "GGAATG", "GATCCT", "GATAGC", "GACACA", "GAGCAA", "GAGGTT", "ATTCCG", "ATTGGC", "ATCGAG", "ACTACC", "ACCAGA", "ACGTCT", "ACACGT", "ACAGTG", "AGCTGT", "AGCCTA", "AGGTTC", "AGGCAT", "AGGACA", "AGAAGC", "AACGTC", "AAGCTG", "CGAGTA", "GAATCC", "GAATGG", "AAGTGC", "AAGAGG", "TACAGG", "CTGACT", "CTAGTC", "CCTAAG", "CCATAG", "CGTAAC", "CAATGC"],
     "C": ["GAAACG", "ACACCA", "TCGAGA", "TCCTTC", "CTGTTC", "GGTTTC", "TGGTTG", "CCTTTG", "GTCTTG", "CGCTTA", "GCGTTA", "TCGTCT", "CTCTCT", "GGATCT", "ACTTCC", "TACTCG", "ATGTCG", "CAGTCA", "TGCTGT", "CCATGT", "GTGTGT", "TAGTGC", "ATCTGC", "AGTTGG", "GACTGA", "CGGTAT", "GCCTAT", "CACTAC", "GAGTAG", "CGTCTT", "GCACTT", "TGACTC", "AACCTC", "CTACTG", "GATCTG", "AGGCTA", "CAACCT", "GTTCCT", "TTCCCA", "TCTCGT", "AGACGT", "TAACGG", "CTTCGA", "ACGCAT", "TTGCAC", "GAACAC", "AGTCAC", "ATCCAG", "CGACAA", "GCTCAA", "CACGTT", "GTAGTC", "TCAGTG", "AAGGTG", "ACCGTA", "TGTGCT", "TAAGCC", "CTAGCA", "GATGCA", "GAAGGT", "AATGGC", "TGAGGA", "ATGGGA", "CCTGAT", "AGCGAT", "ACAGAC", "CAAGAG", "GTTGAG", "CCGATT", "GGCATT", "GAGATC", "ACCACT", "AGAACC", "TCTACG", "CGTACA", "GTGACA", "TGTAGC", "CTAAGC", "TTCAGG", "CATAGG", "ACAAGG", "AGCAGA", "GTCAAC", "CTGAAG", "GTACGA", "TCCGAA", "TGGGAA", "TGCAAG", "AGGAAG", "AGGTAC", "ACTCTG", "GTCCTA", "AAGCCT", "TAGCCA", "AACCGT", "TGCCAA"],
     "B": ["AACGGA", "ACCAAC", "GAGATC", "CTTCTC", "GTTCCT", "TTTCGG", "GTTGTG", "TTTGCC", "CTTGGT", "CTTACG", "GTTAGC", "GTCTTC", "CTCTCT", "ATCTGG", "TTCCAC", "CTCGTA", "GTCGAT", "GTCACA", "CTGTTG", "ATGTCC", "GTGTGT", "GTGCTA", "CTGCAT", "TTGGAG", "CTGAGA", "GTATCG", "CTATGC", "CTACCA", "GTAGGA", "TCTTCG", "ACTTGC", "ACTCTG", "CCTCAA", "ACTGCT", "TCTGGA", "GCTAAG", "ACCTCA", "TCCTGT", "CCCATT", "TCGTTC", "ACGTAG", "ACGGTA", "TCGACT", "GCATAC", "GCACTT", "ACACGA", "TCACAG", "CCAGAT", "ACAACG", "TCAAGC", "CGTTCA", "AGTCGT", "AGTGTC", "GGTGAA", "CGTAAC", "TGCTTG", "AGCCTA", "AGCACT", "TGCAGA", "AGGTGA", "TGGCAA", "AGGATG", "GGGAAT", "TGATCC", "CGATAG", "AGACAC", "AGAGCA", "TGAGGT", "GATTCC", "CATTGG", "GATCGA", "CACTAC", "AACCAG", "TACGTC", "TACACG", "GACAGT", "TAGCTG", "AAGCCT", "CAGGTT", "TAGGCA", "AAGGAC", "CAGAAG", "CAACGT", "GAAGCT", "ACGAGT", "CGAATC", "GGAATG", "CAAGTG", "GAAGAG", "GTACAG", "TCTGAC", "CCTAGT", "GCCTAA", "GCCATA", "CCGTAA", "CCAATG"],
     "D": ["GGAAAC", "AACACC", "ATCGAG", "CTCCTT", "CCTGTT", "CGGTTT", "GTGGTT", "GCCTTT", "GGTCTT", "ACGCTT", "AGCGTT", "TTCGTC", "TCTCTC", "TGGATC", "CACTTC", "GTACTC", "GATGTC", "ACAGTC", "TTGCTG", "TCCATG", "TGTGTG", "CTAGTG", "CATCTG", "GAGTTG", "AGACTG", "TCGGTA", "TGCCTA", "CCACTA", "GGAGTA", "TCGTCT", "TGCACT", "CTGACT", "CAACCT", "GCTACT", "GGATCT", "AAGGCT", "TCAACC", "TGTTCC", "ATTCCC", "TTCTCG", "TAGACG", "GTAACG", "ACTTCG", "TACGCA", "CTTGCA", "CGAACA", "CAGTCA", "GATCCA", "ACGACA", "AGCTCA", "TCACGT", "CGTAGT", "GTCAGT", "GAAGGT", "AACCGT", "TTGTGC", "CTAAGC", "ACTAGC", "AGATGC", "TGAAGG", "CAATGG", "ATGAGG", "AATGGG", "TCCTGA", "TAGCGA", "CACAGA", "GCAAGA", "GGTTGA", "TCCGAT", "TGGCAT", "CGAGAT", "TACCAC", "CAGAAC", "GTCTAC", "ACGTAC", "AGTGAC", "CTGTAG", "CCTAAG", "GTTCAG", "GCATAG", "GACAAG", "AAGCAG", "CGTCAA", "GCTGAA", "AGTACG", "ATCCGA", "ATGGGA", "GTGCAA", "GAGGAA", "CAGGTA", "GACTCT", "AGTCCT", "TAAGCC", "ATAGCC", "TAACCG", "ATGCCA"]
 }
-
+   
 bc_generator = product(BX["A"], BX["C"], BX["B"], BX["D"])
-for BC in bc_generator:
-    sys.stdout.write("".join(BC) + "\n")
+for i,BC in enumerate(bc_generator):
+    sys.stdout.write("".join(BC) + "\n")
+    if i >= args.number:
+        break
diff --git a/harpy/bin/inline_to_haplotag.py b/harpy/bin/inline_to_haplotag.py
@@ -3,19 +3,18 @@
 import os
 import sys
 import gzip
+import sqlite3
 import argparse
-from itertools import zip_longest, product
+from itertools import zip_longest
 
 parser = argparse.ArgumentParser(
     prog = 'inline_to_haplotag.py',
-    description = 'Moves inline linked read barcodes to read headers (OX:Z) and converts them into haplotag ACBD format (BX:Z).',
-    usage = "inline_to_haplotag.py -f <forward.fq.gz> -r <reverse.fq.gz> -b <barcodes.txt> -p <prefix>",
+    description = 'Moves inline linked read barcodes to read headers (OX:Z) and converts them into haplotag ACBD format (BX:Z). Barcodes must all be the same length.',
+    usage = f"inline_to_haplotag.py -f <forward.fq.gz> -r <reverse.fq.gz> -b <barcodes.txt> -p <prefix>",
     exit_on_error = False
     )
-
 parser.add_argument("-f", "--forward", required = True, type = str, help = "Forward reads of paired-end FASTQ file pair (gzipped)")
 parser.add_argument("-r", "--reverse", required = True, type = str, help = "Reverse reads of paired-end FASTQ file pair (gzipped)")
-parser.add_argument("-l", "--length", required = True, type = str, help = "Length of the barcodes (all must be one length)")
 parser.add_argument("-p", "--prefix", required = True, type = str, help = "Prefix for outfile files (e.g. <prefix>.R1.fq.gz)")
 parser.add_argument("-b", "--barcodes", required = True, type=str, help="File listing the linked-read barcodes to convert to haplotag format, one barcode per line")
 if len(sys.argv) == 1:
@@ -29,90 +28,121 @@
 if err:
     parser.error("Some input files were not found on the system:\n" + ", ".join(err))
 
-def iter_fastq_records(file_handle):
-    """Iterate over FASTQ records in a file.
-    file_handle: Opened gzip file handle       
-    Yields: FASTQ record [header, seq, '+', qual]
-    Raises ValueError If file is not in FASTQ format
-    """
-    record = []
-    for line in file_handle:
-        line = line.decode().rstrip("\n")
-        record.append(line)
-        if len(record) == 4:
-            # format sanity check
-            if not (record[0].startswith("@") and record[2] == "+"):
-                raise ValueError("Invalid FASTQ format")
-            yield record
-            record = []
-    if record:
-        raise ValueError("Incomplete FASTQ record at end of file")
+def valid_record(fq_rec, FR):
+    """fastq format sanity check"""
+    if not (fq_rec[0].startswith("@") and fq_rec[2] == "+"):
+        raise ValueError(f"Invalid FASTQ format for {FR} reads")
+
+def insert_key_value(conn, key, value):
+    """insert a key-value pair into sqlite database"""
+    cursor = conn.cursor()
+    cursor.execute('''
+        INSERT OR REPLACE INTO kv_store (key, value) VALUES (?, ?)
+    ''', (key, value))  # Use parameterized queries to avoid SQL injection
+    conn.commit()
 
-def validate_barcode(barcode):
-    """Validate barcode format (A,C,G,T)."""
-    if not set(barcode).issubset({'A','C','G','T'}):
-        raise ValueError(f"Invalid barcode format: {barcode}. Barcodes must be captial letters and only contain standard nucleotide values ATCG.")
+def get_value_by_key(conn, key):
+    """retrieve a value by key from sqlite database"""
+    cursor = conn.cursor()
+    cursor.execute('''
+        SELECT value FROM kv_store WHERE key = ?
+    ''', (key,))
+    result = cursor.fetchone()  # Fetch one row
+    if result:
+        # Return the value (first column of the result)
+        return result[0]  
+    else:
+        # Return invalid ACBD haplotag if the key does not exist
+        return "A00C00B00D00"  
 
-def process_record(fw_entry, rv_entry, barcode_dict, haplotag_bc, bc_len):
+def process_record(fw_entry, rv_entry, barcode_database, bc_len):
     """convert the barcode to haplotag"""
     # [0] = header, [1] = seq, [2] = +, [3] = qual
-    # search for a valid barcode at all possible barcode lengths
     if fw_entry:
+        valid_record(fw_entry, "forward")
         bc_inline = fw_entry[1][:bc_len]
-        bc_hap = barcode_dict.get(bc_inline, "A00C00B00D00")
-        # the default barcode entry is None, meaning it hasnt been assigned a haplotag equivalent yet
-        if not bc_hap:
-            bc_hap = "".join(next(haplotag_bc))
-            barcode_dict[bc_inline] = bc_hap
-        _new_fw  = fw_entry[0].split()[0] + f"\tOX:Z:{bc_inline}\tBX:Z:{bc_hap}\n"
-        _new_fw += fw_entry[1][bc_len:] + "\n"
-        _new_fw += fw_entry[2] + "\n"
-        _new_fw += fw_entry[3][bc_len:] + "\n"
+        bc_hap = get_value_by_key(barcode_database, bc_inline)
+        fw_entry[0] = fw_entry[0].split()[0] + f"\tOX:Z:{bc_inline}\tBX:Z:{bc_hap}"
+        fw_entry[1] = fw_entry[1][bc_len:]
+        fw_entry[3] = fw_entry[3][bc_len:]
+        _new_fw = "\n".join(fw_entry) + "\n"
         if rv_entry:
-            _new_rv  = rv_entry[0].split()[0] + f"\tOX:Z:{bc_inline}\tBX:Z:{bc_hap}\n"
-            _new_rv += rv_entry[1] + "\n"
-            _new_rv += rv_entry[2] + "\n"
-            _new_rv += rv_entry[3] + "\n"
+            valid_record(rv_entry, "reverse")
+            rv_entry[0]  = rv_entry[0].split()[0] + f"\tOX:Z:{bc_inline}\tBX:Z:{bc_hap}"
+            _new_rv = "\n".join(rv_entry) + "\n"
         else:
             _new_rv = None
-        return _new_fw, _new_rv
     else:
+        _new_fw = None
         # no forward read, therefor no barcode to search for
-        _new_rv  = rv_entry[0].split()[0] + f"\tBX:Z:A00C00B00D00\n"
-        _new_rv += rv_entry[1] + "\n"
-        _new_rv += rv_entry[2] + "\n"
-        _new_rv += rv_entry[3] + "\n"
-        return None, _new_rv
+        if rv_entry:
+            valid_record(rv_entry, "reverse")
+            rv_entry[0] = rv_entry[0].split()[0] + "\tBX:Z:A00C00B00D00"
+            _new_rv = "\n".join(rv_entry) + "\n"
+        else:
+            _new_rv = None
+    return _new_fw, _new_rv
 
-bc_range = [f"{i}".zfill(2) for i in range(1,97)]
-bc_generator = product("A", bc_range, "C", bc_range, "B", bc_range, "D", bc_range)
+# Connect to an in-memory SQLite database
+bc_db = sqlite3.connect(':memory:')
+# Create the table to store key-value pairs
+bc_db.cursor().execute('''
+    CREATE TABLE kv_store (
+        key TEXT PRIMARY KEY,
+        value TEXT
+    )
+''')
+bc_db.commit()
+
+nucleotides = {'A','C','G','T'}
+lengths = set()
 
-bc_dict = {}
 # read in barcodes
 opener = gzip.open if args.barcodes.lower().endswith('.gz') else open
 mode = 'rt' if args.barcodes.lower().endswith('.gz') else 'r'
 with opener(args.barcodes, mode) as bc_file:
     for line in bc_file:
-        barcode = line.rstrip().split()[0]
-        validate_barcode(barcode)
-        bc_dict[barcode] = None
+        try:
+            ATCG,ACBD = line.rstrip().split()
+        except ValueError:
+            sys.stderr.write(f"Invalid barcode entry: {line.rstrip()}\nExpected two entries: a nucleotide barcode and ACBD format barcode with a space/tab separatng them, e.g. ATATCAGA A01C22B13D93")
+            sys.exit(1)
+        if not set(ATCG).issubset(nucleotides):
+            sys.stderr.write(f"Invalid barcode format: {ATCG}. Barcodes must be captial letters and only contain standard nucleotide values ATCG.\n")
+            sys.exit(1)
+
+        insert_key_value(bc_db, ATCG, ACBD)
+        #bc_dict[ATCG] = ACBD
+        lengths.add(len(ATCG))
+    if len(lengths) > 1:
+        sys.stderr.write("Can only search sequences for barcodes of a single length, but multiple barcode legnths detected: " + ",".join([str(i) for i in lengths]))
+    else:
+        bc_len = lengths.pop()
 
 # simultaneously iterate the forward and reverse fastq files
-fw_out = gzip.open(f"{args.prefix}.R1.fq.gz", "wb", 6)
-rv_out = gzip.open(f"{args.prefix}.R2.fq.gz", "wb", 6)
-
-with gzip.open(args.forward, "r") as fw_i, gzip.open(args.reverse, "r") as rv_i:
-    for fw_record, rv_record in zip_longest(iter_fastq_records(fw_i), iter_fastq_records(rv_i)):
-        new_fw, new_rv = process_record(fw_record, rv_record, bc_dict, bc_generator, args.length)
-        if new_fw:
-            fw_out.write(new_fw.encode("utf-8"))
-        if new_rv:
-            rv_out.write(new_rv.encode("utf-8"))
-
-fw_out.close()
-rv_out.close()
+with gzip.open(args.forward, "r") as fw_i, gzip.open(args.reverse, "r") as rv_i,\
+    gzip.open(f"{args.prefix}.R1.fq.gz", "wb", 6) as fw_out,\
+    gzip.open(f"{args.prefix}.R2.fq.gz", "wb", 6) as rv_out:
+    record_F = []
+    record_R = []
+    for fw_record, rv_record in zip_longest(fw_i, rv_i):
+        try:
+            record_F.append(fw_record.decode().rstrip("\n"))
+        except AttributeError:
+            # if the file ends before the other one
+            pass
+        try:
+            record_R.append(rv_record.decode().rstrip("\n"))
+        except AttributeError:
+            pass
+        # sanity checks
+        if len(record_F) == 4 or len(record_R) == 4:
+            new_fw, new_rv = process_record(record_F, record_R, bc_db, bc_len)
+            if new_fw:
+                fw_out.write(new_fw.encode("utf-8"))
+                record_F = []
+            if new_rv:
+                rv_out.write(new_rv.encode("utf-8"))
+                record_R = []
 
-with open(f"{args.prefix}.barcodes", "w") as bx_out:
-    for i,j in bc_dict.items():
-        if j:
-            bx_out.write(f"{i}\t{j}\n")
+bc_db.cursor().close()