Merge pull request #162 from pdimens/haplotag_lrsims

haplotagging barcodes as default
pdimens · Nov 14, 2024 · 9334a66 · 9334a66
2 parents 9c8a01d + 4802867
commit 9334a66
Show file tree

Hide file tree

Showing 13 changed files with 149 additions and 153 deletions.
diff --git a/.github/filters.yml b/.github/filters.yml
@@ -153,7 +153,8 @@ simreads: &simreads
   - 'harpy/snakefiles/simulate_linkedreads.smk'
   - 'test/genome**gz'
   - 'extractReads.cpp'
-  - 'harpy/bin/10xtoHaplotag.py'
+  - 'harpy/bin/inline_to_haplotag.py'
+  - 'harpy/bin/haplotag_barcodes.py'
   - 'harpy/scripts/LRSIM_harpy.pl'
 assembly: &assembly
   - *common

diff --git a/harpy/bin/10xtoHaplotag.py b/harpy/bin/10xtoHaplotag.py
diff --git a/harpy/bin/haplotag_barcodes.py b/harpy/bin/haplotag_barcodes.py
@@ -0,0 +1,24 @@
+#! /usr/bin/env python
+"""Generates a text file listing the haplotagging ACBD barcodes"""
+import sys
+import argparse
+from itertools import product
+
+parser = argparse.ArgumentParser(
+    prog = 'haplotag_barcodes.py',
+    description ="Generates a text file listing the haplotagging ACBD barcodes",
+    usage = "haplotag_barcodes.py > barcodes.txt"
+    )
+
+args = parser.parse_args()
+
+BX = {
+    "A": ["ACGGAA", "CCAACA", "AGATCG", "TTCTCC", "TTCCTG", "TTCGGT", "TTGTGG", "TTGCCT", "TTGGTC", "TTACGC", "TTAGCG", "TCTTCG", "TCTCTC", "TCTGGA", "TCCACT", "TCGTAC", "TCGATG", "TCACAG", "TGTTGC", "TGTCCA", "TGTGTG", "TGCTAG", "TGCATC", "TGGAGT", "TGAGAC", "TATCGG", "TATGCC", "TACCAC", "TAGGAG", "CTTCGT", "CTTGCA", "CTCTGA", "CTCAAC", "CTGCTA", "CTGGAT", "CTAAGG", "CCTCAA", "CCTGTT", "CCATTC", "CGTTCT", "CGTAGA", "CGGTAA", "CGACTT", "CATACG", "CACTTG", "CACGAA", "CACAGT", "CAGATC", "CAACGA", "CAAGCT", "GTTCAC", "GTCGTA", "GTGTCA", "GTGAAG", "GTAACC", "GCTTGT", "GCCTAA", "GCACTA", "GCAGAT", "GGTGAA", "GGCAAT", "GGATGA", "GGAATG", "GATCCT", "GATAGC", "GACACA", "GAGCAA", "GAGGTT", "ATTCCG", "ATTGGC", "ATCGAG", "ACTACC", "ACCAGA", "ACGTCT", "ACACGT", "ACAGTG", "AGCTGT", "AGCCTA", "AGGTTC", "AGGCAT", "AGGACA", "AGAAGC", "AACGTC", "AAGCTG", "CGAGTA", "GAATCC", "GAATGG", "AAGTGC", "AAGAGG", "TACAGG", "CTGACT", "CTAGTC", "CCTAAG", "CCATAG", "CGTAAC", "CAATGC"],
+    "C": ["GAAACG", "ACACCA", "TCGAGA", "TCCTTC", "CTGTTC", "GGTTTC", "TGGTTG", "CCTTTG", "GTCTTG", "CGCTTA", "GCGTTA", "TCGTCT", "CTCTCT", "GGATCT", "ACTTCC", "TACTCG", "ATGTCG", "CAGTCA", "TGCTGT", "CCATGT", "GTGTGT", "TAGTGC", "ATCTGC", "AGTTGG", "GACTGA", "CGGTAT", "GCCTAT", "CACTAC", "GAGTAG", "CGTCTT", "GCACTT", "TGACTC", "AACCTC", "CTACTG", "GATCTG", "AGGCTA", "CAACCT", "GTTCCT", "TTCCCA", "TCTCGT", "AGACGT", "TAACGG", "CTTCGA", "ACGCAT", "TTGCAC", "GAACAC", "AGTCAC", "ATCCAG", "CGACAA", "GCTCAA", "CACGTT", "GTAGTC", "TCAGTG", "AAGGTG", "ACCGTA", "TGTGCT", "TAAGCC", "CTAGCA", "GATGCA", "GAAGGT", "AATGGC", "TGAGGA", "ATGGGA", "CCTGAT", "AGCGAT", "ACAGAC", "CAAGAG", "GTTGAG", "CCGATT", "GGCATT", "GAGATC", "ACCACT", "AGAACC", "TCTACG", "CGTACA", "GTGACA", "TGTAGC", "CTAAGC", "TTCAGG", "CATAGG", "ACAAGG", "AGCAGA", "GTCAAC", "CTGAAG", "GTACGA", "TCCGAA", "TGGGAA", "TGCAAG", "AGGAAG", "AGGTAC", "ACTCTG", "GTCCTA", "AAGCCT", "TAGCCA", "AACCGT", "TGCCAA"],
+    "B": ["AACGGA", "ACCAAC", "GAGATC", "CTTCTC", "GTTCCT", "TTTCGG", "GTTGTG", "TTTGCC", "CTTGGT", "CTTACG", "GTTAGC", "GTCTTC", "CTCTCT", "ATCTGG", "TTCCAC", "CTCGTA", "GTCGAT", "GTCACA", "CTGTTG", "ATGTCC", "GTGTGT", "GTGCTA", "CTGCAT", "TTGGAG", "CTGAGA", "GTATCG", "CTATGC", "CTACCA", "GTAGGA", "TCTTCG", "ACTTGC", "ACTCTG", "CCTCAA", "ACTGCT", "TCTGGA", "GCTAAG", "ACCTCA", "TCCTGT", "CCCATT", "TCGTTC", "ACGTAG", "ACGGTA", "TCGACT", "GCATAC", "GCACTT", "ACACGA", "TCACAG", "CCAGAT", "ACAACG", "TCAAGC", "CGTTCA", "AGTCGT", "AGTGTC", "GGTGAA", "CGTAAC", "TGCTTG", "AGCCTA", "AGCACT", "TGCAGA", "AGGTGA", "TGGCAA", "AGGATG", "GGGAAT", "TGATCC", "CGATAG", "AGACAC", "AGAGCA", "TGAGGT", "GATTCC", "CATTGG", "GATCGA", "CACTAC", "AACCAG", "TACGTC", "TACACG", "GACAGT", "TAGCTG", "AAGCCT", "CAGGTT", "TAGGCA", "AAGGAC", "CAGAAG", "CAACGT", "GAAGCT", "ACGAGT", "CGAATC", "GGAATG", "CAAGTG", "GAAGAG", "GTACAG", "TCTGAC", "CCTAGT", "GCCTAA", "GCCATA", "CCGTAA", "CCAATG"],
+    "D": ["GGAAAC", "AACACC", "ATCGAG", "CTCCTT", "CCTGTT", "CGGTTT", "GTGGTT", "GCCTTT", "GGTCTT", "ACGCTT", "AGCGTT", "TTCGTC", "TCTCTC", "TGGATC", "CACTTC", "GTACTC", "GATGTC", "ACAGTC", "TTGCTG", "TCCATG", "TGTGTG", "CTAGTG", "CATCTG", "GAGTTG", "AGACTG", "TCGGTA", "TGCCTA", "CCACTA", "GGAGTA", "TCGTCT", "TGCACT", "CTGACT", "CAACCT", "GCTACT", "GGATCT", "AAGGCT", "TCAACC", "TGTTCC", "ATTCCC", "TTCTCG", "TAGACG", "GTAACG", "ACTTCG", "TACGCA", "CTTGCA", "CGAACA", "CAGTCA", "GATCCA", "ACGACA", "AGCTCA", "TCACGT", "CGTAGT", "GTCAGT", "GAAGGT", "AACCGT", "TTGTGC", "CTAAGC", "ACTAGC", "AGATGC", "TGAAGG", "CAATGG", "ATGAGG", "AATGGG", "TCCTGA", "TAGCGA", "CACAGA", "GCAAGA", "GGTTGA", "TCCGAT", "TGGCAT", "CGAGAT", "TACCAC", "CAGAAC", "GTCTAC", "ACGTAC", "AGTGAC", "CTGTAG", "CCTAAG", "GTTCAG", "GCATAG", "GACAAG", "AAGCAG", "CGTCAA", "GCTGAA", "AGTACG", "ATCCGA", "ATGGGA", "GTGCAA", "GAGGAA", "CAGGTA", "GACTCT", "AGTCCT", "TAAGCC", "ATAGCC", "TAACCG", "ATGCCA"]
+}
+
+bc_generator = product(BX["A"], BX["C"], BX["B"], BX["D"])
+for BC in bc_generator:
+    sys.stdout.write("".join(BC) + "\n")
diff --git a/harpy/bin/inline_to_haplotag.py b/harpy/bin/inline_to_haplotag.py
@@ -0,0 +1,100 @@
+#! /usr/bin/env python
+"""Convert inline barcodes into haplotag style ones"""
+import os
+import sys
+import gzip
+import argparse
+from itertools import zip_longest, product
+
+parser = argparse.ArgumentParser(
+    prog = 'inline_to_haplotag.py',
+    description = 'Moves inline linked read barcodes to read headers (OX:Z) and converts them into haplotag ACBD format (BX:Z).',
+    usage = "inline_to_haplotag.py -f <forward.fq.gz> -r <reverse.fq.gz> -b <barcodes.txt> -p <prefix> > barcodes.conversion.txt",
+    exit_on_error = False
+    )
+
+parser.add_argument("-f", "--forward", required = True, type = str, help = "Forward reads of paired-end FASTQ file pair (gzipped)")
+parser.add_argument("-r", "--reverse", required = True, type = str, help = "Reverse reads of paired-end FASTQ file pair (gzipped)")
+parser.add_argument("-p", "--prefix", required = True, type = str, help = "Prefix for outfile FASTQ files (e.g. <prefix>.R1.fq.gz)")
+parser.add_argument("-b", "--barcodes", required = True, type=str, help="File listing the linked-read barcodes to convert to haplotag format, one barcode per line")
+if len(sys.argv) == 1:
+    parser.print_help(sys.stderr)
+    sys.exit(1)
+args = parser.parse_args()
+err = []
+for i in [args.forward, args.reverse, args.barcodes]:
+    if not os.path.exists(i):
+        err.append(i)
+if err:
+    parser.error("Some input files were not found on the system:\n" + ", ".join(err))
+
+def iter_fastq_records(file_handle):
+    """Iterate over FASTQ records in a file.
+    file_handle: Opened gzip file handle       
+    Yields: FASTQ record [header, seq, '+', qual]
+    Raises ValueError If file is not in FASTQ format
+    """
+    record = []
+    for line in file_handle:
+        line = line.decode().rstrip("\n")
+        record.append(line)
+        if len(record) == 4:
+            # format sanity check
+            if not (record[0].startswith("@") and record[2] == "+"):
+                raise ValueError("Invalid FASTQ format")
+            yield record
+            record = []
+    if record:
+        raise ValueError("Incomplete FASTQ record at end of file")
+
+def validate_barcode(barcode):
+    """Validate barcode format (A,C,G,T)."""
+    if not set(barcode).issubset({'A','C','G','T'}):
+        raise ValueError(f"Invalid barcode format: {barcode}. Barcodes must be captial letters and only contain standard nucleotide values ATCG.")
+
+def process_record(fw_entry, rv_entry, barcode_dict, haplotag_bc):
+    """convert the barcode to haplotag"""
+    # [0] = header, [1] = seq, [2] = +, [3] = qual
+    bc10x = fw_entry[1][:16]
+    bchap = barcode_dict.get(bc10x, "A00C00B00D00")
+    if not bchap:
+        bchap = "".join(next(haplotag_bc))
+        barcode_dict[bc10x] = bchap
+    _new_fw  = fw_entry[0].split()[0] + f"\tOX:Z:{bc10x}\tBX:Z:{bchap}\n"
+    _new_fw += fw_entry[1][16:] + "\n"
+    _new_fw += fw_entry[2] + "\n"
+    _new_fw += fw_entry[3][16:] + "\n"
+    _new_rv  = rv_entry[0].split()[0] + f"\tOX:Z:{bc10x}\tBX:Z:{bchap}\n"
+    _new_rv += "\n".join(rv_entry[1:3])
+    return _new_fw, _new_rv
+
+bc_range = [f"{i}".zfill(2) for i in range(1,97)]
+bc_generator = product("A", bc_range, "C", bc_range, "B", bc_range, "D", bc_range)
+
+bc_dict = {}
+
+# read in barcodes
+opener = gzip.open if args.barcodes.lower().endswith('.gz') else open
+mode = 'rt' if args.barcodes.lower().endswith('.gz') else 'r'
+with opener(args.barcodes, mode) as bc_file:
+    for line in bc_file:
+        barcode = line.rstrip().split()[0]
+        validate_barcode(barcode)
+        bc_dict[barcode] = None
+
+# simultaneously iterate the forward and reverse fastq files
+fw_out = gzip.open(f"{args.prefix}.R1.fq.gz", "wb", 6)
+rv_out = gzip.open(f"{args.prefix}.R2.fq.gz", "wb", 6)
+
+with gzip.open(args.forward, "r") as fw_i, gzip.open(args.reverse, "r") as rv_i:
+    for fw_record, rv_record in zip_longest(iter_fastq_records(fw_i), iter_fastq_records(rv_i)):
+        new_fw, new_rv = process_record(fw_record, rv_record, bc_dict, bc_generator)
+        fw_out.write(new_fw.encode("utf-8"))
+        rv_out.write(new_rv.encode("utf-8"))
+
+fw_out.close()
+rv_out.close()
+
+for i,j in bc_dict.items():
+    if j:
+        sys.stdout.write(f"{i}\t{j}\n")
diff --git a/harpy/bin/make_windows.py b/harpy/bin/make_windows.py
@@ -52,10 +52,7 @@ def makewindows(_contig, _c_len, windowsize):
 
 if testname.endswith("fai"):
     with open(args.input, "r", encoding="utf-8") as fai:
-        while True:
-            line = fai.readline()
-            if not line:
-                break
+        for line in fai:
             lsplit = line.split("\t")
             contig = lsplit[0]
             c_len = int(lsplit[1])

diff --git a/harpy/bin/molecule_coverage.py b/harpy/bin/molecule_coverage.py
@@ -45,10 +45,7 @@
 
 # read the fasta index file as a dict of contig lengths
 with open(args.fai, "r", encoding= "utf-8") as fai:
-    while True:
-        line = fai.readline()
-        if not line:
-            break
+    for line in fai:
         splitline = line.split()
         contig = splitline[0]
         length = splitline[1]

diff --git a/harpy/bin/parse_phaseblocks.py b/harpy/bin/parse_phaseblocks.py
@@ -23,10 +23,7 @@
 
 with open(args.input, "r", encoding="utf-8") as blocks:
     FIRST_LINE = True
-    while True:
-        line = blocks.readline()
-        if not line:
-            break
+    for line in blocks:
         lsplit = line.split()
         if lsplit[0] == "BLOCK:":
             n_snp     = int(lsplit[6])

diff --git a/harpy/simulate.py b/harpy/simulate.py
@@ -126,7 +126,7 @@ def simulate():
 @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/simulate/simulate-linkedreads")
 @click.option('-b', '--barcodes', type = click.Path(exists=True, dir_okay=False, readable=True), help = "File of linked-read barcodes to add to reads")
 @click.option('-s', '--distance-sd', type = click.IntRange(min = 1), default = 15, show_default=True,  help = "Standard deviation of read-pair distance")
-@click.option('-m', '--molecules-per', type = click.IntRange(min = 1), default = 10, show_default=True,  help = "Average number of molecules per partition")
+@click.option('-m', '--molecules-per', type = click.IntRange(min = 1, max = 4700), default = 10, show_default=True,  help = "Average number of molecules per partition")
 @click.option('-l', '--molecule-length', type = click.IntRange(min = 2), default = 100, show_default=True,  help = "Mean molecule length (kbp)")
 @click.option('-r', '--mutation-rate', type = click.FloatRange(min = 0), default=0.001, show_default=True,  help = "Random mutation rate for simulating reads")
 @click.option('-d', '--outer-distance', type = click.IntRange(min = 100), default = 350, show_default= True, help = "Outer distance between paired-end reads (bp)")
@@ -146,12 +146,11 @@ def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_r
     Create linked reads from a genome
  
     Two haplotype genomes (un/compressed fasta) need to be provided as inputs at the end of the command. If
-    you don't have a diploid genome, you can simulate one with `harpy simulate` as described [in the documentation](https://pdimens.github.io/harpy/workflows/simulate/simulate-variants/#simulate-diploid-assembly).
+    you don't have a diploid genome, you can simulate one with `harpy simulate` as described [in the documentation](https://pdimens.github.io/harpy/blog/simulate_diploid/).
 
-    If not providing a file for `--barcodes`, Harpy will download the `4M-with-alts-february-2016.txt`
-    file containing the standard 16-basepair 10X barcodes, which is available from 10X genomics and the
-    LRSIM [GitHub repository](https://github.com/aquaskyline/LRSIM/). The `--barcodes` file is
-    expected to have one 16-basepair barcode per line.
+    If not providing a file for `--barcodes`, Harpy will generate a file containing the original
+    (96^4) set of 24-basepair haplotagging barcodes (~2GB disk space). The `--barcodes` file is expected to have one
+    linked-read barcode per line, given as nucleotides.
     """
     output_dir = output_dir.rstrip("/")
     workflowdir = os.path.join(output_dir, 'workflow')
@@ -202,7 +201,7 @@ def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_r
     start_text.add_column(header="value", justify="left")
     start_text.add_row("Genome Haplotype 1:", os.path.basename(genome_hap1))
     start_text.add_row("Genome Haplotype 2:", os.path.basename(genome_hap2))
-    start_text.add_row("Barcodes:", os.path.basename(barcodes) if barcodes else "Barcodes: 10X Default")
+    start_text.add_row("Barcodes:", os.path.basename(barcodes) if barcodes else "Barcodes: Haplotagging Default")
     start_text.add_row("Output Folder:", output_dir + "/")
     start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz")
     launch_snakemake(command, "simulate_linkedreads", start_text, output_dir, sm_log, quiet, "workflow/simulate.reads.summary")

diff --git a/harpy/snakefiles/simulate_linkedreads.smk b/harpy/snakefiles/simulate_linkedreads.smk
@@ -17,7 +17,7 @@ gen_hap1 = config["inputs"]["genome_hap1"]
 gen_hap2 = config["inputs"]["genome_hap2"]
 barcodes = config["inputs"].get("barcodes", None)
 envdir   = os.path.join(os.getcwd(), ".harpy_envs")
-barcodefile = barcodes if barcodes else f"{outdir}/workflow/input/4M-with-alts-february-2016.txt"
+barcodefile = barcodes if barcodes else f"{outdir}/workflow/input/haplotag_barcodes.txt"
 
 rule link_1st_geno:
     input:
@@ -50,12 +50,13 @@ rule index_genome:
         "samtools faidx --fai-idx {output} {input}"
 
 if not barcodes:
-    rule download_barcodes:
+    rule create_barcodes:
         output:
             barcodefile
-        run:
-            from urllib.request import urlretrieve
-            _ = urlretrieve("https://raw.githubusercontent.com/aquaskyline/LRSIM/master/4M-with-alts-february-2016.txt", output[0])
+        container:
+            None
+        shell:
+            "haplotag_barcodes.py > {output}"
 
 rule create_reads:
     input:
@@ -147,7 +148,7 @@ rule extract_reads:
     shell:
         "extractReads {input} {params} 2> {log}"
 
-rule convert_to_haplotag:
+rule demultiplex_barcodes:
     input:
         fw = outdir + "/10X/sim_hap{hap}_10x_R1_001.fastq.gz",
         rv = outdir + "/10X/sim_hap{hap}_10x_R2_001.fastq.gz",
@@ -162,7 +163,7 @@ rule convert_to_haplotag:
     container:
         None
     shell:
-        "10xtoHaplotag.py -f {input.fw} -r {input.rv} -b {input.barcodes} -p {params} > {log}"
+        "inline_to_haplotag.py -f {input.fw} -r {input.rv} -b {input.barcodes} -p {params} > {log}"
 
 rule workflow_summary:
     default_target: True
@@ -193,8 +194,8 @@ rule workflow_summary:
         lrsim = "LRSIM was started from step 3 (-u 3) with these parameters:\n"
         lrsim += f"\tLRSIM_harpy.pl -g genome1,genome2 -p {params.lrsproj_dir}/lrsim/sim -b BARCODES -r {params.lrsproj_dir} -i {params.lrsoutdist} -s {params.lrsdist_sd} -x {params.lrsn_pairs} -f {params.lrsmol_len} -t {params.lrsparts} -m {params.lrsmols_per} -z THREADS {params.lrsstatic}"
         summary.append(lrsim)
-        bxconvert = "10X style barcodes were converted in haplotag BX:Z tags using:\n"
-        bxconvert += "\t10xtoHaplotag.py"
+        bxconvert = "Inline barcodes were converted in haplotag BX:Z tags using:\n"
+        bxconvert += "\tinline_to_haplotag.py"
         summary.append(bxconvert)
         sm = "The Snakemake workflow was called via command line:\n"
         sm += f"\t{config['workflow_call']}"