Skip to content

Commit

Permalink
Merge pull request #162 from pdimens/haplotag_lrsims
Browse files Browse the repository at this point in the history
haplotagging barcodes as default
  • Loading branch information
pdimens authored Nov 14, 2024
2 parents 9c8a01d + 4802867 commit 9334a66
Show file tree
Hide file tree
Showing 13 changed files with 149 additions and 153 deletions.
3 changes: 2 additions & 1 deletion .github/filters.yml
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,8 @@ simreads: &simreads
- 'harpy/snakefiles/simulate_linkedreads.smk'
- 'test/genome**gz'
- 'extractReads.cpp'
- 'harpy/bin/10xtoHaplotag.py'
- 'harpy/bin/inline_to_haplotag.py'
- 'harpy/bin/haplotag_barcodes.py'
- 'harpy/scripts/LRSIM_harpy.pl'
assembly: &assembly
- *common
Expand Down
108 changes: 0 additions & 108 deletions harpy/bin/10xtoHaplotag.py

This file was deleted.

24 changes: 24 additions & 0 deletions harpy/bin/haplotag_barcodes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#! /usr/bin/env python
"""Generates a text file listing the haplotagging ACBD barcodes"""
import sys
import argparse
from itertools import product

parser = argparse.ArgumentParser(
prog = 'haplotag_barcodes.py',
description ="Generates a text file listing the haplotagging ACBD barcodes",
usage = "haplotag_barcodes.py > barcodes.txt"
)

args = parser.parse_args()

BX = {
"A": ["ACGGAA", "CCAACA", "AGATCG", "TTCTCC", "TTCCTG", "TTCGGT", "TTGTGG", "TTGCCT", "TTGGTC", "TTACGC", "TTAGCG", "TCTTCG", "TCTCTC", "TCTGGA", "TCCACT", "TCGTAC", "TCGATG", "TCACAG", "TGTTGC", "TGTCCA", "TGTGTG", "TGCTAG", "TGCATC", "TGGAGT", "TGAGAC", "TATCGG", "TATGCC", "TACCAC", "TAGGAG", "CTTCGT", "CTTGCA", "CTCTGA", "CTCAAC", "CTGCTA", "CTGGAT", "CTAAGG", "CCTCAA", "CCTGTT", "CCATTC", "CGTTCT", "CGTAGA", "CGGTAA", "CGACTT", "CATACG", "CACTTG", "CACGAA", "CACAGT", "CAGATC", "CAACGA", "CAAGCT", "GTTCAC", "GTCGTA", "GTGTCA", "GTGAAG", "GTAACC", "GCTTGT", "GCCTAA", "GCACTA", "GCAGAT", "GGTGAA", "GGCAAT", "GGATGA", "GGAATG", "GATCCT", "GATAGC", "GACACA", "GAGCAA", "GAGGTT", "ATTCCG", "ATTGGC", "ATCGAG", "ACTACC", "ACCAGA", "ACGTCT", "ACACGT", "ACAGTG", "AGCTGT", "AGCCTA", "AGGTTC", "AGGCAT", "AGGACA", "AGAAGC", "AACGTC", "AAGCTG", "CGAGTA", "GAATCC", "GAATGG", "AAGTGC", "AAGAGG", "TACAGG", "CTGACT", "CTAGTC", "CCTAAG", "CCATAG", "CGTAAC", "CAATGC"],
"C": ["GAAACG", "ACACCA", "TCGAGA", "TCCTTC", "CTGTTC", "GGTTTC", "TGGTTG", "CCTTTG", "GTCTTG", "CGCTTA", "GCGTTA", "TCGTCT", "CTCTCT", "GGATCT", "ACTTCC", "TACTCG", "ATGTCG", "CAGTCA", "TGCTGT", "CCATGT", "GTGTGT", "TAGTGC", "ATCTGC", "AGTTGG", "GACTGA", "CGGTAT", "GCCTAT", "CACTAC", "GAGTAG", "CGTCTT", "GCACTT", "TGACTC", "AACCTC", "CTACTG", "GATCTG", "AGGCTA", "CAACCT", "GTTCCT", "TTCCCA", "TCTCGT", "AGACGT", "TAACGG", "CTTCGA", "ACGCAT", "TTGCAC", "GAACAC", "AGTCAC", "ATCCAG", "CGACAA", "GCTCAA", "CACGTT", "GTAGTC", "TCAGTG", "AAGGTG", "ACCGTA", "TGTGCT", "TAAGCC", "CTAGCA", "GATGCA", "GAAGGT", "AATGGC", "TGAGGA", "ATGGGA", "CCTGAT", "AGCGAT", "ACAGAC", "CAAGAG", "GTTGAG", "CCGATT", "GGCATT", "GAGATC", "ACCACT", "AGAACC", "TCTACG", "CGTACA", "GTGACA", "TGTAGC", "CTAAGC", "TTCAGG", "CATAGG", "ACAAGG", "AGCAGA", "GTCAAC", "CTGAAG", "GTACGA", "TCCGAA", "TGGGAA", "TGCAAG", "AGGAAG", "AGGTAC", "ACTCTG", "GTCCTA", "AAGCCT", "TAGCCA", "AACCGT", "TGCCAA"],
"B": ["AACGGA", "ACCAAC", "GAGATC", "CTTCTC", "GTTCCT", "TTTCGG", "GTTGTG", "TTTGCC", "CTTGGT", "CTTACG", "GTTAGC", "GTCTTC", "CTCTCT", "ATCTGG", "TTCCAC", "CTCGTA", "GTCGAT", "GTCACA", "CTGTTG", "ATGTCC", "GTGTGT", "GTGCTA", "CTGCAT", "TTGGAG", "CTGAGA", "GTATCG", "CTATGC", "CTACCA", "GTAGGA", "TCTTCG", "ACTTGC", "ACTCTG", "CCTCAA", "ACTGCT", "TCTGGA", "GCTAAG", "ACCTCA", "TCCTGT", "CCCATT", "TCGTTC", "ACGTAG", "ACGGTA", "TCGACT", "GCATAC", "GCACTT", "ACACGA", "TCACAG", "CCAGAT", "ACAACG", "TCAAGC", "CGTTCA", "AGTCGT", "AGTGTC", "GGTGAA", "CGTAAC", "TGCTTG", "AGCCTA", "AGCACT", "TGCAGA", "AGGTGA", "TGGCAA", "AGGATG", "GGGAAT", "TGATCC", "CGATAG", "AGACAC", "AGAGCA", "TGAGGT", "GATTCC", "CATTGG", "GATCGA", "CACTAC", "AACCAG", "TACGTC", "TACACG", "GACAGT", "TAGCTG", "AAGCCT", "CAGGTT", "TAGGCA", "AAGGAC", "CAGAAG", "CAACGT", "GAAGCT", "ACGAGT", "CGAATC", "GGAATG", "CAAGTG", "GAAGAG", "GTACAG", "TCTGAC", "CCTAGT", "GCCTAA", "GCCATA", "CCGTAA", "CCAATG"],
"D": ["GGAAAC", "AACACC", "ATCGAG", "CTCCTT", "CCTGTT", "CGGTTT", "GTGGTT", "GCCTTT", "GGTCTT", "ACGCTT", "AGCGTT", "TTCGTC", "TCTCTC", "TGGATC", "CACTTC", "GTACTC", "GATGTC", "ACAGTC", "TTGCTG", "TCCATG", "TGTGTG", "CTAGTG", "CATCTG", "GAGTTG", "AGACTG", "TCGGTA", "TGCCTA", "CCACTA", "GGAGTA", "TCGTCT", "TGCACT", "CTGACT", "CAACCT", "GCTACT", "GGATCT", "AAGGCT", "TCAACC", "TGTTCC", "ATTCCC", "TTCTCG", "TAGACG", "GTAACG", "ACTTCG", "TACGCA", "CTTGCA", "CGAACA", "CAGTCA", "GATCCA", "ACGACA", "AGCTCA", "TCACGT", "CGTAGT", "GTCAGT", "GAAGGT", "AACCGT", "TTGTGC", "CTAAGC", "ACTAGC", "AGATGC", "TGAAGG", "CAATGG", "ATGAGG", "AATGGG", "TCCTGA", "TAGCGA", "CACAGA", "GCAAGA", "GGTTGA", "TCCGAT", "TGGCAT", "CGAGAT", "TACCAC", "CAGAAC", "GTCTAC", "ACGTAC", "AGTGAC", "CTGTAG", "CCTAAG", "GTTCAG", "GCATAG", "GACAAG", "AAGCAG", "CGTCAA", "GCTGAA", "AGTACG", "ATCCGA", "ATGGGA", "GTGCAA", "GAGGAA", "CAGGTA", "GACTCT", "AGTCCT", "TAAGCC", "ATAGCC", "TAACCG", "ATGCCA"]
}

bc_generator = product(BX["A"], BX["C"], BX["B"], BX["D"])
for BC in bc_generator:
sys.stdout.write("".join(BC) + "\n")
100 changes: 100 additions & 0 deletions harpy/bin/inline_to_haplotag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#! /usr/bin/env python
"""Convert inline barcodes into haplotag style ones"""
import os
import sys
import gzip
import argparse
from itertools import zip_longest, product

parser = argparse.ArgumentParser(
prog = 'inline_to_haplotag.py',
description = 'Moves inline linked read barcodes to read headers (OX:Z) and converts them into haplotag ACBD format (BX:Z).',
usage = "inline_to_haplotag.py -f <forward.fq.gz> -r <reverse.fq.gz> -b <barcodes.txt> -p <prefix> > barcodes.conversion.txt",
exit_on_error = False
)

parser.add_argument("-f", "--forward", required = True, type = str, help = "Forward reads of paired-end FASTQ file pair (gzipped)")
parser.add_argument("-r", "--reverse", required = True, type = str, help = "Reverse reads of paired-end FASTQ file pair (gzipped)")
parser.add_argument("-p", "--prefix", required = True, type = str, help = "Prefix for outfile FASTQ files (e.g. <prefix>.R1.fq.gz)")
parser.add_argument("-b", "--barcodes", required = True, type=str, help="File listing the linked-read barcodes to convert to haplotag format, one barcode per line")
if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit(1)
args = parser.parse_args()
err = []
for i in [args.forward, args.reverse, args.barcodes]:
if not os.path.exists(i):
err.append(i)
if err:
parser.error("Some input files were not found on the system:\n" + ", ".join(err))

def iter_fastq_records(file_handle):
"""Iterate over FASTQ records in a file.
file_handle: Opened gzip file handle
Yields: FASTQ record [header, seq, '+', qual]
Raises ValueError If file is not in FASTQ format
"""
record = []
for line in file_handle:
line = line.decode().rstrip("\n")
record.append(line)
if len(record) == 4:
# format sanity check
if not (record[0].startswith("@") and record[2] == "+"):
raise ValueError("Invalid FASTQ format")
yield record
record = []
if record:
raise ValueError("Incomplete FASTQ record at end of file")

def validate_barcode(barcode):
"""Validate barcode format (A,C,G,T)."""
if not set(barcode).issubset({'A','C','G','T'}):
raise ValueError(f"Invalid barcode format: {barcode}. Barcodes must be captial letters and only contain standard nucleotide values ATCG.")

def process_record(fw_entry, rv_entry, barcode_dict, haplotag_bc):
"""convert the barcode to haplotag"""
# [0] = header, [1] = seq, [2] = +, [3] = qual
bc10x = fw_entry[1][:16]
bchap = barcode_dict.get(bc10x, "A00C00B00D00")
if not bchap:
bchap = "".join(next(haplotag_bc))
barcode_dict[bc10x] = bchap
_new_fw = fw_entry[0].split()[0] + f"\tOX:Z:{bc10x}\tBX:Z:{bchap}\n"
_new_fw += fw_entry[1][16:] + "\n"
_new_fw += fw_entry[2] + "\n"
_new_fw += fw_entry[3][16:] + "\n"
_new_rv = rv_entry[0].split()[0] + f"\tOX:Z:{bc10x}\tBX:Z:{bchap}\n"
_new_rv += "\n".join(rv_entry[1:3])
return _new_fw, _new_rv

bc_range = [f"{i}".zfill(2) for i in range(1,97)]
bc_generator = product("A", bc_range, "C", bc_range, "B", bc_range, "D", bc_range)

bc_dict = {}

# read in barcodes
opener = gzip.open if args.barcodes.lower().endswith('.gz') else open
mode = 'rt' if args.barcodes.lower().endswith('.gz') else 'r'
with opener(args.barcodes, mode) as bc_file:
for line in bc_file:
barcode = line.rstrip().split()[0]
validate_barcode(barcode)
bc_dict[barcode] = None

# simultaneously iterate the forward and reverse fastq files
fw_out = gzip.open(f"{args.prefix}.R1.fq.gz", "wb", 6)
rv_out = gzip.open(f"{args.prefix}.R2.fq.gz", "wb", 6)

with gzip.open(args.forward, "r") as fw_i, gzip.open(args.reverse, "r") as rv_i:
for fw_record, rv_record in zip_longest(iter_fastq_records(fw_i), iter_fastq_records(rv_i)):
new_fw, new_rv = process_record(fw_record, rv_record, bc_dict, bc_generator)
fw_out.write(new_fw.encode("utf-8"))
rv_out.write(new_rv.encode("utf-8"))

fw_out.close()
rv_out.close()

for i,j in bc_dict.items():
if j:
sys.stdout.write(f"{i}\t{j}\n")
5 changes: 1 addition & 4 deletions harpy/bin/make_windows.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,7 @@ def makewindows(_contig, _c_len, windowsize):

if testname.endswith("fai"):
with open(args.input, "r", encoding="utf-8") as fai:
while True:
line = fai.readline()
if not line:
break
for line in fai:
lsplit = line.split("\t")
contig = lsplit[0]
c_len = int(lsplit[1])
Expand Down
5 changes: 1 addition & 4 deletions harpy/bin/molecule_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,7 @@

# read the fasta index file as a dict of contig lengths
with open(args.fai, "r", encoding= "utf-8") as fai:
while True:
line = fai.readline()
if not line:
break
for line in fai:
splitline = line.split()
contig = splitline[0]
length = splitline[1]
Expand Down
5 changes: 1 addition & 4 deletions harpy/bin/parse_phaseblocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,7 @@

with open(args.input, "r", encoding="utf-8") as blocks:
FIRST_LINE = True
while True:
line = blocks.readline()
if not line:
break
for line in blocks:
lsplit = line.split()
if lsplit[0] == "BLOCK:":
n_snp = int(lsplit[6])
Expand Down
13 changes: 6 additions & 7 deletions harpy/simulate.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def simulate():
@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/simulate/simulate-linkedreads")
@click.option('-b', '--barcodes', type = click.Path(exists=True, dir_okay=False, readable=True), help = "File of linked-read barcodes to add to reads")
@click.option('-s', '--distance-sd', type = click.IntRange(min = 1), default = 15, show_default=True, help = "Standard deviation of read-pair distance")
@click.option('-m', '--molecules-per', type = click.IntRange(min = 1), default = 10, show_default=True, help = "Average number of molecules per partition")
@click.option('-m', '--molecules-per', type = click.IntRange(min = 1, max = 4700), default = 10, show_default=True, help = "Average number of molecules per partition")
@click.option('-l', '--molecule-length', type = click.IntRange(min = 2), default = 100, show_default=True, help = "Mean molecule length (kbp)")
@click.option('-r', '--mutation-rate', type = click.FloatRange(min = 0), default=0.001, show_default=True, help = "Random mutation rate for simulating reads")
@click.option('-d', '--outer-distance', type = click.IntRange(min = 100), default = 350, show_default= True, help = "Outer distance between paired-end reads (bp)")
Expand All @@ -146,12 +146,11 @@ def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_r
Create linked reads from a genome
Two haplotype genomes (un/compressed fasta) need to be provided as inputs at the end of the command. If
you don't have a diploid genome, you can simulate one with `harpy simulate` as described [in the documentation](https://pdimens.github.io/harpy/workflows/simulate/simulate-variants/#simulate-diploid-assembly).
you don't have a diploid genome, you can simulate one with `harpy simulate` as described [in the documentation](https://pdimens.github.io/harpy/blog/simulate_diploid/).
If not providing a file for `--barcodes`, Harpy will download the `4M-with-alts-february-2016.txt`
file containing the standard 16-basepair 10X barcodes, which is available from 10X genomics and the
LRSIM [GitHub repository](https://github.com/aquaskyline/LRSIM/). The `--barcodes` file is
expected to have one 16-basepair barcode per line.
If not providing a file for `--barcodes`, Harpy will generate a file containing the original
(96^4) set of 24-basepair haplotagging barcodes (~2GB disk space). The `--barcodes` file is expected to have one
linked-read barcode per line, given as nucleotides.
"""
output_dir = output_dir.rstrip("/")
workflowdir = os.path.join(output_dir, 'workflow')
Expand Down Expand Up @@ -202,7 +201,7 @@ def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_r
start_text.add_column(header="value", justify="left")
start_text.add_row("Genome Haplotype 1:", os.path.basename(genome_hap1))
start_text.add_row("Genome Haplotype 2:", os.path.basename(genome_hap2))
start_text.add_row("Barcodes:", os.path.basename(barcodes) if barcodes else "Barcodes: 10X Default")
start_text.add_row("Barcodes:", os.path.basename(barcodes) if barcodes else "Barcodes: Haplotagging Default")
start_text.add_row("Output Folder:", output_dir + "/")
start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz")
launch_snakemake(command, "simulate_linkedreads", start_text, output_dir, sm_log, quiet, "workflow/simulate.reads.summary")
Expand Down
19 changes: 10 additions & 9 deletions harpy/snakefiles/simulate_linkedreads.smk
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ gen_hap1 = config["inputs"]["genome_hap1"]
gen_hap2 = config["inputs"]["genome_hap2"]
barcodes = config["inputs"].get("barcodes", None)
envdir = os.path.join(os.getcwd(), ".harpy_envs")
barcodefile = barcodes if barcodes else f"{outdir}/workflow/input/4M-with-alts-february-2016.txt"
barcodefile = barcodes if barcodes else f"{outdir}/workflow/input/haplotag_barcodes.txt"

rule link_1st_geno:
input:
Expand Down Expand Up @@ -50,12 +50,13 @@ rule index_genome:
"samtools faidx --fai-idx {output} {input}"

if not barcodes:
rule download_barcodes:
rule create_barcodes:
output:
barcodefile
run:
from urllib.request import urlretrieve
_ = urlretrieve("https://raw.githubusercontent.com/aquaskyline/LRSIM/master/4M-with-alts-february-2016.txt", output[0])
container:
None
shell:
"haplotag_barcodes.py > {output}"

rule create_reads:
input:
Expand Down Expand Up @@ -147,7 +148,7 @@ rule extract_reads:
shell:
"extractReads {input} {params} 2> {log}"

rule convert_to_haplotag:
rule demultiplex_barcodes:
input:
fw = outdir + "/10X/sim_hap{hap}_10x_R1_001.fastq.gz",
rv = outdir + "/10X/sim_hap{hap}_10x_R2_001.fastq.gz",
Expand All @@ -162,7 +163,7 @@ rule convert_to_haplotag:
container:
None
shell:
"10xtoHaplotag.py -f {input.fw} -r {input.rv} -b {input.barcodes} -p {params} > {log}"
"inline_to_haplotag.py -f {input.fw} -r {input.rv} -b {input.barcodes} -p {params} > {log}"

rule workflow_summary:
default_target: True
Expand Down Expand Up @@ -193,8 +194,8 @@ rule workflow_summary:
lrsim = "LRSIM was started from step 3 (-u 3) with these parameters:\n"
lrsim += f"\tLRSIM_harpy.pl -g genome1,genome2 -p {params.lrsproj_dir}/lrsim/sim -b BARCODES -r {params.lrsproj_dir} -i {params.lrsoutdist} -s {params.lrsdist_sd} -x {params.lrsn_pairs} -f {params.lrsmol_len} -t {params.lrsparts} -m {params.lrsmols_per} -z THREADS {params.lrsstatic}"
summary.append(lrsim)
bxconvert = "10X style barcodes were converted in haplotag BX:Z tags using:\n"
bxconvert += "\t10xtoHaplotag.py"
bxconvert = "Inline barcodes were converted in haplotag BX:Z tags using:\n"
bxconvert += "\tinline_to_haplotag.py"
summary.append(bxconvert)
sm = "The Snakemake workflow was called via command line:\n"
sm += f"\t{config['workflow_call']}"
Expand Down
Loading

0 comments on commit 9334a66

Please sign in to comment.