Skip to content

Commit

Permalink
Merge pull request #163 from pdimens/haplotag_lrsims
Browse files Browse the repository at this point in the history
better sim demux support
  • Loading branch information
pdimens authored Nov 15, 2024
2 parents 9334a66 + 4e819ab commit a86e607
Show file tree
Hide file tree
Showing 8 changed files with 1,000,106 additions and 53 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -745,7 +745,9 @@ jobs:
path: .snakemake/singularity
- name: simulate linked reads
shell: micromamba-shell {0}
run: harpy simulate linkedreads --quiet -t 4 -n 2 -l 100 -p 50 test/genome/genome.fasta.gz test/genome/genome2.fasta.gz
run: |
gunzip test/haplotag.bc.gz
harpy simulate linkedreads --quiet -t 4 -n 2 -b test/haplotag.bc -l 100 -p 50 test/genome/genome.fasta.gz test/genome/genome2.fasta.gz
assembly:
needs: [changes, pkgbuild]
Expand Down
25 changes: 25 additions & 0 deletions harpy/_validations.py
Original file line number Diff line number Diff line change
Expand Up @@ -522,3 +522,28 @@ def validate(fastq):
futures = [executor.submit(validate, i) for i in fastq_list]
for future in as_completed(futures):
progress.update(task_progress, advance=1)

def validate_barcodefile(infile, return_len = False):
"""Does validations to make sure it's one length, one per line, and nucleotides"""
if is_gzip(infile):
print_error("Incorrect format", f"The input file must be in uncompressed format. Please decompress [blue]{infile}[/blue] and try again.")
sys.exit(1)
lengths = set()
nucleotides = {'A','C','G','T'}
line_num = 0
with open(infile, "r") as bc_file:
for line in bc_file:
line_num += 1
barcode = line.rstrip()
if len(barcode.split()) > 1:
print_error("Incorrect format", f"There must be one barcode per line, but multiple entries were detected on [bold]line {line_num}[/bold] in [blue]{infile}[/blue]")
sys.exit(1)
if not set(barcode).issubset(nucleotides) or barcode != barcode.upper():
print_error("Incorrect format", f"Invalid barcode format on [bold]line {line_num}[/bold]: [yellow]{barcode}[/yellow].\nBarcodes in [blue]{infile}[/blue] must be captial letters and only contain standard nucleotide characters [green]ATCG[/green].")
sys.exit(1)
lengths.add(len(barcode))
if len(lengths) > 1:
print_error("Incorrect format", f"Barcodes in [blue]{infile}[/blue] must all be a single length, but multiple lengths were detected: [yellow]" + ", ".join(lengths))
sys.exit(1)
if return_len:
return lengths.pop()
4 changes: 3 additions & 1 deletion harpy/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from ._launch import launch_snakemake
from ._parsers import parse_fastq_inputs
from ._printing import print_error, print_solution, print_notice
from ._validations import check_fasta, fasta_contig_match
from ._validations import check_fasta, fasta_contig_match, validate_barcodefile

@click.group(options_metavar='', context_settings={"help_option_names" : ["-h", "--help"]})
def align():
Expand Down Expand Up @@ -209,6 +209,8 @@ def ema(inputs, output_dir, platform, barcode_list, fragment_density, genome, de
check_fasta(genome)
if contigs:
fasta_contig_match(contigs, genome)
if barcode_list:
validate_barcodefile(barcode_list)
fetch_rule(workflowdir, "align_ema.smk")
fetch_report(workflowdir, "align_stats.Rmd")
fetch_report(workflowdir, "align_bxstats.Rmd")
Expand Down
62 changes: 40 additions & 22 deletions harpy/bin/inline_to_haplotag.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,14 @@
parser = argparse.ArgumentParser(
prog = 'inline_to_haplotag.py',
description = 'Moves inline linked read barcodes to read headers (OX:Z) and converts them into haplotag ACBD format (BX:Z).',
usage = "inline_to_haplotag.py -f <forward.fq.gz> -r <reverse.fq.gz> -b <barcodes.txt> -p <prefix> > barcodes.conversion.txt",
usage = "inline_to_haplotag.py -f <forward.fq.gz> -r <reverse.fq.gz> -b <barcodes.txt> -p <prefix>",
exit_on_error = False
)

parser.add_argument("-f", "--forward", required = True, type = str, help = "Forward reads of paired-end FASTQ file pair (gzipped)")
parser.add_argument("-r", "--reverse", required = True, type = str, help = "Reverse reads of paired-end FASTQ file pair (gzipped)")
parser.add_argument("-p", "--prefix", required = True, type = str, help = "Prefix for outfile FASTQ files (e.g. <prefix>.R1.fq.gz)")
parser.add_argument("-l", "--length", required = True, type = str, help = "Length of the barcodes (all must be one length)")
parser.add_argument("-p", "--prefix", required = True, type = str, help = "Prefix for outfile files (e.g. <prefix>.R1.fq.gz)")
parser.add_argument("-b", "--barcodes", required = True, type=str, help="File listing the linked-read barcodes to convert to haplotag format, one barcode per line")
if len(sys.argv) == 1:
parser.print_help(sys.stderr)
Expand Down Expand Up @@ -52,27 +53,41 @@ def validate_barcode(barcode):
if not set(barcode).issubset({'A','C','G','T'}):
raise ValueError(f"Invalid barcode format: {barcode}. Barcodes must be captial letters and only contain standard nucleotide values ATCG.")

def process_record(fw_entry, rv_entry, barcode_dict, haplotag_bc):
def process_record(fw_entry, rv_entry, barcode_dict, haplotag_bc, bc_len):
"""convert the barcode to haplotag"""
# [0] = header, [1] = seq, [2] = +, [3] = qual
bc10x = fw_entry[1][:16]
bchap = barcode_dict.get(bc10x, "A00C00B00D00")
if not bchap:
bchap = "".join(next(haplotag_bc))
barcode_dict[bc10x] = bchap
_new_fw = fw_entry[0].split()[0] + f"\tOX:Z:{bc10x}\tBX:Z:{bchap}\n"
_new_fw += fw_entry[1][16:] + "\n"
_new_fw += fw_entry[2] + "\n"
_new_fw += fw_entry[3][16:] + "\n"
_new_rv = rv_entry[0].split()[0] + f"\tOX:Z:{bc10x}\tBX:Z:{bchap}\n"
_new_rv += "\n".join(rv_entry[1:3])
return _new_fw, _new_rv
# search for a valid barcode at all possible barcode lengths
if fw_entry:
bc_inline = fw_entry[1][:bc_len]
bc_hap = barcode_dict.get(bc_inline, "A00C00B00D00")
# the default barcode entry is None, meaning it hasnt been assigned a haplotag equivalent yet
if not bc_hap:
bc_hap = "".join(next(haplotag_bc))
barcode_dict[bc_inline] = bc_hap
_new_fw = fw_entry[0].split()[0] + f"\tOX:Z:{bc_inline}\tBX:Z:{bc_hap}\n"
_new_fw += fw_entry[1][bc_len:] + "\n"
_new_fw += fw_entry[2] + "\n"
_new_fw += fw_entry[3][bc_len:] + "\n"
if rv_entry:
_new_rv = rv_entry[0].split()[0] + f"\tOX:Z:{bc_inline}\tBX:Z:{bc_hap}\n"
_new_rv += rv_entry[1] + "\n"
_new_rv += rv_entry[2] + "\n"
_new_rv += rv_entry[3] + "\n"
else:
_new_rv = None
return _new_fw, _new_rv
else:
# no forward read, therefor no barcode to search for
_new_rv = rv_entry[0].split()[0] + f"\tBX:Z:A00C00B00D00\n"
_new_rv += rv_entry[1] + "\n"
_new_rv += rv_entry[2] + "\n"
_new_rv += rv_entry[3] + "\n"
return None, _new_rv

bc_range = [f"{i}".zfill(2) for i in range(1,97)]
bc_generator = product("A", bc_range, "C", bc_range, "B", bc_range, "D", bc_range)

bc_dict = {}

# read in barcodes
opener = gzip.open if args.barcodes.lower().endswith('.gz') else open
mode = 'rt' if args.barcodes.lower().endswith('.gz') else 'r'
Expand All @@ -88,13 +103,16 @@ def process_record(fw_entry, rv_entry, barcode_dict, haplotag_bc):

with gzip.open(args.forward, "r") as fw_i, gzip.open(args.reverse, "r") as rv_i:
for fw_record, rv_record in zip_longest(iter_fastq_records(fw_i), iter_fastq_records(rv_i)):
new_fw, new_rv = process_record(fw_record, rv_record, bc_dict, bc_generator)
fw_out.write(new_fw.encode("utf-8"))
rv_out.write(new_rv.encode("utf-8"))
new_fw, new_rv = process_record(fw_record, rv_record, bc_dict, bc_generator, args.length)
if new_fw:
fw_out.write(new_fw.encode("utf-8"))
if new_rv:
rv_out.write(new_rv.encode("utf-8"))

fw_out.close()
rv_out.close()

for i,j in bc_dict.items():
if j:
sys.stdout.write(f"{i}\t{j}\n")
with open(f"{args.prefix}.barcodes", "w") as bx_out:
for i,j in bc_dict.items():
if j:
bx_out.write(f"{i}\t{j}\n")
8 changes: 5 additions & 3 deletions harpy/scripts/LRSIM_harpy.pl
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ sub main {
g => undef,
n => undef,
d => 2,
l => 16,
r => undef,
p => undef,
c => undef,
Expand All @@ -64,7 +65,7 @@ sub main {
0 => 100
);
&usage( \%opts ) if ( @ARGV < 1 );
getopts( 'hnoc:g:d:r:p:b:u:e:E:i:s:x:f:t:m:z:1:2:3:4:5:6:7:8:9:0:',
getopts( 'hnoc:g:d:l:r:p:b:u:e:E:i:s:x:f:t:m:z:1:2:3:4:5:6:7:8:9:0:',
\%opts );
&usage( \%opts ) if ( defined $opts{h} );

Expand Down Expand Up @@ -604,7 +605,7 @@ sub main {
&Log("Simulate reads start");

#Load barcodes
our $barcodeLength = 16;
our $barcodeLength = $opts{l};
our @barcodes = ();
our $barcodesMutexLock : shared = 0;
our $numBarcodes = 0;
Expand Down Expand Up @@ -646,7 +647,7 @@ sub main {
}

&Log("Load read positions haplotype $i");
my @defaultBarcodeQualAry = split //, "AAAFFFKKKKKKKKKK";
my @defaultBarcodeQualAry = split //, "AAAFFF" . "K" x (barcodeLength - 6);
my %faidx = ();
my @boundary = ();
my $genomeSize =
Expand Down Expand Up @@ -879,6 +880,7 @@ sub usage {
Linked reads parameters:
-b STRING Barcodes list
-l INT Barcode Length
-x INT # million reads pairs in total to simulated [$$opts{x}]
-f INT Mean molecule length in kbp [$$opts{f}]
-c STRING Input a list of fragment sizes
Expand Down
10 changes: 7 additions & 3 deletions harpy/simulate.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from ._launch import launch_snakemake
from ._misc import fetch_rule, fetch_script, snakemake_log
from ._printing import print_error
from ._validations import check_fasta
from ._validations import check_fasta, validate_barcodefile

@click.group(options_metavar='', context_settings={"help_option_names" : ["-h", "--help"]})
def simulate():
Expand Down Expand Up @@ -165,7 +165,8 @@ def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_r

check_fasta(genome_hap1)
check_fasta(genome_hap2)

if barcodes:
bc_len = validate_barcodefile(barcodes, True)
os.makedirs(f"{workflowdir}/", exist_ok= True)
fetch_rule(workflowdir, "simulate_linkedreads.smk")
fetch_script(workflowdir, "LRSIM_harpy.pl")
Expand All @@ -183,10 +184,13 @@ def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_r
"partitions" : partitions,
"molecules_per_partition" : molecules_per,
"workflow_call" : command.rstrip(),
'barcodes': {
"file": Path(barcodes).resolve().as_posix() if barcodes else f"{workflowdir}/input/haplotag_barcodes.txt",
"length": bc_len if barcodes else 24
},
"inputs" : {
"genome_hap1" : Path(genome_hap1).resolve().as_posix(),
"genome_hap2" : Path(genome_hap2).resolve().as_posix(),
**({'barcodes': Path(barcodes).resolve().as_posix()} if barcodes else {}),
}
}
with open(os.path.join(workflowdir, 'config.yaml'), "w", encoding="utf-8") as config:
Expand Down
46 changes: 23 additions & 23 deletions harpy/snakefiles/simulate_linkedreads.smk
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ onerror:
outdir = config["output_directory"]
gen_hap1 = config["inputs"]["genome_hap1"]
gen_hap2 = config["inputs"]["genome_hap2"]
barcodes = config["inputs"].get("barcodes", None)
barcode_file = config["barcodes"]["file"]
barcode_len = config["barcodes"]["length"]
envdir = os.path.join(os.getcwd(), ".harpy_envs")
barcodefile = barcodes if barcodes else f"{outdir}/workflow/input/haplotag_barcodes.txt"

rule link_1st_geno:
input:
Expand Down Expand Up @@ -49,10 +49,10 @@ rule index_genome:
shell:
"samtools faidx --fai-idx {output} {input}"

if not barcodes:
if barcode_file == f"{outdir}/workflow/input/haplotag_barcodes.txt":
rule create_barcodes:
output:
barcodefile
f"{outdir}/workflow/input/haplotag_barcodes.txt"
container:
None
shell:
Expand Down Expand Up @@ -94,33 +94,32 @@ rule lrsim:
hap2 = f"{outdir}/dwgsim_simulated/dwgsim.1.12.fastq",
fai1 = outdir + "/workflow/input/hap.0.fasta.fai",
fai2 = outdir + "/workflow/input/hap.1.fasta.fai",
barcodes = barcodefile
barcodes = barcode_file
output:
collect(outdir + "/lrsim/sim.{hap}.{ext}", hap = [0,1], ext = ["fp", "manifest"]),
temp(f"{outdir}/lrsim/.status")
log:
f"{outdir}/logs/LRSIM.log"
params:
lrsim = f"{outdir}/workflow/scripts/LRSIM_harpy.pl",
proj_dir = f"{outdir}",
outdist = config["outer_distance"],
dist_sd = config["distance_sd"],
n_pairs = config["read_pairs"],
mol_len = config["molecule_length"],
parts = config["partitions"],
mols_per = config["molecules_per_partition"],
static = "-o 1 -d 2 -u 4"
infiles = f"-g {outdir}/dwgsim_simulated/dwgsim.0.12.fastq,{outdir}/dwgsim_simulated/dwgsim.1.12.fastq",
inbarcodes = f"-b {barcode_file}",
proj_dir = f"-p {outdir}/lrsim/sim",
prefix = f"-r {outdir}",
outdist = f"-i {config['outer_distance']}",
dist_sd = f"-s {config['distance_sd']}",
n_pairs = f"-x {config['read_pairs']}",
mol_len = f"-f {config['molecule_length']}",
parts = f"-t {config['partitions']}",
mols_per = f"-m {config['molecules_per_partition']}",
bc_len = f"-l {barcode_len}",
static = "-o 1 -d 2 -u 4"
threads:
workflow.cores
conda:
f"{envdir}/simulations.yaml"
shell:
"""
perl {params.lrsim} -g {input.hap1},{input.hap2} -p {params.proj_dir}/lrsim/sim \\
-b {input.barcodes} -r {params.proj_dir} -i {params.outdist} \\
-s {params.dist_sd} -x {params.n_pairs} -f {params.mol_len} \\
-t {params.parts} -m {params.mols_per} -z {threads} {params.static} 2> {log}
"""
"perl {params} -z {threads} 2> {log}"

rule sort_manifest:
input:
Expand Down Expand Up @@ -152,18 +151,19 @@ rule demultiplex_barcodes:
input:
fw = outdir + "/10X/sim_hap{hap}_10x_R1_001.fastq.gz",
rv = outdir + "/10X/sim_hap{hap}_10x_R2_001.fastq.gz",
barcodes = barcodefile
barcodes = barcode_file
output:
fw = outdir + "/sim_hap{hap}_haplotag.R1.fq.gz",
rv = outdir + "/sim_hap{hap}_haplotag.R2.fq.gz"
log:
outdir + "/logs/10XtoHaplotag/hap{hap}"
outdir + "/sim_hap/{hap}_haplotag.barcodes"
params:
lambda wc: f"""{outdir}/sim_hap{wc.get("hap")}_haplotag"""
outdir = outdir,
bc_len = barcode_len
container:
None
shell:
"inline_to_haplotag.py -f {input.fw} -r {input.rv} -b {input.barcodes} -p {params} > {log}"
"inline_to_haplotag.py -l {params.bc_len} -f {input.fw} -r {input.rv} -b {input.barcodes} -p {params.outdir}/sim_hap{wildcards.hap}_haplotag"

rule workflow_summary:
default_target: True
Expand Down
Loading

0 comments on commit a86e607

Please sign in to comment.