diff --git a/.github/filters.yml b/.github/filters.yml index b248390a2..8cf5f9407 100644 --- a/.github/filters.yml +++ b/.github/filters.yml @@ -153,7 +153,8 @@ simreads: &simreads - 'harpy/snakefiles/simulate_linkedreads.smk' - 'test/genome**gz' - 'extractReads.cpp' - - 'harpy/bin/10xtoHaplotag.py' + - 'harpy/bin/inline_to_haplotag.py' + - 'harpy/bin/haplotag_barcodes.py' - 'harpy/scripts/LRSIM_harpy.pl' assembly: &assembly - *common diff --git a/harpy/bin/10xtoHaplotag.py b/harpy/bin/10xtoHaplotag.py deleted file mode 100755 index 8d50f3b7c..000000000 --- a/harpy/bin/10xtoHaplotag.py +++ /dev/null @@ -1,108 +0,0 @@ -#! /usr/bin/env python -"""Convert 10X style barcodes into Haplotag style ones""" -import os -import sys -import gzip -import argparse -from itertools import zip_longest, product - -parser = argparse.ArgumentParser( - prog = '10xtoHaplotag.py', - description = 'Converts 10x linked reads to haplotag linked reads with barcodes in BX:Z: and OX:Z: header tags.', - usage = "10xtoHaplotag.py -f -r -b -p > barcodes.conversion.txt", - exit_on_error = False - ) - -parser.add_argument("-f", "--forward", required = True, type = str, help = "Forward reads of paired-end FASTQ file pair (gzipped)") -parser.add_argument("-r", "--reverse", required = True, type = str, help = "Reverse reads of paired-end FASTQ file pair (gzipped)") -parser.add_argument("-p", "--prefix", required = True, type = str, help = "Prefix for outfile FASTQ files (e.g. .R1.fq.gz)") -parser.add_argument("-b", "--barcodes", required = True, type=str, help="File listing the 10X barcodes to convert to haplotag format, one barcode per line") -if len(sys.argv) == 1: - parser.print_help(sys.stderr) - sys.exit(1) -args = parser.parse_args() -err = [] -for i in [args.forward, args.reverse, args.barcodes]: - if not os.path.exists(i): - err.append(i) -if err: - parser.error("Some input files were not found on the system:\n" + ", ".join(err)) - -def process_record(fw_entry, rv_entry): - """convert the 10X to haplotag""" - # [0] = header, [1] = seq, [2] = +, [3] = qual - bc10x = fw_entry[1][:16] - bchap = bc_dict.get(bc10x, "A00C00B00D00") - if not bchap: - bchap = "".join(next(bc_generator)) - bc_dict[bc10x] = bchap - _new_fw = fw_entry[0].split()[0] + f"\tOX:Z:{bc10x}\tBX:Z:{bchap}\n" - _new_fw += fw_entry[1][16:] + "\n" - _new_fw += fw_entry[2] + "\n" - _new_fw += fw_entry[3][16:] + "\n" - _new_rv = rv_entry[0].split()[0] + f"\tOX:Z:{bc10x}\tBX:Z:{bchap}\n" - _new_rv += "\n".join(rv_entry[1:3]) - return _new_fw, _new_rv - -bc_range = [f"{i}".zfill(2) for i in range(1,97)] -bc_generator = product("A", bc_range, "C", bc_range, "B", bc_range, "D", bc_range) - -bc_dict = {} - -# read in barcodes -with open(args.barcodes, "r") as bc_file: - while True: - # Get next line from file - if args.barcodes.endswith("gz"): - line = bc_file.readline().decode() - else: - line = bc_file.readline() - # if line is empty - # end of file is reached - if not line: - break - bc = line.rstrip("\n").split() - _10X = str(bc[0]) - bc_dict[_10X] = None - -# simultaneously iterate the forward and reverse fastq files -fw_reads = args.forward -rv_reads = args.reverse - -fw_out = gzip.open(f"{args.prefix}.R1.fq.gz", "wb", 6) -rv_out = gzip.open(f"{args.prefix}.R2.fq.gz", "wb", 6) - -with gzip.open(fw_reads, "r") as fw_i, gzip.open(rv_reads, "r") as rv_i: - # store the fq records here - fw_record = [] - rv_record = [] - i = 0 - for fw, rv in zip_longest(fw_i, rv_i): - fw_line = fw.decode().rstrip("\n") - rv_line = rv.decode().rstrip("\n") - if fw_line.startswith("@") and i > 0: - i += 1 - # process the full record - new_fw, new_rv = process_record(fw_record, rv_record) - # write new record to files - fw_out.write(new_fw.encode("utf-8")) - rv_out.write(new_rv.encode("utf-8")) - # reset the record - fw_record = [fw_line] - rv_record = [rv_line] - elif fw_line.startswith("@") and i == 0: - # its the first record don't write anything - i += 1 - fw_record = [fw_line] - rv_record = [rv_line] - else: - # just append the other lines to the record - fw_record.append(fw_line) - rv_record.append(rv_line) - -fw_out.close() -rv_out.close() - -for i,j in bc_dict.items(): - if j: - print(f"{i}\t{j}", file = sys.stdout) diff --git a/harpy/bin/haplotag_barcodes.py b/harpy/bin/haplotag_barcodes.py new file mode 100755 index 000000000..e1fb6fc14 --- /dev/null +++ b/harpy/bin/haplotag_barcodes.py @@ -0,0 +1,24 @@ +#! /usr/bin/env python +"""Generates a text file listing the haplotagging ACBD barcodes""" +import sys +import argparse +from itertools import product + +parser = argparse.ArgumentParser( + prog = 'haplotag_barcodes.py', + description ="Generates a text file listing the haplotagging ACBD barcodes", + usage = "haplotag_barcodes.py > barcodes.txt" + ) + +args = parser.parse_args() + +BX = { + "A": ["ACGGAA", "CCAACA", "AGATCG", "TTCTCC", "TTCCTG", "TTCGGT", "TTGTGG", "TTGCCT", "TTGGTC", "TTACGC", "TTAGCG", "TCTTCG", "TCTCTC", "TCTGGA", "TCCACT", "TCGTAC", "TCGATG", "TCACAG", "TGTTGC", "TGTCCA", "TGTGTG", "TGCTAG", "TGCATC", "TGGAGT", "TGAGAC", "TATCGG", "TATGCC", "TACCAC", "TAGGAG", "CTTCGT", "CTTGCA", "CTCTGA", "CTCAAC", "CTGCTA", "CTGGAT", "CTAAGG", "CCTCAA", "CCTGTT", "CCATTC", "CGTTCT", "CGTAGA", "CGGTAA", "CGACTT", "CATACG", "CACTTG", "CACGAA", "CACAGT", "CAGATC", "CAACGA", "CAAGCT", "GTTCAC", "GTCGTA", "GTGTCA", "GTGAAG", "GTAACC", "GCTTGT", "GCCTAA", "GCACTA", "GCAGAT", "GGTGAA", "GGCAAT", "GGATGA", "GGAATG", "GATCCT", "GATAGC", "GACACA", "GAGCAA", "GAGGTT", "ATTCCG", "ATTGGC", "ATCGAG", "ACTACC", "ACCAGA", "ACGTCT", "ACACGT", "ACAGTG", "AGCTGT", "AGCCTA", "AGGTTC", "AGGCAT", "AGGACA", "AGAAGC", "AACGTC", "AAGCTG", "CGAGTA", "GAATCC", "GAATGG", "AAGTGC", "AAGAGG", "TACAGG", "CTGACT", "CTAGTC", "CCTAAG", "CCATAG", "CGTAAC", "CAATGC"], + "C": ["GAAACG", "ACACCA", "TCGAGA", "TCCTTC", "CTGTTC", "GGTTTC", "TGGTTG", "CCTTTG", "GTCTTG", "CGCTTA", "GCGTTA", "TCGTCT", "CTCTCT", "GGATCT", "ACTTCC", "TACTCG", "ATGTCG", "CAGTCA", "TGCTGT", "CCATGT", "GTGTGT", "TAGTGC", "ATCTGC", "AGTTGG", "GACTGA", "CGGTAT", "GCCTAT", "CACTAC", "GAGTAG", "CGTCTT", "GCACTT", "TGACTC", "AACCTC", "CTACTG", "GATCTG", "AGGCTA", "CAACCT", "GTTCCT", "TTCCCA", "TCTCGT", "AGACGT", "TAACGG", "CTTCGA", "ACGCAT", "TTGCAC", "GAACAC", "AGTCAC", "ATCCAG", "CGACAA", "GCTCAA", "CACGTT", "GTAGTC", "TCAGTG", "AAGGTG", "ACCGTA", "TGTGCT", "TAAGCC", "CTAGCA", "GATGCA", "GAAGGT", "AATGGC", "TGAGGA", "ATGGGA", "CCTGAT", "AGCGAT", "ACAGAC", "CAAGAG", "GTTGAG", "CCGATT", "GGCATT", "GAGATC", "ACCACT", "AGAACC", "TCTACG", "CGTACA", "GTGACA", "TGTAGC", "CTAAGC", "TTCAGG", "CATAGG", "ACAAGG", "AGCAGA", "GTCAAC", "CTGAAG", "GTACGA", "TCCGAA", "TGGGAA", "TGCAAG", "AGGAAG", "AGGTAC", "ACTCTG", "GTCCTA", "AAGCCT", "TAGCCA", "AACCGT", "TGCCAA"], + "B": ["AACGGA", "ACCAAC", "GAGATC", "CTTCTC", "GTTCCT", "TTTCGG", "GTTGTG", "TTTGCC", "CTTGGT", "CTTACG", "GTTAGC", "GTCTTC", "CTCTCT", "ATCTGG", "TTCCAC", "CTCGTA", "GTCGAT", "GTCACA", "CTGTTG", "ATGTCC", "GTGTGT", "GTGCTA", "CTGCAT", "TTGGAG", "CTGAGA", "GTATCG", "CTATGC", "CTACCA", "GTAGGA", "TCTTCG", "ACTTGC", "ACTCTG", "CCTCAA", "ACTGCT", "TCTGGA", "GCTAAG", "ACCTCA", "TCCTGT", "CCCATT", "TCGTTC", "ACGTAG", "ACGGTA", "TCGACT", "GCATAC", "GCACTT", "ACACGA", "TCACAG", "CCAGAT", "ACAACG", "TCAAGC", "CGTTCA", "AGTCGT", "AGTGTC", "GGTGAA", "CGTAAC", "TGCTTG", "AGCCTA", "AGCACT", "TGCAGA", "AGGTGA", "TGGCAA", "AGGATG", "GGGAAT", "TGATCC", "CGATAG", "AGACAC", "AGAGCA", "TGAGGT", "GATTCC", "CATTGG", "GATCGA", "CACTAC", "AACCAG", "TACGTC", "TACACG", "GACAGT", "TAGCTG", "AAGCCT", "CAGGTT", "TAGGCA", "AAGGAC", "CAGAAG", "CAACGT", "GAAGCT", "ACGAGT", "CGAATC", "GGAATG", "CAAGTG", "GAAGAG", "GTACAG", "TCTGAC", "CCTAGT", "GCCTAA", "GCCATA", "CCGTAA", "CCAATG"], + "D": ["GGAAAC", "AACACC", "ATCGAG", "CTCCTT", "CCTGTT", "CGGTTT", "GTGGTT", "GCCTTT", "GGTCTT", "ACGCTT", "AGCGTT", "TTCGTC", "TCTCTC", "TGGATC", "CACTTC", "GTACTC", "GATGTC", "ACAGTC", "TTGCTG", "TCCATG", "TGTGTG", "CTAGTG", "CATCTG", "GAGTTG", "AGACTG", "TCGGTA", "TGCCTA", "CCACTA", "GGAGTA", "TCGTCT", "TGCACT", "CTGACT", "CAACCT", "GCTACT", "GGATCT", "AAGGCT", "TCAACC", "TGTTCC", "ATTCCC", "TTCTCG", "TAGACG", "GTAACG", "ACTTCG", "TACGCA", "CTTGCA", "CGAACA", "CAGTCA", "GATCCA", "ACGACA", "AGCTCA", "TCACGT", "CGTAGT", "GTCAGT", "GAAGGT", "AACCGT", "TTGTGC", "CTAAGC", "ACTAGC", "AGATGC", "TGAAGG", "CAATGG", "ATGAGG", "AATGGG", "TCCTGA", "TAGCGA", "CACAGA", "GCAAGA", "GGTTGA", "TCCGAT", "TGGCAT", "CGAGAT", "TACCAC", "CAGAAC", "GTCTAC", "ACGTAC", "AGTGAC", "CTGTAG", "CCTAAG", "GTTCAG", "GCATAG", "GACAAG", "AAGCAG", "CGTCAA", "GCTGAA", "AGTACG", "ATCCGA", "ATGGGA", "GTGCAA", "GAGGAA", "CAGGTA", "GACTCT", "AGTCCT", "TAAGCC", "ATAGCC", "TAACCG", "ATGCCA"] +} + +bc_generator = product(BX["A"], BX["C"], BX["B"], BX["D"]) +for BC in bc_generator: + sys.stdout.write("".join(BC) + "\n") \ No newline at end of file diff --git a/harpy/bin/inline_to_haplotag.py b/harpy/bin/inline_to_haplotag.py new file mode 100755 index 000000000..da314cfdb --- /dev/null +++ b/harpy/bin/inline_to_haplotag.py @@ -0,0 +1,100 @@ +#! /usr/bin/env python +"""Convert inline barcodes into haplotag style ones""" +import os +import sys +import gzip +import argparse +from itertools import zip_longest, product + +parser = argparse.ArgumentParser( + prog = 'inline_to_haplotag.py', + description = 'Moves inline linked read barcodes to read headers (OX:Z) and converts them into haplotag ACBD format (BX:Z).', + usage = "inline_to_haplotag.py -f -r -b -p > barcodes.conversion.txt", + exit_on_error = False + ) + +parser.add_argument("-f", "--forward", required = True, type = str, help = "Forward reads of paired-end FASTQ file pair (gzipped)") +parser.add_argument("-r", "--reverse", required = True, type = str, help = "Reverse reads of paired-end FASTQ file pair (gzipped)") +parser.add_argument("-p", "--prefix", required = True, type = str, help = "Prefix for outfile FASTQ files (e.g. .R1.fq.gz)") +parser.add_argument("-b", "--barcodes", required = True, type=str, help="File listing the linked-read barcodes to convert to haplotag format, one barcode per line") +if len(sys.argv) == 1: + parser.print_help(sys.stderr) + sys.exit(1) +args = parser.parse_args() +err = [] +for i in [args.forward, args.reverse, args.barcodes]: + if not os.path.exists(i): + err.append(i) +if err: + parser.error("Some input files were not found on the system:\n" + ", ".join(err)) + +def iter_fastq_records(file_handle): + """Iterate over FASTQ records in a file. + file_handle: Opened gzip file handle + Yields: FASTQ record [header, seq, '+', qual] + Raises ValueError If file is not in FASTQ format + """ + record = [] + for line in file_handle: + line = line.decode().rstrip("\n") + record.append(line) + if len(record) == 4: + # format sanity check + if not (record[0].startswith("@") and record[2] == "+"): + raise ValueError("Invalid FASTQ format") + yield record + record = [] + if record: + raise ValueError("Incomplete FASTQ record at end of file") + +def validate_barcode(barcode): + """Validate barcode format (A,C,G,T).""" + if not set(barcode).issubset({'A','C','G','T'}): + raise ValueError(f"Invalid barcode format: {barcode}. Barcodes must be captial letters and only contain standard nucleotide values ATCG.") + +def process_record(fw_entry, rv_entry, barcode_dict, haplotag_bc): + """convert the barcode to haplotag""" + # [0] = header, [1] = seq, [2] = +, [3] = qual + bc10x = fw_entry[1][:16] + bchap = barcode_dict.get(bc10x, "A00C00B00D00") + if not bchap: + bchap = "".join(next(haplotag_bc)) + barcode_dict[bc10x] = bchap + _new_fw = fw_entry[0].split()[0] + f"\tOX:Z:{bc10x}\tBX:Z:{bchap}\n" + _new_fw += fw_entry[1][16:] + "\n" + _new_fw += fw_entry[2] + "\n" + _new_fw += fw_entry[3][16:] + "\n" + _new_rv = rv_entry[0].split()[0] + f"\tOX:Z:{bc10x}\tBX:Z:{bchap}\n" + _new_rv += "\n".join(rv_entry[1:3]) + return _new_fw, _new_rv + +bc_range = [f"{i}".zfill(2) for i in range(1,97)] +bc_generator = product("A", bc_range, "C", bc_range, "B", bc_range, "D", bc_range) + +bc_dict = {} + +# read in barcodes +opener = gzip.open if args.barcodes.lower().endswith('.gz') else open +mode = 'rt' if args.barcodes.lower().endswith('.gz') else 'r' +with opener(args.barcodes, mode) as bc_file: + for line in bc_file: + barcode = line.rstrip().split()[0] + validate_barcode(barcode) + bc_dict[barcode] = None + +# simultaneously iterate the forward and reverse fastq files +fw_out = gzip.open(f"{args.prefix}.R1.fq.gz", "wb", 6) +rv_out = gzip.open(f"{args.prefix}.R2.fq.gz", "wb", 6) + +with gzip.open(args.forward, "r") as fw_i, gzip.open(args.reverse, "r") as rv_i: + for fw_record, rv_record in zip_longest(iter_fastq_records(fw_i), iter_fastq_records(rv_i)): + new_fw, new_rv = process_record(fw_record, rv_record, bc_dict, bc_generator) + fw_out.write(new_fw.encode("utf-8")) + rv_out.write(new_rv.encode("utf-8")) + +fw_out.close() +rv_out.close() + +for i,j in bc_dict.items(): + if j: + sys.stdout.write(f"{i}\t{j}\n") diff --git a/harpy/bin/make_windows.py b/harpy/bin/make_windows.py index e4a61fb43..1d0548331 100755 --- a/harpy/bin/make_windows.py +++ b/harpy/bin/make_windows.py @@ -52,10 +52,7 @@ def makewindows(_contig, _c_len, windowsize): if testname.endswith("fai"): with open(args.input, "r", encoding="utf-8") as fai: - while True: - line = fai.readline() - if not line: - break + for line in fai: lsplit = line.split("\t") contig = lsplit[0] c_len = int(lsplit[1]) diff --git a/harpy/bin/molecule_coverage.py b/harpy/bin/molecule_coverage.py index 4283cbe65..711fa17c1 100755 --- a/harpy/bin/molecule_coverage.py +++ b/harpy/bin/molecule_coverage.py @@ -45,10 +45,7 @@ # read the fasta index file as a dict of contig lengths with open(args.fai, "r", encoding= "utf-8") as fai: - while True: - line = fai.readline() - if not line: - break + for line in fai: splitline = line.split() contig = splitline[0] length = splitline[1] diff --git a/harpy/bin/parse_phaseblocks.py b/harpy/bin/parse_phaseblocks.py index ce681478b..bb4d52e0f 100755 --- a/harpy/bin/parse_phaseblocks.py +++ b/harpy/bin/parse_phaseblocks.py @@ -23,10 +23,7 @@ with open(args.input, "r", encoding="utf-8") as blocks: FIRST_LINE = True - while True: - line = blocks.readline() - if not line: - break + for line in blocks: lsplit = line.split() if lsplit[0] == "BLOCK:": n_snp = int(lsplit[6]) diff --git a/harpy/simulate.py b/harpy/simulate.py index f7c36840f..fe23383c4 100644 --- a/harpy/simulate.py +++ b/harpy/simulate.py @@ -126,7 +126,7 @@ def simulate(): @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/simulate/simulate-linkedreads") @click.option('-b', '--barcodes', type = click.Path(exists=True, dir_okay=False, readable=True), help = "File of linked-read barcodes to add to reads") @click.option('-s', '--distance-sd', type = click.IntRange(min = 1), default = 15, show_default=True, help = "Standard deviation of read-pair distance") -@click.option('-m', '--molecules-per', type = click.IntRange(min = 1), default = 10, show_default=True, help = "Average number of molecules per partition") +@click.option('-m', '--molecules-per', type = click.IntRange(min = 1, max = 4700), default = 10, show_default=True, help = "Average number of molecules per partition") @click.option('-l', '--molecule-length', type = click.IntRange(min = 2), default = 100, show_default=True, help = "Mean molecule length (kbp)") @click.option('-r', '--mutation-rate', type = click.FloatRange(min = 0), default=0.001, show_default=True, help = "Random mutation rate for simulating reads") @click.option('-d', '--outer-distance', type = click.IntRange(min = 100), default = 350, show_default= True, help = "Outer distance between paired-end reads (bp)") @@ -146,12 +146,11 @@ def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_r Create linked reads from a genome Two haplotype genomes (un/compressed fasta) need to be provided as inputs at the end of the command. If - you don't have a diploid genome, you can simulate one with `harpy simulate` as described [in the documentation](https://pdimens.github.io/harpy/workflows/simulate/simulate-variants/#simulate-diploid-assembly). + you don't have a diploid genome, you can simulate one with `harpy simulate` as described [in the documentation](https://pdimens.github.io/harpy/blog/simulate_diploid/). - If not providing a file for `--barcodes`, Harpy will download the `4M-with-alts-february-2016.txt` - file containing the standard 16-basepair 10X barcodes, which is available from 10X genomics and the - LRSIM [GitHub repository](https://github.com/aquaskyline/LRSIM/). The `--barcodes` file is - expected to have one 16-basepair barcode per line. + If not providing a file for `--barcodes`, Harpy will generate a file containing the original + (96^4) set of 24-basepair haplotagging barcodes (~2GB disk space). The `--barcodes` file is expected to have one + linked-read barcode per line, given as nucleotides. """ output_dir = output_dir.rstrip("/") workflowdir = os.path.join(output_dir, 'workflow') @@ -202,7 +201,7 @@ def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_r start_text.add_column(header="value", justify="left") start_text.add_row("Genome Haplotype 1:", os.path.basename(genome_hap1)) start_text.add_row("Genome Haplotype 2:", os.path.basename(genome_hap2)) - start_text.add_row("Barcodes:", os.path.basename(barcodes) if barcodes else "Barcodes: 10X Default") + start_text.add_row("Barcodes:", os.path.basename(barcodes) if barcodes else "Barcodes: Haplotagging Default") start_text.add_row("Output Folder:", output_dir + "/") start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") launch_snakemake(command, "simulate_linkedreads", start_text, output_dir, sm_log, quiet, "workflow/simulate.reads.summary") diff --git a/harpy/snakefiles/simulate_linkedreads.smk b/harpy/snakefiles/simulate_linkedreads.smk index 43adf3905..d17c460ee 100644 --- a/harpy/snakefiles/simulate_linkedreads.smk +++ b/harpy/snakefiles/simulate_linkedreads.smk @@ -17,7 +17,7 @@ gen_hap1 = config["inputs"]["genome_hap1"] gen_hap2 = config["inputs"]["genome_hap2"] barcodes = config["inputs"].get("barcodes", None) envdir = os.path.join(os.getcwd(), ".harpy_envs") -barcodefile = barcodes if barcodes else f"{outdir}/workflow/input/4M-with-alts-february-2016.txt" +barcodefile = barcodes if barcodes else f"{outdir}/workflow/input/haplotag_barcodes.txt" rule link_1st_geno: input: @@ -50,12 +50,13 @@ rule index_genome: "samtools faidx --fai-idx {output} {input}" if not barcodes: - rule download_barcodes: + rule create_barcodes: output: barcodefile - run: - from urllib.request import urlretrieve - _ = urlretrieve("https://raw.githubusercontent.com/aquaskyline/LRSIM/master/4M-with-alts-february-2016.txt", output[0]) + container: + None + shell: + "haplotag_barcodes.py > {output}" rule create_reads: input: @@ -147,7 +148,7 @@ rule extract_reads: shell: "extractReads {input} {params} 2> {log}" -rule convert_to_haplotag: +rule demultiplex_barcodes: input: fw = outdir + "/10X/sim_hap{hap}_10x_R1_001.fastq.gz", rv = outdir + "/10X/sim_hap{hap}_10x_R2_001.fastq.gz", @@ -162,7 +163,7 @@ rule convert_to_haplotag: container: None shell: - "10xtoHaplotag.py -f {input.fw} -r {input.rv} -b {input.barcodes} -p {params} > {log}" + "inline_to_haplotag.py -f {input.fw} -r {input.rv} -b {input.barcodes} -p {params} > {log}" rule workflow_summary: default_target: True @@ -193,8 +194,8 @@ rule workflow_summary: lrsim = "LRSIM was started from step 3 (-u 3) with these parameters:\n" lrsim += f"\tLRSIM_harpy.pl -g genome1,genome2 -p {params.lrsproj_dir}/lrsim/sim -b BARCODES -r {params.lrsproj_dir} -i {params.lrsoutdist} -s {params.lrsdist_sd} -x {params.lrsn_pairs} -f {params.lrsmol_len} -t {params.lrsparts} -m {params.lrsmols_per} -z THREADS {params.lrsstatic}" summary.append(lrsim) - bxconvert = "10X style barcodes were converted in haplotag BX:Z tags using:\n" - bxconvert += "\t10xtoHaplotag.py" + bxconvert = "Inline barcodes were converted in haplotag BX:Z tags using:\n" + bxconvert += "\tinline_to_haplotag.py" summary.append(bxconvert) sm = "The Snakemake workflow was called via command line:\n" sm += f"\t{config['workflow_call']}" diff --git a/harpy/snakefiles/simulate_snpindel.smk b/harpy/snakefiles/simulate_snpindel.smk index f1891da3f..fe56e53b4 100644 --- a/harpy/snakefiles/simulate_snpindel.smk +++ b/harpy/snakefiles/simulate_snpindel.smk @@ -118,10 +118,7 @@ rule diploid_snps: run: rng = random.Random(randomseed) if randomseed else random.Random() with open(input[0], "r") as in_vcf, open(output[0], "w") as hap1, open(output[1], "w") as hap2: - while True: - line = in_vcf.readline() - if not line: - break + for line in in_vcf: if line.startswith("#") or rng.uniform(0, 1) >= params.het: # write header lines and homozygous variants to both files hap1.write(line) diff --git a/harpy/snakefiles/simulate_variants.smk b/harpy/snakefiles/simulate_variants.smk index fdb5dd89d..2e95bbb53 100644 --- a/harpy/snakefiles/simulate_variants.smk +++ b/harpy/snakefiles/simulate_variants.smk @@ -80,10 +80,7 @@ rule diploid_variants: run: rng = random.Random(randomseed) if randomseed else random.Random() with open(input[0], "r") as in_vcf, open(output[0], "w") as hap1, open(output[1], "w") as hap2: - while True: - line = in_vcf.readline() - if not line: - break + for line in in_vcf: if line.startswith("#") or rng.uniform(0, 1) >= params.het: # write header lines and homozygous variants to both files hap1.write(line) diff --git a/harpy/snakefiles/snp_freebayes.smk b/harpy/snakefiles/snp_freebayes.smk index 8ba9cacd0..e315a53b6 100644 --- a/harpy/snakefiles/snp_freebayes.smk +++ b/harpy/snakefiles/snp_freebayes.smk @@ -39,10 +39,7 @@ if regiontype == "region": else: with open(regioninput, "r") as reg_in: intervals = set() - while True: - line = reg_in.readline() - if not line: - break + for line in reg_in: cont,startpos,endpos = line.split() intervals.add(f"{cont}:{startpos}-{endpos}") regions = dict(zip(intervals, intervals)) diff --git a/harpy/snakefiles/snp_mpileup.smk b/harpy/snakefiles/snp_mpileup.smk index 93c1a67ef..b6de042a5 100644 --- a/harpy/snakefiles/snp_mpileup.smk +++ b/harpy/snakefiles/snp_mpileup.smk @@ -35,10 +35,7 @@ if regiontype == "region": else: with open(regioninput, "r") as reg_in: intervals = set() - while True: - line = reg_in.readline() - if not line: - break + for line in reg_in: cont,startpos,endpos = line.split() intervals.add(f"{cont}:{startpos}-{endpos}") regions = dict(zip(intervals, intervals))