-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #162 from pdimens/haplotag_lrsims
haplotagging barcodes as default
- Loading branch information
Showing
13 changed files
with
149 additions
and
153 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
#! /usr/bin/env python | ||
"""Generates a text file listing the haplotagging ACBD barcodes""" | ||
import sys | ||
import argparse | ||
from itertools import product | ||
|
||
parser = argparse.ArgumentParser( | ||
prog = 'haplotag_barcodes.py', | ||
description ="Generates a text file listing the haplotagging ACBD barcodes", | ||
usage = "haplotag_barcodes.py > barcodes.txt" | ||
) | ||
|
||
args = parser.parse_args() | ||
|
||
BX = { | ||
"A": ["ACGGAA", "CCAACA", "AGATCG", "TTCTCC", "TTCCTG", "TTCGGT", "TTGTGG", "TTGCCT", "TTGGTC", "TTACGC", "TTAGCG", "TCTTCG", "TCTCTC", "TCTGGA", "TCCACT", "TCGTAC", "TCGATG", "TCACAG", "TGTTGC", "TGTCCA", "TGTGTG", "TGCTAG", "TGCATC", "TGGAGT", "TGAGAC", "TATCGG", "TATGCC", "TACCAC", "TAGGAG", "CTTCGT", "CTTGCA", "CTCTGA", "CTCAAC", "CTGCTA", "CTGGAT", "CTAAGG", "CCTCAA", "CCTGTT", "CCATTC", "CGTTCT", "CGTAGA", "CGGTAA", "CGACTT", "CATACG", "CACTTG", "CACGAA", "CACAGT", "CAGATC", "CAACGA", "CAAGCT", "GTTCAC", "GTCGTA", "GTGTCA", "GTGAAG", "GTAACC", "GCTTGT", "GCCTAA", "GCACTA", "GCAGAT", "GGTGAA", "GGCAAT", "GGATGA", "GGAATG", "GATCCT", "GATAGC", "GACACA", "GAGCAA", "GAGGTT", "ATTCCG", "ATTGGC", "ATCGAG", "ACTACC", "ACCAGA", "ACGTCT", "ACACGT", "ACAGTG", "AGCTGT", "AGCCTA", "AGGTTC", "AGGCAT", "AGGACA", "AGAAGC", "AACGTC", "AAGCTG", "CGAGTA", "GAATCC", "GAATGG", "AAGTGC", "AAGAGG", "TACAGG", "CTGACT", "CTAGTC", "CCTAAG", "CCATAG", "CGTAAC", "CAATGC"], | ||
"C": ["GAAACG", "ACACCA", "TCGAGA", "TCCTTC", "CTGTTC", "GGTTTC", "TGGTTG", "CCTTTG", "GTCTTG", "CGCTTA", "GCGTTA", "TCGTCT", "CTCTCT", "GGATCT", "ACTTCC", "TACTCG", "ATGTCG", "CAGTCA", "TGCTGT", "CCATGT", "GTGTGT", "TAGTGC", "ATCTGC", "AGTTGG", "GACTGA", "CGGTAT", "GCCTAT", "CACTAC", "GAGTAG", "CGTCTT", "GCACTT", "TGACTC", "AACCTC", "CTACTG", "GATCTG", "AGGCTA", "CAACCT", "GTTCCT", "TTCCCA", "TCTCGT", "AGACGT", "TAACGG", "CTTCGA", "ACGCAT", "TTGCAC", "GAACAC", "AGTCAC", "ATCCAG", "CGACAA", "GCTCAA", "CACGTT", "GTAGTC", "TCAGTG", "AAGGTG", "ACCGTA", "TGTGCT", "TAAGCC", "CTAGCA", "GATGCA", "GAAGGT", "AATGGC", "TGAGGA", "ATGGGA", "CCTGAT", "AGCGAT", "ACAGAC", "CAAGAG", "GTTGAG", "CCGATT", "GGCATT", "GAGATC", "ACCACT", "AGAACC", "TCTACG", "CGTACA", "GTGACA", "TGTAGC", "CTAAGC", "TTCAGG", "CATAGG", "ACAAGG", "AGCAGA", "GTCAAC", "CTGAAG", "GTACGA", "TCCGAA", "TGGGAA", "TGCAAG", "AGGAAG", "AGGTAC", "ACTCTG", "GTCCTA", "AAGCCT", "TAGCCA", "AACCGT", "TGCCAA"], | ||
"B": ["AACGGA", "ACCAAC", "GAGATC", "CTTCTC", "GTTCCT", "TTTCGG", "GTTGTG", "TTTGCC", "CTTGGT", "CTTACG", "GTTAGC", "GTCTTC", "CTCTCT", "ATCTGG", "TTCCAC", "CTCGTA", "GTCGAT", "GTCACA", "CTGTTG", "ATGTCC", "GTGTGT", "GTGCTA", "CTGCAT", "TTGGAG", "CTGAGA", "GTATCG", "CTATGC", "CTACCA", "GTAGGA", "TCTTCG", "ACTTGC", "ACTCTG", "CCTCAA", "ACTGCT", "TCTGGA", "GCTAAG", "ACCTCA", "TCCTGT", "CCCATT", "TCGTTC", "ACGTAG", "ACGGTA", "TCGACT", "GCATAC", "GCACTT", "ACACGA", "TCACAG", "CCAGAT", "ACAACG", "TCAAGC", "CGTTCA", "AGTCGT", "AGTGTC", "GGTGAA", "CGTAAC", "TGCTTG", "AGCCTA", "AGCACT", "TGCAGA", "AGGTGA", "TGGCAA", "AGGATG", "GGGAAT", "TGATCC", "CGATAG", "AGACAC", "AGAGCA", "TGAGGT", "GATTCC", "CATTGG", "GATCGA", "CACTAC", "AACCAG", "TACGTC", "TACACG", "GACAGT", "TAGCTG", "AAGCCT", "CAGGTT", "TAGGCA", "AAGGAC", "CAGAAG", "CAACGT", "GAAGCT", "ACGAGT", "CGAATC", "GGAATG", "CAAGTG", "GAAGAG", "GTACAG", "TCTGAC", "CCTAGT", "GCCTAA", "GCCATA", "CCGTAA", "CCAATG"], | ||
"D": ["GGAAAC", "AACACC", "ATCGAG", "CTCCTT", "CCTGTT", "CGGTTT", "GTGGTT", "GCCTTT", "GGTCTT", "ACGCTT", "AGCGTT", "TTCGTC", "TCTCTC", "TGGATC", "CACTTC", "GTACTC", "GATGTC", "ACAGTC", "TTGCTG", "TCCATG", "TGTGTG", "CTAGTG", "CATCTG", "GAGTTG", "AGACTG", "TCGGTA", "TGCCTA", "CCACTA", "GGAGTA", "TCGTCT", "TGCACT", "CTGACT", "CAACCT", "GCTACT", "GGATCT", "AAGGCT", "TCAACC", "TGTTCC", "ATTCCC", "TTCTCG", "TAGACG", "GTAACG", "ACTTCG", "TACGCA", "CTTGCA", "CGAACA", "CAGTCA", "GATCCA", "ACGACA", "AGCTCA", "TCACGT", "CGTAGT", "GTCAGT", "GAAGGT", "AACCGT", "TTGTGC", "CTAAGC", "ACTAGC", "AGATGC", "TGAAGG", "CAATGG", "ATGAGG", "AATGGG", "TCCTGA", "TAGCGA", "CACAGA", "GCAAGA", "GGTTGA", "TCCGAT", "TGGCAT", "CGAGAT", "TACCAC", "CAGAAC", "GTCTAC", "ACGTAC", "AGTGAC", "CTGTAG", "CCTAAG", "GTTCAG", "GCATAG", "GACAAG", "AAGCAG", "CGTCAA", "GCTGAA", "AGTACG", "ATCCGA", "ATGGGA", "GTGCAA", "GAGGAA", "CAGGTA", "GACTCT", "AGTCCT", "TAAGCC", "ATAGCC", "TAACCG", "ATGCCA"] | ||
} | ||
|
||
bc_generator = product(BX["A"], BX["C"], BX["B"], BX["D"]) | ||
for BC in bc_generator: | ||
sys.stdout.write("".join(BC) + "\n") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
#! /usr/bin/env python | ||
"""Convert inline barcodes into haplotag style ones""" | ||
import os | ||
import sys | ||
import gzip | ||
import argparse | ||
from itertools import zip_longest, product | ||
|
||
parser = argparse.ArgumentParser( | ||
prog = 'inline_to_haplotag.py', | ||
description = 'Moves inline linked read barcodes to read headers (OX:Z) and converts them into haplotag ACBD format (BX:Z).', | ||
usage = "inline_to_haplotag.py -f <forward.fq.gz> -r <reverse.fq.gz> -b <barcodes.txt> -p <prefix> > barcodes.conversion.txt", | ||
exit_on_error = False | ||
) | ||
|
||
parser.add_argument("-f", "--forward", required = True, type = str, help = "Forward reads of paired-end FASTQ file pair (gzipped)") | ||
parser.add_argument("-r", "--reverse", required = True, type = str, help = "Reverse reads of paired-end FASTQ file pair (gzipped)") | ||
parser.add_argument("-p", "--prefix", required = True, type = str, help = "Prefix for outfile FASTQ files (e.g. <prefix>.R1.fq.gz)") | ||
parser.add_argument("-b", "--barcodes", required = True, type=str, help="File listing the linked-read barcodes to convert to haplotag format, one barcode per line") | ||
if len(sys.argv) == 1: | ||
parser.print_help(sys.stderr) | ||
sys.exit(1) | ||
args = parser.parse_args() | ||
err = [] | ||
for i in [args.forward, args.reverse, args.barcodes]: | ||
if not os.path.exists(i): | ||
err.append(i) | ||
if err: | ||
parser.error("Some input files were not found on the system:\n" + ", ".join(err)) | ||
|
||
def iter_fastq_records(file_handle): | ||
"""Iterate over FASTQ records in a file. | ||
file_handle: Opened gzip file handle | ||
Yields: FASTQ record [header, seq, '+', qual] | ||
Raises ValueError If file is not in FASTQ format | ||
""" | ||
record = [] | ||
for line in file_handle: | ||
line = line.decode().rstrip("\n") | ||
record.append(line) | ||
if len(record) == 4: | ||
# format sanity check | ||
if not (record[0].startswith("@") and record[2] == "+"): | ||
raise ValueError("Invalid FASTQ format") | ||
yield record | ||
record = [] | ||
if record: | ||
raise ValueError("Incomplete FASTQ record at end of file") | ||
|
||
def validate_barcode(barcode): | ||
"""Validate barcode format (A,C,G,T).""" | ||
if not set(barcode).issubset({'A','C','G','T'}): | ||
raise ValueError(f"Invalid barcode format: {barcode}. Barcodes must be captial letters and only contain standard nucleotide values ATCG.") | ||
|
||
def process_record(fw_entry, rv_entry, barcode_dict, haplotag_bc): | ||
"""convert the barcode to haplotag""" | ||
# [0] = header, [1] = seq, [2] = +, [3] = qual | ||
bc10x = fw_entry[1][:16] | ||
bchap = barcode_dict.get(bc10x, "A00C00B00D00") | ||
if not bchap: | ||
bchap = "".join(next(haplotag_bc)) | ||
barcode_dict[bc10x] = bchap | ||
_new_fw = fw_entry[0].split()[0] + f"\tOX:Z:{bc10x}\tBX:Z:{bchap}\n" | ||
_new_fw += fw_entry[1][16:] + "\n" | ||
_new_fw += fw_entry[2] + "\n" | ||
_new_fw += fw_entry[3][16:] + "\n" | ||
_new_rv = rv_entry[0].split()[0] + f"\tOX:Z:{bc10x}\tBX:Z:{bchap}\n" | ||
_new_rv += "\n".join(rv_entry[1:3]) | ||
return _new_fw, _new_rv | ||
|
||
bc_range = [f"{i}".zfill(2) for i in range(1,97)] | ||
bc_generator = product("A", bc_range, "C", bc_range, "B", bc_range, "D", bc_range) | ||
|
||
bc_dict = {} | ||
|
||
# read in barcodes | ||
opener = gzip.open if args.barcodes.lower().endswith('.gz') else open | ||
mode = 'rt' if args.barcodes.lower().endswith('.gz') else 'r' | ||
with opener(args.barcodes, mode) as bc_file: | ||
for line in bc_file: | ||
barcode = line.rstrip().split()[0] | ||
validate_barcode(barcode) | ||
bc_dict[barcode] = None | ||
|
||
# simultaneously iterate the forward and reverse fastq files | ||
fw_out = gzip.open(f"{args.prefix}.R1.fq.gz", "wb", 6) | ||
rv_out = gzip.open(f"{args.prefix}.R2.fq.gz", "wb", 6) | ||
|
||
with gzip.open(args.forward, "r") as fw_i, gzip.open(args.reverse, "r") as rv_i: | ||
for fw_record, rv_record in zip_longest(iter_fastq_records(fw_i), iter_fastq_records(rv_i)): | ||
new_fw, new_rv = process_record(fw_record, rv_record, bc_dict, bc_generator) | ||
fw_out.write(new_fw.encode("utf-8")) | ||
rv_out.write(new_rv.encode("utf-8")) | ||
|
||
fw_out.close() | ||
rv_out.close() | ||
|
||
for i,j in bc_dict.items(): | ||
if j: | ||
sys.stdout.write(f"{i}\t{j}\n") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.