From b0aa3bc343c85369889a52797595db366b06f190 Mon Sep 17 00:00:00 2001 From: Georgios Date: Wed, 16 Oct 2019 09:30:49 +0200 Subject: [PATCH] initial files --- aladin.py | 228 +++++++++++++++++++++++++++++++++++++++++++++ corrector.pl | 158 +++++++++++++++++++++++++++++++ depot/AladinFun.py | 110 ++++++++++++++++++++++ detection.py | 100 ++++++++++++++++++++ extraction.py | 51 ++++++++++ fragmenter.py | 56 +++++++++++ 6 files changed, 703 insertions(+) create mode 100755 aladin.py create mode 100755 corrector.pl create mode 100644 depot/AladinFun.py create mode 100755 detection.py create mode 100755 extraction.py create mode 100755 fragmenter.py diff --git a/aladin.py b/aladin.py new file mode 100755 index 0000000..89cf520 --- /dev/null +++ b/aladin.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python3 + +""" +aladin.py + Usage: + ./aladin.py -r -i [-o ] [-d ] [-m ] [-l ] [-f ] [-s ] [-t ] [--cleanup] + + Options: + -h, --help show options + -r, --reference reference input + -i, --reads Reads input + -o, --dir_output creates a directory for all output files [default: results] + -d, --data_format (N) Nanopore or (P) Pacbio [default: N] + -m, --mode (M) Mitochondion or (C) Chloroplast [default: M] + -l, --length break reads into chunks of this length [default: 4000] + -f, --fraction fraction of length at which the end of the sequence gets split into a new sequence [default: 0.1] + -s, --genome_size set expected size in kb [default: 20] + -t, --threads minimap2 threads [default: 3] + --cleanup remove intermediate files + --version print version +""" + +import os +import re +import shutil +import subprocess +from Bio import SeqIO +from docopt import docopt +from depot.AladinFun import * +from detection import return_pos +from extraction import extract_reads +from fragmenter import fragment_fasta + +def run_nucmer(assembly_file, contig): + FERR = open("ALADIN.err", 'a') + coords_file = "nucmer/nucmer.coords" + delta_file = "nucmer/nucmer.delta" + coords_out = open(coords_file, 'w') + subprocess.call(["nucmer", "--maxmatch", "-c", "100", "--delta", + delta_file, assembly_file, assembly_file], + stdout=FERR, stderr=FERR) + subprocess.call(["show-coords", "-B", "-r", + delta_file], + stdout=coords_out, stderr=FERR) + coords_file = open_files(coords_file) + length_coords = [] + contigs = [] + trim_list = [] + for line in coords_file: + elems = line.split("\t") + q_contig_name = elems[0] + contig_length = int(elems[2]) + s_contig_name = elems[5] + q_start = int(elems[6]) + q_end = int(elems[7]) + s_start = int(elems[8]) + s_end = int(elems[9]) + strand = elems[17] + #check that the other hit is close to the end + if q_contig_name == s_contig_name and strand == "Plus" and q_contig_name == contig: + if q_start < 5 and s_start > 1000 and (s_end + 100) > contig_length: + if q_end not in trim_list: + length_coords.append(q_end) + contigs.append(q_contig_name) + trim_list.append(q_end) + elif s_start < 5 and q_start > 1000 and (q_end + 100) > contig_length: + if s_end not in trim_list: + length_coords.append(s_end) + contigs.append(q_contig_name) + trim_list.append(s_end) + return (contigs, length_coords, len(length_coords)) + +def run_pipeline(data_format, mode, reference, reads, genome_size, length, fraction, threads): + # Directory structure + get_outdir("map") + get_outdir("nucmer") + + # File naming + FERR = open("ALADIN.err", 'w') + sam_file = "map/aln.sam" + extract_reads_file = "map/read_ids.txt.fasta" + extract_reads_frag_file = "map/read_ids.txt.fasta.frag.fasta" + assembly_file = "canu/extract.contigs.fasta" + aragorn_file = "canu/aragorn.txt" + gfa_file = "canu/extract.contigs.gfa" + + #Set data format + if data_format == "N": + format_mini = "map-ont" + format_canu = "-nanopore-raw" + elif data_format == "P": + format_mini = "map-pb" + format_canu = "-pacbio-raw" + + # Map reads + print(status_d['0']) + sam_out = open(sam_file, 'w') + subprocess.call(["minimap2", "--sam-hit-only", "-t", threads, "--secondary=no", "-ax", + format_mini, reference, reads], + stdout=sam_out, stderr=FERR) + fichier_vide(sam_file) + + # Extract and fragment reads + print(status_d['1']) + read_ids = [] + sam_input = open_files(sam_file) + for line in sam_input: + elems = line.split("\t") + if not line.startswith("@") and str(elems[4]) == "60": + read_ids.append(elems[0]) + + read_ids = list(set(read_ids)) + extract_reads(reads, read_ids, extract_reads_file) + fragment_fasta(extract_reads_file, length, fraction) + + # Run canu + print(status_d['2']) + subprocess.call(["canu", "-d", "canu", "-p", "extract", genome_size, "corOutCoverage=200", "correctedErrorRate=0.15", + format_canu, extract_reads_frag_file], + stdout=FERR, stderr=FERR) + fichier_vide(assembly_file) + + print(status_d['3']) + + # Run aragorn to select contig + aragorn_out = open(aragorn_file, 'w') + if mode == "M": + subprocess.call(["aragorn", "-l", "-gc5", "-mt", "-w", assembly_file], + stdout=aragorn_out, stderr=FERR) + else: + subprocess.call(["aragorn", "-l", "-gc1", "-m", "-t", "-w", assembly_file], + stdout=aragorn_out, stderr=FERR) + aragorn_input = open_files(aragorn_file) + trna_contigs = [] + trna_genes = [] + for line in aragorn_input: + if line.startswith(">") and not line.startswith(">end"): + elem = line.split(" ") + header = elem[0].replace(">","") + trna_contigs.append(header) + elif "genes found" in line: + elem = line.split(" ") + num_trnas = int(elem[0]) + trna_genes.append(num_trnas) + aragorn_contig = trna_contigs[trna_genes.index(max(trna_genes))] + + print("[!] Found contig " + str(aragorn_contig) + " with " + str(max(trna_genes)) + " mtRNAs") + + num_circ = 0 + length_canu = 0 + trim_len = 0 + assembly = {} + + # Check canu output + for record in SeqIO.parse(assembly_file, "fasta"): + elem = str(record.id).split(" ") + if re.search(aragorn_contig,str(record.id)): + if re.search("suggestCircular=yes", str(record.description)): + assembly[elem[0]] = str(record.seq) + num_circ = 1 + + contigs, length_coords, nucmer_cases = run_nucmer(assembly_file, aragorn_contig) + mito_contig = aragorn_contig + + if num_circ == 1: + length_canu = return_pos(gfa_file, aragorn_contig) + trim_len = length_canu + if str(nucmer_cases) == "0": + print(warn_d['2'] % trim_len) + elif str(nucmer_cases) == "1": + abs_trim = abs(int(length_canu) - int(length_coords[0])) + if abs_trim < 11: + print(status_d['4'] % (trim_len, length_coords[0])) + else: + print(warn_d['1'] % abs_trim) + else: + print(warn_d['3']) + else: + print(warn_d['0']) + if str(nucmer_cases) == "0": + warn_d('4') + elif str(nucmer_cases) == "1": + trim_len = length_coords[0] + print(status_d['5'] % trim_len) + else: + warn_d('5') + + circ_file = open("Canu_trimmed.fasta", 'w') + circ_file.write(">" + mito_contig + "\n") + circ_file.write(assembly[mito_contig][int(trim_len):] + "\n") + circ_file.close() + + FERR.close() + + +############### Main ############### + +def main(): + args = docopt(__doc__, version='1.01') + reference = os.path.abspath(str(args['--reference'])) + reads = os.path.abspath(str(args['--reads'])) + dir_output = get_outdir(args['--dir_output']) + length = int(args['--length']) + fraction = float(args['--fraction']) + threads = str(args['--threads']) + cleanup = args['--cleanup'] + data_format = str(args['--data_format']) + mode = str(args['--mode']) + genome_size = int(args['--genome_size']) + genome_size = "genomeSize=" + str(genome_size) + "k" + + os.chdir(dir_output) + + check_programs("minimap2", "canu", "nucmer", "show-coords", "aragorn") + + run_pipeline(data_format, mode, reference, reads, genome_size, length, fraction, threads) + + if cleanup: + shutil.rmtree("canu") + shutil.rmtree("map") + shutil.rmtree("nucmer") + + print(status_d['6']) + resultats_dir = os.getcwd() + print("Results be in "+resultats_dir+"/Canu_trimmed.fasta") + +if __name__ == "__main__": + main() diff --git a/corrector.pl b/corrector.pl new file mode 100755 index 0000000..a2aecf8 --- /dev/null +++ b/corrector.pl @@ -0,0 +1,158 @@ +#!/usr/bin/env perl + +# To do +# Multi-fasta +# Parallel +### + +use strict; +use warnings; +use Bio::SeqIO; +use Data::Dumper; +use List::Util 'sum'; +use Getopt::Long qw(:config pass_through no_ignore_case no_auto_abbrev); + +my ($samfile,$fastafile,$prefix) = ("","","corrected"); + +GetOptions ( + "s|samfile:s" => \$samfile, + "f|fastafile:s" => \$fastafile, + "p|prefix:s" => \$prefix, +); + +if (not $fastafile or not $samfile) { + print STDERR "Usage: + -s|samfile + -f|fastafile + -p|prefix +"; +exit; +} + +my @ref_bases; +my $header; +my $seqio = Bio::SeqIO->new(-file => $fastafile, '-format' => 'Fasta'); +while(my $seq = $seqio->next_seq) { + my $string = $seq->seq; + $header = $seq->id; + @ref_bases = split("",$string); +} + +open IN, "$samfile"; +my %ref_positions; +my %ins_positions; +my $i = 0; +my $k = 0; + +while (my $line=) { + next if $line=~/^\@/; + my @p =split(/\t/,$line); + my $read_name = $p[0]; + my $sam_flag = $p[1]; + my $read_start = $p[3]; + my $cigar = $p[5]; + my $read = $p[9]; + if ($sam_flag != 0 && $sam_flag != 16){ + next; + } #avoid multimapping + + # Parse CIGAR string + my @type_counts = split(/\D+/,$cigar); + my @types = split(/\d+/,$cigar); + shift @types; + $i = 0; + my @cs; + while ($i$prefix.fasta"; +open CORRLST, ">$prefix.lst"; + +my $len = scalar(@ref_bases)+1; + +my $final_ref = ""; +$i = 1; +while ($i<$len) { + + my $nocov = 0; + unless (defined($ref_positions{$i})) {$ref_positions{$i}{$ref_bases[$i-1]}=1;print CORRLST "[N] at position $i\n";$nocov=1;} # No LR cov + + my @bases = (sort {$ref_positions{$i}{$b} <=> $ref_positions{$i}{$a}} keys %{$ref_positions{$i}})[0..1]; + my $base = $bases[0]; + my $sbase = $bases[1]; + my $total_match = sum values %{$ref_positions{$i}}; + + unless (defined($insertions{$i})) {$insertions{$i}=0;} # No insertions at the position + my $perc = int(($insertions{$i}/$total_match)*100); + #if ($insertions{$i}>0) {print CORRLST "pos $i $total_match:" . Dumper($ins_base{$i});} + if ($perc > 50) { + my $max_value = (sort {$ins_base{$i}{$b} <=> $ins_base{$i}{$a}} keys %{$ins_base{$i}})[0]; + $final_ref .= uc($max_value); + print CORRLST "[I] at position $i: $max_value\n"; + } + + # Check that deletion is the most probable + if ($base eq "-") { + if (int(($ref_positions{$i}{$base}/$total_match)*100) > 50) { + $base = ""; + print CORRLST "[D] at position $i\n"; + } + else { + $base = uc($sbase); + } + } + + # Write the substitution output + if ($base ne $ref_bases[$i-1] && $base ne "") { + print CORRLST "[S] at position $i: $ref_bases[$i-1] to $base\n"; + } + # Lowercase if no coverage + if ($nocov) {$final_ref .= lc($base);} + else {$final_ref .= uc($base);} + $i++; + + +} + +print CORRFASTA ">$header\n$final_ref\n"; diff --git a/depot/AladinFun.py b/depot/AladinFun.py new file mode 100644 index 0000000..5cd53a6 --- /dev/null +++ b/depot/AladinFun.py @@ -0,0 +1,110 @@ +import sys +import os +import shutil +import gzip +from Bio import SeqIO + +# Reporting +def error(message, *argv): + if argv is None: + sys.exit(error_d[message]) + else: + sys.exit(error_d[message] % (argv)) + +error_d = { + '0': '[ERROR:0]\t: File %s does not exist.', + '1': '[ERROR:1]\t: File %s is empty.', + '2': '[ERROR:2]\t: %s not found! Please install and add to the PATH variable.' +} + +status_d = { + '0': '[+] Mapping reads with minimap2.', + '1': '[+] Extracting reads.', + '2': '[+] Running Canu.', + '3': '[+] Finding the mitochondrion.', + '4': '[+] Canu and nucmer agree (%s, %s) . Using canu gfa to trim.', + '5': '[+] Using nucmer coordinates to trim (%s bases).', + '6': '[+] Pipeline finished.' +} + +warn_d = { + '0': '[-] No circular tag present, or non-informative gfa in Canu assembly. Will attempt to find the circular contig with nucmer.', + '1': '[-] Using canu gfa to trim despite %s bases difference between canu and nucmer. Validate your results', + '2': '[-] Using canu gfa to trim (%s bases) despite nucmer not finding any case of circularisation. Validate your results', + '3': '[-] Using canu gfa to trim despite nucmer finding multiple cases of circularisation. Validate your results', + '4': '[-] Nor Canu nor nucmer found any circular contig.', + '5': '[-] No circular tag in canu while nucmer found multiple cases of circularisation.' +} + + +# Check if empty or does not exist +def fichier_vide(fichier): + if os.path.isfile(fichier) is True: + if os.path.getsize(fichier) <= 200: + error('1', fichier) + else: + error('0', fichier) + +# Check if program is in path + +def check_programs(*arg): + error_list = [] + for program in arg: + if which(program) is False: + error_list.append(program) + if error_list: + programs = ', '.join(error_list) + error('2', programs) + +def which(program): + if shutil.which(program): + return program + else: + return False + +# Directory checking + +def get_outdir(out_directory, add_dir=""): + """generates output directory in case it does not exist.""" + if type(out_directory) != str: + print("\t[!] {} is NOT a directory! Please specify an output directory".format(out_directory)) + sys.exit() + elif os.path.isfile(out_directory): + print("\t[!] {} is a File! Please specify an output directory".format(out_directory)) + sys.exit() + elif not os.path.exists(os.path.join(out_directory, add_dir)): + os.mkdir(os.path.join(out_directory, add_dir)) + return os.path.abspath(os.path.join(out_directory, add_dir)) + else: + return os.path.abspath(os.path.join(out_directory, add_dir)) + +def check_indir(input_dir): + if not os.path.exists(input_dir): + print("\t[!] FATAL ERROR: '{}' directory not found".format(input_dir)) + sys.exit() + elif not os.path.isdir(input_dir): + print("\t[!] FATAL ERROR: '{}' is not a directory".format(input_dir)) + sys.exit() + else: + return input_dir + +# Can open gzip files + +def open_files(fname): + if fname.endswith('.gz'): + return gzip.open(fname, 'rt', encoding="latin-1") + else: + return open(fname, 'rt', encoding="latin-1") + +# Check if fasta + +def is_fasta(filename): + with open_files(filename) as handle: + if handle.readline().startswith(">"): + return True + +def return_format(filename): + if is_fasta(filename): + return "fasta" + else: + return "fastq" diff --git a/detection.py b/detection.py new file mode 100755 index 0000000..f28b9f8 --- /dev/null +++ b/detection.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 + +""" +detection.py + Usage: + ./detection.py -g -f -o -i + + Options: + -h, --help show options + -g, --gfa_file gfa file + -f, --fasta_file fasta file + -o, --output_file output file + -i, --contig_id contig id + --version print version +""" + +import re +from docopt import docopt + + +def forma_cigar(cigar): + cigar = cigar.rstrip() + type_counts = re.split("\D+", cigar) + types = re.split("\d+", cigar) + types.pop(0) + length_cigar = 0 + for c, t in zip(type_counts, types): + if t != "I": + length_cigar += int(c) + return length_cigar + +def forma_cigar_minus(cigar): + cigar = cigar.rstrip() + type_counts = re.split("\D+", cigar) + types = re.split("\d+", cigar) + types.pop(0) + length_cigar = 0 + for c, t in zip(type_counts, types): + if t != "D": + length_cigar += int(c) + return length_cigar + +def return_pos(file_gfa, contig): + + file_opened = open(file_gfa, "r") + cigar_plus = "" + length_cigar = 0 + + for line in file_opened: + if line.startswith("L"): + elem = line.split("\t") + if elem[1] == contig and elem[1] == elem[3] and elem[2] == "+" and elem[4] == "+": + cigar_plus = elem[5] + length_cigar = forma_cigar(cigar_plus) + file_opened.close() + + file_opened = open(file_gfa, "r") + if length_cigar == 0: + for line in file_opened: + if line.startswith("L"): + elem = line.split("\t") + if elem[1] == contig and elem[1] == elem[3] and elem[2] == "-" and elem[4] == "-": + cigar_minus = elem[5] + length_cigar = forma_cigar_minus(cigar_minus) + + file_opened.close() + length_plus = length_cigar + return length_plus + +############### Main ############### + +def main(): + args = docopt(__doc__, version='1.00') + file_gfa = str(args['--gfa_file']) + file_fasta = str(args['--fasta_file']) + file_output = str(args['--output_file']) + contig = str(args['--contig_id']) + + length_plus = return_pos(file_gfa, contig) + sequence = "" + + file_seq = open(file_fasta, "r") + + for line in file_seq: + line = line.rstrip("\n") + if line.startswith(">"): + header = line + else: + sequence += line + + file_seq.close() + + file_new_seq = open(file_output, 'w') + file_new_seq.write(header + "\n") + file_new_seq.write(sequence[length_plus:] + "\n") + file_new_seq.close() + + +if __name__ == "__main__": + main() diff --git a/extraction.py b/extraction.py new file mode 100755 index 0000000..7830afd --- /dev/null +++ b/extraction.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 + +""" +extraction.py + Usage: + ./extraction.py -r -i + + Options: + -h, --help show options + -r, --reads reads file + -i, --ids read ids + --version print version +""" + +from Bio import SeqIO +from docopt import docopt +from depot.AladinFun import open_files, return_format + + +def extract_reads(reads, read_ids, outfile): + + file_in = open_files(reads) + file_out = open(outfile, 'w') + + for record in SeqIO.parse(file_in, return_format(reads)): + if record.id in read_ids: + file_out.write(">" + str(record.id) + "\n") + file_out.write(str(record.seq) + "\n") + + file_in.close() + file_out.close() + +############### Main ############### +def main(): + args = docopt(__doc__, version='1.00') + reads = str(args['--reads']) + ids = str(args['--ids']) + + file_list = open_files(ids) + + read_ids = [] + for line in file_list: + line = line.rstrip("\n") + read_ids.append(line) + + file_list.close() + outfile = "read_ids.txt.fasta" + extract_reads(reads, read_ids, outfile) + +if __name__ == "__main__": + main() diff --git a/fragmenter.py b/fragmenter.py new file mode 100755 index 0000000..22da529 --- /dev/null +++ b/fragmenter.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 + +""" +fragmenter.py + Usage: + ./fragmenter.py -i [-l ] [-f ] + + Options: + -h, --help show options + -i, --input_file file to fragment + -l, --length split reads into chunks of this length [default: 4000] + -f, --fraction fraction of length at which the end of the sequence gets split into a new sequence [default: 0.1] + --version print version +""" + +from Bio import SeqIO +from docopt import docopt +from depot.AladinFun import open_files, return_format + +def fragment_fasta(input_file, length_frag, fraction_select): + file_in = open_files(input_file) + + outfile = input_file+".frag.fasta" + file_out = open(outfile, 'w') + + for record in SeqIO.parse(file_in, return_format(input_file)): + name = record.id + seq = str(record.seq) + if length_frag < 1: + file_out.write(">" + name + "\n") + file_out.write(seq + "\n") + else: + subseqs = [seq[i:i+length_frag] for i in range(0, len(seq), length_frag)] + if len(subseqs) > 1 and len(subseqs[-1]) < (fraction_select*length_frag): + toadd = subseqs.pop() + subseqs[-1] += toadd + i = 0 + for subseq in subseqs: + file_out.write(">" + name + "_" + str(i) + "\n") + file_out.write(subseq + "\n") + i += 1 + + file_in.close() + file_out.close() + +############### Main ############### +def main(): + args = docopt(__doc__, version='1.00') + input_file = str(args['--input_file']) + length_frag = int(args['--length']) + fraction_select = float(args['--fraction']) + + fragment_fasta(input_file, length_frag, fraction_select) + +if __name__ == "__main__": + main()