Skip to content

Commit

Permalink
initial files
Browse files Browse the repository at this point in the history
  • Loading branch information
GDKO committed Oct 16, 2019
1 parent f0c8075 commit b0aa3bc
Show file tree
Hide file tree
Showing 6 changed files with 703 additions and 0 deletions.
228 changes: 228 additions & 0 deletions aladin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
#!/usr/bin/env python3

"""
aladin.py
Usage:
./aladin.py -r <FILE> -i <FILE> [-o <DIR>] [-d <STR>] [-m <STR>] [-l <INT>] [-f <FLOAT>] [-s <STR>] [-t <INT>] [--cleanup]
Options:
-h, --help show options
-r, --reference <FILE> reference input
-i, --reads <FILE> Reads input
-o, --dir_output <DIR> creates a directory for all output files [default: results]
-d, --data_format <STR> (N) Nanopore or (P) Pacbio [default: N]
-m, --mode <STR> (M) Mitochondion or (C) Chloroplast [default: M]
-l, --length <INT> break reads into chunks of this length [default: 4000]
-f, --fraction <FLOAT> fraction of length at which the end of the sequence gets split into a new sequence [default: 0.1]
-s, --genome_size <INT> set expected size in kb [default: 20]
-t, --threads <INT> minimap2 threads [default: 3]
--cleanup remove intermediate files
--version print version
"""

import os
import re
import shutil
import subprocess
from Bio import SeqIO
from docopt import docopt
from depot.AladinFun import *
from detection import return_pos
from extraction import extract_reads
from fragmenter import fragment_fasta

def run_nucmer(assembly_file, contig):
FERR = open("ALADIN.err", 'a')
coords_file = "nucmer/nucmer.coords"
delta_file = "nucmer/nucmer.delta"
coords_out = open(coords_file, 'w')
subprocess.call(["nucmer", "--maxmatch", "-c", "100", "--delta",
delta_file, assembly_file, assembly_file],
stdout=FERR, stderr=FERR)
subprocess.call(["show-coords", "-B", "-r",
delta_file],
stdout=coords_out, stderr=FERR)
coords_file = open_files(coords_file)
length_coords = []
contigs = []
trim_list = []
for line in coords_file:
elems = line.split("\t")
q_contig_name = elems[0]
contig_length = int(elems[2])
s_contig_name = elems[5]
q_start = int(elems[6])
q_end = int(elems[7])
s_start = int(elems[8])
s_end = int(elems[9])
strand = elems[17]
#check that the other hit is close to the end
if q_contig_name == s_contig_name and strand == "Plus" and q_contig_name == contig:
if q_start < 5 and s_start > 1000 and (s_end + 100) > contig_length:
if q_end not in trim_list:
length_coords.append(q_end)
contigs.append(q_contig_name)
trim_list.append(q_end)
elif s_start < 5 and q_start > 1000 and (q_end + 100) > contig_length:
if s_end not in trim_list:
length_coords.append(s_end)
contigs.append(q_contig_name)
trim_list.append(s_end)
return (contigs, length_coords, len(length_coords))

def run_pipeline(data_format, mode, reference, reads, genome_size, length, fraction, threads):
# Directory structure
get_outdir("map")
get_outdir("nucmer")

# File naming
FERR = open("ALADIN.err", 'w')
sam_file = "map/aln.sam"
extract_reads_file = "map/read_ids.txt.fasta"
extract_reads_frag_file = "map/read_ids.txt.fasta.frag.fasta"
assembly_file = "canu/extract.contigs.fasta"
aragorn_file = "canu/aragorn.txt"
gfa_file = "canu/extract.contigs.gfa"

#Set data format
if data_format == "N":
format_mini = "map-ont"
format_canu = "-nanopore-raw"
elif data_format == "P":
format_mini = "map-pb"
format_canu = "-pacbio-raw"

# Map reads
print(status_d['0'])
sam_out = open(sam_file, 'w')
subprocess.call(["minimap2", "--sam-hit-only", "-t", threads, "--secondary=no", "-ax",
format_mini, reference, reads],
stdout=sam_out, stderr=FERR)
fichier_vide(sam_file)

# Extract and fragment reads
print(status_d['1'])
read_ids = []
sam_input = open_files(sam_file)
for line in sam_input:
elems = line.split("\t")
if not line.startswith("@") and str(elems[4]) == "60":
read_ids.append(elems[0])

read_ids = list(set(read_ids))
extract_reads(reads, read_ids, extract_reads_file)
fragment_fasta(extract_reads_file, length, fraction)

# Run canu
print(status_d['2'])
subprocess.call(["canu", "-d", "canu", "-p", "extract", genome_size, "corOutCoverage=200", "correctedErrorRate=0.15",
format_canu, extract_reads_frag_file],
stdout=FERR, stderr=FERR)
fichier_vide(assembly_file)

print(status_d['3'])

# Run aragorn to select contig
aragorn_out = open(aragorn_file, 'w')
if mode == "M":
subprocess.call(["aragorn", "-l", "-gc5", "-mt", "-w", assembly_file],
stdout=aragorn_out, stderr=FERR)
else:
subprocess.call(["aragorn", "-l", "-gc1", "-m", "-t", "-w", assembly_file],
stdout=aragorn_out, stderr=FERR)
aragorn_input = open_files(aragorn_file)
trna_contigs = []
trna_genes = []
for line in aragorn_input:
if line.startswith(">") and not line.startswith(">end"):
elem = line.split(" ")
header = elem[0].replace(">","")
trna_contigs.append(header)
elif "genes found" in line:
elem = line.split(" ")
num_trnas = int(elem[0])
trna_genes.append(num_trnas)
aragorn_contig = trna_contigs[trna_genes.index(max(trna_genes))]

print("[!] Found contig " + str(aragorn_contig) + " with " + str(max(trna_genes)) + " mtRNAs")

num_circ = 0
length_canu = 0
trim_len = 0
assembly = {}

# Check canu output
for record in SeqIO.parse(assembly_file, "fasta"):
elem = str(record.id).split(" ")
if re.search(aragorn_contig,str(record.id)):
if re.search("suggestCircular=yes", str(record.description)):
assembly[elem[0]] = str(record.seq)
num_circ = 1

contigs, length_coords, nucmer_cases = run_nucmer(assembly_file, aragorn_contig)
mito_contig = aragorn_contig

if num_circ == 1:
length_canu = return_pos(gfa_file, aragorn_contig)
trim_len = length_canu
if str(nucmer_cases) == "0":
print(warn_d['2'] % trim_len)
elif str(nucmer_cases) == "1":
abs_trim = abs(int(length_canu) - int(length_coords[0]))
if abs_trim < 11:
print(status_d['4'] % (trim_len, length_coords[0]))
else:
print(warn_d['1'] % abs_trim)
else:
print(warn_d['3'])
else:
print(warn_d['0'])
if str(nucmer_cases) == "0":
warn_d('4')
elif str(nucmer_cases) == "1":
trim_len = length_coords[0]
print(status_d['5'] % trim_len)
else:
warn_d('5')

circ_file = open("Canu_trimmed.fasta", 'w')
circ_file.write(">" + mito_contig + "\n")
circ_file.write(assembly[mito_contig][int(trim_len):] + "\n")
circ_file.close()

FERR.close()


############### Main ###############

def main():
args = docopt(__doc__, version='1.01')
reference = os.path.abspath(str(args['--reference']))
reads = os.path.abspath(str(args['--reads']))
dir_output = get_outdir(args['--dir_output'])
length = int(args['--length'])
fraction = float(args['--fraction'])
threads = str(args['--threads'])
cleanup = args['--cleanup']
data_format = str(args['--data_format'])
mode = str(args['--mode'])
genome_size = int(args['--genome_size'])
genome_size = "genomeSize=" + str(genome_size) + "k"

os.chdir(dir_output)

check_programs("minimap2", "canu", "nucmer", "show-coords", "aragorn")

run_pipeline(data_format, mode, reference, reads, genome_size, length, fraction, threads)

if cleanup:
shutil.rmtree("canu")
shutil.rmtree("map")
shutil.rmtree("nucmer")

print(status_d['6'])
resultats_dir = os.getcwd()
print("Results be in "+resultats_dir+"/Canu_trimmed.fasta")

if __name__ == "__main__":
main()
158 changes: 158 additions & 0 deletions corrector.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
#!/usr/bin/env perl

# To do
# Multi-fasta
# Parallel
###

use strict;
use warnings;
use Bio::SeqIO;
use Data::Dumper;
use List::Util 'sum';
use Getopt::Long qw(:config pass_through no_ignore_case no_auto_abbrev);

my ($samfile,$fastafile,$prefix) = ("","","corrected");

GetOptions (
"s|samfile:s" => \$samfile,
"f|fastafile:s" => \$fastafile,
"p|prefix:s" => \$prefix,
);

if (not $fastafile or not $samfile) {
print STDERR "Usage:
-s|samfile <samfile>
-f|fastafile <fastafile>
-p|prefix <string>
";
exit;
}

my @ref_bases;
my $header;
my $seqio = Bio::SeqIO->new(-file => $fastafile, '-format' => 'Fasta');
while(my $seq = $seqio->next_seq) {
my $string = $seq->seq;
$header = $seq->id;
@ref_bases = split("",$string);
}

open IN, "$samfile";
my %ref_positions;
my %ins_positions;
my $i = 0;
my $k = 0;

while (my $line=<IN>) {
next if $line=~/^\@/;
my @p =split(/\t/,$line);
my $read_name = $p[0];
my $sam_flag = $p[1];
my $read_start = $p[3];
my $cigar = $p[5];
my $read = $p[9];
if ($sam_flag != 0 && $sam_flag != 16){
next;
} #avoid multimapping

# Parse CIGAR string
my @type_counts = split(/\D+/,$cigar);
my @types = split(/\d+/,$cigar);
shift @types;
$i = 0;
my @cs;
while ($i<scalar(@types)) {
$k = 0;
while ($k<$type_counts[$i]) {
push @cs, $types[$i];
$k++;
}
$i++;
}

# Create hash with M/D and I
my @rb = split("",$read);
$i = 0;
$k = 0;
while ($i<scalar(@cs)) {
if ($cs[$i] eq "M") {
$ref_positions{$read_start+$i}{$rb[$k]}++;
}
elsif ($cs[$i] eq "D") {
$ref_positions{$read_start+$i}{"-"}++;
$k--;
}
elsif ($cs[$i] eq "I") {
$ins_positions{$read_start+$i}{$read_name}.=$rb[$k];
$read_start--;
}
else {
$read_start--;
}
$i++;
$k++;
}
}

my %insertions;
my %ins_base;
# Count the insertions
foreach my $key (keys %ins_positions) {
foreach my $skey (keys %{$ins_positions{$key}}){
$ins_base{$key}{$ins_positions{$key}{$skey}}++;
}
$insertions{$key} = sum values %{$ins_base{$key}};
}


open CORRFASTA, ">$prefix.fasta";
open CORRLST, ">$prefix.lst";

my $len = scalar(@ref_bases)+1;

my $final_ref = "";
$i = 1;
while ($i<$len) {

my $nocov = 0;
unless (defined($ref_positions{$i})) {$ref_positions{$i}{$ref_bases[$i-1]}=1;print CORRLST "[N] at position $i\n";$nocov=1;} # No LR cov

my @bases = (sort {$ref_positions{$i}{$b} <=> $ref_positions{$i}{$a}} keys %{$ref_positions{$i}})[0..1];
my $base = $bases[0];
my $sbase = $bases[1];
my $total_match = sum values %{$ref_positions{$i}};

unless (defined($insertions{$i})) {$insertions{$i}=0;} # No insertions at the position
my $perc = int(($insertions{$i}/$total_match)*100);
#if ($insertions{$i}>0) {print CORRLST "pos $i $total_match:" . Dumper($ins_base{$i});}
if ($perc > 50) {
my $max_value = (sort {$ins_base{$i}{$b} <=> $ins_base{$i}{$a}} keys %{$ins_base{$i}})[0];
$final_ref .= uc($max_value);
print CORRLST "[I] at position $i: $max_value\n";
}

# Check that deletion is the most probable
if ($base eq "-") {
if (int(($ref_positions{$i}{$base}/$total_match)*100) > 50) {
$base = "";
print CORRLST "[D] at position $i\n";
}
else {
$base = uc($sbase);
}
}

# Write the substitution output
if ($base ne $ref_bases[$i-1] && $base ne "") {
print CORRLST "[S] at position $i: $ref_bases[$i-1] to $base\n";
}
# Lowercase if no coverage
if ($nocov) {$final_ref .= lc($base);}
else {$final_ref .= uc($base);}
$i++;


}

print CORRFASTA ">$header\n$final_ref\n";
Loading

0 comments on commit b0aa3bc

Please sign in to comment.