From b0aa3bc343c85369889a52797595db366b06f190 Mon Sep 17 00:00:00 2001
From: Georgios <gdkoutsovoulos@gmail.com>
Date: Wed, 16 Oct 2019 09:30:49 +0200
Subject: [PATCH] initial files

---
 aladin.py          | 228 +++++++++++++++++++++++++++++++++++++++++++++
 corrector.pl       | 158 +++++++++++++++++++++++++++++++
 depot/AladinFun.py | 110 ++++++++++++++++++++++
 detection.py       | 100 ++++++++++++++++++++
 extraction.py      |  51 ++++++++++
 fragmenter.py      |  56 +++++++++++
 6 files changed, 703 insertions(+)
 create mode 100755 aladin.py
 create mode 100755 corrector.pl
 create mode 100644 depot/AladinFun.py
 create mode 100755 detection.py
 create mode 100755 extraction.py
 create mode 100755 fragmenter.py
diff --git a/aladin.py b/aladin.py
new file mode 100755
index 0000000..89cf520
--- /dev/null
+++ b/aladin.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python3
+
+"""
+aladin.py
+    Usage:
+    ./aladin.py -r <FILE> -i <FILE> [-o <DIR>] [-d <STR>] [-m <STR>] [-l <INT>] [-f <FLOAT>] [-s <STR>] [-t <INT>] [--cleanup]
+
+    Options:
+        -h, --help                     show options
+        -r, --reference <FILE>         reference input
+        -i, --reads <FILE>             Reads input
+        -o, --dir_output <DIR>         creates a directory for all output files [default: results]
+        -d, --data_format <STR>        (N) Nanopore or (P) Pacbio [default: N]
+        -m, --mode <STR>               (M) Mitochondion or (C) Chloroplast [default: M]
+        -l, --length <INT>             break reads into chunks of this length [default: 4000]
+        -f, --fraction <FLOAT>         fraction of length at which the end of the sequence gets split into a new sequence [default: 0.1]
+        -s, --genome_size <INT>        set expected size in kb [default: 20]
+        -t, --threads <INT>            minimap2 threads [default: 3]
+        --cleanup                      remove intermediate files
+        --version                      print version
+"""
+
+import os
+import re
+import shutil
+import subprocess
+from Bio import SeqIO
+from docopt import docopt
+from depot.AladinFun import *
+from detection import return_pos
+from extraction import extract_reads
+from fragmenter import fragment_fasta
+
+def run_nucmer(assembly_file, contig):
+    FERR = open("ALADIN.err", 'a')
+    coords_file = "nucmer/nucmer.coords"
+    delta_file = "nucmer/nucmer.delta"
+    coords_out = open(coords_file, 'w')
+    subprocess.call(["nucmer", "--maxmatch", "-c", "100", "--delta",
+                     delta_file, assembly_file, assembly_file],
+                     stdout=FERR, stderr=FERR)
+    subprocess.call(["show-coords", "-B", "-r",
+                     delta_file],
+                     stdout=coords_out, stderr=FERR)
+    coords_file = open_files(coords_file)
+    length_coords = []
+    contigs = []
+    trim_list = []
+    for line in coords_file:
+        elems = line.split("\t")
+        q_contig_name = elems[0]
+        contig_length = int(elems[2])
+        s_contig_name = elems[5]
+        q_start = int(elems[6])
+        q_end = int(elems[7])
+        s_start = int(elems[8])
+        s_end = int(elems[9])
+        strand = elems[17]
+        #check that the other hit is close to the end
+        if q_contig_name  == s_contig_name and strand == "Plus" and q_contig_name == contig:
+            if q_start < 5 and s_start > 1000 and (s_end + 100) > contig_length:
+                if q_end not in trim_list:
+                    length_coords.append(q_end)
+                    contigs.append(q_contig_name)
+                    trim_list.append(q_end)
+            elif s_start < 5 and q_start > 1000 and (q_end + 100) > contig_length:
+                if s_end not in trim_list:
+                    length_coords.append(s_end)
+                    contigs.append(q_contig_name)
+                    trim_list.append(s_end)
+    return (contigs, length_coords, len(length_coords))
+
+def run_pipeline(data_format, mode, reference, reads, genome_size, length, fraction, threads):
+    # Directory structure
+    get_outdir("map")
+    get_outdir("nucmer")
+
+    # File naming
+    FERR = open("ALADIN.err", 'w')
+    sam_file = "map/aln.sam"
+    extract_reads_file = "map/read_ids.txt.fasta"
+    extract_reads_frag_file = "map/read_ids.txt.fasta.frag.fasta"
+    assembly_file = "canu/extract.contigs.fasta"
+    aragorn_file = "canu/aragorn.txt"
+    gfa_file = "canu/extract.contigs.gfa"
+
+    #Set data format
+    if data_format == "N":
+        format_mini = "map-ont"
+        format_canu = "-nanopore-raw"
+    elif data_format == "P":
+        format_mini = "map-pb"
+        format_canu = "-pacbio-raw"
+
+    # Map reads
+    print(status_d['0'])
+    sam_out = open(sam_file, 'w')
+    subprocess.call(["minimap2", "--sam-hit-only", "-t", threads, "--secondary=no", "-ax",
+                     format_mini, reference, reads],
+                     stdout=sam_out, stderr=FERR)
+    fichier_vide(sam_file)
+
+    # Extract and fragment reads
+    print(status_d['1'])
+    read_ids = []
+    sam_input = open_files(sam_file)
+    for line in sam_input:
+        elems = line.split("\t")
+        if not line.startswith("@") and str(elems[4]) == "60":
+            read_ids.append(elems[0])
+
+    read_ids = list(set(read_ids))
+    extract_reads(reads, read_ids, extract_reads_file)
+    fragment_fasta(extract_reads_file, length, fraction)
+
+    # Run canu
+    print(status_d['2'])
+    subprocess.call(["canu", "-d", "canu", "-p", "extract", genome_size, "corOutCoverage=200", "correctedErrorRate=0.15",
+                     format_canu, extract_reads_frag_file],
+                     stdout=FERR, stderr=FERR)
+    fichier_vide(assembly_file)
+
+    print(status_d['3'])
+
+    # Run aragorn to select contig
+    aragorn_out = open(aragorn_file, 'w')
+    if mode == "M":
+        subprocess.call(["aragorn", "-l", "-gc5", "-mt", "-w", assembly_file],
+                        stdout=aragorn_out, stderr=FERR)
+    else:
+        subprocess.call(["aragorn", "-l", "-gc1", "-m", "-t", "-w", assembly_file],
+                        stdout=aragorn_out, stderr=FERR)
+    aragorn_input = open_files(aragorn_file)
+    trna_contigs = []
+    trna_genes = []
+    for line in aragorn_input:
+        if line.startswith(">") and not line.startswith(">end"):
+            elem = line.split(" ")
+            header = elem[0].replace(">","")
+            trna_contigs.append(header)
+        elif "genes found" in line:
+            elem = line.split(" ")
+            num_trnas = int(elem[0])
+            trna_genes.append(num_trnas)
+    aragorn_contig = trna_contigs[trna_genes.index(max(trna_genes))]
+
+    print("[!] Found contig " + str(aragorn_contig) + " with " +  str(max(trna_genes)) + " mtRNAs")
+
+    num_circ = 0
+    length_canu = 0
+    trim_len = 0
+    assembly = {}
+
+    # Check canu output
+    for record in SeqIO.parse(assembly_file, "fasta"):
+        elem = str(record.id).split(" ")
+        if re.search(aragorn_contig,str(record.id)):
+            if re.search("suggestCircular=yes", str(record.description)):
+                assembly[elem[0]] = str(record.seq)
+                num_circ = 1
+
+    contigs, length_coords, nucmer_cases = run_nucmer(assembly_file, aragorn_contig)
+    mito_contig = aragorn_contig
+
+    if num_circ == 1:
+        length_canu = return_pos(gfa_file, aragorn_contig)
+        trim_len = length_canu
+        if str(nucmer_cases) == "0":
+            print(warn_d['2'] % trim_len)
+        elif str(nucmer_cases) == "1":
+            abs_trim = abs(int(length_canu) - int(length_coords[0]))
+            if abs_trim < 11:
+                print(status_d['4'] % (trim_len, length_coords[0]))
+            else:
+                print(warn_d['1'] % abs_trim)
+        else:
+            print(warn_d['3'])
+    else:
+        print(warn_d['0'])
+        if str(nucmer_cases) == "0":
+            warn_d('4')
+        elif str(nucmer_cases) == "1":
+            trim_len = length_coords[0]
+            print(status_d['5'] % trim_len)
+        else:
+            warn_d('5')
+
+    circ_file = open("Canu_trimmed.fasta", 'w')
+    circ_file.write(">" + mito_contig + "\n")
+    circ_file.write(assembly[mito_contig][int(trim_len):] + "\n")
+    circ_file.close()
+
+    FERR.close()
+
+
+############### Main ###############
+
+def main():
+    args = docopt(__doc__, version='1.01')
+    reference = os.path.abspath(str(args['--reference']))
+    reads = os.path.abspath(str(args['--reads']))
+    dir_output = get_outdir(args['--dir_output'])
+    length = int(args['--length'])
+    fraction = float(args['--fraction'])
+    threads = str(args['--threads'])
+    cleanup = args['--cleanup']
+    data_format = str(args['--data_format'])
+    mode = str(args['--mode'])
+    genome_size = int(args['--genome_size'])
+    genome_size = "genomeSize=" + str(genome_size) + "k"
+
+    os.chdir(dir_output)
+
+    check_programs("minimap2", "canu", "nucmer", "show-coords", "aragorn")
+
+    run_pipeline(data_format, mode, reference, reads, genome_size, length, fraction, threads)
+
+    if cleanup:
+        shutil.rmtree("canu")
+        shutil.rmtree("map")
+        shutil.rmtree("nucmer")
+
+    print(status_d['6'])
+    resultats_dir = os.getcwd()
+    print("Results be in "+resultats_dir+"/Canu_trimmed.fasta")
+
+if __name__ == "__main__":
+    main()
diff --git a/corrector.pl b/corrector.pl
new file mode 100755
index 0000000..a2aecf8
--- /dev/null
+++ b/corrector.pl
@@ -0,0 +1,158 @@
+#!/usr/bin/env perl
+
+# To do
+# Multi-fasta
+# Parallel
+###
+
+use strict;
+use warnings;
+use Bio::SeqIO;
+use Data::Dumper;
+use List::Util 'sum';
+use Getopt::Long qw(:config pass_through no_ignore_case no_auto_abbrev);
+
+my ($samfile,$fastafile,$prefix) = ("","","corrected");
+
+GetOptions (
+ "s|samfile:s" => \$samfile,
+ "f|fastafile:s" => \$fastafile,
+ "p|prefix:s" => \$prefix,
+);
+
+if (not $fastafile or not $samfile) {
+  print STDERR "Usage:
+  -s|samfile <samfile>
+  -f|fastafile <fastafile>
+  -p|prefix <string>
+";
+exit;
+}
+
+my @ref_bases;
+my $header;
+my $seqio = Bio::SeqIO->new(-file => $fastafile, '-format' => 'Fasta');
+while(my $seq = $seqio->next_seq) {
+    my $string = $seq->seq;
+    $header = $seq->id;
+    @ref_bases = split("",$string);
+}
+
+open IN, "$samfile";
+my %ref_positions;
+my %ins_positions;
+my $i = 0;
+my $k = 0;
+
+while (my $line=<IN>) {
+  next if $line=~/^\@/;
+  my @p =split(/\t/,$line);
+  my $read_name = $p[0];
+  my $sam_flag = $p[1];
+  my $read_start = $p[3];
+  my $cigar = $p[5];
+  my $read = $p[9];
+  if ($sam_flag != 0 && $sam_flag != 16){
+    next;
+  } #avoid multimapping
+
+  # Parse CIGAR string
+  my @type_counts = split(/\D+/,$cigar);
+  my @types = split(/\d+/,$cigar);
+  shift @types;
+  $i = 0;
+  my @cs;
+  while ($i<scalar(@types)) {
+    $k = 0;
+    while ($k<$type_counts[$i]) {
+      push @cs, $types[$i];
+      $k++;
+    }
+    $i++;
+  }
+
+  # Create hash with M/D and I
+  my @rb = split("",$read);
+  $i = 0;
+  $k = 0;
+  while ($i<scalar(@cs)) {
+    if ($cs[$i] eq "M") {
+      $ref_positions{$read_start+$i}{$rb[$k]}++;
+    }
+    elsif ($cs[$i] eq "D") {
+      $ref_positions{$read_start+$i}{"-"}++;
+      $k--;
+    }
+    elsif ($cs[$i] eq "I") {
+      $ins_positions{$read_start+$i}{$read_name}.=$rb[$k];
+      $read_start--;
+    }
+    else {
+      $read_start--;
+    }
+    $i++;
+    $k++;
+  }
+}
+
+my %insertions;
+my %ins_base;
+# Count the insertions
+foreach my $key (keys %ins_positions) {
+  foreach my $skey (keys %{$ins_positions{$key}}){
+    $ins_base{$key}{$ins_positions{$key}{$skey}}++;
+  }
+  $insertions{$key} = sum values %{$ins_base{$key}};
+}
+
+
+open CORRFASTA, ">$prefix.fasta";
+open CORRLST, ">$prefix.lst";
+
+my $len = scalar(@ref_bases)+1;
+
+my $final_ref = "";
+$i = 1;
+while ($i<$len) {
+
+  my $nocov = 0;
+  unless (defined($ref_positions{$i})) {$ref_positions{$i}{$ref_bases[$i-1]}=1;print CORRLST "[N] at position $i\n";$nocov=1;} # No LR cov
+
+  my @bases = (sort {$ref_positions{$i}{$b} <=> $ref_positions{$i}{$a}} keys %{$ref_positions{$i}})[0..1];
+  my $base = $bases[0];
+  my $sbase = $bases[1];
+  my $total_match = sum values %{$ref_positions{$i}};
+
+  unless (defined($insertions{$i})) {$insertions{$i}=0;} # No insertions at the position
+  my $perc = int(($insertions{$i}/$total_match)*100);
+  #if ($insertions{$i}>0) {print CORRLST "pos $i $total_match:" . Dumper($ins_base{$i});}
+  if ($perc > 50) {
+    my $max_value = (sort {$ins_base{$i}{$b} <=> $ins_base{$i}{$a}} keys %{$ins_base{$i}})[0];
+    $final_ref .= uc($max_value);
+    print CORRLST "[I] at position $i: $max_value\n";
+  }
+
+  # Check that deletion is the most probable
+  if ($base eq "-") {
+    if (int(($ref_positions{$i}{$base}/$total_match)*100) > 50) {
+      $base = "";
+      print CORRLST "[D] at position $i\n";
+    }
+    else {
+      $base = uc($sbase);
+    }
+  }
+
+  # Write the substitution output
+  if ($base ne $ref_bases[$i-1] && $base ne "") {
+    print CORRLST "[S] at position $i: $ref_bases[$i-1] to $base\n";
+  }
+  # Lowercase if no coverage
+  if ($nocov) {$final_ref .= lc($base);}
+  else {$final_ref .= uc($base);}
+  $i++;
+
+
+}
+
+print CORRFASTA ">$header\n$final_ref\n";
diff --git a/depot/AladinFun.py b/depot/AladinFun.py
new file mode 100644
index 0000000..5cd53a6
--- /dev/null
+++ b/depot/AladinFun.py
@@ -0,0 +1,110 @@
+import sys
+import os
+import shutil
+import gzip
+from Bio import SeqIO
+
+# Reporting
+def error(message, *argv):
+    if argv is None:
+        sys.exit(error_d[message])
+    else:
+        sys.exit(error_d[message] % (argv))
+
+error_d = {
+    '0': '[ERROR:0]\t: File %s does not exist.',
+    '1': '[ERROR:1]\t: File %s is empty.',
+    '2': '[ERROR:2]\t: %s not found! Please install and add to the PATH variable.'
+}
+
+status_d = {
+    '0': '[+] Mapping reads with minimap2.',
+    '1': '[+] Extracting reads.',
+    '2': '[+] Running Canu.',
+    '3': '[+] Finding the mitochondrion.',
+    '4': '[+] Canu and nucmer agree (%s, %s) . Using canu gfa to trim.',
+    '5': '[+] Using nucmer coordinates to trim (%s bases).',
+    '6': '[+] Pipeline finished.'
+}
+
+warn_d = {
+    '0': '[-] No circular tag present, or non-informative gfa in Canu assembly. Will attempt to find the circular contig with nucmer.',
+    '1': '[-] Using canu gfa to trim despite %s bases difference between canu and nucmer. Validate your results',
+    '2': '[-] Using canu gfa to trim (%s bases) despite nucmer not finding any case of circularisation. Validate your results',
+    '3': '[-] Using canu gfa to trim despite nucmer finding multiple cases of circularisation. Validate your results',
+    '4': '[-] Nor Canu nor nucmer found any circular contig.',
+    '5': '[-] No circular tag in canu while nucmer found multiple cases of circularisation.'
+}
+
+
+# Check if empty or does not exist
+def fichier_vide(fichier):
+    if os.path.isfile(fichier) is True:
+        if os.path.getsize(fichier) <= 200:
+            error('1', fichier)
+    else:
+        error('0', fichier)
+
+# Check if program is in path
+
+def check_programs(*arg):
+    error_list = []
+    for program in arg:
+        if which(program) is False:
+            error_list.append(program)
+    if error_list:
+        programs = ', '.join(error_list)
+        error('2', programs)
+
+def which(program):
+    if shutil.which(program):
+        return program
+    else:
+        return False
+
+# Directory checking
+
+def get_outdir(out_directory, add_dir=""):
+    """generates output directory in case it does not exist."""
+    if type(out_directory) != str:
+        print("\t[!] {} is NOT a directory! Please specify an output directory".format(out_directory))
+        sys.exit()
+    elif os.path.isfile(out_directory):
+        print("\t[!] {} is a File! Please specify an output directory".format(out_directory))
+        sys.exit()
+    elif not os.path.exists(os.path.join(out_directory, add_dir)):
+        os.mkdir(os.path.join(out_directory, add_dir))
+        return os.path.abspath(os.path.join(out_directory, add_dir))
+    else:
+        return os.path.abspath(os.path.join(out_directory, add_dir))
+
+def check_indir(input_dir):
+    if not os.path.exists(input_dir):
+        print("\t[!] FATAL ERROR: '{}' directory not found".format(input_dir))
+        sys.exit()
+    elif not os.path.isdir(input_dir):
+        print("\t[!] FATAL ERROR: '{}' is not a directory".format(input_dir))
+        sys.exit()
+    else:
+        return input_dir
+
+# Can open gzip files
+
+def open_files(fname):
+    if fname.endswith('.gz'):
+        return gzip.open(fname, 'rt', encoding="latin-1")
+    else:
+        return open(fname, 'rt', encoding="latin-1")
+
+# Check if fasta
+
+def is_fasta(filename):
+    with open_files(filename) as handle:
+        if handle.readline().startswith(">"):
+            return True
+
+def return_format(filename):
+    if is_fasta(filename):
+        return "fasta"
+    else:
+        return "fastq"
diff --git a/detection.py b/detection.py
new file mode 100755
index 0000000..f28b9f8
--- /dev/null
+++ b/detection.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+
+"""
+detection.py
+    Usage:
+    ./detection.py -g <FILE> -f <FILE> -o <FILE> -i <STR>
+
+    Options:
+        -h, --help                     show options
+        -g, --gfa_file <FILE>          gfa file
+        -f, --fasta_file <FILE>        fasta file
+        -o, --output_file <FILE>       output file
+        -i, --contig_id <STR>          contig id
+        --version                      print version
+"""
+
+import re
+from docopt import docopt
+
+
+def forma_cigar(cigar):
+    cigar = cigar.rstrip()
+    type_counts = re.split("\D+", cigar)
+    types = re.split("\d+", cigar)
+    types.pop(0)
+    length_cigar = 0
+    for c, t in zip(type_counts, types):
+        if t != "I":
+            length_cigar += int(c)
+    return length_cigar
+
+def forma_cigar_minus(cigar):
+    cigar = cigar.rstrip()
+    type_counts = re.split("\D+", cigar)
+    types = re.split("\d+", cigar)
+    types.pop(0)
+    length_cigar = 0
+    for c, t in zip(type_counts, types):
+        if t != "D":
+            length_cigar += int(c)
+    return length_cigar
+
+def return_pos(file_gfa, contig):
+
+    file_opened = open(file_gfa, "r")
+    cigar_plus = ""
+    length_cigar = 0
+
+    for line in file_opened:
+        if line.startswith("L"):
+            elem = line.split("\t")
+            if elem[1] == contig and elem[1] == elem[3] and elem[2] == "+" and elem[4] == "+":
+                cigar_plus = elem[5]
+                length_cigar = forma_cigar(cigar_plus)
+    file_opened.close()
+
+    file_opened = open(file_gfa, "r")
+    if length_cigar == 0:
+        for line in file_opened:
+            if line.startswith("L"):
+                elem = line.split("\t")
+                if elem[1] == contig and elem[1] == elem[3] and elem[2] == "-" and elem[4] == "-":
+                    cigar_minus = elem[5]
+                    length_cigar = forma_cigar_minus(cigar_minus)
+
+    file_opened.close()
+    length_plus = length_cigar
+    return length_plus
+
+############### Main ###############
+
+def main():
+    args = docopt(__doc__, version='1.00')
+    file_gfa = str(args['--gfa_file'])
+    file_fasta = str(args['--fasta_file'])
+    file_output = str(args['--output_file'])
+    contig = str(args['--contig_id'])
+
+    length_plus = return_pos(file_gfa, contig)
+    sequence = ""
+
+    file_seq = open(file_fasta, "r")
+
+    for line in file_seq:
+        line = line.rstrip("\n")
+        if line.startswith(">"):
+            header = line
+        else:
+            sequence += line
+
+    file_seq.close()
+
+    file_new_seq = open(file_output, 'w')
+    file_new_seq.write(header + "\n")
+    file_new_seq.write(sequence[length_plus:] + "\n")
+    file_new_seq.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extraction.py b/extraction.py
new file mode 100755
index 0000000..7830afd
--- /dev/null
+++ b/extraction.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+
+"""
+extraction.py
+    Usage:
+    ./extraction.py -r <FILE> -i <FILE>
+
+    Options:
+        -h, --help                     show options
+        -r, --reads <FILE>             reads file
+        -i, --ids <FILE>               read ids
+        --version                      print version
+"""
+
+from Bio import SeqIO
+from docopt import docopt
+from depot.AladinFun import open_files, return_format
+
+
+def extract_reads(reads, read_ids, outfile):
+
+    file_in = open_files(reads)
+    file_out = open(outfile, 'w')
+
+    for record in SeqIO.parse(file_in, return_format(reads)):
+        if record.id in read_ids:
+            file_out.write(">" + str(record.id) + "\n")
+            file_out.write(str(record.seq) + "\n")
+
+    file_in.close()
+    file_out.close()
+
+############### Main ###############
+def main():
+    args = docopt(__doc__, version='1.00')
+    reads = str(args['--reads'])
+    ids = str(args['--ids'])
+
+    file_list = open_files(ids)
+
+    read_ids = []
+    for line in file_list:
+        line = line.rstrip("\n")
+        read_ids.append(line)
+
+    file_list.close()
+    outfile = "read_ids.txt.fasta"
+    extract_reads(reads, read_ids, outfile)
+
+if __name__ == "__main__":
+    main()
diff --git a/fragmenter.py b/fragmenter.py
new file mode 100755
index 0000000..22da529
--- /dev/null
+++ b/fragmenter.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+
+"""
+fragmenter.py
+    Usage:
+    ./fragmenter.py -i <FILE> [-l <INT>] [-f <FLOAT>]
+
+    Options:
+        -h, --help                     show options
+        -i, --input_file <FILE>        file to fragment
+        -l, --length <INT>             split reads into chunks of this length [default: 4000]
+        -f, --fraction <FLOAT>         fraction of length at which the end of the sequence gets split into a new sequence [default: 0.1]
+        --version                      print version
+"""
+
+from Bio import SeqIO
+from docopt import docopt
+from depot.AladinFun import open_files, return_format
+
+def fragment_fasta(input_file, length_frag, fraction_select):
+    file_in = open_files(input_file)
+
+    outfile = input_file+".frag.fasta"
+    file_out = open(outfile, 'w')
+
+    for record in SeqIO.parse(file_in, return_format(input_file)):
+        name = record.id
+        seq = str(record.seq)
+        if length_frag < 1:
+            file_out.write(">" + name + "\n")
+            file_out.write(seq + "\n")
+        else:
+            subseqs = [seq[i:i+length_frag] for i in range(0, len(seq), length_frag)]
+            if len(subseqs) > 1 and len(subseqs[-1]) < (fraction_select*length_frag):
+                toadd = subseqs.pop()
+                subseqs[-1] += toadd
+            i = 0
+            for subseq in subseqs:
+                file_out.write(">" + name + "_" + str(i) + "\n")
+                file_out.write(subseq + "\n")
+                i += 1
+
+    file_in.close()
+    file_out.close()
+
+############### Main ###############
+def main():
+    args = docopt(__doc__, version='1.00')
+    input_file = str(args['--input_file'])
+    length_frag = int(args['--length'])
+    fraction_select = float(args['--fraction'])
+
+    fragment_fasta(input_file, length_frag, fraction_select)
+
+if __name__ == "__main__":
+    main()