This repository has been archived by the owner on Jul 18, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Contig_summary.py
executable file
·124 lines (88 loc) · 3.51 KB
/
Contig_summary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env python
import os, sys, string, numpy, scipy.stats
from Bio.Seq import Seq
from optparse import OptionParser
from modules.Si_SeqIO import *
##########################################
# Function to Get command line arguments #
##########################################
def main():
usage = "usage: %prog [options]"
parser = OptionParser(usage=usage)
parser.add_option("-m", "--min_contig_len", action="store", dest="minlength", help="Minimum contig length to include in stats", default=0, type="int")
parser.add_option("-s", "--split_contigs", action="store_true", dest="split", help="Print stats for each contig as well as each file", default=False)
return parser.parse_args()
def check_input_options(options, args):
if options.minlength<0:
print "Minimum length of contigs to include must be >0"
(options, args) = main()
check_input_options(options, args)
print '\t'.join(["File", "Total length", "No. contigs", "Mean length", "Stdev lengths", "Median length", "Max length", "Min length", "Skewness", "Kurtosis", "N50", "N50n", "Mean GC%", "Stdev GC%", "Median GC%", "Max GC%", "MinGC%", "N count", "gap count", "Length without Ns and gaps"])
for filename in args:
try:
contigs=SeqIO.parse(open(filename), "fasta")
except StandardError:
print "Could not open file", filename
continue
# print filename
lengths=[]
GCs=[]
test=[]
gaps=[]
Ns=[]
lenacgt=[]
for contig in contigs:
seq=str(contig.seq)
length=len(seq)
if length<options.minlength:
continue
lengths.append(length)
lengthnons=len(seq.upper().replace("N",""))
lengthnogaps=len(seq.upper().replace("-",""))
lengthnogapnons=len(seq.upper().replace("-","").replace("N",""))
if lengthnons==0:
GC=0
else:
GC=(float(len(seq.upper().replace("N","").replace("A","").replace("T","")))/lengthnogapnons)*100
Ns.append(length-lengthnons)
gaps.append(length-lengthnogaps)
lenacgt.append(lengthnogapnons)
GCs.append(GC)
if options.split:
try:
print '\t'.join(map(str,[contig.name, length, "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", GC, "-", "-", "-", "-", Ns[-1], gaps[-1], lenacgt[-1]]))
except StandardError:
print filename+"\tfailed"
#test.sort()
#test.reverse()
#print test[0]
if len(lengths)==0:
print '\t'.join([filename, '0', '0', '0', '0', '0', '0', '0', '0', '0', '-', '-', "-", "-", "-", "-", "-", "-", "-", "-"])
continue
# print "Total length =", numpy.sum(lengths)
# print "Number of contigs =", len(lengths)
# print "Mean length =", numpy.mean(lengths)
# print "Standard deviation of lengths =", numpy.std(lengths)
# print "Maximum length =", numpy.max(lengths)
# print "Minimum length =", numpy.min(lengths)
lengths.sort()
lengths.reverse()
fifty=float(numpy.sum(lengths))/2
count=0
suml=0
#print lengths[0]
while suml<fifty:
N50=lengths[count]
suml+=lengths[count]
count+=1
try:
print '\t'.join(map(str,[filename, numpy.sum(lengths), len(lengths), numpy.mean(lengths), numpy.std(lengths), numpy.median(lengths), numpy.max(lengths), numpy.min(lengths), scipy.stats.skew(lengths), scipy.stats.kurtosis(lengths), N50, count, numpy.mean(GCs), numpy.std(GCs), numpy.median(GCs), numpy.max(GCs), numpy.min(GCs), sum(Ns), sum(gaps), sum(lenacgt)]))
except StandardError:
print filename+"\tfailed"
# print "N50 =", N50
# print "N50n =", count
# print "Mean GC% =", numpy.mean(GCs)
# print "GC% standard deviation =", numpy.mean(GCs)
# print "Median GC% =", numpy.mean(GCs)
# print "Maximum GC% =", numpy.max(GCs)
# print "Minimum GC% =", numpy.min(GCs)