-
Notifications
You must be signed in to change notification settings - Fork 0
/
pepnucfunction.py
107 lines (89 loc) · 3.34 KB
/
pepnucfunction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import sys
import progressbar as pb
import os
"""
Developed by Davi J. Marcon Github: Mxrcon
E-mail:davijosuemarcon@gmail.com
Requirements: Biopython and progressbar packages
Python script for generate .pep, .nuc and .function files for pgap input
usage: script.py [1] [2]
where: [1] = input folder containing .gbk files and [2] = output folder where resulting files will be created
"""
class progress_timer:
def __init__(self, n_iter, description=""):
self.n_iter = n_iter
self.iter = 0
self.description = description+":"
self.timer = None
self.initialize()
def initialize(self):
widgets= [self.description,pb.Percentage(),"",pb.Bar(marker=pb.RotatingMarker()),"",pb.ETA()]
self.timer= pb.ProgressBar(widgets=widgets,maxval=self.n_iter).start()
def update(self,q=1):
self.timer.update(self.iter)
self.iter += q
def finish(self):
self.timer.finish()
class sequence:
def __init__(self,seq_id,seq,seq_product,seq_translation,cog):
self.id = seq_id
self.seq = seq
self.product =seq_product
self.translation = seq_translation
self.cog = cog
def pep(sequence_list,output_folder,name):
records = [SeqRecord(Seq(seq.translation,),id=seq.id,description="",) for seq in sequence_list]
SeqIO.write(records,output_folder+"/"+name+".pep","fasta")
def nuc(sequence_list,output_folder,name):
records = [SeqRecord(seq.seq,id=seq.id,description="",) for seq in sequence_list]
SeqIO.write(records,output_folder+"/"+name+".nuc","fasta")
def func(sequence_list,output_folder,name):
function = [seq.id+"\t"+seq.cog+"\t"+seq.product+"\n" for seq in sequence_list]
function = "".join(function)
with open(output_folder+"/"+name+".function",'w') as output:
output.write(function)
output.close()
def open_gbk(seq_file):
sequence_list,id_list = [],[]
for seq_record in SeqIO.parse(seq_file , 'genbank'):
genome = seq_record.seq
for feature in seq_record.features:
if feature.type == "CDS":
gene = feature
feature_id = feature.qualifiers['locus_tag'][0]
feature_product = feature.qualifiers['product'][0]
feature_translation = feature.qualifiers['translation'][0]
if feature.location.strand == 1:
feature_sequence = genome[feature.location.start:feature.location.end]
if feature.location.strand == -1:
feature_sequence = genome[feature.location.start:feature.location.end].reverse_complement()
try:
if feature.qualifiers['db_xref'][0].split(":")[0] == "COG":
feature_cog = feature.qualifiers['db_xref'][0].split(":")[1]
except:
feature_cog = "-"
sequence_list.append(sequence(feature_id,feature_sequence,feature_product,feature_translation,feature_cog))
return sequence_list
def file_handler(input_file,output_folder):
name = input_file.split(".gb")[0].split("/")[1]
sequence_list = open_gbk(input_file)
pep(sequence_list,output_folder,name)
nuc(sequence_list,output_folder,name)
func(sequence_list,output_folder,name)
#Inputs and outputs
input_folder = sys.argv[1]
output_folder= sys.argv[2]
#Output check
if not os.path.exists(output_folder):
os.makedirs(output_folder)
#Running code
pt= progress_timer(description="Running",n_iter=len(os.listdir(input_folder)))
for i in os.listdir(input_folder):
pt.update()
open(input_folder+"/"+i,"r")
file_handler(input_folder+"/"+i, output_folder)
pt.finish()
print("Done!")