-
Notifications
You must be signed in to change notification settings - Fork 1
/
Align.py
254 lines (221 loc) · 10.8 KB
/
Align.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
from Bio.Blast.Applications import NcbitblastnCommandline
from Bio.Blast import NCBIXML
from Bio import SeqIO
import os
import csv
import helper
# Name: buildSpeciesDB
# Summary: This builds a local nucleotide blastdb for the input species.
# Parameters: species_name - describes the name of the current species for analysis
# Returns: db_name - the name of the db that can be queried
def buildSpeciesDB(species_name):
# Species the file location of the fasta file to be made into db
file_location = "Species\\"
fasta_file = species_name + ".fsa"
# Writes over the current blast db each time to save space
db_name = "temp_blastdb"
# Creates the db with the right input file and the temp output db names
blastn_db = "makeblastdb -in " + file_location + fasta_file + " -dbtype nucl -input_type fasta -out " + db_name
os.system(blastn_db)
return db_name
# Name: alignSequence
# Summary: This aligns a specied query protein against our newly made blast db.
# Parameters: filename - describes the name of the current file to query, will be one of the
# multiple pathway proteins.
# db_name - the name of the db to query against
# Returns: out_file - the name of the blast result xml file
def alignSequence(species_name, protein_name, db_name):
# Run the blastn command on the local database
out_file_location = "BlastResults\\"
out_file = out_file_location + species_name + protein_name + "_blastresults.xml"
query_file = "Proteins\\" + protein_name + ".fasta"
tblastn_cmd = NcbitblastnCommandline(query=query_file, db=db_name, outfmt=5, out=out_file)
stdout, stderr = tblastn_cmd()
return out_file
# Name: parseResults
# Summary: Parses XML results of each species - protein aligmnemnt
# to create a csv output of significant hits.
# Parameters: xmlFile - the blast results file to consider
# protein_name - the name of the pathway protein query.
# species_name - the name of the species subject.
# csv_out - the csv file to write results out to.
# Returns: NA
def parseResults(xmlFile, species_name, protein_name, csv_out):
result_handle = open(xmlFile)
blast_records = NCBIXML.parse(result_handle)
sequence = SeqIO.read("Proteins\\" + protein_name + ".fasta", "fasta")
with open("CSVResults\\" + csv_out, 'a', newline='') as out_file:
writer =csv.writer(out_file)
for record in blast_records:
alignment_num = 0
for alignment in record.alignments:
alignment_num += 1
hsp_num = 0
for hsp in alignment.hsps:
hsp_num += 1
if hsp.expect < .01:
list = []
list.append(alignment_num)
list.append(hsp_num)
list.append(species_name)
list.append(protein_name)
list.append(alignment.title)
list.append(str(hsp.identities / len(sequence.seq) * 100))
list.append(str(hsp.align_length / len(sequence.seq) * 100))
list.append(hsp.expect)
list.append(hsp.align_length)
list.append(hsp.score)
list.append(hsp.query_start)
list.append(hsp.query_end)
list.append(hsp.sbjct_start)
list.append(hsp.sbjct_end)
list.append(hsp.query)
list.append(hsp.sbjct)
list.append(hsp.identities)
list.append(hsp.gaps)
list.append(hsp.positives)
list.append(hsp.bits)
list.append(xmlFile)
writer.writerow(list)
out_file.close()
# Name: score
# Summary: Parses CSV results to score each bacteria.
# Parameters: csv_in - the csv file to score species from
# Returns: NA
def score(csv_in, csv_out, threshold):
protein_namesCore = helper.protein_namesCore
protein_namesBCT = helper.protein_namesBCT
protein_namesBK = helper.protein_namesBK
# Create a dict entry for each protein in our list.
temp_dict_prot = {}
for protein in protein_namesBK:
temp_dict_prot[protein] = ""
for protein in protein_namesBCT:
temp_dict_prot[protein] = ""
for protein in protein_namesCore:
temp_dict_prot[protein] = ""
temp_dict = {} # will hold a list of scores for species.
# Read in the csv results file
with open(csv_in, 'r') as in_file:
scorer = csv.reader(in_file, delimiter=',')
row_num = 0
# For each row that is not the header, score results.
for row in scorer:
if row_num > 0:
species_name = row[2]
protein_name = row[3]
protein_identity = float(row[5])
# create an entry for that
if not helper.dict_contains(species_name, temp_dict):
temp_dict[species_name] = []
# Save results in a temporary list and create new entry
temp_list = temp_dict[species_name]
temp_list.append([protein_name, protein_identity])
temp_dict[species_name] = temp_list
row_num += 1
in_file.close()
# After results are saved as a species dictionary list of lists of proteins
# write the results out to a file.
with open(csv_out, 'a', newline='') as outfile:
write_out = csv.writer(outfile)
for key in temp_dict.keys():
# If there is a protein that reaches the threshold
if helper.reaches_threshold(temp_dict[key], threshold):
total_scoreCore = 0
total_scoreBCT = 0
total_scoreBK = 0
protein_dict = {
"thiolase": 0,
"beta hydroxybutyryl-CoA dehydrogenase": 0,
"crotonase": 0,
"butyryl-CoA dehydrogenase": 0,
"electron transfer flavoprotein alpha-subunit": 0,
"electron transfer flavoprotein beta-subunit": 0,
"butyryl-COA-transferase": 0,
"phoasephate butyryltransferase": 0,
"butyrate kinase": 0
}
definite_BCT = "no"
definite_BK = "no"
# Score the available proteins by pathway
for protein in protein_namesCore:
if helper.has_protein(temp_dict[key], protein, threshold):
total_scoreCore += 1
protein_dict[protein] = helper.highest_identity(temp_dict[key], protein)
for protein in protein_namesBCT:
if helper.has_protein(temp_dict[key], protein, threshold):
total_scoreBCT += 1
protein_dict[protein] = helper.highest_identity(temp_dict[key], protein)
for protein in protein_namesBK:
if helper.has_protein(temp_dict[key], protein, threshold):
total_scoreBK += 1
protein_dict[protein] = helper.highest_identity(temp_dict[key], protein)
# Create labels for capability.
if (total_scoreBCT + total_scoreCore == 7):
definite_BCT = "yes"
if (total_scoreBK + total_scoreCore == 8):
definite_BK = "yes"
# Format the row to write to the csv.
row = [key, total_scoreCore, total_scoreBCT, total_scoreBK,
protein_dict["thiolase"],
protein_dict["beta hydroxybutyryl-CoA dehydrogenase"],
protein_dict["crotonase"],
protein_dict["butyryl-CoA dehydrogenase"],
protein_dict["electron transfer flavoprotein alpha-subunit"],
protein_dict["electron transfer flavoprotein beta-subunit"],
protein_dict["butyryl-COA-transferase"],
protein_dict["phoasephate butyryltransferase"],
protein_dict["butyrate kinase"],
definite_BCT, definite_BK]
write_out.writerow(row)
outfile.close()
# Name: run
# Summary: Performs alignments and outputs to "Output.csv"
# Parameters: letter - the species name starting letter for this round of runs.
# Returns: NA
def run(letter):
csv_out = letter + ".csv"
with open("Species\\files_info.csv", 'r') as species_in:
read_species = csv.reader(species_in, delimiter=',')
# Writes the header for the appendable output file.
openOutFile(csv_out)
# For all of the separate file names, create a blast db
for row in read_species:
if row[0][0] == letter:
current_species = row[0]
db = buildSpeciesDB(current_species)
# For all of the protein files, run an alignment against the blastdb
# parse the results and create a csv output file.
for protein in helper.protein_names:
results = alignSequence(current_species, protein, db)
parseResults(results, current_species, protein, csv_out)
# Track progress in the console
print(current_species + ": " + protein)
# Name: openOutFile
# Summary: Creates the output file and writes its header.
# Parameters: csv_out - the name of the output file
# Returns: NA
def openOutFile(csv_out):
with open("CSVResults\\" + csv_out, 'w', newline='') as out_file:
writer =csv.writer(out_file)
header = ["alignment#", "hsp#", "species_name", "query_protein",
"hit_title", "%identity", "%query_cover", "evalue",
"align_length", "score", "query_start", "query_end",
"sbjct_start", "sbjct_end", "query", "sbjct", "identities",
"gaps", "positives", "bits", "out_file"]
writer.writerow(header)
# Name: openOutFileScored
# Summary: Creates the output file and writes its header.
# Parameters: csv_out - the name of the output file
# Returns: NA
def openOutFileScored(csv_out):
with open(csv_out, 'w', newline='') as outfile:
write_out = csv.writer(outfile)
header = ["species", "core", "BCT", "BK", "thiolase",
"beta hydroxybutyryl-CoA dehydrogenase",
"crotonase", "butyryl-CoA dehydrogenase",
"electron transfer flavoprotein alpha-subunit",
"electron transfer flavoprotein beta-subunit",
"butyryl-COA-transferase", "phoasephate butyryltransferase",
"butyrate kinase", "definite_BCT", "definite_BK"]
write_out.writerow(header)