-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path1181102378_BLAST_project.py
180 lines (156 loc) · 5.43 KB
/
1181102378_BLAST_project.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# Name: Tan Zi Ching
# ID: 1181102378
# Date: 25/9/2021
# Project
'''
In this project, you are required to develop a program. The program should:
1) Accept a gene sequence.
2) Perform BLAST to search for protein homologs.
3) Retrieve top 10 homologs.
4) Perform Multiple Sequence Alignment on to the sequences.
5) Extract the positon and amino acid of non-conserved regions.
'''
from Bio.Seq import *
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
from Bio.Align.Applications import ClustalOmegaCommandline
import subprocess # to run bash script
# blast function
def blast_function(query):
query_file = open(query)
query_record = SeqIO.read(query_file, format='fasta')
results_handle = NCBIWWW.qblast(
'blastp',
'nr',
query_record.seq)
res = results_handle.read()
save_file = open("output/qblast_blastp.xml", 'w')
save_file.write(res)
save_file.close()
results_handle = open("output/qblast_blastp.xml", 'r')
blast_records = NCBIXML.parse(results_handle)
E_VALUE_THRESH = 0.04
count = 0
hitlist = []
for blast_record in blast_records:
for alignment in blast_record.alignments:
count = count + 1
if count > 10: break
for hsp in alignment.hsps:
hitrecord = SeqRecord(Seq(hsp.sbjct),
id=str(count),
name=alignment.hit_id,
description=alignment.title[:100])
hitlist.append(hitrecord)
SeqIO.write(hitlist, "output/hitlist.fasta", "fasta")
return hitlist
# multiple sequence alignment function
def msa_function(ifile):
in_file = ifile
out_file = "output/aligned.fasta"
# get bash command for running clustelo
clustalomega_cline = ClustalOmegaCommandline(infile=in_file, outfile=out_file, verbose=True, auto=True, force=True)
#print(clustalomega_cline)
# run bash command from python
aligning = subprocess.Popen(str(clustalomega_cline),
stdout=subprocess.PIPE)
output, error = aligning.communicate()
#print(output)
# find positions of nonconserved regions
def ncr_function(ifile):
protfile = ifile
homologues = []
homolog_file = open(protfile)
homolog_record = SeqIO.parse(homolog_file, 'fasta')
for homolog in homolog_record:
#print(homolog.seq)
homologues.append(homolog)
tofile = []
nonconserved = []
c=0
for hom in homologues:
s2 = str(hom.seq)
if(c==0):
q = hom
print("Query :", s2)
x="Q: "+s2
tofile.append(x)
else:
print("Subject:", s2)
x="S: "+s2
tofile.append(x)
for i in range(len(q)):
if(q[i] != s2[i]):
nonconserved.append(i)
c+=1
nonconserved.sort()
#print(nonconserved)
sym = []
print(" ", end='')
sym.append(" ")
for i in range(len(q)):
if i in nonconserved:
print(",", end='')
sym.append(",")
else:
print("*", end='')
sym.append("*")
sym = ''.join(sym)
tofile.append(sym)
with open("output/nonconserved-regions.txt", 'w') as f:
f.writelines('\n'.join(tofile)+'\n')
# get DNA sequence
# user input
#seqinput = input("Please enter a DNA sequence: ")
# read from FASTA
valid = True
file = input("Please enter the FASTA file name (HBA2.fasta): ")
seqfile = open(file, 'r')
next(seqfile)
lines = []
lines = seqfile.readlines()
seqinput = ''.join(lines)
seqinput = seqinput.replace('\n', '')
seqinput = seqinput.upper()
seqinput = seqinput[0:len(seqinput)-len(seqinput)%3]
for base in seqinput: # check for ambiguous bases
if base not in ['A', 'T', 'G', 'C']:
valid = False
dna = SeqIO.read(file, 'fasta')
inputid = dna.id
inputname = dna.name
inputdes = dna.description
if valid == False:
print("Sequence not recognised!")
else:
seq = Seq(seqinput)
prot_seq = seq.translate()
record = SeqRecord(prot_seq,
id = inputid,
name = inputname,
description = inputdes)
filehandle = open('protquery.fasta', 'w')
SeqIO.write(record, filehandle, 'fasta')
filehandle.close()
print("\nRunning BLAST...\n")
hitlist = blast_function("protquery.fasta")
print("Displaying top 10 homologues\n")
for i in range(len(hitlist)):
print(hitlist[i], end='\n')
print("\n\nHit list generated as 'hitlist.fasta'\n")
# create MSA file with query + hitlist
iifile = []
infilehandle = open('infile.fasta', 'w')
iifile.append(record)
for h in SeqIO.parse('output/hitlist.fasta', 'fasta'):
iifile.append(h)
SeqIO.write(iifile, 'infile.fasta', 'fasta')
print("\nRunning Multiple Sequence Alignment...\n")
msa_function("infile.fasta")
print("\nMSA alignment generated as 'aligned.fasta'\n")
print("\nDisplaying nonconserved regions\n")
ncr_function("output/aligned.fasta")
print("\n\nList of nonconserved regions generated as 'nonconserved-regions.fasta'\n")
print("\n\n\n")