-
Notifications
You must be signed in to change notification settings - Fork 0
/
protein_diff.py
156 lines (139 loc) · 6.92 KB
/
protein_diff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# %% Import libraries
from numpy import fill_diagonal
from pandas import DataFrame
from itertools import combinations
'''
This is an automatic demonstration of how to construct a distance matrix.
All sequences were retrieved using an ncbi protein search for
'sonic hedgehog'.
NOTE: This only works for Python 3.6+, since dicts are now ordered by default
using insertion order.
'''
def init_pairwise_dist(seq_1, seq_2):
'''
If a sequence is longer or shorter than another, the difference must be
used for initialization
'''
if len(seq_1) > len(seq_2):
return len(seq_1) - len(seq_2)
elif len(seq_2) > len(seq_1):
return len(seq_2) - len(seq_1)
else:
return 0
def count_diff(seq_1, seq_2):
'''
The actual amino acid to amino acid comparison occurs here.
'''
total_diff = init_pairwise_dist(seq_1, seq_2)
for char_1, char_2 in zip(seq_1, seq_2):
if char_1 != char_2:
total_diff += 1
return total_diff
def gen_dist_matrix(dist_dict):
'''
Initialize an empty dataframe consisting of all organisms
as both the column and index labels. Also,
fill the main diagonal with all zeros.
'''
dist_matrix = DataFrame(index=dist_dict.keys(), columns=dist_dict.keys())
fill_diagonal(dist_matrix.values, 0)
return dist_matrix
def update_dist_matrix(dist_matrix, pairwise_dict, start, counter, end):
'''
Populate the entire distance matrix using vectorized assignments.
Think of this as assigning lists, for each organism at right angles, where
the vertex of the right angle is zero.
The start and end positions for the right angles are updated by
setting the start position to the end position, and by adding a
decrementing counter value to the end value.
'''
for i, j in zip(range(len(Shh.keys()) - 1), range(1, len(Shh.keys()))):
Shh_df.iloc[i, j:] = list(Shh_pairwise_diff.values())[start:end]
Shh_df.iloc[j:, i] = Shh_df.iloc[i, j:]
start = end
end += counter
counter -= 1
return dist_matrix
# %% Dictionary to hold all organism amino acid sequences
Shh = {
'Red Junglefowl':
'MVEMLLLTRILLVGFICALLVSSGLTCGPGRGIGKRRHPKKLTPLAYKQFIPNVAEKTLGASGRYEGKIT'
'RNSERFKELTPNYNPDIIFKDEENTGADRLMTQRCKDKLNALAISVMNQWPGVKLRVTEGWDEDGHHSEE'
'SLHYEGRAVDITTSDRDRSKYGMLARLAVEAGFDWVYYESKAHIHCSVKAENSVAAKSGGCFPGSATVHL'
'EHGGTKLVKDLSPGDRVLAADADGRLLYSDFLTFLDRMDSSRKLFYVIETRQPRARLLLTAAHLLFVAPQ'
'HNQSEATGSTSGQALFASNVKPGQRVYVLGEGGQQLLPASVHSVSLREEASGAYAPLTAQGTILINRVLA'
'SCYAVIEEHSWAHWAFAPFRLAQGLLAALCPDGAIPTAATTTTGIHWYSRLLYRIGSWVLDGDALHPLGM'
'VAPAS',
'Zebrafish':
'MRLLTRVLLVSLLTLSLVVSGLACGPGRGYGRRRHPKKLTPLAYKQFIPNVAEKTLGASGRYEGKITRNS'
'ERFKELTPNYNPDIIFKDEENTGADRLMTQRCKDKLNSLAISVMNHWPGVKLRVTEGWDEDGHHFEESLH'
'YEGRAVDITTSDRDKSKYGTLSRLAVEAGFDWVYYESKAHIHCSVKAENSVAAKSGGCFPGSALVSLQDG'
'GQKAVKDLNPGDKVLAADSAGNLVFSDFIMFTDRDSTTRRVFYVIETQEPVEKITLTAAHLLFVLDNSTE'
'DLHTMTAAYASSVRAGQKVMVVDDSGQLKSVIVQRIYTEEQRGSFAPVTAHGTIVVDRILASCYAVIEDQ'
'GLAHLAFAPARLYYYVSSFLFPQNSSSRSNATLQQEGVHWYSRLLYQMGTWLLDSNMLHPLGMSVNSS',
'Indonesian Coelacanth':
'MDEMLLLTRIVLVGLICSSLVSSGLTCGPGRGYGRRKYPKKLTPLAYKQFIPNVAEKTLGASGRYEGKIT'
'RNSERFKELTPNYNPDIIFKDEENTGADRLMTQRCKDKLNSLAISVMNQWPGVKLRVTEGWDEDGHHSEE'
'SLHYEGRAVDITTSDRDRSKYGMLARLAVEAGFDWVYYESKAHIHCSVKAENSVAAKSGGCFPGLATVTL'
'EDGGTKFVKDLSPGDRVLAADDQGKLVYSDFLMFLDKEEESQKVFYVIETKEPLKRITLTAAHLLFVAQN'
'SSDNLSPFKATFASEIKPGQIIFVAHGDDTHLMAATVERVVLEEDTGAYAPLTNQGTILINRVWASCYAV'
'IEQHKWAHWAFAPVRMGYVISSLFFPKDILKYNGTFQENGVHWYSKTLYQIGTWVLDNDYIHPLGMPERS'
'S',
'Olive Flounder':
'MLLWTRIVLAGVICLSLVSSGMGCGPGRGYGRRRHPKKLTPLAYKQFIPNVAEKTLGASGRYEGKITRNS'
'ERFKELTPNYNTDIIFKDEENTGADRLMTQRCKDKLNSLAISVMNQWPGVKLRVTEGWDEDGHHFEESLH'
'YEGRAVDITTSDRDKSKYGTLSRLAVEAGFDWVYYESKAHIHCSVKAENSVAAKSGGCFPGSSTVTLQDG'
'TKKPVKALQTGDRVLAADAHGQPVYTDFIMFIDQDSTTRRLFYVIETDSGQKITLTAAHLLFVGHSNSTE'
'RAHRGMSAVFASQVRPGQTVFVLDAERLQPVTVKRIYTQEHEGSFAPVTAQGTVVVDQVLASCYAVIQDH'
'ELAHWALAPVRLAHWVSSLLFSSQPQASAQKDGVHWYSKILYQLGTWLLDSHSIHPLGMSVYPS',
'Carp':
'MRLMTRVLLVSLLSLSLVVSGLACGPGRGYGRRKHPKKLTPLAYKQFIPNVAEKTLGASGRYEGKITRNS'
'ERFKELTPNYNPDIIFKDEENTGADRLMTQRCKDRLNSLAISVMNQWPGVKLRVTEGWDEDGHHFEESLH'
'YEGRAVDITTSDRDKSKYGTLSRLAVEAGFDWVYYGSKAHIHCSVKAENSVAAKSGGCFPGSALVALKDG'
'RQKAVKDLNPGDKVLAADGNGKLVYSDFIMFTDRDSATRRVFYVIETKEPVEKITLTAAHLLFVLDNSTD'
'DLHSMTAAFASSVRAGQKVMVVDDSGPLKSVIVERIYTEEHQGSFAPVTAHGTIVVDRILASCYAVIEDH'
'SLAHLAFAPVRLYYDVSSVLFPKNFISQSNATLQQEGVHWYSKLLFQIGAWLLDSRMLHPLGMSVNSS',
'Little Skate':
'MMLTRIVLVGLVCCSLFSSARACGPGRGYGRRKHPRKLTPLAYKQFIPNVAEKTLGASGRYEGKITRNSE'
'RFKELTPNYNPDIIFKDEENTGADRLMTQRCKDKLNSLAISVMNQWPGVKLRVTEGWDEDGHHSEESLHY'
'EGRAVDITTSDRDRSKYGMLARLAVEAGFDWVNYESKAHIHCSVKAENSVAAKSGGCFPASARVSLENGD'
'TKQVKDLTPGDRVLAADERGNLLYSDFVMFLDRAEEVEKVFYVVETREPRRKLALTAAHLLFVGHATNDG'
'QLGLKATFASKVRSGQLVYITDGDSHRLRPARVDKVYLEEMIGAYAPLTIQGTVVIDQVLTSCYAVIEEH'
'SLAHWAFAPVRMRYTARSLLLPSDPPAVNCTVQAGGVHWYSSALYQIGRWVLNGASIHPLGMALDSS',
'Mouse':
'MLLLLARCFLVILASSLLVCPGLACGPGRGFGKRRHPKKLTPLAYKQFIPNVAEKTLGASGRYEGKITRN'
'SERFKELTPNYNPDIIFKDEENTGADRLMTQRCKDKLNALAISVMNQWPGVKLRVTEGWDEDGHHSEESL'
'HYEGRAVDITTSDRDRSKYGMLARLAVEAGFDWVYYESKAHIHCSVKAENSVAAKSGGCFPGSATVHLEQ'
'GGTKLVKDLRPGDRVLAADDQGRLLYSDFLTFLDRDEGAKKVFYVIETLEPRERLLLTAAHLLFVAPHND'
'SGPTPGPSALFASRVRPGQRVYVVAERGGDRRLLPAAVHSVTLREEEAGAYAPLTAHGTILINRVLASCY'
'AVIEEHSWAHRAFAPFRLAHALLAALAPARTDGGGGGSIPAAQSATEARGAEPTAGIHWYSQLLYHIGTW'
'LLDSETMHPLGMAVKSS',
'Chimpanzee':
'MGEMLLLARCLLLVLVSSLLVCSGLACGPGRGFGKRRHPKKLTPLAYKQFIPNVAEKTLGASGRYEGKIS'
'RNSERFKELTPNYNPDIIFKDEENTGADRLMTQRCKDKLNALAISVMNQWPGVKLRVTEGWDEDGHHSEE'
'SLHYEGRAVDITTSDRDRSKYGMLARLAVEAGFDWVYYESKAHIHCSVKAENSVAAKSGGCFPGSATVHL'
'EQGGTKLVKDLSPGDRVLAADDQGRLLYSDFLTFLDRDDGAKKVFYVIETREPRERLLLTAAHLLFVAPH'
'NDSATGGPEASSGSGPPSGGALGPRALFASRVRPGQRVYVVAERDGDRRLLPAAVHSVTLSEEAAGAYAP'
'LTAQGTILINRVLASCYAVIEEHSWAHRAFAPFRLAHALLAALAPARTDRGGDSGGGDRGGGGGRVALPA'
'PGAADAPGAGATAGIHWYSQLLYQIGTWLLDSEALHPLGMAVKSS',
'Human':
'MLLLARCLLLVLVSSLLVCSGLACGPGRGFGKRRHPKKLTPLAYKQFIPNVAEKTLGASGRYEGKISRNS'
'ERFKELTPNYNPDIIFKDEENTGADRLMTQRCKDKLNALAISVMNQWPGVKLRVTEGWDEDGHHSEESLH'
'YEGRAVDITTSDRDRSKYGMLARLAVEAGFDWVYYESKAHIHCSVKAENSVAAKSGGCFPGSATVHLEQG'
'GTKLVKDLSPGDRVLAADDQGRLLYSDFLTFLDRDDGAKKVFYVIETREPRERLLLTAAHLLFVAPHNDS'
'ATGEPEASSGSGPPSGGALGPRALFASRVRPGQRVYVVAERDGDRRLLPAAVHSVTLSEEAAGAYAPLTA'
'QGTILINRVLASCYAVIEEHSWAHRAFAPFRLAHALLAALAPARTDRGGDSGGGDRGGGGGRVALTAPGA'
'ADAPGAGATAGIHWYSQLLYQIGTWLLDSEALHPLGMAVKSS'
}
# %% Dict comprehension to get all possible pairings and sequence differences
Shh_pairwise_diff = {
key: count_diff(Shh[key[0]], Shh[key[1]])
for key in combinations(Shh, 2)
}
# %% Create the distance matrix
Shh_df = gen_dist_matrix(Shh)
# %% Finally, populate the distance matrix
Shh_df = update_dist_matrix(Shh_df, Shh_pairwise_diff, 0,
len(Shh.keys()) - 2,
len(Shh.keys()) - 1)
Shh_df