-
Notifications
You must be signed in to change notification settings - Fork 6
/
evoClust_clustix.py
executable file
·158 lines (125 loc) · 4.85 KB
/
evoClust_clustix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/env python3
"""CLUSTIX - CLUSTering of matrIX :-)::
./clustix.py -m <matrix> -c <cuttoff>
Required: numpy
Authors: Marcin Magnus <magnus@genesilico.pl> Irina Tuszynska <irena@genesilico.pl>
"""
from __future__ import print_function
import sys
import shutil
try:
from numpy import array, zeros, loadtxt, nditer
except ImportError:
try:
import numpy
print('Something went wrong. Your numpy is in version: %s . Please upgrade! Clustix has been tested with numpy 1.7.1.!' % numpy.__version__)
except:
print('Something went wrong. Please, install numpy')
sys.exit(1)
import os.path
import argparse
MIN_CLUSTER = 3
try:
import lib.timex.timex as timex
dont_use_timex = False
except ImportError:
dont_use_timex = True
def get_parser():
parser = argparse.ArgumentParser()
parser.add_argument('matrix',
help="A txt file with a similarity matrix with column headers, See test_data/matrix.txt for more")
parser.add_argument('-o',
dest="output",
help="See test_data/output.txt for more, don't type extension of the file")
parser.add_argument('-c', type=float,
dest="cut_off",
default=5.0,
help="Cut_off of RMSD for the formation of a cluster")
parser.add_argument("-v", "--verbose",
action="store_true", default=False, dest="verbose", help="be verbose")
return parser
if __name__ == '__main__':
parser = get_parser()
opts = parser.parse_args()
if not opts.matrix:
parser.print_help()
sys.exit(1)
if not dont_use_timex:
t = timex.Timex()
t.start()
# main program
cf = opts.cut_off
mfn = opts.matrix
verbose = opts.verbose
# get struc_names
struc_names = open(mfn).readline().rstrip().strip('#').split()
# load matrix
m = loadtxt(mfn)
if verbose:
print('> matrix:\n', m)
mshape = m.shape
print('There is ', mshape[0], 'structures in your matrix. 1/6 of this is ', mshape[0] * 1 / 6)
# matrix for clustering, contain 0 for values > cf and 1 - for values from matri that are < cf
mclust = zeros(mshape)
index = nditer(m, flags=['multi_index'])
while not index.finished:
if index[0] < cf:
mclust[index.multi_index[0], index.multi_index[1]] = 1
else:
mclust[index.multi_index[0], index.multi_index[1]] = 0
index.iternext()
if verbose:
print('> matrix of neighbors:\n', mclust)
print('cut_off:', opts.cut_off)
matrixfn = os.path.splitext(os.path.basename(opts.matrix))[
0] # get only fn of matrix, remove extension
if not opts.output:
out_name = matrixfn + "_cf%.2f.out" % (opts.cut_off)
else:
out_name = opts.output + "_cf%.2f.out" % (opts.cut_off)
print(out_name)
output = open(out_name, "w")
output.write("CLUSTER_BAKER_cf%i_%s\n" % (int(cf), matrixfn))
# find the biggest cluster
# ... and the find the new biggest cluster
# ... and the find the
# ... till you reach no of elements in cluster less than MIN_CLUSTER
c = 0
while 1:
no_neighbors_under_cf_of_struc = []
for row in mclust:
no_neighbors_under_cf = row.sum()
no_neighbors_under_cf_of_struc.append(no_neighbors_under_cf)
if verbose:
print('> no_neighbors_under_cf_of_struc', no_neighbors_under_cf_of_struc) # print no of neighbors
curr_biggest_cluster = array(no_neighbors_under_cf_of_struc)
curr_biggest_cluster_no_of_struc = curr_biggest_cluster.max()
print('cluster #' + str(c + 1), " curr the biggest cluster size ", int(curr_biggest_cluster_no_of_struc))
if c == 0:
n1c = str(int(curr_biggest_cluster_no_of_struc))
if curr_biggest_cluster_no_of_struc < MIN_CLUSTER:
break
index = no_neighbors_under_cf_of_struc.index(curr_biggest_cluster_no_of_struc)
curr_biggest_cluster_row = mclust[index, :] # row of curr_biggest_cluster
indexes = curr_biggest_cluster_row.nonzero()
if verbose:
print(" indexes of elements in this cluster", str(indexes[0]))
output.write("%3.1f\n" % len(indexes[0]))
struc_names_of_biggest_cluster = []
for i in indexes[0]:
struc_names_of_biggest_cluster.append(struc_names[i])
nam = struc_names[i]
print(nam, end='') # names of files
output.write("%s\n" % nam)
print()
mclust[:, indexes] = 0
if c > 3:
break
c += 1
output.close()
# hack
nout_name = out_name.replace('_cf', '_n1c' + n1c + '_cf')
shutil.move(out_name, nout_name)
print('>> OK! The output is written to the %s file' % nout_name)
if not dont_use_timex:
print(t.end())