-
Notifications
You must be signed in to change notification settings - Fork 0
/
helper.py
220 lines (198 loc) · 8.93 KB
/
helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
# Author: Yi Yao Tan
##Functions borrowed from my summer internship:
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage, leaves_list, to_tree, centroid, cut_tree
import numpy as np
from matplotlib import pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA,KernelPCA
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
import pandas as pd
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras import Model
from sympy.utilities.iterables import multiset_permutations
from sklearn.metrics import classification_report
def statistics(true_labels, knn_labels, no_clusters = 3):
"""Statistics given labels """
true_labels = np.array(true_labels)
permutations = list(multiset_permutations(range(no_clusters)))
final_guess = list()
max_corr = 0
##quick put together for best classification
for permutation in permutations:
guess = np.zeros(len(true_labels))
for i,k in enumerate(permutation):
for l in range(len(knn_labels)):
if knn_labels[l] == i:
guess[l] = k
# print(f"permutation: {permutation} guess: {guess}")
num_correct = 0
for i in range(len(guess)):
if guess[i] == true_labels[i]:
num_correct+= 1
if num_correct > max_corr:
final_guess = guess
max_corr = num_correct
report = classification_report(true_labels, final_guess, output_dict = True)
#very non robust
report['history_titles'] = report['0']
del report['0']
report['math_titles'] = report['1']
del report['1']
report['philosophy_titles'] = report['2']
del report['2']
acc = report["accuracy"]
df = pd.DataFrame(report)
df = df.drop(['accuracy'], axis = 1)
print(f"global accuracy: {acc}")
return df, acc
def nonlinear_autoencoder_complex(input_size, code_size: int,loss = "mse"):
"""
Instanciate and compiles an autoencoder, returns both the autoencoder and just the encoder
:param int or tuple input_size: shape of the input samples
:param int code_size: dimension on which to project the original data
:return: autoencoder, encoder
"""
# YOUR CODE HERE
inputs = Input(shape=(input_size,))
hidden_layer = Dense(code_size, activation ="relu")(inputs)
outputs = Dense(input_size/2, activation = "sigmoid")(hidden_layer)
outputs = Dense(input_size)(outputs)
autoencoder = Model(inputs = inputs, outputs= outputs)
autoencoder.summary()
autoencoder.compile(optimizer = "Adam", loss = loss)
encoder = Model(inputs = inputs, outputs = hidden_layer)
return autoencoder, encoder
def nonlinear_autoencoder(input_size, code_size: int, loss = "mse"):
"""
Instanciate and compiles an autoencoder, returns both the autoencoder and just the encoder
:param int or tuple input_size: shape of the input samples
:param int code_size: dimension on which to project the original data
:return: autoencoder, encoder
"""
# YOUR CODE HERE
inputs = Input(shape=(input_size,))
hidden_layer = Dense(code_size, activation ="relu")(inputs)
outputs = Dense(input_size/2, activation = "relu")(hidden_layer)
outputs = Dense(input_size)(outputs)
autoencoder = Model(inputs = inputs, outputs= outputs)
autoencoder.summary()
autoencoder.compile(optimizer = "Adam", loss = "mse")
encoder = Model(inputs = inputs, outputs = hidden_layer)
return autoencoder, encoder
def random_knn(X, labels, num, knn, metric = "cosine"):
neigh = NearestNeighbors(n_neighbors=knn, metric = metric)
neigh.fit(X)
choices = np.random.choice(range(len(labels)),size = num, replace = False)
for choice in choices:
print(f"Sample: {labels[choice]}")
nearest_indices = neigh.kneighbors(np.array([X[choice]]), return_distance = False)
print(f"{knn} nearest are: {labels[nearest_indices[0]]}")
def compute_kmeans(X, titles_list, title = "",no_clusters=3):
"""Compute K means of 3 groups by default"""
kmeans = KMeans(n_clusters=no_clusters)
res = kmeans.fit(X)
#print(f"res: \n {res}")
plt.figure(figsize=(40,40),dpi= 600, facecolor='white')
classes = kmeans.predict(X)
transposed =X.T
#print(f"classes: {classes}")
plt.scatter(X[:, 0], X[:, 1], c=classes)
plt.title(title)
axis1 =np.array(transposed[0].tolist())
axis2 =np.array(transposed[1].tolist())
for i,sample in enumerate(titles_list):
if i % 10 == 0:
plt.annotate(sample, (axis1[i], axis2[i]))
plt.show()
plt.savefig(f'Diagrams/{title}.png')
return classes
def plot_tsne(matrix, titles_list,label, title="", metric = "cosine", perplexity = 60):
"""Reduces matrix to 2 dimensions using TSNE and plots it"""
reduced_matrix =TSNE(n_components=2,init='pca',method='exact',perplexity=perplexity, metric = metric).fit_transform(matrix)
axes =reduced_matrix.T
plt.figure(figsize=(20,20),dpi= 600, facecolor='white')
axis1 =np.array(axes[0].tolist())
axis2 =np.array(axes[1].tolist())
for i in [0, 1, 2]:
plt.scatter(axis1[label == i] , axis2[label == i] , label = i)
#plt.scatter(axis1,axis2)
for i,labels in enumerate(titles_list):
if i % 2 == 0:
plt.annotate(labels, (axis1[i], axis2[i]))
plt.title(title)
plt.savefig(f'Diagrams/{title}.png')
return reduced_matrix
def plot_kpca(matrix,titles_list, label, kernel="linear",title=""):
"""Reduces matrix to 2 dimensions using PCA and plots it
kernel choices: {‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘cosine’, ‘precomputed’}
"""
np.matrix(matrix)
pca = KernelPCA(n_components=2,kernel=kernel)
reduced_matrix=pca.fit_transform(matrix)
print(f"shape reduced matrix: {np.shape(reduced_matrix)}")
transposed =reduced_matrix.T
fig, axes = plt.subplots(2,1, figsize=(10, 10), dpi= 1200, facecolor='white')
# axi.figure(figsize=(20,20),dpi= 600, facecolor='white')
axis1 =np.array(transposed[0].tolist())
axis2 =np.array(transposed[1].tolist())
for i in [0, 1, 2]:
axes[0].scatter(axis1[label == i] , axis2[label == i] , label = i)
#axes[0].scatter(axis1,axis2)
for i,labels in enumerate(titles_list):
if i % 10 == 0:
axes[0].annotate(labels, (axis1[i], axis2[i]))
axes[0].set_title(title)
# axes[1].plot(range(1,len(pca.eigenvalues_)+1), pca.eigenvalues_)
axes[1].set_title(f"Eigenvalues of {title}")
fig.savefig(f'Diagrams/{title}.png')
return reduced_matrix
def plot_dendrogram(matrix,titles_list = None, hierarchy_method = "complete",dist_metric = "cos", title= ""):
"""Plots dendro gram given matrix with parameters for the linkage
labels: name labels on the dendrogram tree"""
out = linkage(matrix, method = hierarchy_method, metric = dist_metric)
plt.figure(figsize=(96, 36) ,dpi= 400, facecolor='white')
plt.title(title)
dn = dendrogram(out, labels = titles_list, distance_sort = True)
plt.show()
plt.savefig(f'Diagrams/{title}.png')
def plot_pca(matrix, label, titles_list = None,title=""):
"""Reduces matrix to 2 dimensions using PCA and plots it along with the screeplot
output:
reduced_matrix: 2 x m matrix"""
matrix = np.matrix(matrix)
pca = PCA(n_components=2)
reduced_matrix=pca.fit_transform(matrix)
print(f"shape reduced matrix: {np.shape(reduced_matrix)}")
print(f"pca.explained_variance_ratio_: {pca.explained_variance_ratio_}")
transpose =reduced_matrix.T
axis1 =np.array(transpose[0].tolist())
axis2 =np.array(transpose[1].tolist())
fig, axes = plt.subplots(2,1, figsize=(10, 10), dpi= 1200, facecolor='white')
for i in [0, 1, 2]:
axes[0].scatter(axis1[label == i] , axis2[label == i] , label = i)
#axes[0].scatter(axis1,axis2)
for i,label in enumerate(titles_list):
if (i+4)%10 == 0:
axes[0].annotate(label, (axis1[i], axis2[i]))
axes[0].set_title(title)
components = np.arange(pca.n_components_) + 1
axes[1].plot(components, pca.explained_variance_ratio_, 'o-')
axes[1].set_title(f"Scree plot of {title}")
fig.savefig(f'Diagrams/{title}.png')
return reduced_matrix
def heat_map(leaves_list,titles_list,matrix):
"""Prints heat map where rows are ordered by the clustering algorithm,
columns are still chains list ordered"""
rows = [titles_list[i] for i in leaves_list]
ordered_mat = [matrix[i] for i in leaves_list]
#print(f"rows: {rows}, chains_list {chains_list}")
heat_frame = pd.DataFrame(ordered_mat,rows,titles_list)
print(f"starting heat function 2")
#f, ax = plt.subplots(figsize=(11, 9))
#cmap = sns.diverging_palette(230, 20, as_cmap=True)
plt.figure(figsize=(1000,1000))
plt.xticks(range(len(titles_list)),titles_list,rotation=90)
plt.yticks(range(len(rows)),rows)
plt.imshow(heat_frame, cmap='hot',interpolation="nearest")