-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpearson_and_spearman_clustering.py
126 lines (102 loc) · 5.12 KB
/
pearson_and_spearman_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import numpy as np
from scipy.stats import stats
def get_pearson_matrix(hm):
"""
Get the Pearson Correlation Coefficient matrix of the drug profiles in the given heatmap.
:param hm: Heatmap (DataFrame) of drugs, to find the name of the drugs.
:return pearson_matrix: Pearson Correlation Coefficient matrix of drugs.
"""
pearson_matrix = np.corrcoef(hm)
return pearson_matrix
def get_spearman_matrix(hm):
"""
Get the Spearman's Rank Correlation Coefficient matrix of the drug profiles in the given heatmap.
:param hm: Heatmap (DataFrame) of drugs, to find the name of the drugs
:return spearman_matrix: Spearman's Rank Correlation Coefficient matrix of drugs
"""
spearman_matrix, _ = stats.spearmanr(hm, axis=1)
return spearman_matrix
def compare_COMPARE_Analysis(pearson_matrix, drug_list, r_threshold_list):
"""
Run Pearson Algorithm for a range of r_cutoff values.
:param pearson_matrix: Pearson Correlation Coefficient Matrix.
:param drug_list: List of drugs in the data.
:param r_threshold_list: List of r_cutoff values wanted.
:return clustered_drugs_list: List of clusters of drugs computed from each r_cutoff value.
"""
clustered_drugs_list = []
for r_threshold in r_threshold_list:
initial_cluster, _ = get_similarity_labeling_matrix(pearson_matrix, drug_list, r_threshold)
clustered_drugs = group_similar_drugs(initial_cluster, minimum_cluster_length=2, has_outcast_cluster=False)
clustered_drugs_list.append(clustered_drugs)
return clustered_drugs_list
def get_similarity_labeling_matrix(corr_m, drug_list, r_cutoff=0.6):
"""
Put two drugs in the same list if they have a similarity above the threshold provided.
:param corr_m: Correlation Matrix.
:param drug_list: List of drugs.
:param r_cutoff: The threshold to consider when concluding whether two drugs are similar enough.
:return: List of initial clusters and Correlation Matrix with values less than r_cutoff rounded
to zero.
"""
initial_cluster = []
modified_similarity_matrix = []
for row_i in corr_m:
row_i_similardrugs_set = []
row_i_similarscore_set = []
for drug_j_index in range(0, len(row_i)):
drug_j_similarity_with_drug_i = row_i[drug_j_index]
if drug_j_similarity_with_drug_i >= r_cutoff:
row_i_similardrugs_set.append(drug_list[drug_j_index])
row_i_similarscore_set.append(drug_j_similarity_with_drug_i)
else:
row_i_similarscore_set.append(0)
initial_cluster.append(row_i_similardrugs_set)
modified_similarity_matrix.append(row_i_similarscore_set)
return (initial_cluster, np.array(modified_similarity_matrix))
def group_similar_drugs(initial_cluster, minimum_cluster_length=2, has_outcast_cluster=False):
"""
Remove same-but-different-ordered clusters and also clusters with length less than the minimum
required. If outcast argument is True, then instead of removing the individual clusters, merge
them into one big outcast cluster. Hence, outcast cluster consists of drugs that are not similar
enough to any other drug in the data. Return the list of clusters.
:param initial_cluster: Rough clusters computed from get_similarity_labeling_matrix.
:param minimum_cluster_length: Minimum number of drugs to have in a cluster.
:param has_outcast_cluster: If true, merge clusters that do not meet the minimum_cluster_length
criteria, useful when computing NMI.
:return: List of actual clusters.
"""
clustered_drugs = []
outcast_cluster = []
for label in initial_cluster:
sorted_label = sorted(label) # to check uniqueness in the list
if len(label) >= minimum_cluster_length and sorted_label not in clustered_drugs:
clustered_drugs.append(sorted_label)
elif 0 < len(label) < minimum_cluster_length and has_outcast_cluster:
outcast_cluster.extend(label)
if has_outcast_cluster and len(outcast_cluster) > 0:
clustered_drugs.append(outcast_cluster)
return clustered_drugs
def get_super_clustered_drugs(clustered_drugs):
"""
Compute and return superclusters. Supercluster are the merge of clusters who share at least
one drug in common
:param clustered_drugs: The cluster of drugs that are already computed.
:return: Superclusters of drugs.
"""
super_clustered_drugs = []
for cluster1 in clustered_drugs:
super_cluster = cluster1
i = 0
while i < len(clustered_drugs):
cluster2 = clustered_drugs[i]
i += 1
union_set = set(super_cluster + cluster2)
union_list = list(union_set)
# if they have shared elements but one set is not the other set's subset
if len(union_list) < len(super_cluster) + len(cluster2) and \
not (set(cluster2).issubset(set(super_cluster))):
super_cluster = union_list
i = 0
super_clustered_drugs.append(super_cluster)
return group_similar_drugs(super_clustered_drugs, minimum_cluster_length=3, has_outcast_cluster=False)