-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathanalysis.py
182 lines (147 loc) · 6.54 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import os
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from umap import UMAP
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import AgglomerativeClustering
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
# Load csv data assumed to be in ./csv
from inspect import currentframe, getframeinfo
from pathlib import Path
filename = getframeinfo(currentframe()).filename
parent = Path(filename).resolve().parent
csv_dir = str(parent.joinpath('csv').absolute())
num_grns = 5
# Read each metric csv file into a dictionary, with GRN id as key
data = {}
for grn in range(num_grns):
filename = csv_dir + '/grn_' + str(grn) + '.csv'
pdata = pd.read_csv(filename, sep=',')
data[filename] = pdata
# Concatenate the dictionary items into a single dataframe
pdata = pd.concat(data.values())
# Extract and prepare morphometry data for analysis
# Extract SNT measurements
snt_keys = [k for k in pdata.keys() if k != 'filenameGRN' and k != 'Description']
# Obtain the set of strings of the 5 GRN types
grns = sorted(list(set(pdata['filenameGRN'])))
num_grns = len(grns)
# Dictionary that maps GRN filenames to an integer GRN uid
grn_dict = dict(zip(grns, range(num_grns)))
# Define numpy matrices
snt_mat = pdata[snt_keys].to_numpy() # Matrix of SNT measurements
# Vector of GRN ids to associate with the metrics vector for each observed cell
grn_mat = np.array([grn_dict[grn] for grn in pdata['filenameGRN']])
# Remove all entries with nan's
nan_mask = ~np.isnan(snt_mat).any(axis=1)
grn_mat = grn_mat[nan_mask]
snt_mat = snt_mat[nan_mask]
# Create the histplot
def plot_histplot(pdata):
# Remove the non-metric values from the dataframe
snt_df = pdata.drop(columns=['Description'])
# Unpivot the dataframe to long format using GRN type as the identifier variable,
# leaving all other columns as the measured variables
snt_long = pd.melt(snt_df, "filenameGRN", var_name="var")
g = sns.FacetGrid(snt_long, hue="filenameGRN", col="var", col_wrap=5, sharex=False, legend_out=True)
g.map(plt.hist, "value", alpha=.4)
g.add_legend()
plt.savefig(str(parent.absolute()) + '/histplot.png')
plot_histplot(pdata)
# Create the heatmap
def plot_heatmap(pdata):
snt_df = pdata.dropna(axis="columns")
filename_grn = snt_df["filenameGRN"]
snt_df = pdata.drop(columns=["Description", "filenameGRN"])
# Scale the values of each metric by the min-max value across all observations, so that all metrics
# are in the same range [0, 1]
snt_df = pd.DataFrame(MinMaxScaler().fit_transform(snt_df), index=snt_df.index, columns=snt_df.columns)
snt_df["filenameGRN"] = filename_grn
# Aggregate each GRN type using the mean value of each scaled metric across all observations
snt_df = snt_df.groupby(["filenameGRN"]).mean()
plt.figure(figsize=(8, 15))
ax = sns.heatmap(snt_df.T)
plt.savefig(str(parent.absolute()) + '/heatmap.png', bbox_inches='tight')
plot_heatmap(pdata)
def ks2samp_full(grn_a_id, grn_b_id, mat):
# Compute the 2-sample Kolmogorov-Smirnov test
warnings.filterwarnings("ignore")
# Only compare the observations for the two GRN types of interest
grn_a_mask = np.array([ True if grn == grn_a_id else False for grn in grn_mat ])
grn_b_mask = np.array([ True if grn == grn_b_id else False for grn in grn_mat ])
# Ignore column 0 since it holds the GRN integer ids
snt_mat_a = mat[grn_a_mask, 1:]
snt_mat_b = mat[grn_b_mask, 1:]
pvalues = []
for pid in range(snt_mat_b.shape[1]):
test_result = stats.ks_2samp(snt_mat_a[:, pid], snt_mat_b[:, pid])
pvalues += [test_result.pvalue]
# Combine pvalues to conclude about singular hypothesis
combined_pvalue = stats.combine_pvalues(pvalues, method='fisher')
return combined_pvalue[1]
def generate_report(mat, method):
# Conduct the 2-sample K-S test on all GRN types against each other (ignoring same-group comparisons)
# and print the p-values
print('p-values of K-S 2-sample test on {} features combined with Fisher:'.format(method))
for a in range(num_grns):
print('- - - - - - - - -')
for b in range(num_grns):
pvalue = ks2samp_full(a, b, mat)
if( a == b ):
print('N/A\t', end='')
elif( pvalue < 0.0001 ):
print('***\t', end='')
elif( pvalue < 0.001 ):
print('**\t', end='')
elif( pvalue < 0.05 ):
print('*\t', end='')
else:
print('NS\t', end='')
print('\t'.join([str(el) for el in [a,b,pvalue]]))
print(str(len(snt_keys)) + " metrics used")
def visualize_embedding(mat, title):
# Plot the first two and three components of the dimension-reduced metrics array
# 3 components
fig = plt.figure(figsize=(15, 15))
ax = Axes3D(fig)
scatter = ax.scatter(mat[:,0], mat[:,1], mat[:,2],
s=50, c=grn_mat, cmap=plt.get_cmap("viridis"))
# 2 components
fig, ax = plt.subplots(figsize=(10,10))
scatter = ax.scatter(mat[:,0], mat[:,1],
s=50, c=grn_mat, cmap=plt.get_cmap("viridis"))
plt.legend(handles=scatter.legend_elements()[0], labels=grns)
plt.title(title)
plt.show()
# Standardize features by removing the mean and scaling to unit variance
norm_snt_mat = StandardScaler().fit_transform(snt_mat)
# SNT metrics analysis
# Concatenate the integer GRN type id for each observation with the observed measurements array
# along the horizontal axis.
full_mat_snt = np.hstack([grn_mat[:, np.newaxis], norm_snt_mat])
# Print the values of the 2-sample K-S test
generate_report(full_mat_snt, "SNT")
# t-SNE plot
tsne_snt_mat = TSNE(n_components=3, random_state=13, perplexity=30).fit_transform(norm_snt_mat)
visualize_embedding(tsne_snt_mat, "t-SNE")
# UMAP analysis
umap_snt_mat = UMAP(n_components=3, random_state=13).fit_transform(norm_snt_mat)
full_mat_umap = np.hstack([grn_mat[:, np.newaxis], umap_snt_mat])
generate_report(full_mat_umap, "UMAP")
visualize_embedding(umap_snt_mat, "UMAP")
# PCA analysis
pca = PCA(n_components=3)
pca.fit(norm_snt_mat)
print("Explained variance ratio: ", pca.explained_variance_ratio_)
print("Singular values: ", pca.singular_values_)
pca_snt_mat = pca.transform(norm_snt_mat)
full_mat_pca = np.hstack([grn_mat[:, np.newaxis], pca_snt_mat])
generate_report(full_mat_pca, "PCA")
visualize_embedding(pca_snt_mat, "PCA")