-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmulti_neural_classifier_fn2.py
207 lines (163 loc) · 8.24 KB
/
multi_neural_classifier_fn2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import numpy as np
import pandas as pd
import scanpy as sc
from sklearn.neural_network import MLPClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle
import os
import time
from multi_neural_classifier_UMAP_results import neural_results_on_UMAP
""" Contains function for neural network. Uses multi-NN to predict whether a co-stim cell is
- M1-dominant,
- M2-dominant,
- Mixed,
- Unclassified.
Outputs:
Full output of classification report
Full output of predicted labels for each cell
Plots predicted labels onto UMAP space """
def M1M2_neuralnet2(*, geneset=None, n_genes=None, fig_dir=None):
"""
This analysis will use M1 and M2 data to predict the classes in M1+M2.
Uses multiple neural nets to predict (independent of each other):
- M1, and nothing else
- M2, and nothing else
- A mix of M1 and M2, or
- No classification, termed unclassified.
Use top n_genes genes from the M1M2 volcano analysis (or the downregulated genes), specified by parameter
:param geneset: {unique_core, down_regulated, all}. Specifies which geneset to use, unique_core signature genes, or those genes
downregulated by co-stim. All specifies using all the genes in the dataset to build the classifier (not supported
in this function yet, will implement soon).
:param n_genes: specifies number of genes to use for neural network.
:param fig_dir: folder to save results
:return:
"""
# Set settings
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(scanpy=True, dpi=80, dpi_save=600, color_map='viridis', vector_friendly=False)
sc.settings.autoshow = False
fig_dir_o = 'figures/'
results_path = 'write/'
results_file = 'bmdm_object.h5ad'
data_dir = fig_dir_o + 'gene_programs/'
# ######################################################################################################################
# ######################################################################################################################
# Set n_genes if empty
if not n_genes:
n_genes = 'all'
# Read genesets based on parameter
if geneset == 'down_regulated':
geneset_file = data_dir + '/down_in_mixed_unique.pickle'
print("***\nRunning Neural network analysis with {} {} genes\n***".format(n_genes, geneset))
elif geneset == 'all_genes':
print("***\nRunning Neural network analysis with {}\n***".format(geneset))
if not os.path.isdir(fig_dir):
os.makedirs(fig_dir)
sc.settings.figdir = fig_dir
print("*** Starting neural network analysis ***")
adata = sc.read(results_path + results_file)
samples = ['M0', 'M1', 'M2', 'M1+M2']
# Build data sets to train classifier
M0_mask = np.array(adata.obs['sample'] == 'M0')
N0 = np.sum(M0_mask)
M1_mask = np.array(adata.obs['sample'] == 'M1')
N1 = np.sum(M1_mask)
M2_mask = np.array(adata.obs['sample'] == 'M2')
N2 = np.sum(M2_mask)
M12_mask = np.array(adata.obs['sample'] == 'M1+M2')
N12 = np.sum(M12_mask)
single_mask = M1_mask | M2_mask
cell_state = np.array(adata[single_mask].obs['sample'])
# Reduce to desired geneset
if geneset is not 'all_genes':
with open(geneset_file, 'rb') as f:
unique_genes = pickle.load(f)
# Print number of M1 and M2 genes
print("Number of M1 genes: {}\n"
"Number of M2 genes: {}".format(unique_genes['M1'].shape[0], unique_genes['M2'].shape[0]))
# Reduce to desired genelist
if n_genes == 'all': # This is scenario when you want all the genes in a specific unique or downreg geneset
genelist = np.concatenate((unique_genes['M1']['names'],
unique_genes['M2']['names']))
else:
genelist = np.concatenate((unique_genes['M1']['names'][0:n_genes],
unique_genes['M2']['names'][0:n_genes]))
adata2 = adata.raw[:, genelist].copy()
else:
# Use all genes
adata2 = adata.raw.copy()
print("Total number of genes: {}".format(adata2.shape[1]))
M1orM2 = pd.DataFrame(data=adata2[single_mask, :].X.todense(), columns=adata2.var_names)
M12 = pd.DataFrame(data=adata2[M12_mask, :].X.todense(), columns=adata2.var_names)
M12_obs_names = adata[M12_mask, :].obs_names
del adata, adata2
#Make Multilabel format
y = MultiLabelBinarizer().fit_transform(cell_state)
y = y[:,:-1]
#Split into test-train
X_train, X_test, y_train, y_test = train_test_split(M1orM2, y, test_size=0.10)
#Scale the data
scaler = StandardScaler()
# scaler.fit(X_train)
scaler.fit(M1orM2)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
M12 = scaler.transform(M12)
#Build classifier with tnree hidden layers, and multi-label one vs all approach
size_layer = round(X_train.shape[1]*1/2)
clf = OneVsRestClassifier(MLPClassifier(hidden_layer_sizes=(size_layer, size_layer, size_layer),
activation='tanh', max_iter=500, verbose=False)) # three hidden layer
#Train classifier
print('---- Started training neural network ---')
tic = time.time()
clf.fit(X_train, y_train)
toc = time.time()
print('---- Elapsed time = {} minutes ----'.format((toc-tic)/60))
# Check accuracy
predicted = clf.predict(X_test)
print(classification_report(y_test, predicted))
report = pd.DataFrame(classification_report(y_test, predicted, output_dict=True)).T
report.to_excel(fig_dir + f"{geneset}_{size_layer}_neurons_multiclass_classification_report_full.xlsx")
print('---- Accuracy = {:.2%} ----'.format(accuracy_score(y_test, predicted)))
with open(fig_dir + f"{geneset}_{size_layer}_neurons_multiclass_classification_report.txt", "w") as text_file:
text_file.write('---- Accuracy = {:.2%} ----'.format(accuracy_score(y_test, predicted)))
# Predict M1 + M2 states
print('---- Started predicting M1+M2 with neural network ---')
M12_prediction = clf.predict(M12)
# Quantify predictions
print('---- Quantifying and saving results ---')
c_M1 = np.all((np.sum(M12_prediction, axis=1) == 1, M12_prediction[:,0] == 1), axis=0)
c_M2 = np.all((np.sum(M12_prediction, axis=1) == 1, M12_prediction[:,1] == 1), axis=0)
c_mixed = np.all((np.sum(M12_prediction, axis=1) == 2, M12_prediction[:,0] == 1, M12_prediction[:,1] == 1), axis=0)
c_unclass = np.sum(M12_prediction, axis=1) == 0
n_M1 = np.sum(c_M1)
n_M2 = np.sum(c_M2)
n_mixed = np.sum(c_mixed)
n_unclass = np.sum(c_unclass)
# Make label vector
# predicted_label = np.zeros(M12.shape[0], dtype='object')
predicted_label = pd.Series(index=M12_obs_names, dtype='object')
predicted_label[c_M1] = "M1_class"
predicted_label[c_M2] = "M2_class"
predicted_label[c_mixed] = "Mixed_class"
predicted_label[c_unclass] = "Unclassified_class"
print('{:.2%} classified as M1\n'
'{:.2%} classified as M2\n'
'{:.2%} classified as M1+M2\n'
'{:.2%} unclassified'.format(n_M1/N12, n_M2/N12, n_mixed/N12, n_unclass/N12))
print('{} classified as M1\n'
'{} classified as M2\n'
'{} classified as M1+M2\n'
'{} unclassified'.format(n_M1, n_M2, n_mixed, n_unclass))
d = {'Num': [n_M1, n_M2, n_mixed, n_unclass], 'Percent': [n_M1, n_M2, n_mixed, n_unclass]/N12}
out = pd.DataFrame(data=d, index=["M1-dominant", "M2-dominant", "Mixed", "Unclassified"])
out.to_excel(fig_dir + f"Fig3c_{geneset}_{size_layer}_neurons_multiclass_classifier_results.xlsx")
####################################################################################################################
# Plot on umap space
print('---- Overlaying classifications on UMAP space ---')
neural_results_on_UMAP(predicted_label=predicted_label, dataset='M1M2only', geneset_name=geneset, fig_dir_net=fig_dir)
####################################################################################################################