-
Notifications
You must be signed in to change notification settings - Fork 0
/
Minimal_Panel_clean.py
182 lines (168 loc) · 9.06 KB
/
Minimal_Panel_clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/usr/bin/env python
# coding: utf-8
import pandas as pd # To read data files
import numpy as np # To perform matrix operations
import math # Mathematical operations (check if nan)
import os # To access directories
from sklearn.preprocessing import StandardScaler # Normalize data
from sklearn.linear_model import LogisticRegression # Linear classifier
#Obtain metrics to test classifier
from sklearn.metrics import f1_score,confusion_matrix,roc_curve,ConfusionMatrixDisplay,auc,balanced_accuracy_score,roc_auc_score
from sklearn.model_selection import train_test_split,cross_validate,GridSearchCV # Split data to train-test, perform Grid search
from sklearn.feature_selection import SequentialFeatureSelector,RF # Feature elimination tools
import matplotlib.pyplot as plt # To plot data
# Open data file and split into different sets containing specific omic profiles
X_acute_cp=np.hstack((X_acute_cyt,X_acute_pro)) # stack cytokines and proteins together for acute data
X_long_cp=np.hstack((X_long_cyt,X_long_pro)) #stack cytokines and proteins for long data
X_acute_cm=np.hstack((X_acute_cyt,X_acute_meta)) # stack cytokines and metabolites for acute
X_long_cm=np.hstack((X_long_cyt,X_long_meta)) # stack cytokines and metabolites for long
X_acute_pm=np.hstack((X_acute_pro,X_acute_meta)) # stack protein and metabolites for acute
X_long_pm=np.hstack((X_long_pro,X_long_meta)) # stack protein and metabolites for long
X_acute = np.hstack((X_acute_cyt,X_acute_pro,X_acute_meta))
X_long = np.hstack((X_long_cyt,X_long_pro,X_long_meta))
#Generate color vector and label vector for classifier
label_dict_comb={0:'green',1:'red'}
class_dict_comb={'Event-Free':0,'With Event':1}
cvec=[label_dict_comb[label] for label in patient_labels]
org_label=patient_labels.
# Create Data Matrix combining all data
DATA=[]
DATA.append(X_acute)
DATA.append(X_acute_cyt)
DATA.append(X_acute_pro)
DATA.append(X_acute_meta)
DATA.append(X_acute_cp)
DATA.append(X_acute_cm)
DATA.append(X_acute_pm)
DATA.append(X_long)
DATA.append(X_long_cyt)
DATA.append(X_long_pro)
DATA.append(X_long_meta)
DATA.append(X_long_cp)
DATA.append(X_long_cm)
DATA.append(X_long_pm)
DATA.append(X_long-X_acute)
NAMES=['Acute','Acute_Cyt','Acute_Pro','Acute_Meta','Acute_CP','Acute_CM','Acute_PM','Long','Long_Cyt','Long_Pro','Long_Meta','Long_CP','Long_CM','Long_PM','Delta']
#Function to perform 5 fold cross validation
def cross_validation(model, _X, _y, _cv=5):
_scoring = ['balanced_accuracy', 'precision', 'recall', 'f1']
results = cross_validate(estimator=model,
X=_X,
y=_y,
cv=_cv,
scoring=_scoring,
return_train_score=True)
return {"Training Accuracy scores": results['train_balanced_accuracy'],
"Mean Training Accuracy":results['train_balanced_accuracy'].mean()*100,
"Training Precision scores": results['train_precision'],
"Mean Training Precision": results['train_precision'].mean(),
"Training Recall scores": results['train_recall'],
"Mean Training Recall": results['train_recall'].mean(),
"Training F1 scores": results['train_f1'],
"Mean Training F1 Score": results['train_f1'].mean(),
"Validation Accuracy scores": results['test_balanced_accuracy'],
"Mean Validation Accuracy": >results['test_balanced_accuracy'].mean()*100,
"Validation Precision scores": results['test_precision'],
"Mean Validation Precision": >results['test_precision'].mean(),
"Validation Recall scores": results['test_recall'],
"Mean Validation Recall": results['test_recall'].mean(),
"Validation F1 scores": results['test_f1'],
"Mean Validation F1 Score": results['test_f1'].mean()
}
#Function to plot 5 fold cross validation results
def plot_result(x_label, y_label, plot_title, train_data, val_data,name):
'''Function to plot a grouped bar chart showing the training and validation
results of the ML model in each fold after applying K-fold cross-validation.
Parameters
----------
x_label: str,
Name of the algorithm used for training e.g 'Decision Tree'
y_label: str,
Name of metric being visualized e.g 'Accuracy'
plot_title: str,
This is the title of the plot e.g 'Accuracy Plot'
train_result: list, array
This is the list containing either training precision, accuracy, or f1 score.
val_result: list, array
This is the list containing either validation precision, accuracy, or f1 score.
Returns
-------
The function returns a Grouped Bar chart showing the training an validation result in each fold.
'''
# Set size of plot
plt.figure(figsize=(12,6))
labels = ["1st Fold", "2nd Fold", "3rd Fold", "4th Fold", "5th >Fold"]
X_axis = np.arange(len(labels[:len(train_data)]))
ax = plt.gca()
plt.ylim(0.0, 1)
plt.bar(X_axis-0.2, train_data, 0.4, color='blue', label='Training')
plt.bar(X_axis+0.2, val_data, 0.4, color='red', label='Validation')
plt.title(plot_title, fontsize=30)
plt.xticks(X_axis, labels[:len(train_data)])
plt.xlabel(x_label, fontsize=14)
plt.ylabel(y_label, fontsize=14)
plt.legend()
plt.grid(True)
plt.savefig('MinimalPanel_New/RFE/FU_2023/Minimal/IMAGES/Cross_Val/'+str(name)+'_CV.png')
plt.show()
FPR=[]
TPR=[]
FPR_ALL=[]
TPR_ALL=[]
results_df=pd.DataFrame()
for i in range(len(DATA)):
X=DATA[i]
results_dict={}
#GRID SEARCH
parameters={'class_weight':[{0:1,1:1},{0:1,1:2},{0:1,1:5},{0:1,1:10},{0:2,1:1},{0:10,1:1},'balanced'],'C':[1E-5, 1E-3, 0.1, 1, 10, 100,1000]}
_scoring = ['balanced_accuracy', 'precision', 'recall', 'f1']
clf = GridSearchCV(LogisticRegression(penalty='l2',max_iter=5000,), parameters,scoring=_scoring,refit='balanced_accuracy')
clf.fit(X_train,y_train);
results_dict = clf.best_params_
if clf.best_params_['class_weight'] !='balanced':
clf.best_params_['class_weight']
dummy=str(clf.best_params_['class_weight'][0])+','+str((clf.best_params_['class_weight'][1]))
results_dict['class_weight'] = dummy
Estimator=clf.best_estimator_
n_features = 20 # number of features to be selected
selector=RFE(estimator,step=1,n_features_to_select= n_features )
selector.fit(X,org_label)
mask=selector.support_
X_low=selector.transform(X) # reduces the dimensionality of the data based on step 36
X_train,X_test,y_train,y_test,train_id,test_id = train_test_split(X_low,org_label,sample_num,test_size=0.1, random_state=42)
Clf_result=cross_validation(Clf,X_train,y_train,5)
plot_result('Linear Classifier '+ str(NAMES[i]),"Accuracy","Balanced Accuracy scores in 5 Folds", Clf_result["Training Accuracy scores"],Clf_result["Validation Accuracy scores"],NAMES[i])
Estimator.fit(X_train,y_train) # Fit the classifier
y_pred=Estimator.predict(X_test) # Predict the labels for unseen data
y_score=Estimator.decision_function(X_test) # Probability of the sample being in a class
y_train_pred = Estimator.predict(X_train)
results_dict['Test Accuracy'] = balanced_accuracy_score(y_test,y_pred) # Test Accuracy
results_dict['Test F1 Score']=f1_score(y_test,y_pred) # Test F1 Score
results_dict['Train Accuracy'] = balanced_accuracy_score(y_train,y_train_pred) # Train accuracy
results_dict['Train F1 Score']=f1_score(y_train,y_train_pred) # Train F1 score
cm = confusion_matrix(y_test, y_pred, labels=Clf.classes_) # Confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=Clf.classes_
disp.plot()
plt.savefig('MinimalPanel_New/RFE/FU_2023/IMAGES/Confusion_Matrix/'+str(NAMES[i])+'_CM.png')
print('Saving results for ',NAMES[i])
results_df=pd.concat([results_df,pd.DataFrame(results_dict,index=[0])],ignore_index=True)
results_df.to_csv('MinimalPanel_New/RFE/FU_2023/Summary_new.csv') # Change to the location to store results in
fpr_all,tpr_all,_ =roc_curve(org_label,y_score)
area=auc(fpr_all,tpr_all)
plt.figure()
lw=3
label ='Linear '+NAMES[i]+' (area= %0.2f)'
plt.plot(fpr_all,tpr_all,color='blue',lw=lw,label=label % area,)
plt.plot([0,1],[0,1],color='black',lw=lw,linestyle="--")
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves All Data')
plt.legend(loc='lower right')
plt.savefig('MinimalPanel_New/RFE/FU_2023/IMAGES/ROC/'+str(NAMES[i])+'_ROC.png')
Weights_dict={}
Weights_dict['Features']=np.array(mols_list[i])[mask]
Weights_dict['Weights']=Estimator.coef_[0]
Weights_df=pd.DataFrame(Weights_dict)
pd.concat([Weights_df,pd.DataFrame({'Features':'Bias','Weights':Estimator.intercept_[0] }, index=[0])],ignore_index=True).to_csv('MinimalPanel_New/RFE/FU_2023/Minimal/Weights/'+NAMES[i]+'.csv') #Change to desired location