-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscript_PLSDA_RF_SVM.py
120 lines (102 loc) · 3.9 KB
/
script_PLSDA_RF_SVM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""
This code is an implementation of the other methods used for comparison in the article :
Learning a confidence score and the latent space of a new Supervised Autoencoder
for diagnosis and prognosis in clinical metabolomic studies.
"""
#%%
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import scale
import functions.functions_compare as fc
if __name__ == "__main__":
# Set params :
# filename = "LUNG.csv"
# filename = "BRAIN_MID.csv"
# filename = "GC_Brest_D_MB.csv"
filename = "Th12F_meanFill.csv"
outputPath = "results_compare/" + filename.split(".")[0] + "/"
if not os.path.exists(outputPath): # make the directory if it does not exist
os.makedirs(outputPath)
SEEDS = [5, 6, 7]
algo_list = [
# "svm",
"Lasso",
"plsda",
"RF",
"logreg",
] # ML algorithms to compare to
# Note: svm takes a long time because of the parameter grid search
doScale = True # scaling along rows
doTopgenes = True # feature ranking
# Load data
X, Yr, nbr_clusters, feature_names = fc.readData(filename)
# Data Preprocessing
X = np.log(abs(X + 1))
X = X - np.mean(X, axis=0)
if doScale:
X = scale(X, axis=0)
######## Main #######
print("Started training")
for seed in SEEDS:
# Processing
print(f"------ Seed {seed} ------")
(
accTestCompare,
df_timeElapsed,
aucTestCompare,
df_featureList,
) = fc.basic_run_other(
X,
Yr,
nbr_clusters,
algo_list,
genenames=feature_names,
clusternames=None,
nfold=4,
rng=seed,
doTopGenes=True,
)
df_timeElapsed.to_csv(outputPath + "timeElapsed.csv")
if seed == SEEDS[0]:
accTestCompare_final = accTestCompare.iloc[:4, :]
aucTestCompare_final = aucTestCompare.iloc[:4, :]
if doTopgenes:
df_featureList_final = df_featureList
else:
accTestCompare_final = pd.concat(
[accTestCompare_final, accTestCompare.iloc[:4, :]]
)
aucTestCompare_final = pd.concat(
[aucTestCompare_final, aucTestCompare.iloc[:4, :]]
)
if doTopgenes:
for met in range(len(df_featureList_final)):
df_featureList_final[met] = df_featureList_final[met].join(
df_featureList[met]["weights"], rsuffix=" {}".format(seed)
)
mean = pd.DataFrame(accTestCompare_final.mean(axis=0))
if doTopgenes:
for met in range(len(df_featureList_final)):
mean_met = pd.DataFrame(df_featureList_final[met].iloc[:, 1:].mean(axis=1))
std_met = pd.DataFrame(df_featureList_final[met].iloc[:, 1:].std(axis=1))
mean_met.columns = ["Mean"]
df_featureList_final[met] = df_featureList_final[met].join(mean_met)
std_met.columns = ["Std"]
df_featureList_final[met] = df_featureList_final[met].join(std_met)
std = pd.DataFrame(accTestCompare_final.std(axis=0))
mean.columns = ["Mean"]
accTestCompare_final = accTestCompare_final.T.join(mean).T
std.columns = ["Std"]
accTestCompare_final = accTestCompare_final.T.join(std).T
accTestCompare_final.to_csv(outputPath + "accCompare.csv")
mean = pd.DataFrame(aucTestCompare_final.mean(axis=0))
std = pd.DataFrame(aucTestCompare_final.std(axis=0))
mean.columns = ["Mean"]
aucTestCompare_final = aucTestCompare_final.T.join(mean).T
std.columns = ["Std"]
aucTestCompare_final = aucTestCompare_final.T.join(std).T
aucTestCompare_final.to_csv(outputPath + "aucCompare.csv")
if doTopgenes:
for seed, algo in enumerate(algo_list):
df_featureList_final[seed].to_csv(f"{outputPath}topgenes_{algo}.csv")