-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2_model_development_all_except_death.txt
92 lines (69 loc) · 3.11 KB
/
2_model_development_all_except_death.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""
Created on Sat Aug 08 12:20:45 2020
@authors: Labdaps team (http://www.fsp.usp.br/labdaps/)
"""
#1) Read cleaned and preprocessed CSV
dataset = pd.read_csv("file.csv", delimiter=";",index_col=0)
df_bp = pd.DataFrame(dataset)
df_bp.shape
#1.1) Check outcome proportion
df_bp['severe_outcome_combination'].value_counts()
#1.2) set class
df_bp['class'] = df_bp['severe_outcome_combination'].astype('int')
df_bp = df_bp.drop(['severe_outcome_combination'],axis=1)
df_bp.head()
# 2) Remove non-used outcomes to avoid data leakage and save original dataframe for further testing on dropped outcomes
df_bp_todos_desfechos = df_bp
df_bp = df_bp.drop(['mv'],axis=1)
df_bp = df_bp.drop(['death'],axis=1)
df_bp = df_bp.drop(['icu'],axis=1)
# 2.1) Prepare Experiment
#Using imputation by median on numeric variables and random under sampling
#Forced some categories due to MLFlow doesn't recognize all at once
exp = setup(df_bp, target='class'
, categorical_features = ['sex','race']
, numeric_features = ['basophils','braden']
, normalize=True
#, remove_multicollinearity=True
,numeric_imputation='median'
# ,numeric_imputation='ignore'
#,multicollinearity_threshold=0.9
, resample=True
, resample_method='random_under'
)
X, y, X_train, X_test, y_train, y_test, seed, prep_pipe, _ = exp
X_test.shape
#2.1.1) Filter original dataframe with x_test index
#PS: We are not using those outcomes in the training because it would cause a data leakage
#They are used after training and test after being attached, to check multipurpose algorith performance on single outcomes
df_bp_todos_desfechos.loc[X_test.index]
y_icu = df_bp_todos_desfechos.loc[X_test.index,"icu"]
y_mv = df_bp_todos_desfechos.loc[X_test.index,"mv"]
y_death = df_bp_todos_desfechos.loc[X_test.index,"death"]
# 2.2) Filtering only 5 pre-selected models
#Catboost, XGBoost, Extra Trees, Random Forests and MLP
compare_models(blacklist = ["lr","knn","nb","dt","svm","rbfsvm","gpc","ridge","qda","ada","gbc","lda","lightgbm"] , turbo = False)
# 2.2.1) Tune models
cattuned = tune_model('catboost', optimize='AUC', n_iter=20)
mlptuned = tune_model('mlp', optimize='AUC', n_iter=20)
ettuned = tune_model('et', optimize='AUC', n_iter=20)
rftuned = tune_model('rf', optimize='AUC', n_iter=20)
xgbtuned = tune_model('xgboost', optimize='AUC', n_iter=20)
# 2.3) Select best algorithm based on AUROC
#MLFlow predicts on X_test
preds = predict_model(ettuned)
preds = predict_model(cattuned)
preds = predict_model(rftuned)
preds = predict_model(xgbtuned)
# 2.4) interpret models
interpret_model(rftuned)
# 2.5) Finalize model
final_model = finalize_model(rftuned)
# 3) Using model to predict unseen outcome
preds = predict_model(rftuned)
preds
#With calcuted scores in the training algorithm, what classifications would be considered on other outcomes?
binary_classification_metrics(y_death, preds['Label'], preds['Score'])
binary_classification_metrics(y_icu, preds['Label'], preds['Score'])
binary_classification_metrics(y_mv, preds['Label'], preds['Score'])
# 4) Generate ROC curves ommited