-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path6_model_development_single_otucome_icu.txt
77 lines (58 loc) · 2.28 KB
/
6_model_development_single_otucome_icu.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
"""
Created on Sat Aug 08 12:20:45 2020
@authors: Labdaps team (http://www.fsp.usp.br/labdaps/)
"""
#1) Read cleaned and preprocessed CSV
dataset = pd.read_csv("file.csv", delimiter=";",index_col=0)
df_bp = pd.DataFrame(dataset)
df_bp.shape
#1.1) Check outcome proportion
df_bp['icu'].value_counts()
#1.2) set class
df_bp['class'] = df_bp['icu'].astype('int')
df_bp = df_bp.drop(['icu'],axis=1)
df_bp.head()
# 2) Remove non-used outcomes to avoid data leakage
df_bp = df_bp.drop(['mv'],axis=1)
df_bp = df_bp.drop(['death'],axis=1)
# 2.1) Prepare Experiment
#Using imputation by median on numeric variables and random under sampling
#Forced some categories due to MLFlow doesn't recognize all at once
exp = setup(df_bp, target='class'
, categorical_features = ['sex','race']
, numeric_features = ['basophils','braden']
, normalize=True
#, remove_multicollinearity=True
,numeric_imputation='median'
# ,numeric_imputation='ignore'
#,multicollinearity_threshold=0.9
, resample=True
, resample_method='random_under'
)
X, y, X_train, X_test, y_train, y_test, seed, prep_pipe, _ = exp
X_test.shape
# 2.2) Filtering only 5 pre-selected models
#Catboost, XGBoost, Extra Trees, Random Forests and MLP
compare_models(blacklist = ["lr","knn","nb","dt","svm","rbfsvm","gpc","ridge","qda","ada","gbc","lda","lightgbm"] , turbo = False)
# 2.2.1) Tune models
cattuned = tune_model('catboost', optimize='AUC', n_iter=20)
mlptuned = tune_model('mlp', optimize='AUC', n_iter=20)
ettuned = tune_model('et', optimize='AUC', n_iter=20)
rftuned = tune_model('rf', optimize='AUC', n_iter=20)
xgbtuned = tune_model('xgboost', optimize='AUC', n_iter=20)
# 2.3) Select best algorithm based on AUROC
#MLFlow predicts on X_test
preds = predict_model(ettuned)
preds = predict_model(cattuned)
preds = predict_model(rftuned)
preds = predict_model(xgbtuned)
# 2.4) interpret models
interpret_model(rftuned)
# 2.5) Finalize model
final_model = finalize_model(rftuned)
# 3) Using model to predict unseen outcome
preds = predict_model(rftuned)
preds
#Check performance on test dataset (AUC, PPV, Recall, Precision, Specificity)
binary_classification_metrics(y_test, preds['Label'], preds['Score'])
# 4) Generate ROC curves ommited