-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtrain_test.py
80 lines (54 loc) · 2.16 KB
/
train_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jan 16 18:15:08 2019
@author: shiming
"""
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
import xgboost as xgb
from joblib import dump # model persistence
file = r''
index_file = file.replace('.xlsx','_Index.xlsx')
xls = pd.ExcelFile(file)
sheets = xls.sheet_names
index_xls = pd.ExcelFile(index_file)
n_splits = 20
################
#config xgboost
#param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
#model = xgb.XGBClassifier(silent=False, scale_pos_weight=1,learning_rate=0.01, colsample_bytree = 0.4,subsample = 0.8,objective='binary:logistic', n_estimators=100, reg_alpha = 0.3, max_depth=4, gamma=10)
################
## over sheets
for sheet_idx, sheet in enumerate(sheets):
df = xls.parse(sheet)
df_idx = index_xls.parse(sheet_idx)
# msg: 'Processing data on sheet'
# preprocessing data
#
n_exp = df_idx.shape[0]
## over experiments
var = df_idx.keys()
for idx_exp in range(n_exp):
# find input col indices
input_var = var[(df_idx.iloc[0]==1) | (df_idx.iloc[0]==3)]
## over outcomes
for outcome_var in var[df_idx.iloc[idx_exp]==2]:
#this_df = df[]
## do feature selection if necessary
## over splits
#kfold = StratifiedKFold(n_splits = n_splits, shuffle=False)
shufflesplit = StratifiedShuffleSplit(n_splits = n_splits, random_state = 42, test_size = 0.2)
this_X = df[input_var]
this_y = df[outcome_var]
for train_idx, test_idx in shufflesplit(this_X, this_y):
X_train, X_test = df[train_idx], df[test_idx]
y_train, y_test = df[test_idx], df[test_idx]
eval_set = [(X_train, y_train), (X_test, y_test)]
eval_metric = ["auc","error"]
# Do model training and testing
this_model = model.fit(eval_set, eval_metric)
dump(this_model, 'filename.joblib')
## do model-based feature selection if necessary
## collect all models' performance; persistentize results
## do model selection