-
Notifications
You must be signed in to change notification settings - Fork 0
/
main_with_model_loading.py
117 lines (95 loc) · 5.12 KB
/
main_with_model_loading.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import argparse, sys
import pandas as pd
from os.path import isfile
import auxiliary.modelPlots as plottery
import auxiliary.outlierDetection as outliers
from input_output.TitanicLoader import TitanicLoader
from preprocessing.TitanicPreprocessor import TitanicPreprocessor
from featureEngineering.TitanicFeatures import TitanicFeatures
from input_output.TitanicSaver import TitanicSaver
from validation.testPerformance import testAccuracy
from ensembles.votingEnsemble import VotingEnsemble
from models.RandomForestModel import RF
from models.SVMModel import SVM
from models.KNNModel import KNN
from models.MLPModel import MLP
from models.GRBTModel import GRBT
from models.GPModel import GP
from models.XGBoostModel import XGBoost
from models.BayesModel import Bayes
from models.AdaBoostModel import AdaBoostModel as AdaBoost
from models.ExtraTreesModel import ExtraTreesModel as ET
from models.LogitRegModel import LogitRegModel as Logit
class Pipeline(object):
def __init__(self, loader=TitanicLoader, preprocessor=TitanicPreprocessor, features=TitanicFeatures,
models=[RF], saver=TitanicSaver, training_mode=False):
parser = argparse.ArgumentParser()
self.args = parser.parse_args()
self.params = None
self.model_params = None
self.training_data_file = "Data/train.csv"
self.test_data_file = "Data/test.csv"
self.loader = loader()
self.preprocessor = preprocessor()
self.features = features()
self.models = [m(self.params) for m in models]
self.saver = saver()
self.training_mode = False
def run(self):
# load data. Test_labels are PassengerIds which we need to save for the submission
x_train, y_train, x_test, test_labels = self.loader.load_split(training_data_file=self.training_data_file, test_data_file=self.test_data_file)
# detect outliers
out = outliers.detect_outliers(x_train, 2, ["Age", "SibSp", "Parch", "Fare"])
print ("Dropping ", len(out) , " outliers")
# drop outliers
x_train = x_train.drop(out, axis=0).reset_index(drop=True)
y_train = y_train.drop(out, axis=0).reset_index(drop=True)
# process in whole, so the train and test would have the same features (for one-hot encoding for example)
combined = pd.concat([x_train,x_test],ignore_index=True)
preprocessed = self.preprocessor.preprocess_datasets([combined])
engineered = self.features.engineer_features_multiple_ds(preprocessed)[0]
# Sanity check
assert len(engineered) == len(x_train) + len(x_test)
# split data again
x_train = engineered[0:len(x_train)]
x_test = engineered[len(x_train):]
# train all the models
for model in self.models:
print ("\nUsing " , model.name)
# Load model from 'saved_models' folder if not training and if it's available.
if not self.training_mode and isfile('saved_models/' + model.name + '.pkl'):
model = self.loader.load_pkl('saved_models/' + model.name + '.pkl')
# Else, train model from scratch in training or if no saved model is available during testing.
else:
# Warning message if we want to load a model but it does not exist.
if not self.training_mode:
print("Saved model not found..")
# Check which features are optimal for this model, and train the model with them
model.feature_selection(x_train, y_train)
model.train(x_train, y_train, self.model_params)
# Generate predictions for the test set and write to a csv file
print ("Predicting test set..")
model.test(x_test, test_labels)
self.saver.save_predictions(model.predictions, 'predictions/' + model.name + '.csv')
print("Accuracy on test set is:", testAccuracy(model.name))
# Save improved models and scores to disk (in training mode)
if self.training_mode:
test_accuracies = [ testAccuracy(model.name) for model in self.models ]
self.saver.save_models(self.models, test_accuracies)
# Create ensemble from all the trained models, and test the predictions output
# NOTE: assumes you trained your model with Gridsearch
ve = VotingEnsemble(params=[], models=self.models)
ve.feature_selection(x_train, y_train)
ve.test(x_test, test_labels)
self.saver.save_predictions(ve.predictions, 'predictions/' + ve.name + '.csv')
print("Accuracy on test set is:", testAccuracy(ve.name))
# show accuracies and correlation of models
# plottery.compareModelAcc(self.models)
# plottery.plotModelCorrelation(self.models)
if __name__ == '__main__':
Pipeline(loader=TitanicLoader, preprocessor=TitanicPreprocessor, features=TitanicFeatures,
models=[RF, KNN, ET, SVM, Logit, Bayes], saver=TitanicSaver,
training_mode=True).run()
# works nice: RF, KNN, ET, SVM, Logit, Bayes
# works nice: RF, KNN, ET, SVM, AdaBoost, Bayes
# all models = Bayes, GP, GRBT, RF, AdaBoost, Logit, SVM, XGBoost, MLP, KNN, ET