-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodels.py
297 lines (238 loc) · 9.56 KB
/
models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
# -----------------------------------------------------------
# Modeling related
#
# Author: Nicole Sung
# Created: 9/14/2023
# Modified: 10/4/2023
#
# -----------------------------------------------------------
# Classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
# Regression
import numpy as np
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import math
# General
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
import warnings, time
# Define a dictionary of regressor names and their corresponding models
REGRESSOR_FUNCTIONS = {
"Naive": DummyRegressor(strategy="mean"),
"Baseline": LinearRegression(),
"Ridge Regression": Ridge(),
"Lasso Regression": Lasso(),
"Elastic Net Regression": ElasticNet(),
"Decision Tree Regressor": DecisionTreeRegressor(),
"Gradient Boosting Regressor": GradientBoostingRegressor(),
}
# Define a dictionary of classifier names and their corresponding models
CLASSIFIER_MODELS = {
'Naive': DummyClassifier(strategy='most_frequent'),
'DecisionTree': DecisionTreeClassifier(),
'Dummy': DummyClassifier(),
'SVC': SVC(),
'RandomForest': RandomForestClassifier(),
'GaussianNB': GaussianNB(),
'MLP': MLPClassifier()
}
class CustomClassifierPipeline(BaseEstimator, TransformerMixin):
def __init__(self, X_train, y_train, X_test, y_test):
self.X_train = X_train
self.y_train = y_train
self.X_test = X_test
self.y_test = y_test
self.results = {}
def evaluate_classifiers(self):
for name, model in CLASSIFIER_MODELS.items():
model.fit(self.X_train, self.y_train)
y_pred = model.predict(self.X_test)
self.results[name] = y_pred
def get_results(self):
return self.results
def evaluate_models_pipe(X, y, model_selection, test_size=0.3, random_state=1, suppress_warnings=True):
if suppress_warnings:
warnings.filterwarnings('ignore')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
custom_pipeline = CustomClassifierPipeline(X_train, y_train, X_test, y_test)
custom_pipeline.evaluate_classifiers()
return custom_pipeline.get_results()
def model_analysis(X_train, X_test, y_train, y_test, model_type, model, additional_info = False):
matrix = None
# Start timing
start_time = time.time()
# Fit the model
model.fit(X_train, y_train)
# End timing
end_time = time.time()
elapsed_time = end_time - start_time
# Make predictions
y_pred = model.predict(X_test)
print(f"{model.__class__.__name__} Model Fitting Time: {elapsed_time:.3f} seconds")
# Calculate the error
if model_type == "classification":
matrix = compute_classification_matrix(y_test, y_pred, model)
else:
matrix = compute_regression_matrix(y_test, y_pred, model, additional_info)
return model, y_pred, matrix
# Evaluates a list of models and returns the best classifier based on accuracy.
def evaluate_models(X, y, model_selection, test_size=0.3, random_state=1, suppress_warnings=True):
if suppress_warnings:
warnings.filterwarnings('ignore')
best_model = None
best_metric = 0 if model_selection == 'classification' else float('inf')
best_y_pred = None
best_name = ""
MODEL_FUNCTIONS = CLASSIFIER_FUNCTIONS if model_selection == 'classification' else REGRESSOR_FUNCTIONS
print(f"\nPhase 4: {model_selection.capitalize()} Modeling")
print("="*60)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
for name, model_func in MODEL_FUNCTIONS.items():
model, y_pred, metric = model_analysis(X_train, X_test, y_train, y_test, model_selection, model_func)
# Store best model result
if (model_selection == 'classification' and metric['accuracy'] > best_metric) or \
(model_selection == 'regression' and metric['mse'] < best_metric):
best_metric = metric['accuracy'] if model_selection == 'classification' else metric['mse']
best_model = model
best_y_pred = y_pred
best_name = name
print("="*60)
print(f"Best Model: {best_name} with Metric: {best_metric:.3f}")
return best_model
# Compute the accuracy of predictions.
def compute_classification_matrix(y_true, y_pred, model):
metrics = {}
accuracy = accuracy_score(y_true, y_pred)
metrics['accuracy'] = accuracy
print(f"{model.__class__.__name__} Accuracy: {accuracy*100:.3f}%")
print("-"*60)
return metrics
def compute_regression_matrix(y_true, y_pred, model, additional_info):
metrics = {}
error = mean_squared_error(y_true, y_pred)
metrics['mse'] = error
print(f"{model.__class__.__name__} MSE: {error:.3f}")
# Additional regression metrics
if additional_info:
residuals = y_true - y_pred
SSE = np.sum(residuals**2)
SST = np.sum((y_true - np.mean(y_true))**2)
SSM = SST - SSE
r2 = r2_score(y_true, y_pred)
metrics['SSE'] = SSE
metrics['SST'] = SST
metrics['SSM'] = SSM
metrics['r2'] = r2
print(f"{model.__class__.__name__} SSE: {SSE:.3f}")
print(f"{model.__class__.__name__} SST: {SST:.3f}")
print(f"{model.__class__.__name__} SSM: {SSM:.3f}")
print(f"{model.__class__.__name__} R^2: {r2:.3f}")
print("-"*60)
return metrics
def predict_confidence(clf, X):
# Predict labels and get confidence scores or probability estimates.
# Predict labels
pred_labels = clf.predict(X)
# Try to get probability estimates
if hasattr(clf, "predict_proba"):
probs = clf.predict_proba(X)
# Assuming binary classification; take the probability of the second class
pred_confidence = probs[:, 1]
# For classifiers like SVM without probability estimates but with decision_function
elif hasattr(clf, "decision_function"):
# Distance from the decision boundary can be a measure of confidence
pred_confidence = clf.decision_function(X)
else:
raise ValueError("The classifier does not have a method to provide confidence values.")
return pred_labels, pred_confidence
def decision_tree_classifier(X_train, y_train, X_test, y_test):
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return y_pred
def naive_classifier(X_train, y_train, X_test, y_test):
model = DummyClassifier(strategy='most_frequent')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return y_pred
def support_vector_classifier(X_train, y_train, X_test, y_test):
model = SVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return y_pred
def random_forest_classifier(X_train, y_train, X_test, y_test):
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return y_pred
def naive_bayes_classifier(X_train, y_train, X_test, y_test):
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return y_pred
def mlp_classifier(X_train, y_train, X_test, y_test):
model = MLPClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return y_pred
def naive_regression(X_train, y_train, X_test, y_test):
model = DummyRegressor(strategy="mean")
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return y_pred
def linear_regression(X_train, y_train, X_test, y_test):
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return y_pred
def ridge_regression(X_train, y_train, X_test, y_test):
model = Ridge()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return y_pred
def lasso_regression(X_train, y_train, X_test, y_test):
model = Lasso()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return y_pred
def elastic_net_regression(X_train, y_train, X_test, y_test):
model = ElasticNet()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return y_pred
def decision_tree_regressor(X_train, y_train, X_test, y_test):
model = DecisionTreeRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return y_pred
def random_forest_regressor(X_train, y_train, X_test, y_test):
model = RandomForestRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return y_pred
def gradient_boosting_regressor(X_train, y_train, X_test, y_test):
model = GradientBoostingRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return y_pred
def support_vector_regressor(X_train, y_train, X_test, y_test):
model = SVR()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return y_pred
def mlp_regressor(X_train, y_train, X_test, y_test):
model = MLPRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return y_pred