-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathadaboost_tuned.py
109 lines (85 loc) · 3.82 KB
/
adaboost_tuned.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import numpy as np
import pandas as pd
import joblib
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV
# Displays the performance metrics of the model
def display_metrics(model, test_features, test_labels):
tn = fp = fn = tp = 0
test_len = len(test_labels)
test_pred = model.predict(test_features)
# Computes the accuracy, precision, recall, and f1 score
for i in range(test_len):
if test_pred[i] == 0 and test_labels[i] == 0:
tn += 1
elif test_pred[i] == 0 and test_labels[i] == 1:
fn += 1
elif test_pred[i] == 1 and test_labels[i] == 0:
fp += 1
elif test_pred[i] == 1 and test_labels[i] == 1:
tp += 1
accuracy = (tp + tn) / (tp + fp + fn + tn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * (recall * precision) / (recall + precision)
# Displays the performance metrics
print('Accuracy: {}'.format(accuracy))
print('Precision: {}'.format(precision))
print('Recall: {}'.format(recall))
print('F1 Score: {}'.format(f1_score))
# Return the best adaboost model using grid search
def grid_search(train_features, train_labels):
# Parameters used for the grid search model
ada_param = {'n_estimators': [400, 500, 600],
'learning_rate': [1, 1.2, 1.4, 1.6, 1.8, 2]}
# Perform five-fold cross validation on the adaboost classifier using the grid search model
ada = ensemble.AdaBoostClassifier()
ada_grid = GridSearchCV(estimator=ada, param_grid=ada_param, cv=5, n_jobs=3)
# Fit the adaboost model
ada_grid.fit(train_features, train_labels)
# Display the parameters of the best model from the grid search
print(ada_grid.best_params_)
# Sets the adaboost model as the best model from the grid search
return ada_grid.best_estimator_
if __name__ == '__main__':
# Loads the Chicago crime data set
crimes_filepath = 'D:/Users/Eric/Google Drive/Colab Notebooks/total_df.csv'
crimes_df = pd.read_csv(crimes_filepath)
# Drops the index column
crimes_df.drop(['Unnamed: 0'], axis=1, inplace=True)
# Sort the crimes by year
crimes_df.sort_values(by=['Year'], inplace=True)
# Uses the 'Arrest' feature as the target output for training
crime_features = np.array(crimes_df.drop(['Arrest'], axis=1))
crime_labels = np.array(crimes_df['Arrest'])
# Sets the percentage of data set as the test set
test_split = 0.25
# Generate a random uniform mask to separate the training set and the test set
test_len = int(test_split * len(crime_labels))
test_mask = np.zeros(len(crime_labels)).astype(bool)
for _ in range(test_len):
loop = True
while loop:
# Get the random index value to set as True
index = np.random.randint(0, len(crime_labels))
# Break out of the while loop if the index for the mask is False
# Used to prevent repeats of index values
if not test_mask[index]:
test_mask[index] = True
loop = False
# Gets the test set using the test mask
X_test = crime_features[test_mask]
y_test = crime_labels[test_mask]
# Gets the training set using the inverse of the test mask
X_train = crime_features[~test_mask]
y_train = crime_labels[~test_mask]
# Create the adaboost classifier
ada_model = ensemble.AdaBoostClassifier(learning_rate=1.8, n_estimators=600)
# Train the adaboost model
ada_model.n_jobs = 3
ada_model.fit(X_train, y_train)
# Display the performance metrics of the adaboost model
display_metrics(ada_model, X_test, y_test)
# Save the adaboost model
ada_file = 'adaboost.pkl'
joblib.dump(ada_model, ada_file)