-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathknn_tuned.py
129 lines (100 loc) · 4.42 KB
/
knn_tuned.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import numpy as np
import pandas as pd
import joblib
from sklearn import neighbors
from sklearn.model_selection import RandomizedSearchCV
# Displays the performance metrics of the model
def display_metrics(model, test_features, test_labels):
tn = fp = fn = tp = 0
test_len = len(test_labels)
test_pred = model.predict(test_features)
# Computes the accuracy, precision, recall, and f1 score
for i in range(test_len):
if test_pred[i] == 0 and test_labels[i] == 0:
tn += 1
elif test_pred[i] == 0 and test_labels[i] == 1:
fn += 1
elif test_pred[i] == 1 and test_labels[i] == 0:
fp += 1
elif test_pred[i] == 1 and test_labels[i] == 1:
tp += 1
accuracy = (tp + tn) / (tp + fp + fn + tn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * (recall * precision) / (recall + precision)
# Displays the performance metrics
print('Accuracy: {}'.format(accuracy))
print('Precision: {}'.format(precision))
print('Recall: {}'.format(recall))
print('F1 Score: {}'.format(f1_score))
# Return the best k-NN model using random search
def random_search(train_features, train_labels):
# Number of neighbors to use
n_neighbors = [3, 5, 7, 9, 11, 13]
# Weight function used for prediction
weights = ['uniform', 'distance']
# Algorithm to use
algorithm = ['ball_tree', 'kd_tree']
# Leaf size used for ball tree or kd tree
leaf_size = [20, 30, 40]
# Power parameter used for Minkowski metric
p = [1, 2]
# Parameters used for the random search model
knn_param = {'n_neighbors': n_neighbors,
'weights': weights,
'algorithm': algorithm,
'leaf_size': leaf_size,
'p': p}
# Perform five-fold cross validation on the k-NN classifier using the random search model
knn = neighbors.KNeighborsClassifier()
knn_random = RandomizedSearchCV(estimator=knn, param_distributions=knn_param,
n_iter=100, cv=5, random_state=42, n_jobs=3)
# Fit the k-NN model
knn_random.fit(train_features, train_labels)
# Display the parameters of the best model from the random search
print(knn_random.best_params_)
# Sets the k-NN model as the best model from the random search
return knn_random.best_estimator_
if __name__ == '__main__':
# Loads the Chicago crime data set
crimes_filepath = 'D:/Users/Eric/Google Drive/Colab Notebooks/total_df.csv'
crimes_df = pd.read_csv(crimes_filepath)
# Drops the index column
crimes_df.drop(['Unnamed: 0'], axis=1, inplace=True)
# Sort the crimes by year
crimes_df.sort_values(by=['Year'], inplace=True)
# Uses the 'Arrest' feature as the target output for training
crime_features = np.array(crimes_df.drop(['Arrest'], axis=1))
crime_labels = np.array(crimes_df['Arrest'])
# Sets the percentage of data set as the test set
test_split = 0.25
# Generate a random uniform mask to separate the training set and the test set
test_len = int(test_split * len(crime_labels))
test_mask = np.zeros(len(crime_labels)).astype(bool)
for _ in range(test_len):
loop = True
while loop:
# Get the random index value to set as True
index = np.random.randint(0, len(crime_labels))
# Break out of the while loop if the index for the mask is False
# Used to prevent repeats of index values
if not test_mask[index]:
test_mask[index] = True
loop = False
# Gets the test set using the test mask
X_test = crime_features[test_mask]
y_test = crime_labels[test_mask]
# Gets the training set using the inverse of the test mask
X_train = crime_features[~test_mask]
y_train = crime_labels[~test_mask]
# Create the k-NN classifier
knn_model = neighbors.KNeighborsClassifier(n_neighbors=11, weights='distance',
algorithm='ball_tree', leaf_size=20, p=1)
# Train the k-NN model
knn_model.n_jobs = 3
knn_model.fit(X_train, y_train)
# Display the performance metrics of the k-NN model
display_metrics(knn_model, X_test, y_test)
# Save the k-NN model
knn_file = 'k-NN.pkl'
joblib.dump(knn_model, knn_file)