-
Notifications
You must be signed in to change notification settings - Fork 0
/
knn-sklearn-iris.py
183 lines (172 loc) · 8.07 KB
/
knn-sklearn-iris.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import random
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
class knn_homebrew_sklearn:
def __init__(self):
'''
Load the Iris flower data set by Ronald Fisher
https://scikit-learn.org/stable/datasets/toy_dataset.html#iris-dataset
'''
self.iris = datasets.load_iris()
self.predictors = self.iris.data[:, 0:2]
self.outcomes = self.iris.target
def __find_knn(self, p, points, k = 5):
'''
Find the k nearest neighbours of point p and return
their indices
'''
distances = np.zeros(points.shape[0])
for i in range(len(distances)):
distances[i] = np.linalg.norm(p - points[i])
ind = np.argsort(distances)
return ind[:k]
def __majority(self, elements):
'''
Return the most common element in elements
'''
counts = {}
for element in elements:
if element in counts:
counts[element] += 1
else:
counts[element] = 1
majority_elements = []
max_count = max(counts.values())
for element, count in counts.items():
if count == max_count:
majority_elements.append(element)
return random.choice(majority_elements)
def _knn_predict(self, p, points, outcomes, k = 5):
'''
Call __find_knn() to find the k nearest neighbours of p
and then call __majority() to determine the dominant class
of the k nearest neighbours before returning the result
'''
ind = self.__find_knn(p, points, k)
return self.__majority(outcomes[ind])
def _make_prediction_grid(self, predictors, outcomes, h, k, mode):
'''
Make a prediction grid using either the homebrew or
the scikit-learn knn method
'''
(x_min, x_max, y_min, y_max) = (predictors[:, 0].min() * 0.9,
predictors[:, 0].max() * 1.1,
predictors[:, 1].min() * 0.9,
predictors[:, 1].max() * 1.1)
xs = np.arange(x_min, x_max, h)
ys = np.arange(y_min, y_max, h)
xx, yy = np.meshgrid(xs, ys)
if mode == 'sk':
prediction_grid = np.zeros(xx.shape, dtype=int)
knn = KNeighborsClassifier(n_neighbors = k)
knn.fit(predictors, outcomes)
for i, x in enumerate(xs):
for j, y in enumerate(ys):
p = np.array([x, y])
prediction_grid[j, i] = knn.predict([[x, y]])
return (xx, yy, prediction_grid)
elif mode == 'homebrew':
prediction_grid = np.zeros(xx.shape, dtype = int)
for i, x in enumerate(xs):
for j, y in enumerate(ys):
p = np.array([x, y])
prediction_grid[j, i] = self._knn_predict(p,
predictors, outcomes, k)
return (xx, yy, prediction_grid)
def predict_by_k(self):
'''
Determine the accuracy of knn classification on the
Iris flower dataset with different value of k using
both the homebrew and scikit-learn knn methods
'''
self.xr = np.array(range(self.iris.data.shape[0] - 1))
self.yr_homebrew = np.zeros(self.xr.shape, dtype = float)
self.yr_sk = np.zeros(self.xr.shape, dtype = float)
for n in self.xr:
self.yr_homebrew[n] = 100 * np.mean(np.array(
[self._knn_predict(p, self.predictors, self.outcomes,
n + 1) for p in self.predictors]) == self.outcomes).item()
knn = KNeighborsClassifier(n_neighbors = n + 1)
knn.fit(self.predictors, self.outcomes)
self.yr_sk[n] = 100 * np.mean(knn.predict(self.predictors)
== self.outcomes).item()
def make_prediction_grid_with_highest_k(self):
'''
Use the value of k with the highest accuracy to make prediction grid
'''
(self.xx, self.yy, self.prediction_grid) = self._make_prediction_grid(
self.predictors, self.outcomes,
0.1, self.yr_homebrew.argmax()
+ 1, 'homebrew')
self.knn = KNeighborsClassifier(n_neighbors = self.yr_sk.argmax() + 1)
self.knn.fit(self.predictors, self.outcomes)
(self.sk_xx, self.sk_yy, self.sk_prediction_grid) = self._make_prediction_grid(
self.predictors, self.outcomes,
0.1, self.yr_sk.argmax() + 1, 'sk')
def plot_and_save_pdf(self):
'''
Plot the prediction grid with the Iris flower data set
using both the homebrew and scikit-learn knn methods for comparison.
Plot also the accurracy of both the homebrew and scikit-learn knn
methods as a function of k.
Save the plot as knn_iris_plot.pdf in the working directory.
'''
plt.figure(figsize = (12, 12))
grid_cm = ListedColormap (['hotpink','lightskyblue', 'yellowgreen'])
plt.subplot(221)
plt.title('Homebrew K-Nearest Neighbors with K = ' +
str(self.yr_homebrew.argmax() + 1) + '\n' +
'(Accuracy = ' + str(round(100 * np.mean(np.array(
[self._knn_predict(p, self.predictors, self.outcomes,
self.yr_homebrew.argmax() + 1) for p in self.predictors])
== self.outcomes).item(), 2)) + ')')
plt.pcolormesh(self.xx, self.yy, self.prediction_grid, cmap = grid_cm, alpha = 0.5)
plt.plot(self.predictors[self.outcomes == 0][:, 0],
self.predictors[self.outcomes == 0][:, 1], 'ro', label='Iris Setosa')
plt.plot(self.predictors[self.outcomes == 1][:, 0],
self.predictors[self.outcomes == 1][:, 1], 'bo', label='Iris Virginica')
plt.plot(self.predictors[self.outcomes == 2][:, 0],
self.predictors[self.outcomes == 2][:, 1], 'go', label='Iris Versicolor')
plt.legend()
plt.xlabel('Sepal Length')
plt.ylabel('Sepal Width')
plt.subplot(222)
plt.title('Scikit-learn K-Nearest Neighbors with K = ' +
str(self.yr_sk.argmax() + 1) + '\n' + '(Accuracy = ' +
str(round(100 * np.mean(self.knn.predict(self.predictors)
== self.outcomes).item(), 2)) + ')')
plt.pcolormesh(self.sk_xx, self.sk_yy, self.sk_prediction_grid, cmap = grid_cm, alpha = 0.5)
plt.plot(self.predictors[self.outcomes == 0][:, 0],
self.predictors[self.outcomes == 0][:, 1], 'ro', label='Iris Setosa')
plt.plot(self.predictors[self.outcomes == 1][:, 0],
self.predictors[self.outcomes == 1][:, 1], 'bo', label='Iris Virginica')
plt.plot(self.predictors[self.outcomes == 2][:, 0],
self.predictors[self.outcomes == 2][:, 1], 'go', label='Iris Versicolor')
plt.legend()
plt.xlabel('Sepal Length')
plt.ylabel('Sepal Width')
plt.subplot(223)
plt.title('Homebrew K-Nearest Neighbors Accuracy (%)')
plt.plot(self.xr + 1, self.yr_homebrew, 'b-', label='Accuracy (%)')
plt.xlabel('K')
plt.ylabel('Accuracy (%)')
plt.subplot(224)
plt.title('Scikit-learn K-Nearest Neighbors Accuracy')
plt.plot(self.xr + 1, self.yr_sk, 'b-', label='Accuracy (%)')
plt.xlabel('K')
plt.ylabel('Accuracy (%)')
plt.savefig('knn-sklearn-iris-plot.pdf')
plt.show()
if __name__ == '__main__':
'''
Create the knn object and plot the prediction grid to compare
the homebrew and scikit-learn knn methods on the Iris flower data set.
Save the plot in the working directory.
'''
iris_knn = knn_homebrew_sklearn()
iris_knn.predict_by_k()
iris_knn.make_prediction_grid_with_highest_k()
iris_knn.plot_and_save_pdf()