-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcifar10_pca_knn_analysis.py
166 lines (132 loc) · 5.33 KB
/
cifar10_pca_knn_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import pickle
import math
import numpy as np
from PIL import Image
import datetime
def unpickle(file):
with open(file, 'rb') as fo:
data = pickle.load(fo, encoding='bytes')
return data
def merge_and_extract(file_prefix, batches, test=False):
"""
Merges 'batches' number of given batches from the cifar-10 data set,
extracts and returns the images as an array 'ready_images' of flat gray scale images
and the labels 'set_labels'.
"""
print("extracting data and merging batches...")
set_labels = []
set_images = []
for batch in range(1, batches + 1):
if test:
file = file_prefix
else:
file = file_prefix + str(batch)
my_dict = unpickle(file)
set_labels.extend(my_dict[b'labels'])
set_images.extend(my_dict[b'data'])
ready_images = np.zeros((np.ma.size(set_images, 0), 1024))
for i in range(len(set_images)):
ready_images[i] = turn_image_to_flat_gray_scale(set_images[i])
return ready_images, set_labels
def turn_image_to_flat_gray_scale(color_image):
"""
Converts a 'color_image' to a flat (1 by 1024) gray scale array 'flat_array' which is returned.
"""
single_img = np.array(color_image)
single_img_reshaped = np.transpose(np.reshape(single_img, (3, 32, 32)), (1, 2, 0))
image = Image.fromarray(single_img_reshaped.astype('uint8'))
image = image.convert("L")
my_array = np.array(image)
flat_array = my_array.reshape((1, 1024))
return flat_array
def euclidean_distance(instance1, instance2, length):
""" Function to find the euclidean distance between two vectors """
distance = 0
for x in range(length):
distance += pow((instance1[x] - instance2[x]), 2)
return math.sqrt(distance)
def k_nearest_neighbors(training_set, test_instance, training_labels, k):
"""
kNN algorithm. Returns proposed label of a given test image 'test_instance', by finding the
'k' similar neighbors (euclidean distance) for 'training_set' set of images.
"""
closest_neighbors_labels = []
distances = []
length = len(test_instance)-1
for x in range(np.ma.size(training_set, 1)):
dist = euclidean_distance(training_set[:, x].tolist(), test_instance, length)
distances.append(dist)
distances = np.array(distances, dtype=float)
for neighbor in range(k):
closest_neighbor = np.argmin(distances)
closest_neighbors_labels.append(training_labels[closest_neighbor])
distances[closest_neighbor] = distances.max()
return closest_neighbors_labels
def best_neighbor_match_check(k_neighbors_labels):
""" Returns the value with the most repetitions in `k_neighbors`. """
k_neighbors_labels.sort()
longest_repeats = current_repeats = 0
current_value = best_match_value = k_neighbors_labels[0]
for value in k_neighbors_labels:
if value == current_value:
current_repeats += 1
else:
current_repeats = 1
current_value = value
if longest_repeats < current_repeats:
longest_repeats = current_repeats
best_match_value = current_value
return best_match_value
def test_run(training_matrix, test_matrix, training_labels, test_labels, k_list, s=0):
"""
Applies kNN algorithm to 'training_matrix' and to each image in 'test_matrix'.
Checks label match accuracy according to 'training_labels' and 'test_labels'.
Sends to 'k_nearest_neighbors' function the highest k value.
Then iterates on each k from 'k_list' closest neighbors, and finds the best match by calling
'best_neighbor_match_check' function, instead of the full kNN!.
"""
nearest_list = []
for img in range(np.ma.size(test_matrix, 1)):
nearest_list.append(k_nearest_neighbors(training_matrix, test_matrix[:, img].tolist(), training_labels, max(k_list)))
for k in k_list:
count = 0
for img in range(np.ma.size(test_matrix, 1)):
nearest_one = best_neighbor_match_check(nearest_list[img][:k])
if nearest_one != test_labels[img]:
count += 1
error_rate = count / np.ma.size(test_matrix, 1)
if s == 0:
print("For s =", s, " and k = ", k, " the error rate is: ", error_rate)
else:
print("For k = ", k, " the error rate is: ", error_rate)
def pca(matrix):
"""
Performs SVD and returns the left eigenvectors.
All values are returned and then sliced according to given 's' before applying kNN algorithm.
"""
print("Applying PCA...")
u, singular_values, v = np.linalg.svd(matrix.T, full_matrices=False)
return u
def main():
"""
Program to identify class (label) of test images, given a training set of images
utilizing Principal Component Analysis (PCA) and k nearest neighbors algorithm (kNN).
"""
training_set = "./cifar-10-python/cifar-10-batches-py/data_batch_"
training_images, training_labels = merge_and_extract(training_set, 5, False)
test_set = "./cifar-10-python/cifar-10-batches-py/test_batch"
test_images, test_labels = merge_and_extract(test_set, 1, True)
pca_result = np.array(pca(training_images))
print("Starting runs!")
k_list = [5, 10, 20, 40, 80, 160]
for s in (1, 5, 10, 20, 40, 80):
training_matrix = np.matmul(pca_result[:s], training_images.T)
test_matrix = np.matmul(pca_result[:s], test_images.T)
now = datetime.datetime.now()
print(now.strftime("%Y-%m-%d %H:%M:%S"), "Running PCA + kNN test with s =", s, "and k = ", k_list, "...")
test_run(training_matrix, test_matrix, training_labels, test_labels, k_list, s)
now = datetime.datetime.now()
print(now.strftime("%Y-%m-%d %H:%M:%S"), "Running kNN test with k = ", k_list, "...")
test_run(training_images, test_images, training_labels, test_labels, k_list)
print("Test complete!")
main()