-
Notifications
You must be signed in to change notification settings - Fork 0
/
knn_.R
84 lines (64 loc) · 2.76 KB
/
knn_.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# K-nearest neighbours: Graded Task
library(datasets)
library(dplyr)
library(ggfortify)
library(rdist)
######## LOAD TEST DATA SET AND PREPARE IT FOR ANALYSIS #######
# Use the Iris data for testing: has four dimensions of features and also
# comes labeled with the species.
data("iris")
iris_raw <- select(iris, Sepal.Length, Sepal.Width,
Petal.Length, Petal.Width, Species)
# Convert the categorical species column to numerical and rewrite the data set
label <- as.numeric(iris_raw$Species)
categories <- iris_raw$Species
iris <- cbind(iris_raw, label)
# Remove categorical column to avoid problems
iris$Species <- NULL
amount_features <- length(iris[,1])
# The labeled or training data will represent around 70% of the data set,
# and the unlabeled the remaining. To do so, we perform a random
# permutation of the Iris data set
random_permutation_iris <- iris[sample(nrow(iris)), ]
training_set <- random_permutation_iris[1:100,]
target_set <- random_permutation_iris[101:150,]
# label_permuted_iris <- random_permutation_iris$label
# labels_train <- label_permuted_iris[1:100]
# labels_target <- label_permuted_iris[101:150]
############## VARIABLES ##############
K = 5
############## k-NEAREST NEIGHBORS FUNCTION ##############
knnClassify <- function(train_set, k,
target_set, metric = 'euclidean') {
# Rearrange input data
train_features <- train_set[,-5]
train_labels <- train_set[,5]
target_features <- target_set[,-5]
target_real_labels <- target_set[,5]
indexes <- list()
predicted_labels <- list ()
for (i in 1:length(target_set[,1])){
# Compute distances from point i to all training points
# and sort them ascending to select the k-nearest
distances <- cdist(target_features[i,], train_features, metric=metric)
# This orders the indexes of the closest elements
k_nearest_neighbors_indexes <- order(distances)[1:K]
indexes[[i]] <- k_nearest_neighbors_indexes
predicted_labels[i] <- sort(train_labels[k_nearest_neighbors_indexes],decreasing=TRUE)[1]
# With the indexes, now I have to figure out the most
# repeated label for those indexes
}
indexes <- do.call(rbind, indexes)
predicted_labels <- do.call(rbind, predicted_labels)
return(predicted_labels)
}
labels = knnClassify(training_set, K, target_set)
###### TEST WITH PCA ########
pca_iris <- prcomp(iris[,-5], scale=TRUE)
pca_iris_pcs <- pca_iris$x
plot(pca_iris_pcs[,1], pca_iris_pcs[,2], pch=20, col=label, main="Real Labels",
xlab="PC1", ylab="PC2")
plot(pca_iris_pcs[,1][1:100], pca_iris_pcs[,2][1:100],
pch=20, col=training_set[,5], main="Predicted Labels", xlab="PC1", ylab="PC2")
points(pca_iris_pcs[,1][101:150], pca_iris_pcs[,2][101:150], pch=20,
col=labels)