From c27906f15dd90e57db98506d5a443304a908c5dd Mon Sep 17 00:00:00 2001 From: lazavgeridis Date: Fri, 19 Mar 2021 15:25:05 +0200 Subject: [PATCH] update README v4, hypercube range search --- README.md | 48 ++++++++++++++++++++++----- include/modules/hypercube/hypercube.h | 45 +++---------------------- 2 files changed, 44 insertions(+), 49 deletions(-) diff --git a/README.md b/README.md index b83f4f8..f225db3 100644 --- a/README.md +++ b/README.md @@ -6,20 +6,23 @@ problem: **LSH** and **Hypercube Randomized Projection** 2. implemented the improved version of the well-known clustering algorithm k-Means, which is called **k-Medians++** -The dataset used in the 2 tasks above was [MNIST](http://yann.lecun.com/exdb/mnist/) +The dataset used in the 2 tasks above was [MNIST](http://yann.lecun.com/exdb/mnist/). Each +handwritten digit image has a resolution of 28x28 pixels. Consequently, we store each image +as a "flattened" vector of size 784 (28 x 28 = 784). To calulate the distance between 2 points +in our datasets we used the manhattan distance # Nearest Neighbour Search -Both methods mentioned above work in a similar manner. The following sequence of steps happens -before the actual search process takes place: +Both methods mentioned above (LSH and Hypercube) work in a similar manner. The following +sequence of steps happens before the actual search process takes place: 1. program reads in the input dataset (or training set), which in our case consists of 60.000 images of handwritten digits (0 - 9). 2. then, the program builds the actual data structures that will be used in the search process. All input dataset points are stored in these data structures 3. next, the program reads in the query set (or test set) 4. search starts: for each point in the query set find: - 1. its _N_ nearest neighbours approximately (Approximate k-NN) - 2. its _N_ nearest neighbours using brute-force search (Exact k-NN) - 3. its nearest neighbours approximately that lie inside a circle of radius _R_ + 1. its _N_ nearest neighbours approximately (Approximate k-NN) + 2. its _N_ nearest neighbours using brute-force search (Exact k-NN) + 3. its nearest neighbours approximately that lie inside a circle of radius _R_ Where these 2 methods differ, is how each one builds its appropriate data structures and chooses to store (hashing) the input dataset. In general, The whole purpose of these 2 methods is to deliver an efficient -but approximate- @@ -28,6 +31,14 @@ high accuracy results # Clustering +Using the same dataset file as input, the goal of this program is to "group" as accurately as +possible the input datapoints into clusters. Ideally, the clusters produced by the program +should only contain images of the same handwritten digit (default numbers of clusters is 10) . +The (iterative) algorithm selects its initial centroids using an improved initialization +technique called [initialization++](https://en.wikipedia.org/wiki/K-means%2B%2B#Improved_initialization_algorithm), assigns points to their closest centroid using one of Lloyd's or LSH or +Hypercube assignment methods and uses the median update rule to update the centroids. +The algorithm stops, when the observed change in cluster assignments is relatively small. For +this purpose, the k-medians objective function (l1 norm) is calculated after each iteration. # Execution @@ -68,7 +79,28 @@ image_number_A image_number_B . . . image_number_Z +``` + +Finally, for **clustering**, run the following commands: ``` +$ cd src/cluster +$ make +$ ./cluster -i ../../datasets/train-images-idx3-ubyte -c ../../include/cluster/cluster.conf + -o --complete -m +``` -# Vector-Clustering-Algorithms -In this application we are called to implement clustering algorithms for vectors using Locality Sensitive Hashing. Also, the goal is to reduce the dimensionality with random projection on Hypercube. +The formatted output will be written to the output file specified by the user. +This is how it looks like: +``` +Algorithm: Lloyds OR Range Search LSH OR Range Search Hypercube +CLUSTER-1 {size: , centroid: array with the centroid's components} +... +CLUSTER-Κ {size: , centroid: array with the centroid's components} +clustering_time: //in seconds +Silhouette: [s1,...,si,...,sΚ, stotal] +// si=average s(p) of points in cluster i, stotal=average s(p) of points in dataset +// Optionally with command line parameter –complete +CLUSTER-1 {centroid, image_numberA, ..., image_numberX} +... +CLUSTER-Κ {centroid, image_numberR, ..., image_numberZ} +``` diff --git a/include/modules/hypercube/hypercube.h b/include/modules/hypercube/hypercube.h index 2dc5aa7..d7b23c1 100644 --- a/include/modules/hypercube/hypercube.h +++ b/include/modules/hypercube/hypercube.h @@ -223,7 +223,7 @@ class Hypercube { } - std::vector range_search(const std::vector &query, const std::vector> &train_samples) + std::vector range_search(const std::vector &query, const std::vector> &train_samples, float r = 0.0) { /* vector to store query's nearest neighbors; only store the training index this time */ std::vector candidates; @@ -234,45 +234,8 @@ class Hypercube { uint8_t bits = 1; size_t bucket_count = 0; - /* project query to a cube vertex / hash table bucket */ - const std::string key = cube_projection_test(query); - std::string key1 = key; - - while (M > 0) { - if (probes > 0) { - bucket_count = hash_table[key1].size(); - const std::vector &indexes = hash_table[key1]; - for (size_t i = 0; (i != bucket_count) && (M > 0); ++i, --M) { - dist = manhattan_distance_rd(query, train_samples[indexes[i]]); - if (dist < C * R) { // average distance is 20 000 - 35 000 - candidates.push_back(indexes[i]); - } - } - - /* generate a "nearby" vertex using hamming distance (hamming distance = 1, then hamming distance = 2, etc) */ - key1 = gen_nearby_vertex(key, cnt, bits); - /* can't generate hamming distance = x > cube dimension */ - if (bits > projection_dimension) break; - --probes; - } - else - break; - } - - return candidates; - } - - - std::vector range_search(const std::vector &query, const std::vector> &train_samples, double r) - { - /* vector to store query's nearest neighbors; only store the training index this time */ - std::vector candidates; - uint32_t dist = 0; - uint32_t cnt = projection_dimension; - uint16_t M = max_candidates; - uint16_t probes = max_probes; - uint8_t bits = 1; - size_t bucket_count = 0; + if (r == 0.0) + r = R; /* project query to a cube vertex / hash table bucket */ const std::string key = cube_projection_test(query); @@ -284,7 +247,7 @@ class Hypercube { const std::vector &indexes = hash_table[key1]; for (size_t i = 0; (i != bucket_count) && (M > 0); ++i, --M) { dist = manhattan_distance_rd(query, train_samples[indexes[i]]); - if (dist < C * r) { // average distance is 20.000 - 35.000 + if (dist < C * r) { // average distance is 20 000 - 35 000 candidates.push_back(indexes[i]); } }