Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 50 additions & 47 deletions include/dkm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
#ifndef DKM_KMEANS_H
#define DKM_KMEANS_H

#include "dkm_matrix.hpp"

#include <algorithm>
#include <array>
#include <cassert>
Expand All @@ -28,33 +30,33 @@ namespace details {
/*
Calculate the square of the distance between two points.
*/
template <typename T, size_t N>
T distance_squared(const std::array<T, N>& point_a, const std::array<T, N>& point_b) {
template <typename T>
T distance_squared(const std::vector<T>& point_a, const std::vector<T>& point_b) {
T d_squared = T();
for (typename std::array<T, N>::size_type i = 0; i < N; ++i) {
for (typename std::vector<T>::size_type i = 0; i < point_a.size(); ++i) {
auto delta = point_a[i] - point_b[i];
d_squared += delta * delta;
}
return d_squared;
}

template <typename T, size_t N>
T distance(const std::array<T, N>& point_a, const std::array<T, N>& point_b) {
template <typename T>
T distance(const std::vector<T>& point_a, const std::vector<T>& point_b) {
return std::sqrt(distance_squared(point_a, point_b));
}

/*
Calculate the smallest distance between each of the data points and any of the input means.
*/
template <typename T, size_t N>
template <typename T>
std::vector<T> closest_distance(
const std::vector<std::array<T, N>>& means, const std::vector<std::array<T, N>>& data) {
const std::vector<std::vector<T>>& means, const dkm::as_matrix<T>& data) {
std::vector<T> distances;
distances.reserve(data.size());
for (auto& d : data) {
T closest = distance_squared(d, means[0]);
distances.reserve(data.n_rows);
for (size_t i = 0; i < data.n_rows; i++) {
T closest = distance_squared(data.row(i), means[0]);
for (auto& m : means) {
T distance = distance_squared(d, m);
T distance = distance_squared(data.row(i), m);
if (distance < closest)
closest = distance;
}
Expand All @@ -67,20 +69,20 @@ std::vector<T> closest_distance(
This is an alternate initialization method based on the [kmeans++](https://en.wikipedia.org/wiki/K-means%2B%2B)
initialization algorithm.
*/
template <typename T, size_t N>
std::vector<std::array<T, N>> random_plusplus(const std::vector<std::array<T, N>>& data, uint32_t k, uint64_t seed) {
template <typename T>
std::vector<std::vector<T>> random_plusplus(const dkm::as_matrix<T>& data, uint32_t k, uint64_t seed) {
assert(k > 0);
assert(data.size() > 0);
using input_size_t = typename std::array<T, N>::size_type;
std::vector<std::array<T, N>> means;
assert(data.n_rows > 0);
using input_size_t = typename std::vector<T>::size_type;
std::vector<std::vector<T>> means;
// Using a very simple PRBS generator, parameters selected according to
// https://en.wikipedia.org/wiki/Linear_congruential_generator#Parameters_in_common_use
std::linear_congruential_engine<uint64_t, 6364136223846793005, 1442695040888963407, UINT64_MAX> rand_engine(seed);

// Select first mean at random from the set
{
std::uniform_int_distribution<input_size_t> uniform_generator(0, data.size() - 1);
means.push_back(data[uniform_generator(rand_engine)]);
std::uniform_int_distribution<input_size_t> uniform_generator(0, data.n_rows - 1);
means.push_back(data.row(uniform_generator(rand_engine)));
}

for (uint32_t count = 1; count < k; ++count) {
Expand All @@ -94,19 +96,19 @@ std::vector<std::array<T, N>> random_plusplus(const std::vector<std::array<T, N>
input_size_t i = 0;
std::discrete_distribution<input_size_t> generator(distances.size(), 0.0, 0.0, [&distances, &i](double) { return distances[i++]; });
#endif
means.push_back(data[generator(rand_engine)]);
means.push_back(data.row(generator(rand_engine)));
}
return means;
}

/*
Calculate the index of the mean a particular data point is closest to (euclidean distance)
*/
template <typename T, size_t N>
uint32_t closest_mean(const std::array<T, N>& point, const std::vector<std::array<T, N>>& means) {
template <typename T>
uint32_t closest_mean(const std::vector<T>& point, const std::vector<std::vector<T>>& means) {
assert(!means.empty());
T smallest_distance = distance_squared(point, means[0]);
typename std::array<T, N>::size_type index = 0;
typename std::vector<T>::size_type index = 0;
T distance;
for (size_t i = 1; i < means.size(); ++i) {
distance = distance_squared(point, means[i]);
Expand All @@ -121,31 +123,32 @@ uint32_t closest_mean(const std::array<T, N>& point, const std::vector<std::arra
/*
Calculate the index of the mean each data point is closest to (euclidean distance).
*/
template <typename T, size_t N>
template <typename T>
std::vector<uint32_t> calculate_clusters(
const std::vector<std::array<T, N>>& data, const std::vector<std::array<T, N>>& means) {
const dkm::as_matrix<T>& data, const std::vector<std::vector<T>>& means) {
std::vector<uint32_t> clusters;
for (auto& point : data) {
clusters.push_back(closest_mean(point, means));
for (size_t i = 0; i < data.n_rows; i++) {
clusters.push_back(closest_mean(data.row(i), means));
}
return clusters;
}

/*
Calculate means based on data points and their cluster assignments.
*/
template <typename T, size_t N>
std::vector<std::array<T, N>> calculate_means(const std::vector<std::array<T, N>>& data,
template <typename T>
std::vector<std::vector<T>> calculate_means(const dkm::as_matrix<T>& data,
const std::vector<uint32_t>& clusters,
const std::vector<std::array<T, N>>& old_means,
const std::vector<std::vector<T>>& old_means,
uint32_t k) {
std::vector<std::array<T, N>> means(k);
std::vector<std::vector<T>> means(k);
std::vector<T> count(k, T());
for (size_t i = 0; i < std::min(clusters.size(), data.size()); ++i) {
for (size_t i = 0; i < std::min(clusters.size(), data.n_rows); ++i) {
auto& mean = means[clusters[i]];
mean = (mean.empty())? std::vector<T>(data.n_cols): mean;
count[clusters[i]] += 1;
for (size_t j = 0; j < std::min(data[i].size(), mean.size()); ++j) {
mean[j] += data[i][j];
for (size_t j = 0; j < std::min(data.n_cols, mean.size()); ++j) {
mean[j] += data(i, j);
}
}
for (size_t i = 0; i < k; ++i) {
Expand All @@ -160,9 +163,9 @@ std::vector<std::array<T, N>> calculate_means(const std::vector<std::array<T, N>
return means;
}

template <typename T, size_t N>
template <typename T>
std::vector<T> deltas(
const std::vector<std::array<T, N>>& old_means, const std::vector<std::array<T, N>>& means)
const std::vector<std::vector<T>>& old_means, const std::vector<std::vector<T>>& means)
{
std::vector<T> distances;
distances.reserve(means.size());
Expand Down Expand Up @@ -267,19 +270,19 @@ with the [kmeans++](https://en.wikipedia.org/wiki/K-means%2B%2B)
used for initializing the means.

*/
template <typename T, size_t N>
std::tuple<std::vector<std::array<T, N>>, std::vector<uint32_t>> kmeans_lloyd(
const std::vector<std::array<T, N>>& data, const clustering_parameters<T>& parameters) {
template <typename T>
std::tuple<std::vector<std::vector<T>>, std::vector<uint32_t>> kmeans_lloyd(
const dkm::as_matrix<T>& data, const clustering_parameters<T>& parameters) {
static_assert(std::is_arithmetic<T>::value && std::is_signed<T>::value,
"kmeans_lloyd requires the template parameter T to be a signed arithmetic type (e.g. float, double, int)");
assert(parameters.get_k() > 0); // k must be greater than zero
assert(data.size() >= parameters.get_k()); // there must be at least k data points
assert(data.n_rows >= parameters.get_k()); // there must be at least k data points
std::random_device rand_device;
uint64_t seed = parameters.has_random_seed() ? parameters.get_random_seed() : rand_device();
std::vector<std::array<T, N>> means = details::random_plusplus(data, parameters.get_k(), seed);
std::vector<std::vector<T>> means = details::random_plusplus(data, parameters.get_k(), seed);

std::vector<std::array<T, N>> old_means;
std::vector<std::array<T, N>> old_old_means;
std::vector<std::vector<T>> old_means;
std::vector<std::vector<T>> old_old_means;
std::vector<uint32_t> clusters;
// Calculate new means until convergence is reached or we hit the maximum iteration count
uint64_t count = 0;
Expand All @@ -293,18 +296,18 @@ std::tuple<std::vector<std::array<T, N>>, std::vector<uint32_t>> kmeans_lloyd(
&& !(parameters.has_max_iteration() && count == parameters.get_max_iteration())
&& !(parameters.has_min_delta() && details::deltas_below_limit(details::deltas(old_means, means), parameters.get_min_delta())));

return std::tuple<std::vector<std::array<T, N>>, std::vector<uint32_t>>(means, clusters);
return std::tuple<std::vector<std::vector<T>>, std::vector<uint32_t>>(means, clusters);
}

/*
This overload exists to support legacy code which uses this signature of the kmeans_lloyd function.
Any code still using this signature should move to the version of this function that uses a
`clustering_parameters` struct for configuration.
*/
template <typename T, size_t N>
std::tuple<std::vector<std::array<T, N>>, std::vector<uint32_t>> kmeans_lloyd(
const std::vector<std::array<T, N>>& data, uint32_t k,
uint64_t max_iter = 0, T min_delta = -1.0) {
template <typename T>
std::tuple<std::vector<std::vector<T>>, std::vector<uint32_t>> kmeans_lloyd(
const dkm::as_matrix<T>& data, uint32_t k,
uint64_t max_iter = 0, T min_delta = -1.0) {
clustering_parameters<T> parameters(k);
if (max_iter != 0) {
parameters.set_max_iteration(max_iter);
Expand Down
63 changes: 63 additions & 0 deletions include/dkm_matrix.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#pragma once

#ifndef DKM_MATRIX_HPP
#define DKM_MATRIX_HPP

#include <vector>
#include <cassert>
#include <algorithm>

namespace dkm
{
// This class is only an interface! Not designed to be used outside library internals.

template<typename T>
class as_matrix
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be fine to just name this matrix.

Copy link
Author

@Eleobert Eleobert Mar 1, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The name comes from the fact that we are dealing with the data "as a matrix". But as_matrix is in fact not even a matrix (in common sense), it is only an interface between the algorithm and the data. I think the name as_matrix expresses better the idea.

{
const T *data = nullptr;

// column major indexer
auto cm_indexer(size_t i, size_t j) const -> const T&
{
assert(i >= 0 && j >= 0 && i < n_rows && j < n_cols);
return data[j * n_rows + i];
}

// row major indexer
auto rm_indexer(size_t i, size_t j) const -> const T&
{
assert(i >= 0 && j >= 0 && i < n_rows && j < n_cols);
return data[i * n_cols + j];
}

const T& (as_matrix<T>::*indexer) (size_t, size_t) const = nullptr;

public:
const size_t n_rows, n_cols;

as_matrix(const T *data, size_t n_rows, size_t n_cols, bool col_major = true)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be great to see a constructor for conversion from the existing vector<array<T, N>> type as well to ease migration for existing users.

Copy link
Author

@Eleobert Eleobert Mar 1, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can instead overload the main function and mark the old version as [[deprecated]]. What do you think?

Copy link
Owner

@genbattle genbattle Mar 3, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This approach is also acceptable, as long as [[deprecated]] will be ignored by older compilers.

: data(data), n_rows(n_rows), n_cols(n_cols),
indexer((col_major)? &as_matrix<T>::cm_indexer: &as_matrix<T>::rm_indexer)
{}

auto row(size_t i) const -> std::vector<T>;
auto operator()(size_t i, size_t j) const -> const T&
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer this was just a named function like get.

{
return ((this)->*(this->indexer))(i, j);
}
};


template<typename T>
auto as_matrix<T>::row(size_t i) const -> std::vector<T>
{
auto res = std::vector<T>(n_cols);
for(size_t j = 0; j < n_cols; j++)
{
res[j] = (*this)(i, j);
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One of the reasons this library is so performant is that it avoids allocations and copies wherever possible. This is a full copy of each and every row in the input data, multiple times. It will have a significant effect on performance.

The way to do this performantly and safely would be to return a std::span which points to the internal data. Given this library currently only requires C++11, the solution is probably returning a pointer to the row or a custom struct which emulates the behavior of std::span.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had been implementing a new version that takes this into account. The idea is whenever the matrix is column major we first copy (because we cannot change the incoming data) and transpose it.

The copy would work like this:

auto [owner, data] = copy(matrix);

From here we can safely return the span. See that the matrix data structure never owns the data, so we need to return an owner (arguably unique_ptr) from the copy function.

I plan to implement this when my project reaches the optimization stage. Here is an implementation I was already working on https://pastebin.com/DYQs2EBb

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why can transposition not just produce a new as_matrix with the correct indexer if we need to switch to column instead of row major.

There's no reason to have any solution here that includes any form of copying.

Copy link
Author

@Eleobert Eleobert Mar 10, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The transposition is performed only once right after the function is called. I didn't measure the performance yet but I think that the overhead is insignificant.

The idea with transposition is that with column major data the next element of a given row is at a distance of n_rows. It is more performant and easier to work with if we keep the elements at a distance of 1.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My argument was mainly that in this case the representation of the data doesn't need to change at all, we just need need a different view over the data, which seems to be the whole point of as_matrix (an abstracted view over the data).

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand your point, my only concern is that in the case of row major data to get a row we can easily return std::span(ptr, ncols). But in case of column major it is not possible without adding more complexity. To avoid copying the alternative I can think of would have to be a sort of row_vec whose the indexing would work as following:

auto row_vec::operator()(size_t i)
{
    return data[i * n_rows + this->row_number]
}

From my experience the transposing the data + changing from column major to row major is simpler and faster, specially when the input data is large enough. But I am not completely sure if this is true in fact. What is your opinion on this?

}
return res;
}

}
#endif