Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat: implement Random Projections #332

Merged
merged 8 commits into from
Mar 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion algorithms/linfa-reduction/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
[package]
name = "linfa-reduction"
version = "0.7.0"
authors = ["Lorenz Schmidt <bytesnake@mailbox.org>"]
authors = [
"Lorenz Schmidt <bytesnake@mailbox.org>",
"Gabriel Bathie <gbathie.dev@gmail.com>",
]
description = "A collection of dimensionality reduction techniques"
edition = "2018"
license = "MIT OR Apache-2.0"
Expand Down Expand Up @@ -41,6 +44,8 @@ rand = { version = "0.8", features = ["small_rng"] }

linfa = { version = "0.7.0", path = "../.." }
linfa-kernel = { version = "0.7.0", path = "../linfa-kernel" }
sprs = "0.11.1"
rand_xoshiro = "0.6.0"

[dev-dependencies]
ndarray-npy = { version = "0.8", default-features = false }
Expand All @@ -49,3 +54,5 @@ linfa-datasets = { version = "0.7.0", path = "../../datasets", features = [
"generate",
] }
approx = { version = "0.4" }
mnist = { version = "0.6.0", features = ["download"] }
linfa-trees = { version = "0.7.0", path = "../linfa-trees"}
4 changes: 4 additions & 0 deletions algorithms/linfa-reduction/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
`linfa-reduction` currently provides an implementation of the following dimensional reduction methods:
- Diffusion Mapping
- Principal Component Analysis (PCA)
- Gaussian random projections
- Sparse random projections

## Examples

Expand All @@ -19,6 +21,8 @@ There is an usage example in the `examples/` directory. To run, use:
```bash
$ cargo run --release --example diffusion_map
$ cargo run --release --example pca
$ cargo run --release --example gaussian_projection
$ cargo run --release --example sparse_projection
```

## BLAS/LAPACK backend
Expand Down
76 changes: 76 additions & 0 deletions algorithms/linfa-reduction/examples/gaussian_projection.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
use std::{error::Error, time::Instant};

use linfa::prelude::*;
use linfa_reduction::random_projection::GaussianRandomProjection;
use linfa_trees::{DecisionTree, SplitQuality};

use mnist::{MnistBuilder, NormalizedMnist};
use ndarray::{Array1, Array2};
use rand::SeedableRng;
use rand_xoshiro::Xoshiro256Plus;

/// Train a Decision tree on the MNIST data set, with and without dimensionality reduction.
fn main() -> Result<(), Box<dyn Error>> {
// Parameters
let train_sz = 10_000usize;
let test_sz = 1_000usize;
let reduced_dim = 100;
let rng = Xoshiro256Plus::seed_from_u64(42);

let NormalizedMnist {
trn_img,
trn_lbl,
tst_img,
tst_lbl,
..
} = MnistBuilder::new()
.label_format_digit()
.training_set_length(train_sz as u32)
.test_set_length(test_sz as u32)
.download_and_extract()
.finalize()
.normalize();

let train_data = Array2::from_shape_vec((train_sz, 28 * 28), trn_img)?;
let train_labels: Array1<usize> =
Array1::from_shape_vec(train_sz, trn_lbl)?.map(|x| *x as usize);
let train_dataset = Dataset::new(train_data, train_labels);

let test_data = Array2::from_shape_vec((test_sz, 28 * 28), tst_img)?;
let test_labels: Array1<usize> = Array1::from_shape_vec(test_sz, tst_lbl)?.map(|x| *x as usize);

let params = DecisionTree::params()
.split_quality(SplitQuality::Gini)
.max_depth(Some(10));

println!("Training non-reduced model...");
let start = Instant::now();
let model: DecisionTree<f32, usize> = params.fit(&train_dataset)?;

let end = start.elapsed();
let pred_y = model.predict(&test_data);
let cm = pred_y.confusion_matrix(&test_labels)?;
println!("Non-reduced model precision: {}%", 100.0 * cm.accuracy());
println!("Training time: {:.2}s\n", end.as_secs_f32());

println!("Training reduced model...");
let start = Instant::now();
// Compute the random projection and train the model on the reduced dataset.
let proj = GaussianRandomProjection::<f32>::params_with_rng(rng)
.target_dim(reduced_dim)
.fit(&train_dataset)?;
let reduced_train_ds = proj.transform(&train_dataset);
let reduced_test_data = proj.transform(&test_data);
let model_reduced: DecisionTree<f32, usize> = params.fit(&reduced_train_ds)?;

let end = start.elapsed();
let pred_reduced = model_reduced.predict(&reduced_test_data);
let cm_reduced = pred_reduced.confusion_matrix(&test_labels)?;
println!(
"Reduced model precision: {}%",
100.0 * cm_reduced.accuracy()
);
println!("Reduction + training time: {:.2}s", end.as_secs_f32());

Ok(())
}
76 changes: 76 additions & 0 deletions algorithms/linfa-reduction/examples/sparse_projection.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
use std::{error::Error, time::Instant};

use linfa::prelude::*;
use linfa_reduction::random_projection::SparseRandomProjection;
use linfa_trees::{DecisionTree, SplitQuality};

use mnist::{MnistBuilder, NormalizedMnist};
use ndarray::{Array1, Array2};
use rand::SeedableRng;
use rand_xoshiro::Xoshiro256Plus;

/// Train a Decision tree on the MNIST data set, with and without dimensionality reduction.
fn main() -> Result<(), Box<dyn Error>> {
// Parameters
let train_sz = 10_000usize;
let test_sz = 1_000usize;
let reduced_dim = 100;
let rng = Xoshiro256Plus::seed_from_u64(42);

let NormalizedMnist {
trn_img,
trn_lbl,
tst_img,
tst_lbl,
..
} = MnistBuilder::new()
.label_format_digit()
.training_set_length(train_sz as u32)
.test_set_length(test_sz as u32)
.download_and_extract()
.finalize()
.normalize();

let train_data = Array2::from_shape_vec((train_sz, 28 * 28), trn_img)?;
let train_labels: Array1<usize> =
Array1::from_shape_vec(train_sz, trn_lbl)?.map(|x| *x as usize);
let train_dataset = Dataset::new(train_data, train_labels);

let test_data = Array2::from_shape_vec((test_sz, 28 * 28), tst_img)?;
let test_labels: Array1<usize> = Array1::from_shape_vec(test_sz, tst_lbl)?.map(|x| *x as usize);

let params = DecisionTree::params()
.split_quality(SplitQuality::Gini)
.max_depth(Some(10));

println!("Training non-reduced model...");
let start = Instant::now();
let model: DecisionTree<f32, usize> = params.fit(&train_dataset)?;

let end = start.elapsed();
let pred_y = model.predict(&test_data);
let cm = pred_y.confusion_matrix(&test_labels)?;
println!("Non-reduced model precision: {}%", 100.0 * cm.accuracy());
println!("Training time: {:.2}s\n", end.as_secs_f32());

println!("Training reduced model...");
let start = Instant::now();
// Compute the random projection and train the model on the reduced dataset.
let proj = SparseRandomProjection::<f32>::params_with_rng(rng)
.target_dim(reduced_dim)
.fit(&train_dataset)?;
let reduced_train_ds = proj.transform(&train_dataset);
let reduced_test_data = proj.transform(&test_data);
let model_reduced: DecisionTree<f32, usize> = params.fit(&reduced_train_ds)?;

let end = start.elapsed();
let pred_reduced = model_reduced.predict(&reduced_test_data);
let cm_reduced = pred_reduced.confusion_matrix(&test_labels)?;
println!(
"Reduced model precision: {}%",
100.0 * cm_reduced.accuracy()
);
println!("Reduction + training time: {:.2}s", end.as_secs_f32());

Ok(())
}
8 changes: 8 additions & 0 deletions algorithms/linfa-reduction/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,12 @@ pub enum ReductionError {
LinalgError(#[from] linfa_linalg::LinalgError),
#[error(transparent)]
LinfaError(#[from] linfa::error::Error),
#[error(transparent)]
NdarrayRandError(#[from] ndarray_rand::rand_distr::NormalError),
#[error("Precision parameter must be in the interval (0; 1)")]
InvalidPrecision,
#[error("Target dimension of the projection must be positive")]
NonPositiveEmbeddingSize,
#[error("Target dimension {0} is larger than the number of features {1}.")]
DimensionIncrease(usize, usize),
}
1 change: 1 addition & 0 deletions algorithms/linfa-reduction/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ extern crate ndarray;
mod diffusion_map;
mod error;
mod pca;
pub mod random_projection;
pub mod utils;

pub use diffusion_map::{DiffusionMap, DiffusionMapParams, DiffusionMapValidParams};
Expand Down
169 changes: 169 additions & 0 deletions algorithms/linfa-reduction/src/random_projection/algorithms.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
use std::marker::PhantomData;

use linfa::{
dataset::{AsTargets, FromTargetArray},
prelude::Records,
traits::{Fit, Transformer},
DatasetBase, Float,
};
use ndarray::{linalg::Dot, Array2, ArrayBase, Data, Ix2};

use rand::{prelude::Distribution, Rng, SeedableRng};
use rand_xoshiro::Xoshiro256Plus;

use super::hyperparams::RandomProjectionParamsInner;
use super::{common::johnson_lindenstrauss_min_dim, methods::ProjectionMethod};
use super::{RandomProjectionParams, RandomProjectionValidParams};
use crate::ReductionError;

/// Embedding via random projection
pub struct RandomProjection<Proj: ProjectionMethod, F: Float>
where
Proj::RandomDistribution: Distribution<F>,
{
projection: Proj::ProjectionMatrix<F>,
}

impl<F, Proj, Rec, T, R> Fit<Rec, T, ReductionError> for RandomProjectionValidParams<Proj, R>
where
F: Float,
Proj: ProjectionMethod,
Rec: Records<Elem = F>,
R: Rng + Clone,
Proj::RandomDistribution: Distribution<F>,
{
type Object = RandomProjection<Proj, F>;

fn fit(&self, dataset: &linfa::DatasetBase<Rec, T>) -> Result<Self::Object, ReductionError> {
let n_samples = dataset.nsamples();
let n_features = dataset.nfeatures();
let mut rng = self.rng.clone();

let n_dims = match &self.params {
RandomProjectionParamsInner::Dimension { target_dim } => *target_dim,
RandomProjectionParamsInner::Epsilon { eps } => {
johnson_lindenstrauss_min_dim(n_samples, *eps)
}
};

if n_dims > n_features {
return Err(ReductionError::DimensionIncrease(n_dims, n_features));
}

let projection = Proj::generate_matrix(n_features, n_dims, &mut rng)?;

Ok(RandomProjection { projection })
}
}

impl<Proj: ProjectionMethod, F: Float> RandomProjection<Proj, F>
where
Proj::RandomDistribution: Distribution<F>,
{
/// Create new parameters for a [`RandomProjection`] with default value
/// `eps = 0.1` and a [`Xoshiro256Plus`] RNG.
pub fn params() -> RandomProjectionParams<Proj, Xoshiro256Plus> {
RandomProjectionParams(RandomProjectionValidParams {
params: RandomProjectionParamsInner::Epsilon { eps: 0.1 },
rng: Xoshiro256Plus::seed_from_u64(42),
marker: PhantomData,
})
}

/// Create new parameters for a [`RandomProjection`] with default values
/// `eps = 0.1` and the provided [`Rng`].
pub fn params_with_rng<R>(rng: R) -> RandomProjectionParams<Proj, R>
where
R: Rng + Clone,
{
RandomProjectionParams(RandomProjectionValidParams {
params: RandomProjectionParamsInner::Epsilon { eps: 0.1 },
rng,
marker: PhantomData,
})
}
}

impl<Proj, F, D> Transformer<&ArrayBase<D, Ix2>, Array2<F>> for RandomProjection<Proj, F>
where
Proj: ProjectionMethod,
F: Float,
D: Data<Elem = F>,
ArrayBase<D, Ix2>: Dot<Proj::ProjectionMatrix<F>, Output = Array2<F>>,
Proj::RandomDistribution: Distribution<F>,
{
/// Compute the embedding of a two-dimensional array
fn transform(&self, x: &ArrayBase<D, Ix2>) -> Array2<F> {
x.dot(&self.projection)
}
}

impl<Proj, F, D> Transformer<ArrayBase<D, Ix2>, Array2<F>> for RandomProjection<Proj, F>
where
Proj: ProjectionMethod,
F: Float,
D: Data<Elem = F>,
ArrayBase<D, Ix2>: Dot<Proj::ProjectionMatrix<F>, Output = Array2<F>>,
Proj::RandomDistribution: Distribution<F>,
{
/// Compute the embedding of a two-dimensional array
fn transform(&self, x: ArrayBase<D, Ix2>) -> Array2<F> {
self.transform(&x)
}
}

impl<Proj, F, T> Transformer<DatasetBase<Array2<F>, T>, DatasetBase<Array2<F>, T>>
for RandomProjection<Proj, F>
where
Proj: ProjectionMethod,
F: Float,
T: AsTargets,
for<'a> ArrayBase<ndarray::ViewRepr<&'a F>, Ix2>:
Dot<Proj::ProjectionMatrix<F>, Output = Array2<F>>,
Proj::RandomDistribution: Distribution<F>,
{
/// Compute the embedding of a dataset
///
/// # Parameter
///
/// * `data`: a dataset
///
/// # Returns
///
/// New dataset, with data equal to the embedding of the input data
fn transform(&self, data: DatasetBase<Array2<F>, T>) -> DatasetBase<Array2<F>, T> {
let new_records = self.transform(data.records().view());

DatasetBase::new(new_records, data.targets)
}
}

impl<'a, Proj, F, L, T> Transformer<&'a DatasetBase<Array2<F>, T>, DatasetBase<Array2<F>, T::View>>
for RandomProjection<Proj, F>
where
Proj: ProjectionMethod,
F: Float,
L: 'a,
T: AsTargets<Elem = L> + FromTargetArray<'a>,
for<'b> ArrayBase<ndarray::ViewRepr<&'b F>, Ix2>:
Dot<Proj::ProjectionMatrix<F>, Output = Array2<F>>,
Proj::RandomDistribution: Distribution<F>,
{
/// Compute the embedding of a dataset
///
/// # Parameter
///
/// * `data`: a dataset
///
/// # Returns
///
/// New dataset, with data equal to the embedding of the input data
fn transform(&self, data: &'a DatasetBase<Array2<F>, T>) -> DatasetBase<Array2<F>, T::View> {
let new_records = self.transform(data.records().view());

DatasetBase::new(
new_records,
T::new_targets_view(AsTargets::as_targets(data)),
)
}
}
Loading
Loading