diff --git a/algorithms/linfa-reduction/Cargo.toml b/algorithms/linfa-reduction/Cargo.toml index 9e99889c5..967dfebdf 100644 --- a/algorithms/linfa-reduction/Cargo.toml +++ b/algorithms/linfa-reduction/Cargo.toml @@ -1,7 +1,10 @@ [package] name = "linfa-reduction" version = "0.7.0" -authors = ["Lorenz Schmidt "] +authors = [ + "Lorenz Schmidt ", + "Gabriel Bathie ", +] description = "A collection of dimensionality reduction techniques" edition = "2018" license = "MIT OR Apache-2.0" @@ -41,6 +44,8 @@ rand = { version = "0.8", features = ["small_rng"] } linfa = { version = "0.7.0", path = "../.." } linfa-kernel = { version = "0.7.0", path = "../linfa-kernel" } +sprs = "0.11.1" +rand_xoshiro = "0.6.0" [dev-dependencies] ndarray-npy = { version = "0.8", default-features = false } @@ -49,3 +54,5 @@ linfa-datasets = { version = "0.7.0", path = "../../datasets", features = [ "generate", ] } approx = { version = "0.4" } +mnist = { version = "0.6.0", features = ["download"] } +linfa-trees = { version = "0.7.0", path = "../linfa-trees"} diff --git a/algorithms/linfa-reduction/README.md b/algorithms/linfa-reduction/README.md index e1c59b1c3..e269df763 100644 --- a/algorithms/linfa-reduction/README.md +++ b/algorithms/linfa-reduction/README.md @@ -11,6 +11,8 @@ `linfa-reduction` currently provides an implementation of the following dimensional reduction methods: - Diffusion Mapping - Principal Component Analysis (PCA) +- Gaussian random projections +- Sparse random projections ## Examples @@ -19,6 +21,8 @@ There is an usage example in the `examples/` directory. To run, use: ```bash $ cargo run --release --example diffusion_map $ cargo run --release --example pca +$ cargo run --release --example gaussian_projection +$ cargo run --release --example sparse_projection ``` ## BLAS/LAPACK backend diff --git a/algorithms/linfa-reduction/examples/gaussian_projection.rs b/algorithms/linfa-reduction/examples/gaussian_projection.rs new file mode 100644 index 000000000..9548479a3 --- /dev/null +++ b/algorithms/linfa-reduction/examples/gaussian_projection.rs @@ -0,0 +1,76 @@ +use std::{error::Error, time::Instant}; + +use linfa::prelude::*; +use linfa_reduction::random_projection::GaussianRandomProjection; +use linfa_trees::{DecisionTree, SplitQuality}; + +use mnist::{MnistBuilder, NormalizedMnist}; +use ndarray::{Array1, Array2}; +use rand::SeedableRng; +use rand_xoshiro::Xoshiro256Plus; + +/// Train a Decision tree on the MNIST data set, with and without dimensionality reduction. +fn main() -> Result<(), Box> { + // Parameters + let train_sz = 10_000usize; + let test_sz = 1_000usize; + let reduced_dim = 100; + let rng = Xoshiro256Plus::seed_from_u64(42); + + let NormalizedMnist { + trn_img, + trn_lbl, + tst_img, + tst_lbl, + .. + } = MnistBuilder::new() + .label_format_digit() + .training_set_length(train_sz as u32) + .test_set_length(test_sz as u32) + .download_and_extract() + .finalize() + .normalize(); + + let train_data = Array2::from_shape_vec((train_sz, 28 * 28), trn_img)?; + let train_labels: Array1 = + Array1::from_shape_vec(train_sz, trn_lbl)?.map(|x| *x as usize); + let train_dataset = Dataset::new(train_data, train_labels); + + let test_data = Array2::from_shape_vec((test_sz, 28 * 28), tst_img)?; + let test_labels: Array1 = Array1::from_shape_vec(test_sz, tst_lbl)?.map(|x| *x as usize); + + let params = DecisionTree::params() + .split_quality(SplitQuality::Gini) + .max_depth(Some(10)); + + println!("Training non-reduced model..."); + let start = Instant::now(); + let model: DecisionTree = params.fit(&train_dataset)?; + + let end = start.elapsed(); + let pred_y = model.predict(&test_data); + let cm = pred_y.confusion_matrix(&test_labels)?; + println!("Non-reduced model precision: {}%", 100.0 * cm.accuracy()); + println!("Training time: {:.2}s\n", end.as_secs_f32()); + + println!("Training reduced model..."); + let start = Instant::now(); + // Compute the random projection and train the model on the reduced dataset. + let proj = GaussianRandomProjection::::params_with_rng(rng) + .target_dim(reduced_dim) + .fit(&train_dataset)?; + let reduced_train_ds = proj.transform(&train_dataset); + let reduced_test_data = proj.transform(&test_data); + let model_reduced: DecisionTree = params.fit(&reduced_train_ds)?; + + let end = start.elapsed(); + let pred_reduced = model_reduced.predict(&reduced_test_data); + let cm_reduced = pred_reduced.confusion_matrix(&test_labels)?; + println!( + "Reduced model precision: {}%", + 100.0 * cm_reduced.accuracy() + ); + println!("Reduction + training time: {:.2}s", end.as_secs_f32()); + + Ok(()) +} diff --git a/algorithms/linfa-reduction/examples/sparse_projection.rs b/algorithms/linfa-reduction/examples/sparse_projection.rs new file mode 100644 index 000000000..d775d34e0 --- /dev/null +++ b/algorithms/linfa-reduction/examples/sparse_projection.rs @@ -0,0 +1,76 @@ +use std::{error::Error, time::Instant}; + +use linfa::prelude::*; +use linfa_reduction::random_projection::SparseRandomProjection; +use linfa_trees::{DecisionTree, SplitQuality}; + +use mnist::{MnistBuilder, NormalizedMnist}; +use ndarray::{Array1, Array2}; +use rand::SeedableRng; +use rand_xoshiro::Xoshiro256Plus; + +/// Train a Decision tree on the MNIST data set, with and without dimensionality reduction. +fn main() -> Result<(), Box> { + // Parameters + let train_sz = 10_000usize; + let test_sz = 1_000usize; + let reduced_dim = 100; + let rng = Xoshiro256Plus::seed_from_u64(42); + + let NormalizedMnist { + trn_img, + trn_lbl, + tst_img, + tst_lbl, + .. + } = MnistBuilder::new() + .label_format_digit() + .training_set_length(train_sz as u32) + .test_set_length(test_sz as u32) + .download_and_extract() + .finalize() + .normalize(); + + let train_data = Array2::from_shape_vec((train_sz, 28 * 28), trn_img)?; + let train_labels: Array1 = + Array1::from_shape_vec(train_sz, trn_lbl)?.map(|x| *x as usize); + let train_dataset = Dataset::new(train_data, train_labels); + + let test_data = Array2::from_shape_vec((test_sz, 28 * 28), tst_img)?; + let test_labels: Array1 = Array1::from_shape_vec(test_sz, tst_lbl)?.map(|x| *x as usize); + + let params = DecisionTree::params() + .split_quality(SplitQuality::Gini) + .max_depth(Some(10)); + + println!("Training non-reduced model..."); + let start = Instant::now(); + let model: DecisionTree = params.fit(&train_dataset)?; + + let end = start.elapsed(); + let pred_y = model.predict(&test_data); + let cm = pred_y.confusion_matrix(&test_labels)?; + println!("Non-reduced model precision: {}%", 100.0 * cm.accuracy()); + println!("Training time: {:.2}s\n", end.as_secs_f32()); + + println!("Training reduced model..."); + let start = Instant::now(); + // Compute the random projection and train the model on the reduced dataset. + let proj = SparseRandomProjection::::params_with_rng(rng) + .target_dim(reduced_dim) + .fit(&train_dataset)?; + let reduced_train_ds = proj.transform(&train_dataset); + let reduced_test_data = proj.transform(&test_data); + let model_reduced: DecisionTree = params.fit(&reduced_train_ds)?; + + let end = start.elapsed(); + let pred_reduced = model_reduced.predict(&reduced_test_data); + let cm_reduced = pred_reduced.confusion_matrix(&test_labels)?; + println!( + "Reduced model precision: {}%", + 100.0 * cm_reduced.accuracy() + ); + println!("Reduction + training time: {:.2}s", end.as_secs_f32()); + + Ok(()) +} diff --git a/algorithms/linfa-reduction/src/error.rs b/algorithms/linfa-reduction/src/error.rs index 5bf988960..7853f6a2a 100644 --- a/algorithms/linfa-reduction/src/error.rs +++ b/algorithms/linfa-reduction/src/error.rs @@ -18,4 +18,12 @@ pub enum ReductionError { LinalgError(#[from] linfa_linalg::LinalgError), #[error(transparent)] LinfaError(#[from] linfa::error::Error), + #[error(transparent)] + NdarrayRandError(#[from] ndarray_rand::rand_distr::NormalError), + #[error("Precision parameter must be in the interval (0; 1)")] + InvalidPrecision, + #[error("Target dimension of the projection must be positive")] + NonPositiveEmbeddingSize, + #[error("Target dimension {0} is larger than the number of features {1}.")] + DimensionIncrease(usize, usize), } diff --git a/algorithms/linfa-reduction/src/lib.rs b/algorithms/linfa-reduction/src/lib.rs index 444bfe6b4..f2ddd7961 100644 --- a/algorithms/linfa-reduction/src/lib.rs +++ b/algorithms/linfa-reduction/src/lib.rs @@ -6,6 +6,7 @@ extern crate ndarray; mod diffusion_map; mod error; mod pca; +pub mod random_projection; pub mod utils; pub use diffusion_map::{DiffusionMap, DiffusionMapParams, DiffusionMapValidParams}; diff --git a/algorithms/linfa-reduction/src/random_projection/algorithms.rs b/algorithms/linfa-reduction/src/random_projection/algorithms.rs new file mode 100644 index 000000000..8e06f9c9e --- /dev/null +++ b/algorithms/linfa-reduction/src/random_projection/algorithms.rs @@ -0,0 +1,169 @@ +use std::marker::PhantomData; + +use linfa::{ + dataset::{AsTargets, FromTargetArray}, + prelude::Records, + traits::{Fit, Transformer}, + DatasetBase, Float, +}; +use ndarray::{linalg::Dot, Array2, ArrayBase, Data, Ix2}; + +use rand::{prelude::Distribution, Rng, SeedableRng}; +use rand_xoshiro::Xoshiro256Plus; + +use super::hyperparams::RandomProjectionParamsInner; +use super::{common::johnson_lindenstrauss_min_dim, methods::ProjectionMethod}; +use super::{RandomProjectionParams, RandomProjectionValidParams}; +use crate::ReductionError; + +/// Embedding via random projection +pub struct RandomProjection +where + Proj::RandomDistribution: Distribution, +{ + projection: Proj::ProjectionMatrix, +} + +impl Fit for RandomProjectionValidParams +where + F: Float, + Proj: ProjectionMethod, + Rec: Records, + R: Rng + Clone, + Proj::RandomDistribution: Distribution, +{ + type Object = RandomProjection; + + fn fit(&self, dataset: &linfa::DatasetBase) -> Result { + let n_samples = dataset.nsamples(); + let n_features = dataset.nfeatures(); + let mut rng = self.rng.clone(); + + let n_dims = match &self.params { + RandomProjectionParamsInner::Dimension { target_dim } => *target_dim, + RandomProjectionParamsInner::Epsilon { eps } => { + johnson_lindenstrauss_min_dim(n_samples, *eps) + } + }; + + if n_dims > n_features { + return Err(ReductionError::DimensionIncrease(n_dims, n_features)); + } + + let projection = Proj::generate_matrix(n_features, n_dims, &mut rng)?; + + Ok(RandomProjection { projection }) + } +} + +impl RandomProjection +where + Proj::RandomDistribution: Distribution, +{ + /// Create new parameters for a [`RandomProjection`] with default value + /// `eps = 0.1` and a [`Xoshiro256Plus`] RNG. + pub fn params() -> RandomProjectionParams { + RandomProjectionParams(RandomProjectionValidParams { + params: RandomProjectionParamsInner::Epsilon { eps: 0.1 }, + rng: Xoshiro256Plus::seed_from_u64(42), + marker: PhantomData, + }) + } + + /// Create new parameters for a [`RandomProjection`] with default values + /// `eps = 0.1` and the provided [`Rng`]. + pub fn params_with_rng(rng: R) -> RandomProjectionParams + where + R: Rng + Clone, + { + RandomProjectionParams(RandomProjectionValidParams { + params: RandomProjectionParamsInner::Epsilon { eps: 0.1 }, + rng, + marker: PhantomData, + }) + } +} + +impl Transformer<&ArrayBase, Array2> for RandomProjection +where + Proj: ProjectionMethod, + F: Float, + D: Data, + ArrayBase: Dot, Output = Array2>, + Proj::RandomDistribution: Distribution, +{ + /// Compute the embedding of a two-dimensional array + fn transform(&self, x: &ArrayBase) -> Array2 { + x.dot(&self.projection) + } +} + +impl Transformer, Array2> for RandomProjection +where + Proj: ProjectionMethod, + F: Float, + D: Data, + ArrayBase: Dot, Output = Array2>, + Proj::RandomDistribution: Distribution, +{ + /// Compute the embedding of a two-dimensional array + fn transform(&self, x: ArrayBase) -> Array2 { + self.transform(&x) + } +} + +impl Transformer, T>, DatasetBase, T>> + for RandomProjection +where + Proj: ProjectionMethod, + F: Float, + T: AsTargets, + for<'a> ArrayBase, Ix2>: + Dot, Output = Array2>, + Proj::RandomDistribution: Distribution, +{ + /// Compute the embedding of a dataset + /// + /// # Parameter + /// + /// * `data`: a dataset + /// + /// # Returns + /// + /// New dataset, with data equal to the embedding of the input data + fn transform(&self, data: DatasetBase, T>) -> DatasetBase, T> { + let new_records = self.transform(data.records().view()); + + DatasetBase::new(new_records, data.targets) + } +} + +impl<'a, Proj, F, L, T> Transformer<&'a DatasetBase, T>, DatasetBase, T::View>> + for RandomProjection +where + Proj: ProjectionMethod, + F: Float, + L: 'a, + T: AsTargets + FromTargetArray<'a>, + for<'b> ArrayBase, Ix2>: + Dot, Output = Array2>, + Proj::RandomDistribution: Distribution, +{ + /// Compute the embedding of a dataset + /// + /// # Parameter + /// + /// * `data`: a dataset + /// + /// # Returns + /// + /// New dataset, with data equal to the embedding of the input data + fn transform(&self, data: &'a DatasetBase, T>) -> DatasetBase, T::View> { + let new_records = self.transform(data.records().view()); + + DatasetBase::new( + new_records, + T::new_targets_view(AsTargets::as_targets(data)), + ) + } +} diff --git a/algorithms/linfa-reduction/src/random_projection/common.rs b/algorithms/linfa-reduction/src/random_projection/common.rs new file mode 100644 index 000000000..ef8e36d56 --- /dev/null +++ b/algorithms/linfa-reduction/src/random_projection/common.rs @@ -0,0 +1,38 @@ +/// Compute a safe dimension for a projection with precision `eps`, +/// using the Johnson-Lindestrauss Lemma. +/// +/// References: +/// - [D. Achlioptas, JCSS](https://www.sciencedirect.com/science/article/pii/S0022000003000254) +/// - [Li et al., SIGKDD'06](https://hastie.su.domains/Papers/Ping/KDD06_rp.pdf) +pub(crate) fn johnson_lindenstrauss_min_dim(n_samples: usize, eps: f64) -> usize { + let log_samples = (n_samples as f64).ln(); + let value = 4. * log_samples / (eps.powi(2) / 2. - eps.powi(3) / 3.); + value as usize +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + /// Test against values computed by the scikit-learn implementation + /// of `johnson_lindenstrauss_min_dim`. + fn test_johnson_lindenstrauss() { + assert_eq!(johnson_lindenstrauss_min_dim(100, 0.05), 15244); + assert_eq!(johnson_lindenstrauss_min_dim(100, 0.1), 3947); + assert_eq!(johnson_lindenstrauss_min_dim(100, 0.2), 1062); + assert_eq!(johnson_lindenstrauss_min_dim(100, 0.5), 221); + assert_eq!(johnson_lindenstrauss_min_dim(1000, 0.05), 22867); + assert_eq!(johnson_lindenstrauss_min_dim(1000, 0.1), 5920); + assert_eq!(johnson_lindenstrauss_min_dim(1000, 0.2), 1594); + assert_eq!(johnson_lindenstrauss_min_dim(1000, 0.5), 331); + assert_eq!(johnson_lindenstrauss_min_dim(5000, 0.05), 28194); + assert_eq!(johnson_lindenstrauss_min_dim(5000, 0.1), 7300); + assert_eq!(johnson_lindenstrauss_min_dim(5000, 0.2), 1965); + assert_eq!(johnson_lindenstrauss_min_dim(5000, 0.5), 408); + assert_eq!(johnson_lindenstrauss_min_dim(10000, 0.05), 30489); + assert_eq!(johnson_lindenstrauss_min_dim(10000, 0.1), 7894); + assert_eq!(johnson_lindenstrauss_min_dim(10000, 0.2), 2125); + assert_eq!(johnson_lindenstrauss_min_dim(10000, 0.5), 442); + } +} diff --git a/algorithms/linfa-reduction/src/random_projection/hyperparams.rs b/algorithms/linfa-reduction/src/random_projection/hyperparams.rs new file mode 100644 index 000000000..043cbcc19 --- /dev/null +++ b/algorithms/linfa-reduction/src/random_projection/hyperparams.rs @@ -0,0 +1,142 @@ +use std::{fmt::Debug, marker::PhantomData}; + +use linfa::ParamGuard; + +use rand::Rng; + +use crate::ReductionError; + +use super::methods::ProjectionMethod; + +/// Random projection hyperparameters +/// +/// The main hyperparameter of random projections is +/// the dimension of the embedding. +/// This dimension is usually determined by the desired precision (or distortion) `eps`, +/// using the [Johnson-Lindenstrauss Lemma](https://en.wikipedia.org/wiki/Johnson%E2%80%93Lindenstrauss_lemma). +/// However, this lemma makes a very conservative estimate of the required dimension, +/// and does not leverage the structure of the data, therefore it is also possible +/// to manually specify the dimension of the embedding. +/// +/// As this algorithm is randomized, it also accepts an [`Rng`] as parameter, +/// to be used to sample coordinate of the projection matrix. +pub struct RandomProjectionParams( + pub(crate) RandomProjectionValidParams, +); + +impl RandomProjectionParams { + /// Set the dimension of output of the embedding. + /// + /// Setting the target dimension with this function + /// discards the precision parameter if it had been set previously. + pub fn target_dim(mut self, dim: usize) -> Self { + self.0.params = RandomProjectionParamsInner::Dimension { target_dim: dim }; + + self + } + + /// Set the precision parameter (distortion, `eps`) of the embedding. + /// + /// Setting `eps` with this function + /// discards the target dimension parameter if it had been set previously. + pub fn eps(mut self, eps: f64) -> Self { + self.0.params = RandomProjectionParamsInner::Epsilon { eps }; + + self + } + + /// Specify the random number generator to use to generate the projection matrix. + pub fn with_rng(self, rng: R2) -> RandomProjectionParams { + RandomProjectionParams(RandomProjectionValidParams { + params: self.0.params, + rng, + marker: PhantomData, + }) + } +} + +/// Random projection hyperparameters +/// +/// The main hyperparameter of random projections is +/// the dimension of the embedding. +/// This dimension is usually determined by the desired precision (or distortion) `eps`, +/// using the [Johnson-Lindenstrauss Lemma](https://en.wikipedia.org/wiki/Johnson%E2%80%93Lindenstrauss_lemma). +/// However, this lemma makes a very conservative estimate of the required dimension, +/// and does not leverage the structure of the data, therefore it is also possible +/// to manually specify the dimension of the embedding. +/// +/// As this algorithm is randomized, it also accepts an [`Rng`] as parameter, +/// to be used to sample coordinate of the projection matrix. +#[derive(Debug, Clone, PartialEq)] +pub struct RandomProjectionValidParams { + pub(super) params: RandomProjectionParamsInner, + pub(super) rng: R, + pub(crate) marker: PhantomData, +} + +/// Internal data structure that either holds the dimension or the embedding, +/// or the precision, which can be used later to compute the dimension +/// (see [super::common::johnson_lindenstrauss_min_dim]). +#[derive(Debug, Clone, PartialEq)] +pub(crate) enum RandomProjectionParamsInner { + Dimension { target_dim: usize }, + Epsilon { eps: f64 }, +} + +impl RandomProjectionParamsInner { + fn target_dim(&self) -> Option { + use RandomProjectionParamsInner::*; + match self { + Dimension { target_dim } => Some(*target_dim), + Epsilon { .. } => None, + } + } + + fn eps(&self) -> Option { + use RandomProjectionParamsInner::*; + match self { + Dimension { .. } => None, + Epsilon { eps } => Some(*eps), + } + } +} + +impl RandomProjectionValidParams { + pub fn target_dim(&self) -> Option { + self.params.target_dim() + } + + pub fn eps(&self) -> Option { + self.params.eps() + } + + pub fn rng(&self) -> &R { + &self.rng + } +} + +impl ParamGuard for RandomProjectionParams { + type Checked = RandomProjectionValidParams; + type Error = ReductionError; + + fn check_ref(&self) -> Result<&Self::Checked, Self::Error> { + match self.0.params { + RandomProjectionParamsInner::Dimension { target_dim } => { + if target_dim == 0 { + return Err(ReductionError::NonPositiveEmbeddingSize); + } + } + RandomProjectionParamsInner::Epsilon { eps } => { + if eps <= 0. || eps >= 1. { + return Err(ReductionError::InvalidPrecision); + } + } + }; + Ok(&self.0) + } + + fn check(self) -> Result { + self.check_ref()?; + Ok(self.0) + } +} diff --git a/algorithms/linfa-reduction/src/random_projection/methods.rs b/algorithms/linfa-reduction/src/random_projection/methods.rs new file mode 100644 index 000000000..cba33aaae --- /dev/null +++ b/algorithms/linfa-reduction/src/random_projection/methods.rs @@ -0,0 +1,117 @@ +use linfa::Float; +use ndarray::{Array, Array2}; +use ndarray_rand::{ + rand_distr::{Normal, StandardNormal}, + RandomExt, +}; +use rand::{ + distributions::{Bernoulli, Distribution, Standard}, + Rng, +}; +use sprs::{CsMat, TriMat}; + +use crate::ReductionError; + +pub trait ProjectionMethod { + type RandomDistribution; + type ProjectionMatrix + where + Self::RandomDistribution: Distribution; + + fn generate_matrix( + n_features: usize, + n_dims: usize, + rng: &mut impl Rng, + ) -> Result, ReductionError> + where + Self::RandomDistribution: Distribution; +} + +pub struct Gaussian; + +impl ProjectionMethod for Gaussian { + type RandomDistribution = StandardNormal; + type ProjectionMatrix = Array2 where StandardNormal: Distribution; + + fn generate_matrix( + n_features: usize, + n_dims: usize, + rng: &mut impl Rng, + ) -> Result, ReductionError> + where + StandardNormal: Distribution, + { + let std_dev = F::cast(n_features).sqrt().recip(); + let gaussian = Normal::new(F::zero(), std_dev)?; + + Ok(Array::random_using((n_features, n_dims), gaussian, rng)) + } +} + +pub struct Sparse; + +impl ProjectionMethod for Sparse { + type RandomDistribution = Standard; + type ProjectionMatrix = CsMat where Standard: Distribution; + + fn generate_matrix( + n_features: usize, + n_dims: usize, + rng: &mut impl Rng, + ) -> Result, ReductionError> + where + Standard: Distribution, + { + let scale = (n_features as f64).sqrt(); + let p = 1f64 / scale; + let dist = SparseDistribution::new(F::cast(scale), p); + + let (mut row_inds, mut col_inds, mut values) = (Vec::new(), Vec::new(), Vec::new()); + for row in 0..n_features { + for col in 0..n_dims { + if let Some(x) = dist.sample(rng) { + row_inds.push(row); + col_inds.push(col); + values.push(x); + } + } + } + + // `proj` will be used as the RHS of a matrix multiplication in [`SparseRandomProjection::transform`], + // therefore we convert it to `csc` storage. + let proj = TriMat::from_triplets((n_features, n_dims), row_inds, col_inds, values).to_csc(); + + Ok(proj) + } +} + +/// Random variable that has value `Some(+/- scale)` with probability `p/2` each, +/// and [`None`] with probability `1-p`. +struct SparseDistribution { + scale: F, + b: Bernoulli, +} + +impl SparseDistribution { + pub fn new(scale: F, p: f64) -> Self { + SparseDistribution { + scale, + b: Bernoulli::new(p).expect("Parmeter `p` must be between 0 and 1."), + } + } +} + +impl Distribution> for SparseDistribution { + fn sample(&self, rng: &mut R) -> Option { + let non_zero = self.b.sample(rng); + if non_zero { + if rng.gen::() { + Some(self.scale) + } else { + Some(-self.scale) + } + } else { + None + } + } +} diff --git a/algorithms/linfa-reduction/src/random_projection/mod.rs b/algorithms/linfa-reduction/src/random_projection/mod.rs new file mode 100644 index 000000000..777ff8886 --- /dev/null +++ b/algorithms/linfa-reduction/src/random_projection/mod.rs @@ -0,0 +1,87 @@ +//! # Random Projections +//! +//! These algorithms build a low-distortion embedding of the input data +//! in a low-dimensional Euclidean space by projecting the data onto a random subspace. +//! The embedding is a randomly chosen matrix (either Gaussian or sparse), +//! following the [Johnson-Lindenstrauss Lemma](https://en.wikipedia.org/wiki/Johnson%E2%80%93Lindenstrauss_lemma). +//! +//! This result states that, if the dimension of the embedding is `Ω(log(n_samples)/eps^2)`, +//! then with high probability, the projection `p` has distortion less than `eps`, +//! where `eps` is parameter, with `0 < eps < 1`. +//! "Distortion less than `eps`" means that for all vectors `u, v` in the original dataset, +//! we have `(1 - eps) d(u, v) <= d(p(u), p(v)) <= (1 + eps) d(u, v)`, +//! where `d` denotes the distance between two vectors. +//! +//! Note that the dimension of the embedding does not +//! depend on the original dimension of the data set (the number of features). +//! +//! ## Comparison with other methods +//! +//! To obtain a given accuracy on a given task, random projections will +//! often require a larger embedding dimension than other reduction methods such as PCA. +//! However, random projections have a very low computational cost, +//! since they only consist in sampling a random matrix, +//! whereas the PCA requires computing the pseudoinverse of a large matrix, +//! which is computationally expensive. +mod algorithms; +mod common; +mod hyperparams; +mod methods; + +pub use algorithms::RandomProjection; +pub use hyperparams::{RandomProjectionParams, RandomProjectionValidParams}; + +use self::methods::{Gaussian, Sparse}; + +pub type GaussianRandomProjection = RandomProjection; +pub type GaussianRandomProjectionParams = RandomProjectionParams; +pub type GaussianRandomProjectionValidParams = RandomProjectionValidParams; +pub type SparseRandomProjection = RandomProjection; +pub type SparseRandomProjectionParams = RandomProjectionParams; +pub type SparseRandomProjectionValidParams = RandomProjectionValidParams; + +#[cfg(test)] +mod tests { + use super::*; + + use rand_xoshiro::Xoshiro256Plus; + + #[test] + fn autotraits_gaussian() { + fn has_autotraits() {} + has_autotraits::>(); + has_autotraits::>(); + has_autotraits::>(); + has_autotraits::>(); + } + + #[test] + fn autotraits_sparse() { + fn has_autotraits() {} + has_autotraits::>(); + has_autotraits::>(); + has_autotraits::>(); + has_autotraits::>(); + } + + use linfa::{traits::Fit, Dataset}; + + #[test] + fn gaussian_dim_increase_error() { + let records = array![[10., 10.], [1., 12.], [20., 30.], [-20., 30.],]; + let dataset = Dataset::from(records); + let res = GaussianRandomProjection::::params() + .target_dim(10) + .fit(&dataset); + assert!(res.is_err()) + } + #[test] + fn sparse_dim_increase_error() { + let records = array![[10., 10.], [1., 12.], [20., 30.], [-20., 30.],]; + let dataset = Dataset::from(records); + let res = SparseRandomProjection::::params() + .target_dim(10) + .fit(&dataset); + assert!(res.is_err()) + } +}