From 30d62d2917a9dcc3edc2b116c79837a515a6cce5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Lafage?= Date: Fri, 19 Jan 2024 11:58:32 +0100 Subject: [PATCH] Implement sparse gaussian process methods (#128) * Implement sparse FITC method * Cleanup * Linting * Fix comment * Refactor documentation * Add noise estimation or noise constant option * Add Eq, PartialEq * Remove mean model from SGP as zero trend is always used * Fix test tolerance * Implement random inducing points * Implement VFE sparse method * Add relevant initial guess and bounds * Fix doc * Improve doc * Remove generic mean arg unused as it is always Constant * Add SGP example * Update README * Use same optimize_theta as GP in SGP * Renaming * Renaming in CI * Relax noise estimation tolerance * Relax test tolerance * Fix bad test in MOE: do not test derivatives on hard recombination --- .github/workflows/test.yml | 2 +- README.md | 12 +- gp/src/algorithm.rs | 157 +++++- gp/src/correlation_models.rs | 9 +- gp/src/errors.rs | 2 +- gp/src/lib.rs | 95 +--- gp/src/mean_models.rs | 6 +- gp/src/parameters.rs | 10 +- gp/src/sgp_algorithm.rs | 910 +++++++++++++++++++++++++++++++++++ gp/src/sgp_parameters.rs | 230 +++++++++ moe/src/algorithm.rs | 47 +- 11 files changed, 1325 insertions(+), 155 deletions(-) create mode 100644 gp/src/sgp_algorithm.rs create mode 100644 gp/src/sgp_parameters.rs diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a6586df0..075c618c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -48,7 +48,7 @@ jobs: args: --all --release test-features: - name: testing-with-blas-${{ matrix.toolchain }}-${{ matrix.os }} + name: testing-features-${{ matrix.toolchain }}-${{ matrix.os }} runs-on: ${{ matrix.os }} strategy: fail-fast: false diff --git a/README.md b/README.md index e839c7a3..31297c86 100644 --- a/README.md +++ b/README.md @@ -27,12 +27,12 @@ and mixture of Gaussian processes surrogate model. `egobox` Rust libraries consists of the following sub-packages. -| Name | Version | Documentation | Description | -| :---------------------------------------------------- | :---------------------------------------------------------------------------------------------- | :-------------------------------------------------------------------------- | :------------------------------------------------------------------------------ | -| [doe](https://github.com/relf/egobox/tree/master/doe) | [![crates.io](https://img.shields.io/crates/v/egobox-doe)](https://crates.io/crates/egobox-doe) | [![docs](https://docs.rs/egobox-doe/badge.svg)](https://docs.rs/egobox-doe) | sampling methods; contains LHS, FullFactorial, Random methods | -| [gp](https://github.com/relf/egobox/tree/master/gp) | [![crates.io](https://img.shields.io/crates/v/egobox-gp)](https://crates.io/crates/egobox-gp) | [![docs](https://docs.rs/egobox-gp/badge.svg)](https://docs.rs/egobox-gp) | gaussian process regression; contains Kriging and PLS dimension reduction | -| [moe](https://github.com/relf/egobox/tree/master/moe) | [![crates.io](https://img.shields.io/crates/v/egobox-moe)](https://crates.io/crates/egobox-moe) | [![docs](https://docs.rs/egobox-moe/badge.svg)](https://docs.rs/egobox-moe) | mixture of experts using GP models | -| [ego](https://github.com/relf/egobox/tree/master/ego) | [![crates.io](https://img.shields.io/crates/v/egobox-ego)](https://crates.io/crates/egobox-ego) | [![docs](https://docs.rs/egobox-ego/badge.svg)](https://docs.rs/egobox-ego) | efficient global optimization with basic constraints and mixed integer handling | +| Name | Version | Documentation | Description | +| :---------------------------------------------------- | :---------------------------------------------------------------------------------------------- | :-------------------------------------------------------------------------- | :---------------------------------------------------------------------------------------- | +| [doe](https://github.com/relf/egobox/tree/master/doe) | [![crates.io](https://img.shields.io/crates/v/egobox-doe)](https://crates.io/crates/egobox-doe) | [![docs](https://docs.rs/egobox-doe/badge.svg)](https://docs.rs/egobox-doe) | sampling methods; contains LHS, FullFactorial, Random methods | +| [gp](https://github.com/relf/egobox/tree/master/gp) | [![crates.io](https://img.shields.io/crates/v/egobox-gp)](https://crates.io/crates/egobox-gp) | [![docs](https://docs.rs/egobox-gp/badge.svg)](https://docs.rs/egobox-gp) | gaussian process regression; contains Kriging, PLS dimension reduction and sparse methods | +| [moe](https://github.com/relf/egobox/tree/master/moe) | [![crates.io](https://img.shields.io/crates/v/egobox-moe)](https://crates.io/crates/egobox-moe) | [![docs](https://docs.rs/egobox-moe/badge.svg)](https://docs.rs/egobox-moe) | mixture of experts using GP models | +| [ego](https://github.com/relf/egobox/tree/master/ego) | [![crates.io](https://img.shields.io/crates/v/egobox-ego)](https://crates.io/crates/egobox-ego) | [![docs](https://docs.rs/egobox-ego/badge.svg)](https://docs.rs/egobox-ego) | efficient global optimization with constraints and mixed integer handling | ### Usage diff --git a/gp/src/algorithm.rs b/gp/src/algorithm.rs index c72c0396..af5efbd8 100644 --- a/gp/src/algorithm.rs +++ b/gp/src/algorithm.rs @@ -65,7 +65,92 @@ impl Clone for GpInnerParams { } } -/// Structure for trained Gaussian Process model +/// A GP regression is an interpolation method where the +/// interpolated values are modeled by a Gaussian process with a mean and +/// governed by a prior covariance kernel, which depends on some +/// parameters to be determined. +/// +/// The interpolated output is modeled as stochastic process as follows: +/// +/// `Y(x) = mu(x) + Z(x)` +/// +/// where: +/// * `mu(x)` is the trend i.e. the mean of the gaussian process +/// * `Z(x)` the realization of stochastic gaussian process ~ `Normal(0, sigma^2)` +/// +/// which in turn is written as: +/// +/// `Y(x) = betas.regr(x) + sigma^2*corr(x, x')` +/// +/// where: +/// * `betas` is a vector of linear regression parameters to be determined +/// * `regr(x)` a vector of polynomial basis functions +/// * `sigma^2` is the process variance +/// * `corr(x, x')` is a correlation function which depends on `distance(x, x')` +/// and a set of unknown parameters `thetas` to be determined. +/// +/// # Implementation +/// +/// * Based on [ndarray](https://github.com/rust-ndarray/ndarray) +/// and [linfa](https://github.com/rust-ml/linfa) and strive to follow [linfa guidelines](https://github.com/rust-ml/linfa/blob/master/CONTRIBUTE.md) +/// * GP mean model can be constant, linear or quadratic +/// * GP correlation model can be build the following kernels: squared exponential, absolute exponential, matern 3/2, matern 5/2 +/// cf. [SMT Kriging](https://smt.readthedocs.io/en/latest/_src_docs/surrogate_models/krg.html) +/// * For high dimensional problems, the classic GP algorithm does not perform well as +/// it depends on the inversion of a correlation (n, n) matrix which is an O(n3) operation. +/// To work around this problem the library implements dimension reduction using +/// Partial Least Squares method upon Kriging method also known as KPLS algorithm (see Reference) +/// * GP models can be saved and loaded using [serde](https://serde.rs/). +/// See `serializable` feature section below. +/// +/// # Features +/// +/// ## serializable +/// +/// The `serializable` feature enables the serialization of GP models using the [`serde crate`](https://serde.rs/). +/// +/// ## blas +/// +/// The `blas` feature enables the use of BLAS/LAPACK linear algebra backend available with [`ndarray-linalg`](https://github.com/rust-ndarray/ndarray-linalg). +/// +/// # Example +/// +/// ```no_run +/// use egobox_gp::{correlation_models::*, mean_models::*, GaussianProcess}; +/// use linfa::prelude::*; +/// use ndarray::{arr2, concatenate, Array, Array2, Axis}; +/// +/// // one-dimensional test function to approximate +/// fn xsinx(x: &Array2) -> Array2 { +/// (x - 3.5) * ((x - 3.5) / std::f64::consts::PI).mapv(|v| v.sin()) +/// } +/// +/// // training data +/// let xt = arr2(&[[0.0], [5.0], [10.0], [15.0], [18.0], [20.0], [25.0]]); +/// let yt = xsinx(&xt); +/// +/// // GP with constant mean model and squared exponential correlation model +/// // i.e. Oridinary Kriging model +/// let kriging = GaussianProcess::::params( +/// ConstantMean::default(), +/// SquaredExponentialCorr::default()) +/// .fit(&Dataset::new(xt, yt)) +/// .expect("Kriging trained"); +/// +/// // Use trained model for making predictions +/// let xtest = Array::linspace(0., 25., 26).insert_axis(Axis(1)); +/// let ytest = xsinx(&xtest); +/// +/// let ypred = kriging.predict_values(&xtest).expect("Kriging prediction"); +/// let yvariances = kriging.predict_variances(&xtest).expect("Kriging prediction"); +///``` +/// +/// # Reference: +/// +/// Bouhlel, Mohamed Amine, et al. [Improving kriging surrogates of high-dimensional design +/// models by Partial Least Squares dimension reduction](https://hal.archives-ouvertes.fr/hal-01232938/document) +/// Structural and Multidisciplinary Optimization 53.5 (2016): 935-952. +/// #[derive(Debug)] #[cfg_attr( feature = "serializable", @@ -747,10 +832,13 @@ impl, Corr: CorrelationModel, D: Data, Corr: CorrelationModel, D: Data(objfn: ObjF, theta0: &Array1) -> (Array1, f64) +pub(crate) fn optimize_params( + objfn: ObjF, + param0: &Array1, + bounds: &[(F, F)], +) -> (Array1, f64) where ObjF: Fn(&[f64], Option<&mut [f64]>, &mut ()) -> f64, F: Float, @@ -777,36 +869,44 @@ where let base: f64 = 10.; // block to drop optimizer and allow self.corr borrowing after - let mut optimizer = Nlopt::new(Algorithm::Cobyla, theta0.len(), objfn, Target::Minimize, ()); - let mut theta_vec = theta0 + let mut optimizer = Nlopt::new(Algorithm::Cobyla, param0.len(), objfn, Target::Minimize, ()); + let mut param = param0 .map(|v| unsafe { *(v as *const F as *const f64) }) .into_raw_vec(); - optimizer.set_lower_bound(-6.).unwrap(); - optimizer.set_upper_bound(2.).unwrap(); + + let lower_bounds = bounds.iter().map(|b| into_f64(&b.0)).collect::>(); + optimizer.set_lower_bounds(&lower_bounds).unwrap(); + let upper_bounds = bounds.iter().map(|b| into_f64(&b.1)).collect::>(); + optimizer.set_upper_bounds(&upper_bounds).unwrap(); + optimizer.set_initial_step1(0.5).unwrap(); - optimizer.set_maxeval(15 * theta0.len() as u32).unwrap(); + optimizer.set_maxeval(15 * param0.len() as u32).unwrap(); optimizer.set_ftol_rel(1e-4).unwrap(); - match optimizer.optimize(&mut theta_vec) { + match optimizer.optimize(&mut param) { Ok((_, fmin)) => { - let thetas_opt = arr1(&theta_vec).mapv(|v| base.powf(v)); + let params_opt = arr1(¶m); let fval = if f64::is_nan(fmin) { f64::INFINITY } else { fmin }; - (thetas_opt, fval) + (params_opt, fval) } Err(_e) => { // println!("ERROR OPTIM in GP err={:?}", e); - (arr1(&theta_vec).mapv(|v| base.powf(v)), f64::INFINITY) + (arr1(¶m).mapv(|v| base.powf(v)), f64::INFINITY) } } } -/// Optimize gp hyper parameter theta given an initial guess `theta0` +/// Optimize gp hyper parameters given an initial guess and bounds with cobyla #[cfg(not(feature = "nlopt"))] -fn optimize_theta(objfn: ObjF, theta0: &Array1) -> (Array1, f64) +pub(crate) fn optimize_params( + objfn: ObjF, + param0: &Array1, + bounds: &[(F, F)], +) -> (Array1, f64) where ObjF: Fn(&[f64], Option<&mut [f64]>, &mut ()) -> f64, F: Float, @@ -815,18 +915,20 @@ where let base: f64 = 10.; let cons: Vec<&dyn Func<()>> = vec![]; - let theta_init = theta0 - .map(|v| unsafe { *(v as *const F as *const f64) }) - .into_raw_vec(); + let param0 = param0.map(|v| into_f64(v)).into_raw_vec(); let initial_step = 0.5; let ftol_rel = 1e-4; - let maxeval = 15 * theta0.len(); - let bounds = vec![(-6., 2.); theta0.len()]; + let maxeval = 15 * param0.len(); + + let bounds: Vec<_> = bounds + .iter() + .map(|(lo, up)| (into_f64(lo), into_f64(up))) + .collect(); match minimize( |x, u| objfn(x, None, u), - &theta_init, + ¶m0, &bounds, &cons, (), @@ -838,13 +940,13 @@ where }), ) { Ok((_, x_opt, fval)) => { - let thetas_opt = arr1(&x_opt).mapv(|v| base.powf(v)); + let params_opt = arr1(&x_opt); let fval = if f64::is_nan(fval) { f64::INFINITY } else { fval }; - (thetas_opt, fval) + (params_opt, fval) } Err((status, x_opt, _)) => { println!("ERROR Cobyla optimizer in GP status={:?}", status); @@ -853,6 +955,11 @@ where } } +#[inline(always)] +fn into_f64(v: &F) -> f64 { + unsafe { *(v as *const F as *const f64) } +} + /// Compute reduced likelihood function /// fx: mean factors term at x samples, /// rxx: correlation factors at x samples, diff --git a/gp/src/correlation_models.rs b/gp/src/correlation_models.rs index a6b468c4..871ba5c9 100644 --- a/gp/src/correlation_models.rs +++ b/gp/src/correlation_models.rs @@ -42,7 +42,7 @@ pub trait CorrelationModel: Clone + Copy + Default + fmt::Display { } /// Squared exponential correlation models -#[derive(Clone, Copy, Debug, Default)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)] #[cfg_attr( feature = "serializable", derive(Serialize, Deserialize), @@ -108,7 +108,7 @@ impl fmt::Display for SquaredExponentialCorr { } /// Absolute exponential correlation models -#[derive(Clone, Copy, Debug, Default)] +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] #[cfg_attr( feature = "serializable", derive(Serialize, Deserialize), @@ -175,8 +175,7 @@ impl fmt::Display for AbsoluteExponentialCorr { } /// Matern 3/2 correlation model - -#[derive(Clone, Copy, Debug, Default)] +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] #[cfg_attr( feature = "serializable", derive(Serialize, Deserialize), @@ -315,7 +314,7 @@ impl fmt::Display for Matern32Corr { } /// Matern 5/2 correlation model -#[derive(Clone, Copy, Debug, Default)] +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] #[cfg_attr( feature = "serializable", derive(Serialize, Deserialize), diff --git a/gp/src/errors.rs b/gp/src/errors.rs index 5a546b36..0f0ca4aa 100644 --- a/gp/src/errors.rs +++ b/gp/src/errors.rs @@ -3,7 +3,7 @@ use thiserror::Error; /// A result type for GP regression algorithm pub type Result = std::result::Result; -/// An error when modeling a GMM algorithm +/// An error when using [`GaussianProcess`](crate::GaussianProcess) or a [`SparseGaussianProcess`](crate::SparseGaussianProcess) algorithm #[derive(Error, Debug)] pub enum GpError { /// When LikelihoodComputation computation fails diff --git a/gp/src/lib.rs b/gp/src/lib.rs index a50f7dc9..683ab528 100644 --- a/gp/src/lib.rs +++ b/gp/src/lib.rs @@ -2,95 +2,26 @@ //! also known as [Kriging](https://en.wikipedia.org/wiki/Kriging) models, //! it is a port of [SMT Kriging and KPLS surrogate models](https://smt.readthedocs.io/en/latest/_src_docs/surrogate_models.html). //! -//! A GP regression is an interpolation method where the -//! interpolated values are modeled by a Gaussian process with a mean -//! governed by a prior covariance kernel, which depends on some -//! parameters to be determined. -//! -//! The interpolated output is modeled as stochastic process as follows: -//! -//! `Y(x) = mu(x) + Z(x)` -//! -//! where: -//! * `mu(x)` is the trend i.e. the mean of the gaussian process -//! * `Z(x)` the realization of stochastic gaussian process ~ `Normal(0, sigma^2)` -//! -//! which in turn is written as: -//! -//! `Y(x) = betas.regr(x) + sigma^2*corr(x, x')` -//! -//! where: -//! * `betas` is a vector of linear regression parameters to be determined -//! * `regr(x)` a vector of polynomial basis functions -//! * `sigma^2` is the process variance -//! * `corr(x, x')` is a correlation function which depends on `distance(x, x')` -//! and a set of unknown parameters `thetas` to be determined. -//! -//! # Implementation -//! -//! * Based on [ndarray](https://github.com/rust-ndarray/ndarray) -//! and [linfa](https://github.com/rust-ml/linfa) and strive to follow [linfa guidelines](https://github.com/rust-ml/linfa/blob/master/CONTRIBUTE.md) -//! * GP mean model can be constant, linear or quadratic -//! * GP correlation model can be build the following kernels: squared exponential, absolute exponential, matern 3/2, matern 5/2 -//! cf. [SMT Kriging](https://smt.readthedocs.io/en/latest/_src_docs/surrogate_models/krg.html) -//! * For high dimensional problems, the classic GP algorithm does not perform well as -//! it depends on the inversion of a correlation (n, n) matrix which is an O(n3) operation. -//! To work around this problem the library implements dimension reduction using -//! Partial Least Squares method upon Kriging method also known as KPLS algorithm (see Reference) -//! * GP models can be saved and loaded using [serde](https://serde.rs/). -//! See `serializable` feature section below. -//! -//! # Features -//! -//! ## serializable -//! -//! The `serializable` feature enables the serialization of GP models using the [serde crate](https://serde.rs/). -//! -//! # Example -//! -//! ```no_run -//! use egobox_gp::{correlation_models::*, mean_models::*, GaussianProcess}; -//! use linfa::prelude::*; -//! use ndarray::{arr2, concatenate, Array, Array2, Axis}; -//! -//! // one-dimensional test function to approximate -//! fn xsinx(x: &Array2) -> Array2 { -//! (x - 3.5) * ((x - 3.5) / std::f64::consts::PI).mapv(|v| v.sin()) -//! } -//! -//! // training data -//! let xt = arr2(&[[0.0], [5.0], [10.0], [15.0], [18.0], [20.0], [25.0]]); -//! let yt = xsinx(&xt); -//! -//! // GP with constant mean model and squared exponential correlation model -//! // i.e. Oridinary Kriging model -//! let kriging = GaussianProcess::::params( -//! ConstantMean::default(), -//! SquaredExponentialCorr::default()) -//! .fit(&Dataset::new(xt, yt)) -//! .expect("Kriging trained"); -//! -//! // Use trained model for making predictions -//! let xtest = Array::linspace(0., 25., 26).insert_axis(Axis(1)); -//! let ytest = xsinx(&xtest); -//! -//! let ypred = kriging.predict_values(&xtest).expect("Kriging prediction"); -//! let yvariances = kriging.predict_variances(&xtest).expect("Kriging prediction"); -//!``` -//! -//! # Reference: -//! -//! Bouhlel, Mohamed Amine, et al. [Improving kriging surrogates of high-dimensional design -//! models by Partial Least Squares dimension reduction](https://hal.archives-ouvertes.fr/hal-01232938/document) -//! Structural and Multidisciplinary Optimization 53.5 (2016): 935-952. -//! +//! It also implements Sparse Gaussian Processes methods (SGPs) which address limitations of Gaussian Processes (GPs) +//! when the number of training points is large. Indeed the complexity of GPs algorithm is in O(N^3) in processing +//! time and O(N^2) in memory where N is the number of training points. The complexity is then respectively reduced +//! to O(N.M^2) and O(NM) where M is the number of so-called inducing points with M < N. +//! +//! GP methods are implemented by [GaussianProcess] parameterized by [GpParams]. +//! +//! SGP methods are implemented by [SparseGaussianProcess] parameterized by [SgpParams]. mod algorithm; pub mod correlation_models; mod errors; pub mod mean_models; +mod sgp_algorithm; + mod parameters; +mod sgp_parameters; mod utils; pub use algorithm::*; pub use errors::*; pub use parameters::*; +pub use sgp_algorithm::*; +pub use sgp_parameters::*; diff --git a/gp/src/mean_models.rs b/gp/src/mean_models.rs index ecaba1d2..781ab163 100644 --- a/gp/src/mean_models.rs +++ b/gp/src/mean_models.rs @@ -27,7 +27,7 @@ pub trait RegressionModel: Clone + Copy + Default + fmt::Display { } /// A constant function as mean of the GP -#[derive(Clone, Copy, Debug, Default)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)] #[cfg_attr( feature = "serializable", derive(Serialize, Deserialize), @@ -53,7 +53,7 @@ impl RegressionModel for ConstantMean { } /// An affine function as mean of the GP -#[derive(Clone, Copy, Debug, Default)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)] #[cfg_attr( feature = "serializable", derive(Serialize, Deserialize), @@ -82,7 +82,7 @@ impl RegressionModel for LinearMean { } /// A 2-degree polynomial as mean of the GP -#[derive(Clone, Copy, Debug, Default)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)] #[cfg_attr( feature = "serializable", derive(Serialize, Deserialize), diff --git a/gp/src/parameters.rs b/gp/src/parameters.rs index b1068bef..64e6578d 100644 --- a/gp/src/parameters.rs +++ b/gp/src/parameters.rs @@ -7,15 +7,15 @@ use linfa::{Float, ParamGuard}; #[derive(Clone, Debug, PartialEq, Eq)] pub struct GpValidParams, Corr: CorrelationModel> { /// Parameter of the autocorrelation model - theta: Option>, + pub(crate) theta: Option>, /// Regression model representing the mean(x) - mean: Mean, + pub(crate) mean: Mean, /// Correlation model representing the spatial correlation between errors at e(x) and e(x') - corr: Corr, + pub(crate) corr: Corr, /// Optionally apply dimension reduction (KPLS) or not - kpls_dim: Option, + pub(crate) kpls_dim: Option, /// Parameter to improve numerical stability - nugget: F, + pub(crate) nugget: F, } impl Default for GpValidParams { diff --git a/gp/src/sgp_algorithm.rs b/gp/src/sgp_algorithm.rs new file mode 100644 index 00000000..2e1749a8 --- /dev/null +++ b/gp/src/sgp_algorithm.rs @@ -0,0 +1,910 @@ +use crate::algorithm::optimize_params; +use crate::errors::{GpError, Result}; +use crate::sgp_parameters::{ + Inducings, SgpParams, SgpValidParams, SparseMethod, VarianceEstimation, +}; +use crate::{correlation_models::*, utils::pairwise_differences}; +use egobox_doe::{Lhs, SamplingMethod}; +use linfa::prelude::{Dataset, DatasetBase, Fit, Float, PredictInplace}; +use linfa_linalg::{cholesky::*, triangular::*}; +use linfa_pls::PlsRegression; +use ndarray::{arr1, s, Array, Array1, Array2, ArrayBase, Axis, Data, Ix2, Zip}; +use ndarray_einsum_beta::*; +use ndarray_stats::QuantileExt; + +use ndarray_rand::rand::seq::SliceRandom; +use ndarray_rand::rand::SeedableRng; +use rand_xoshiro::Xoshiro256Plus; + +#[cfg(feature = "serializable")] +use serde::{Deserialize, Serialize}; +use std::fmt; + +const N_START: usize = 10; // number of optimization restart (aka multistart) + +/// Woodbury data computed during training and used for prediction +/// +/// Name came from [Woodbury matrix identity](https://en.wikipedia.org/wiki/Woodbury_matrix_identity) +#[cfg_attr( + feature = "serializable", + derive(Serialize, Deserialize), + serde(bound(serialize = "F: Serialize", deserialize = "F: Deserialize<'de>")) +)] +#[derive(Debug)] +pub(crate) struct WoodburyData { + vec: Array2, + inv: Array2, +} +impl Clone for WoodburyData { + fn clone(&self) -> Self { + WoodburyData { + vec: self.vec.to_owned(), + inv: self.inv.clone(), + } + } +} + +/// Sparse gaussian process considers a set of `M` inducing points either to approximate the posterior Gaussian distribution +/// with a low-rank representation (FITC - Fully Independent Training Conditional method), or to approximate the posterior +/// distribution directly (VFE - Variational Free Energy method). +/// +/// These methods enable accurate modeling with large training datasets of N points while preserving +/// computational efficiency. With `M < N`, we get `O(NM^2)` complexity instead of `O(N^3)` +/// in time processing and `O(NM)` instead of `O(N^2)` in memory space. +/// +/// See Reference section for more information. +/// +/// # Implementation +/// +/// [`SparseGaussianProcess`] inducing points definition can be either random or provided by the user through +/// the [`Inducings`] specification. The used sparse method is specified with the [`SparseMethod`]. +/// Noise variance can be either specified as a known constant or estimated (see [`VarianceEstimation`]). +/// Unlike [`GaussianProcess`]([crate::GaussianProcess]) implementation [`SparseGaussianProcess`] +/// does not allow choosing a trend which is supposed to be zero. +/// The correlation kernel might be selected amongst [available kernels](crate::correlation_models). +/// When targetting a squared exponential kernel, one can use the [SparseKriging] shortcut. +/// +/// # Features +/// +/// ## serializable +/// +/// The `serializable` feature enables the serialization of GP models using the [`serde crate`](https://serde.rs/). +/// +/// # Example +/// +/// ``` +/// use ndarray::{Array, Array2, Axis}; +/// use ndarray_rand::rand; +/// use ndarray_rand::rand::SeedableRng; +/// use ndarray_rand::RandomExt; +/// use ndarray_rand::rand_distr::{Normal, Uniform}; +/// use linfa::prelude::{Dataset, DatasetBase, Fit, Float, PredictInplace}; +/// +/// use egobox_gp::SparseKriging; +/// +/// const PI: f64 = std::f64::consts::PI; +/// +/// // Let us define a hidden target function for our sparse GP example +/// fn f_obj(x: &Array2) -> Array2 { +/// x.mapv(|v| (3. * PI * v).sin() + 0.3 * (9. * PI * v).cos() + 0.5 * (7. * PI * v).sin()) +/// } +/// +/// // Then we can define a utility function to generate some noisy data +/// // nt points with a gaussian noise with a variance eta2. +/// fn make_test_data( +/// nt: usize, +/// eta2: f64, +/// ) -> (Array2, Array2) { +/// let normal = Normal::new(0., eta2.sqrt()).unwrap(); +/// let mut rng = rand::thread_rng(); +/// let gaussian_noise = Array::::random_using((nt, 1), normal, &mut rng); +/// let xt = 2. * Array::::random_using((nt, 1), Uniform::new(0., 1.), &mut rng) - 1.; +/// let yt = f_obj(&xt) + gaussian_noise; +/// (xt, yt) +/// } +/// +/// // Generate training data +/// let nt = 200; +/// // Variance of the gaussian noise on our training data +/// let eta2: f64 = 0.01; +/// let (xt, yt) = make_test_data(nt, eta2); +/// +/// // Train our sparse gaussian process with n inducing points taken in the dataset +/// let n_inducings = 30; +/// let sgp = SparseKriging::params() +/// .n_inducings(n_inducings) +/// .fit(&Dataset::new(xt, yt)) +/// .expect("SGP fitted"); +/// +/// println!("sgp theta={:?}", sgp.theta()); +/// println!("sgp variance={:?}", sgp.variance()); +/// println!("noise variance={:?}", sgp.noise_variance()); +/// +/// // Predict with our trained SGP +/// let xplot = Array::linspace(-1., 1., 100).insert_axis(Axis(1)); +/// let sgp_vals = sgp.predict_values(&xplot).unwrap(); +/// let sgp_vars = sgp.predict_variances(&xplot).unwrap(); +/// ``` +/// +/// # Reference +/// +/// Matthias Bauer, Mark van der Wilk, and Carl Edward Rasmussen. +/// [Understanding Probabilistic Sparse Gaussian Process Approximations](https://arxiv.org/pdf/1606.04820.pdf). +/// In: Advances in Neural Information Processing Systems. Ed. by D. Lee et al. Vol. 29. Curran Associates, Inc., 2016 +/// +#[derive(Debug)] +#[cfg_attr( + feature = "serializable", + derive(Serialize, Deserialize), + serde(bound(serialize = "F: Serialize", deserialize = "F: Deserialize<'de>")) +)] +pub struct SparseGaussianProcess> { + /// Correlation kernel + #[cfg_attr( + feature = "serializable", + serde(bound(serialize = "Corr: Serialize", deserialize = "Corr: Deserialize<'de>")) + )] + corr: Corr, + /// Sparse method used + method: SparseMethod, + /// Parameter of the autocorrelation model + theta: Array1, + /// Estimated gaussian process variance + sigma2: F, + /// Gaussian noise variance + noise: F, + /// Weights in case of KPLS dimension reduction coming from PLS regression (orig_dim, kpls_dim) + w_star: Array2, + /// Training inputs + xtrain: Array2, + /// Training outputs + ytrain: Array2, + /// Inducing points + inducings: Array2, + /// Data used for prediction + w_data: WoodburyData, +} + +/// Kriging as sparse GP special case when using squared exponential correlation +pub type SparseKriging = SgpParams; + +impl SparseKriging { + pub fn params() -> SgpParams { + SgpParams::new(SquaredExponentialCorr()) + } +} + +impl> Clone for SparseGaussianProcess { + fn clone(&self) -> Self { + Self { + corr: self.corr, + method: self.method.clone(), + theta: self.theta.to_owned(), + sigma2: self.sigma2, + noise: self.noise, + w_star: self.w_star.to_owned(), + xtrain: self.xtrain.clone(), + ytrain: self.xtrain.clone(), + inducings: self.inducings.clone(), + w_data: self.w_data.clone(), + } + } +} + +impl> fmt::Display for SparseGaussianProcess { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "SGP({})", self.corr) + } +} + +impl> SparseGaussianProcess { + /// Gp parameters contructor + pub fn params>(corr: NewCorr) -> SgpParams { + SgpParams::new(corr) + } + + fn compute_k( + &self, + a: &ArrayBase, Ix2>, + b: &ArrayBase, Ix2>, + w_star: &Array2, + theta: &Array1, + sigma2: F, + ) -> Array2 { + // Get pairwise componentwise L1-distances to the input training set + let dx = pairwise_differences(a, b); + // Compute the correlation function + let r = self.corr.value(&dx, theta, w_star); + r.into_shape((a.nrows(), b.nrows())) + .unwrap() + .mapv(|v| v * sigma2) + } + + /// Predict output values at n given `x` points of nx components specified as a (n, nx) matrix. + /// Returns n scalar output values as (n, 1) column vector. + pub fn predict_values(&self, x: &ArrayBase, Ix2>) -> Result> { + let kx = self.compute_k(x, &self.inducings, &self.w_star, &self.theta, self.sigma2); + let mu = kx.dot(&self.w_data.vec); + Ok(mu) + } + + /// Predict variance values at n given `x` points of nx components specified as a (n, nx) matrix. + /// Returns n variance values as (n, 1) column vector. + pub fn predict_variances(&self, x: &ArrayBase, Ix2>) -> Result> { + let kx = self.compute_k(&self.inducings, x, &self.w_star, &self.theta, self.sigma2); + let kxx = Array::from_elem(x.nrows(), self.sigma2); + let var = kxx - (self.w_data.inv.t().clone().dot(&kx) * &kx).sum_axis(Axis(0)); + let var = var.mapv(|v| { + if v < F::cast(1e-15) { + F::cast(1e-15) + self.noise + } else { + v + self.noise + } + }); + Ok(var.insert_axis(Axis(1))) + } + + /// Retrieve number of PLS components 1 <= n <= x dimension + pub fn kpls_dim(&self) -> Option { + if self.w_star.ncols() < self.xtrain.ncols() { + Some(self.w_star.ncols()) + } else { + None + } + } + + /// Retrieve input dimension before kpls dimension reduction if any + pub fn input_dim(&self) -> usize { + self.ytrain.ncols() + } + + /// Retrieve output dimension + pub fn output_dim(&self) -> usize { + self.ytrain.ncols() + } + + /// Optimal theta + pub fn theta(&self) -> &Array1 { + &self.theta + } + + /// Estimated variance + pub fn variance(&self) -> F { + self.sigma2 + } + + /// Estimated noise variance + pub fn noise_variance(&self) -> F { + self.noise + } + + /// Inducing points + pub fn inducings(&self) -> &Array2 { + &self.inducings + } +} + +impl PredictInplace, Array2> for SparseGaussianProcess +where + F: Float, + D: Data, + Corr: CorrelationModel, +{ + fn predict_inplace(&self, x: &ArrayBase, y: &mut Array2) { + assert_eq!( + x.nrows(), + y.nrows(), + "The number of data points must match the number of output targets." + ); + + let values = self.predict_values(x).expect("GP Prediction"); + *y = values; + } + + fn default_target(&self, x: &ArrayBase) -> Array2 { + Array2::zeros((x.nrows(), self.output_dim())) + } +} + +/// Gausssian Process adaptator to implement `linfa::Predict` trait for variance prediction. +struct GpVariancePredictor<'a, F, Corr>(&'a SparseGaussianProcess) +where + F: Float, + Corr: CorrelationModel; + +impl<'a, F, D, Corr> PredictInplace, Array2> + for GpVariancePredictor<'a, F, Corr> +where + F: Float, + D: Data, + Corr: CorrelationModel, +{ + fn predict_inplace(&self, x: &ArrayBase, y: &mut Array2) { + assert_eq!( + x.nrows(), + y.nrows(), + "The number of data points must match the number of output targets." + ); + + let values = self.0.predict_variances(x).expect("GP Prediction"); + *y = values; + } + + fn default_target(&self, x: &ArrayBase) -> Array2 { + Array2::zeros((x.nrows(), self.0.output_dim())) + } +} + +impl, D: Data> + Fit, ArrayBase, GpError> for SgpValidParams +{ + type Object = SparseGaussianProcess; + + /// Fit GP parameters using maximum likelihood + fn fit( + &self, + dataset: &DatasetBase, ArrayBase>, + ) -> Result { + let x = dataset.records(); + let y = dataset.targets(); + if let Some(d) = self.kpls_dim() { + if *d > x.ncols() { + return Err(GpError::InvalidValue(format!( + "Dimension reduction {} should be smaller than actual \ + training input dimensions {}", + d, + x.ncols() + ))); + }; + } + + let xtrain = x.to_owned(); + let ytrain = y.to_owned(); + + let mut w_star = Array2::eye(x.ncols()); + if let Some(n_components) = self.kpls_dim() { + let ds = Dataset::new(x.to_owned(), y.to_owned()); + w_star = PlsRegression::params(*n_components).fit(&ds).map_or_else( + |e| match e { + linfa_pls::PlsError::PowerMethodConstantResidualError() => { + Ok(Array2::zeros((x.ncols(), *n_components))) + } + err => Err(err), + }, + |v| Ok(v.rotations().0.to_owned()), + )?; + }; + + // Initial guess for theta + let theta0: Array1<_> = self + .initial_theta() + .clone() + .map_or(Array1::from_elem(w_star.ncols(), F::cast(1e-2)), |v| { + Array::from_vec(v) + }); + + // Initial guess for variance + let y_std = ytrain.std_axis(Axis(0), F::one()); + let sigma2_0 = y_std[0] * y_std[0]; + //let sigma2_0 = F::cast(1e-2); + + // Initial guess for noise, when noise variance constant, it is not part of optimization params + let (is_noise_estimated, noise0) = match self.noise_variance() { + VarianceEstimation::Constant(c) => (false, c), + VarianceEstimation::Estimated { + initial_guess: c, + bounds: _, + } => (true, c), + }; + + // Params consist in [theta1, ..., thetap, sigma2, [noise]] + // where sigma2 is the variance of the gaussian process + // where noise is the variance of the noise when it is estimated + let n = theta0.len() + 1 + is_noise_estimated as usize; + let mut params_0 = Array1::zeros(n); + params_0 + .slice_mut(s![..n - 1 - is_noise_estimated as usize]) + .assign(&theta0); + params_0[n - 1 - is_noise_estimated as usize] = sigma2_0; + if is_noise_estimated { + // noise variance is estimated, noise0 is initial_guess + params_0[n - 1] = *noise0; + } + + let mut rng = match self.seed() { + Some(seed) => Xoshiro256Plus::seed_from_u64(*seed), + None => Xoshiro256Plus::from_entropy(), + }; + let z = match self.inducings() { + Inducings::Randomized(n) => make_inducings(*n, &xtrain, &mut rng), + Inducings::Located(z) => z.to_owned(), + }; + + // We prefer optimize variable change log10(theta) + // as theta is used as exponent in objective function + let base: f64 = 10.; + let objfn = |x: &[f64], _gradient: Option<&mut [f64]>, _params: &mut ()| -> f64 { + for v in x.iter() { + // check theta as optimizer may give nan values + if v.is_nan() { + // shortcut return worst value wrt to rlf minimization + return f64::INFINITY; + } + } + let input = Array1::from_shape_vec( + (x.len(),), + x.iter().map(|v| F::cast(base.powf(*v))).collect(), + ) + .unwrap(); + + let theta = input.slice(s![..input.len() - 1 - is_noise_estimated as usize]); + let sigma2 = input[input.len() - 1 - is_noise_estimated as usize]; + let noise = if is_noise_estimated { + input[input.len() - 1] + } else { + F::cast(*noise0) + }; + + let theta = theta.mapv(F::cast); + match self.reduced_likelihood( + &theta, + sigma2, + noise, + &w_star, + &xtrain, + &ytrain, + &z, + self.nugget(), + ) { + Ok(r) => unsafe { -(*(&r.0 as *const F as *const f64)) }, + Err(_) => f64::INFINITY, + } + }; + + // Multistart: user theta0 + LHS samplings + let mut params = Array2::zeros((N_START + 1, params_0.len())); + params.row_mut(0).assign(¶ms_0.mapv(|v| F::log10(v))); + let mut xlimits: Array2 = Array2::zeros((params_0.len(), 2)); + for mut row in xlimits.rows_mut() { + row.assign(&arr1(&[F::cast(-6), F::cast(2)])); + } + // Use a seed here for reproducibility. Do we need to make it truly random + // Probably no, as it is just to get init values spread over + // [1e-6, 20] for multistart thanks to LHS method. + let seeds = Lhs::new(&xlimits) + .kind(egobox_doe::LhsKind::Maximin) + .with_rng(Xoshiro256Plus::seed_from_u64(42)) + .sample(N_START); + Zip::from(params.slice_mut(s![1.., ..]).rows_mut()) + .and(seeds.rows()) + .par_for_each(|mut theta, row| theta.assign(&row)); + + // bounds of theta, variance and optionally noise variance + let mut bounds = vec![(F::cast(1e-6).log10(), F::cast(1e2).log10()); params.ncols()]; + // variance bounds + bounds[params.ncols() - 1 - is_noise_estimated as usize] = + (F::cast(1e-6).log10(), (F::cast(9.) * sigma2_0).log10()); + // optionally adjust noise variance bounds + if let VarianceEstimation::Estimated { + initial_guess: _, + bounds: (lo, up), + } = self.noise_variance() + { + // Set bounds for noise + if let Some(noise_bounds) = bounds.last_mut() { + *noise_bounds = (lo.log10(), up.log10()); + } + } + + let opt_params = + params.map_axis(Axis(1), |p| optimize_params(objfn, &p.to_owned(), &bounds)); + let opt_index = opt_params.map(|(_, opt_f)| opt_f).argmin().unwrap(); + let opt_params = &(opt_params[opt_index]).0.mapv(|v| F::cast(base.powf(v))); + // println!("opt_theta={}", opt_theta); + let opt_theta = opt_params + .slice(s![..n - 1 - is_noise_estimated as usize]) + .to_owned(); + let opt_sigma2 = opt_params[n - 1 - is_noise_estimated as usize]; + let opt_noise = if is_noise_estimated { + opt_params[n - 1] + } else { + *noise0 + }; + + // Recompute reduced likelihood with optimized params + let (_, w_data) = self.reduced_likelihood( + &opt_theta, + opt_sigma2, + opt_noise, + &w_star, + &xtrain, + &ytrain, + &z, + self.nugget(), + )?; + Ok(SparseGaussianProcess { + corr: *self.corr(), + method: self.method().clone(), + theta: opt_theta, + sigma2: opt_sigma2, + noise: opt_noise, + w_data, + w_star, + xtrain: xtrain.to_owned(), + ytrain: ytrain.to_owned(), + inducings: z.clone(), + }) + } +} + +impl> SgpValidParams { + /// Compute reduced likelihood function + /// nugget: factor to improve numerical stability + #[allow(clippy::too_many_arguments)] + fn reduced_likelihood( + &self, + theta: &Array1, + sigma2: F, + noise: F, + w_star: &Array2, + xtrain: &Array2, + ytrain: &Array2, + z: &Array2, + nugget: F, + ) -> Result<(F, WoodburyData)> { + let (likelihood, w_data) = match self.method() { + SparseMethod::Fitc => { + self.fitc(theta, sigma2, noise, w_star, xtrain, ytrain, z, nugget) + } + SparseMethod::Vfe => self.vfe(theta, sigma2, noise, w_star, xtrain, ytrain, z, nugget), + }; + + Ok((likelihood, w_data)) + } + + /// Compute covariance matrix between a and b matrices + fn compute_k( + &self, + a: &ArrayBase, Ix2>, + b: &ArrayBase, Ix2>, + w_star: &Array2, + theta: &Array1, + sigma2: F, + ) -> Array2 { + // Get pairwise componentwise L1-distances to the input training set + let dx = pairwise_differences(a, b); + // Compute the correlation function + let r = self.corr().value(&dx, theta, w_star); + r.into_shape((a.nrows(), b.nrows())) + .unwrap() + .mapv(|v| v * sigma2) + } + + /// FITC method + #[allow(clippy::too_many_arguments)] + fn fitc( + &self, + theta: &Array1, + sigma2: F, + noise: F, + w_star: &Array2, + xtrain: &Array2, + ytrain: &Array2, + z: &Array2, + nugget: F, + ) -> (F, WoodburyData) { + let nz = z.nrows(); + let knn = Array1::from_elem(xtrain.nrows(), sigma2); + let kmm = self.compute_k(z, z, w_star, theta, sigma2) + Array::eye(nz) * nugget; + let kmn = self.compute_k(z, xtrain, w_star, theta, sigma2); + + // Compute (lower) Cholesky decomposition: Kmm = U U^T + let u = kmm.cholesky().unwrap(); + + // Compute cholesky decomposition: Qnn = V^T V + let ui = u + .solve_triangular(&Array::eye(u.nrows()), UPLO::Lower) + .unwrap(); + let v = ui.dot(&kmn); + + // Assumption on the gaussian noise on training outputs + let eta2 = noise; + + // Compute diagonal correction: nu = Knn_diag - Qnn_diag + \eta^2 + let nu = knn; + let nu = nu - (v.to_owned() * &v).sum_axis(Axis(0)); + let nu = nu + eta2; + // Compute beta, the effective noise precision + let beta = nu.mapv(|v| F::one() / v); + + // Compute (lower) Cholesky decomposition: A = I + V diag(beta) V^T = L L^T + let a = Array::eye(nz) + &(v.to_owned() * beta.to_owned().insert_axis(Axis(0))).dot(&v.t()); + + let l = a.cholesky().unwrap(); + let li = l + .solve_triangular(&Array::eye(l.nrows()), UPLO::Lower) + .unwrap(); + + // Compute a and b + let a = einsum("ij,i->ij", &[ytrain, &beta]) + .unwrap() + .into_dimensionality::() + .unwrap(); + let tmp = li.dot(&v); + let b = tmp.dot(&a); + + // Compute marginal log-likelihood + // constant term ignored in reduced likelihood + //let term0 = self.ytrain.nrows() * F::cast(2. * std::f64::consts::PI); + let term1 = nu.mapv(|v| v.ln()).sum(); + let term2 = F::cast(2.) * l.diag().mapv(|v| v.ln()).sum(); + let term3 = (a.t().to_owned()).dot(ytrain)[[0, 0]]; + //let term4 = einsum("ij,ij->", &[&b, &b]).unwrap(); + let term4 = -(b.to_owned() * &b).sum(); + let likelihood = -F::cast(0.5) * (term1 + term2 + term3 + term4); + + // Store Woodbury vectors for prediction step + let li_ui = li.dot(&ui); + let li_ui_t = li_ui.t(); + let w_data = WoodburyData { + vec: li_ui_t.dot(&b), + inv: (ui.t()).dot(&ui) - li_ui_t.dot(&li_ui), + }; + + (likelihood, w_data) + } + + /// VFE method + #[allow(clippy::too_many_arguments)] + fn vfe( + &self, + theta: &Array1, + sigma2: F, + noise: F, + w_star: &Array2, + xtrain: &Array2, + ytrain: &Array2, + z: &Array2, + nugget: F, + ) -> (F, WoodburyData) { + // Compute: Kmm and Kmn + let nz = z.nrows(); + let kmm = self.compute_k(z, z, w_star, theta, sigma2) + Array::eye(nz) * nugget; + let kmn = self.compute_k(z, xtrain, w_star, theta, sigma2); + + // Compute cholesky decomposition: Kmm = U U^T + let u = kmm.cholesky().unwrap(); + + // Compute cholesky decomposition: Qnn = V^T V + let ui = u + .solve_triangular(&Array::eye(u.nrows()), UPLO::Lower) + .unwrap(); + let v = ui.dot(&kmn); + + // Compute beta, the effective noise precision + let beta = F::one() / noise.max(nugget); + + // Compute A = beta * V @ V.T + let a = v.to_owned().dot(&v.t()).mapv(|v| v * beta); + + // Compute cholesky decomposition: B = I + A = L L^T + let b: Array2 = Array::eye(nz) + &a; + let l = b.cholesky().unwrap(); + let li = l + .solve_triangular(&Array::eye(l.nrows()), UPLO::Lower) + .unwrap(); + + // Compute b + let b = li.dot(&v).dot(ytrain).mapv(|v| v * beta); + + // Compute log-marginal likelihood + // constant term ignored in reduced likelihood + //let term0 = self.ytrain.nrows() * (F::cast(2. * std::f64::consts::PI) + let term1 = -F::cast(ytrain.nrows()) * beta.ln(); + let term2 = F::cast(2.) * l.diag().mapv(|v| v.ln()).sum(); + let term3 = beta * (ytrain.to_owned() * ytrain).sum(); + let term4 = -b.t().dot(&b)[[0, 0]]; + let term5 = F::cast(ytrain.nrows()) * beta * sigma2; + let term6 = -a.diag().sum(); + + let likelihood = -F::cast(0.5) * (term1 + term2 + term3 + term4 + term5 + term6); + println!("likelihood={}", likelihood); + + let li_ui = li.dot(&ui); + let bi = Array::eye(nz) + li.t().dot(&li); + let w_data = WoodburyData { + vec: li_ui.t().dot(&b), + inv: ui.t().dot(&bi).dot(&ui), + }; + + (likelihood, w_data) + } +} + +fn make_inducings( + n_inducing: usize, + xt: &Array2, + rng: &mut Xoshiro256Plus, +) -> Array2 { + let mut indices = (0..xt.nrows()).collect::>(); + indices.shuffle(rng); + let n = n_inducing.min(xt.nrows()); + let mut z = Array2::zeros((n, xt.ncols())); + let idx = indices[..n].to_vec(); + Zip::from(z.rows_mut()) + .and(&Array1::from_vec(idx)) + .for_each(|mut zi, i| zi.assign(&xt.row(*i))); + z +} + +#[cfg(test)] +mod tests { + use super::*; + + use approx::assert_abs_diff_eq; + use ndarray::Array; + // use ndarray_npy::{read_npy, write_npy}; + use ndarray_npy::write_npy; + use ndarray_rand::rand::SeedableRng; + use ndarray_rand::rand_distr::{Normal, Uniform}; + use ndarray_rand::RandomExt; + use rand_xoshiro::Xoshiro256Plus; + + const PI: f64 = std::f64::consts::PI; + + fn f_obj(x: &ArrayBase, Ix2>) -> Array2 { + x.mapv(|v| (3. * PI * v).sin() + 0.3 * (9. * PI * v).cos() + 0.5 * (7. * PI * v).sin()) + } + + fn make_test_data( + nt: usize, + eta2: f64, + rng: &mut Xoshiro256Plus, + ) -> (Array2, Array2) { + let normal = Normal::new(0., eta2.sqrt()).unwrap(); + let gaussian_noise = Array::::random_using((nt, 1), normal, rng); + let xt = 2. * Array::::random_using((nt, 1), Uniform::new(0., 1.), rng) - 1.; + let yt = f_obj(&xt) + gaussian_noise; + (xt, yt) + } + + fn save_data( + xt: &Array2, + yt: &Array2, + z: &Array2, + xplot: &Array2, + sgp_vals: &Array2, + sgp_vars: &Array2, + ) { + let test_dir = "target/tests"; + std::fs::create_dir_all(test_dir).ok(); + + let file_path = format!("{}/sgp_xt.npy", test_dir); + write_npy(file_path, xt).expect("xt saved"); + let file_path = format!("{}/sgp_yt.npy", test_dir); + write_npy(file_path, yt).expect("yt saved"); + let file_path = format!("{}/sgp_z.npy", test_dir); + write_npy(file_path, z).expect("z saved"); + let file_path = format!("{}/sgp_x.npy", test_dir); + write_npy(file_path, xplot).expect("x saved"); + let file_path = format!("{}/sgp_vals.npy", test_dir); + write_npy(file_path, sgp_vals).expect("sgp vals saved"); + let file_path = format!("{}/sgp_vars.npy", test_dir); + write_npy(file_path, sgp_vars).expect("sgp vars saved"); + } + + #[test] + fn test_sgp_default() { + let mut rng = Xoshiro256Plus::seed_from_u64(0); + // Generate training data + let nt = 200; + // Variance of the gaussian noise on our training data + let eta2: f64 = 0.01; + let (xt, yt) = make_test_data(nt, eta2, &mut rng); + + let xplot = Array::linspace(-1., 1., 100).insert_axis(Axis(1)); + let n_inducings = 30; + + let sgp = SparseKriging::params() + .n_inducings(n_inducings) + .initial_theta(Some(vec![0.1])) + .fit(&Dataset::new(xt.clone(), yt.clone())) + .expect("GP fitted"); + + println!("noise variance={:?}", sgp.noise_variance()); + assert_abs_diff_eq!(eta2, sgp.noise_variance()); + + let sgp_vals = sgp.predict_values(&xplot).unwrap(); + let sgp_vars = sgp.predict_variances(&xplot).unwrap(); + + save_data(&xt, &yt, sgp.inducings(), &xplot, &sgp_vals, &sgp_vars); + } + + #[test] + fn test_sgp_vfe() { + let mut rng = Xoshiro256Plus::seed_from_u64(0); + // Generate training data + let nt = 200; + // Variance of the gaussian noise on our training data + let eta2: f64 = 0.01; + let (xt, yt) = make_test_data(nt, eta2, &mut rng); + // let test_dir = "target/tests"; + // let file_path = format!("{}/smt_xt.npy", test_dir); + // let xt: Array2 = read_npy(file_path).expect("xt read"); + // let file_path = format!("{}/smt_yt.npy", test_dir); + // let yt: Array2 = read_npy(file_path).expect("yt read"); + + let xplot = Array::linspace(-1., 1., 100).insert_axis(Axis(1)); + let n_inducings = 30; + + let z = make_inducings(n_inducings, &xt, &mut rng); + // let file_path = format!("{}/smt_z.npy", test_dir); + // let z: Array2 = read_npy(file_path).expect("z read"); + + let sgp = SparseGaussianProcess::::params( + SquaredExponentialCorr::default(), + ) + .sparse_method(SparseMethod::Vfe) + .inducings(z) + .initial_theta(Some(vec![0.01])) + .fit(&Dataset::new(xt.clone(), yt.clone())) + .expect("GP fitted"); + + println!("theta={:?}", sgp.theta()); + println!("variance={:?}", sgp.variance()); + println!("noise variance={:?}", sgp.noise_variance()); + assert_abs_diff_eq!(eta2, sgp.noise_variance()); + + let sgp_vals = sgp.predict_values(&xplot).unwrap(); + let sgp_vars = sgp.predict_variances(&xplot).unwrap(); + + save_data(&xt, &yt, sgp.inducings(), &xplot, &sgp_vals, &sgp_vars); + } + + #[test] + fn test_sgp_noise_estimation() { + let mut rng = Xoshiro256Plus::seed_from_u64(0); + // Generate training data + let nt = 200; + // Variance of the gaussian noise on our training data + let eta2: f64 = 0.01; + let (xt, yt) = make_test_data(nt, eta2, &mut rng); + // let test_dir = "target/tests"; + // let file_path = format!("{}/smt_xt.npy", test_dir); + // let xt: Array2 = read_npy(file_path).expect("xt read"); + // let file_path = format!("{}/smt_yt.npy", test_dir); + // let yt: Array2 = read_npy(file_path).expect("yt read"); + + let xplot = Array::linspace(-1., 1., 100).insert_axis(Axis(1)); + let n_inducings = 30; + + let z = make_inducings(n_inducings, &xt, &mut rng); + // let file_path = format!("{}/smt_z.npy", test_dir); + // let z: Array2 = read_npy(file_path).expect("z read"); + + let sgp = SparseGaussianProcess::::params( + SquaredExponentialCorr::default(), + ) + .sparse_method(SparseMethod::Vfe) + //.sparse_method(SparseMethod::Fitc) + .inducings(z.clone()) + .initial_theta(Some(vec![0.1])) + .noise_variance(VarianceEstimation::Estimated { + initial_guess: 0.02, + bounds: (1e-3, 1.), + }) + .fit(&Dataset::new(xt.clone(), yt.clone())) + .expect("SGP fitted"); + + println!("theta={:?}", sgp.theta()); + println!("variance={:?}", sgp.variance()); + println!("noise variance={:?}", sgp.noise_variance()); + assert_abs_diff_eq!(eta2, sgp.noise_variance(), epsilon = 0.005); + assert_abs_diff_eq!(&z, sgp.inducings(), epsilon = 0.0015); + + let sgp_vals = sgp.predict_values(&xplot).unwrap(); + let sgp_vars = sgp.predict_variances(&xplot).unwrap(); + + save_data(&xt, &yt, &z, &xplot, &sgp_vals, &sgp_vars); + } +} diff --git a/gp/src/sgp_parameters.rs b/gp/src/sgp_parameters.rs new file mode 100644 index 00000000..875a27ea --- /dev/null +++ b/gp/src/sgp_parameters.rs @@ -0,0 +1,230 @@ +use crate::correlation_models::{CorrelationModel, SquaredExponentialCorr}; +use crate::errors::{GpError, Result}; +use crate::mean_models::ConstantMean; +use crate::parameters::GpValidParams; +use linfa::{Float, ParamGuard}; +use ndarray::Array2; +#[cfg(feature = "serializable")] +use serde::{Deserialize, Serialize}; + +/// Variance estimation method +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum VarianceEstimation { + /// Constant variance (ie given not estimated) + Constant(F), + /// Variance is estimated between given bounds (lower, upper) starting from the inital guess + Estimated { initial_guess: F, bounds: (F, F) }, +} +impl Default for VarianceEstimation { + fn default() -> VarianceEstimation { + Self::Constant(F::cast(0.01)) + } +} + +/// SGP inducing points specification +#[derive(Clone, Debug, PartialEq, Eq)] +#[cfg_attr(feature = "serializable", derive(Serialize, Deserialize))] +pub enum Inducings { + /// `usize` points are selected randomly in the training dataset + Randomized(usize), + /// Points are given as a (npoints, nx) matrix + Located(Array2), +} +impl Default for Inducings { + fn default() -> Inducings { + Self::Randomized(10) + } +} + +/// SGP algorithm method specification +#[derive(Clone, Debug, PartialEq, Eq, Default)] +#[cfg_attr(feature = "serializable", derive(Serialize, Deserialize))] +pub enum SparseMethod { + #[default] + /// Fully Independent Training Conditional method + Fitc, + /// Variational Free Energy method + Vfe, +} + +/// A set of validated SGP parameters. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct SgpValidParams> { + /// gp + gp_params: GpValidParams, + /// Gaussian homeoscedastic noise variance + noise: VarianceEstimation, + /// Inducing points + z: Inducings, + /// Method + method: SparseMethod, + /// Random generator seed + seed: Option, +} + +impl Default for SgpValidParams { + fn default() -> SgpValidParams { + SgpValidParams { + gp_params: GpValidParams::default(), + noise: VarianceEstimation::default(), + z: Inducings::default(), + method: SparseMethod::default(), + seed: None, + } + } +} + +impl> SgpValidParams { + /// Get starting theta value for optimization + pub fn initial_theta(&self) -> &Option> { + &self.gp_params.theta + } + + /// Get correlation corr k(x, x') + pub fn corr(&self) -> &Corr { + &self.gp_params.corr + } + + /// Get number of components used by PLS + pub fn kpls_dim(&self) -> &Option { + &self.gp_params.kpls_dim + } + + /// Get number of components used by PLS + pub fn nugget(&self) -> F { + self.gp_params.nugget + } + + /// Get used sparse method + pub fn method(&self) -> &SparseMethod { + &self.method + } + + /// Get inducing points + pub fn inducings(&self) -> &Inducings { + &self.z + } + + /// Get noise variance configuration + pub fn noise_variance(&self) -> &VarianceEstimation { + &self.noise + } + + /// Get seed + pub fn seed(&self) -> &Option { + &self.seed + } +} + +#[derive(Clone, Debug)] +/// The set of hyperparameters that can be specified for the execution of +/// the [SGP algorithm](struct.SparseGaussianProcess.html). +pub struct SgpParams>(SgpValidParams); + +impl> SgpParams { + /// A constructor for SGP parameters given mean and correlation models + pub fn new(corr: Corr) -> SgpParams { + Self(SgpValidParams { + gp_params: GpValidParams { + theta: None, + mean: ConstantMean::default(), + corr, + kpls_dim: None, + nugget: F::cast(1000.0) * F::epsilon(), + }, + noise: VarianceEstimation::default(), + z: Inducings::default(), + method: SparseMethod::default(), + seed: None, + }) + } + + /// Set initial value for theta hyper parameter. + /// + /// During training process, the internal optimization is started from `initial_theta`. + pub fn initial_theta(mut self, theta: Option>) -> Self { + self.0.gp_params.theta = theta; + self + } + + /// Set correlation model. + pub fn corr(mut self, corr: Corr) -> Self { + self.0.gp_params.corr = corr; + self + } + + /// Set number of PLS components. + /// + /// Should be 0 < n < pb size (i.e. x dimension) + pub fn kpls_dim(mut self, kpls_dim: Option) -> Self { + self.0.gp_params.kpls_dim = kpls_dim; + self + } + + /// Set nugget value. + /// + /// Nugget is used to improve numerical stability + pub fn nugget(mut self, nugget: F) -> Self { + self.0.gp_params.nugget = nugget; + self + } + + /// Specify the sparse method + pub fn sparse_method(mut self, method: SparseMethod) -> Self { + self.0.method = method; + self + } + + /// Specify nz inducing points as (nz, x_dim) matrix. + pub fn inducings(mut self, z: Array2) -> Self { + self.0.z = Inducings::Located(z); + self + } + + /// Specify nz number of inducing points which will be picked randomly in the input training dataset. + pub fn n_inducings(mut self, nz: usize) -> Self { + self.0.z = Inducings::Randomized(nz); + self + } + + /// Set noise variance configuration defining noise handling. + pub fn noise_variance(mut self, config: VarianceEstimation) -> Self { + self.0.noise = config; + self + } + + /// Set noise variance configuration defining noise handling. + pub fn seed(mut self, seed: Option) -> Self { + self.0.seed = seed; + self + } +} + +impl> ParamGuard for SgpParams { + type Checked = SgpValidParams; + type Error = GpError; + + fn check_ref(&self) -> Result<&Self::Checked> { + if let Some(d) = self.0.gp_params.kpls_dim { + if d == 0 { + return Err(GpError::InvalidValue("`kpls_dim` canot be 0!".to_string())); + } + if let Some(theta) = self.0.initial_theta() { + if theta.len() > 1 && d > theta.len() { + return Err(GpError::InvalidValue(format!( + "Dimension reduction ({}) should be smaller than expected + training input size infered from given initial theta length ({})", + d, + theta.len() + ))); + }; + } + } + Ok(&self.0) + } + + fn check(self) -> Result { + self.check_ref()?; + Ok(self.0) + } +} diff --git a/moe/src/algorithm.rs b/moe/src/algorithm.rs index 9c8a0294..67a5476d 100644 --- a/moe/src/algorithm.rs +++ b/moe/src/algorithm.rs @@ -1062,7 +1062,7 @@ mod tests { } #[test] - fn test_moe_drv_hard() { + fn test_moe_drv_smooth() { let rng = Xoshiro256Plus::seed_from_u64(0); let xt = Lhs::new(&array![[0., 1.]]).sample(100); let yt = f_test_1d(&xt); @@ -1071,7 +1071,7 @@ mod tests { .n_clusters(3) .regression_spec(RegressionSpec::CONSTANT) .correlation_spec(CorrelationSpec::SQUAREDEXPONENTIAL) - .recombination(Recombination::Hard) + .recombination(Recombination::Smooth(Some(0.5))) .with_rng(rng) .fit(&Dataset::new(xt, yt)) .expect("MOE fitted"); @@ -1090,32 +1090,25 @@ mod tests { for _ in 0..20 { let x1: f64 = rng.gen_range(0.0..1.0); - if (0.39 < x1 && x1 < 0.41) || (0.79 < x1 && x1 < 0.81) { - // avoid testing hard on discoontinuity - continue; + let h = 1e-4; + let xtest = array![[x1]]; + + let x = array![[x1], [x1 + h], [x1 - h]]; + let preds = moe.predict_derivatives(&x).unwrap(); + let fdiff = preds[[1, 0]] - preds[[1, 0]] / 2. * h; + + let drv = moe.predict_derivatives(&xtest).unwrap(); + let df = df_test_1d(&xtest); + + let err = if drv[[0, 0]] < 0.2 { + (drv[[0, 0]] - fdiff).abs() } else { - let h = 1e-4; - let xtest = array![[x1]]; - - let x = array![[x1], [x1 + h], [x1 - h]]; - let preds = moe.predict_derivatives(&x).unwrap(); - let fdiff = preds[[1, 0]] - preds[[1, 0]] / 2. * h; - - let drv = moe.predict_derivatives(&xtest).unwrap(); - let df = df_test_1d(&xtest); - - if (df[[0, 0]] - fdiff).abs() > 10.0 { - let err = if drv[[0, 0]] < 0.2 { - (drv[[0, 0]] - fdiff).abs() - } else { - (drv[[0, 0]] - fdiff).abs() / drv[[0, 0]] - }; - println!( - "Test predicted derivatives at {xtest}: drv {drv}, true df {df}, fdiff {fdiff}" - ); - assert_abs_diff_eq!(err, 0.0, epsilon = 2e-1); - } - } + (drv[[0, 0]] - fdiff).abs() / drv[[0, 0]] + }; + println!( + "Test predicted derivatives at {xtest}: drv {drv}, true df {df}, fdiff {fdiff}" + ); + assert_abs_diff_eq!(err, 0.0, epsilon = 2e-1); } }