From 1390f05f2facf27dc27f1cccb0cab5611c06e18c Mon Sep 17 00:00:00 2001 From: Tommy Odland Date: Thu, 28 Dec 2023 13:24:21 +0100 Subject: [PATCH 1/9] added lasso.md --- lasso.md | 126 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 lasso.md diff --git a/lasso.md b/lasso.md new file mode 100644 index 00000000..69d9ad52 --- /dev/null +++ b/lasso.md @@ -0,0 +1,126 @@ +# Lasso + +## Notation and preliminaries + +Let $X \in \mathbb{R}^{n \times N}$ be a parameter ensemble matrix. +There are $n$ parameters (rows) and $N$ realizations in the ensemble (columns). + +Let $Y \in \mathbb{R}^{m \times N}$ be a response ensemble matrix. +The forward model is $h: \mathbb{R}^n \to \mathbb{R}^n$. + +Assume that observational noise is generated as $D \sim \mathcal{N}(d_\text{obs}, \Sigma_\epsilon)$, where $D \in \mathbb{R}^{m \times N}$. + +## Update equation + +We will start with the ESMDA update equation, and work our way to the linear regression update equation. + +The ESMDA update equation, with inflation factor $\alpha=1$, is + +$$X_\text{posterior} = X + \operatorname{cov}(X, Y) (\operatorname{cov}(Y, Y) + \Sigma_\epsilon)^{-1} (D - Y)$$ +where $\operatorname{cov}(X, Y) = c(X) c(Y)^T / (N - 1)$ and $c(X)$ centers each row in X by subtracting the mean. + +The term $K := \operatorname{cov}(X, Y) (\operatorname{cov}(Y, Y) + \Sigma_\epsilon)^{-1}$ is the Kalman gain. + +## The Kalman gain + +We will now transform the kalman gain matrix into a linear regression. +We switch notations and set $X := c(X)$ and $Y := c(Y)$, meaning that we assume that each row (variable) has been centered. + +Then we have +$$ +K = \frac{X Y^T}{N - 1} \left( +\frac{Y Y^T}{N - 1} + \Sigma_\epsilon +\right)^{-1}. +$$ + +Since $D D^T / (N - 1) \approx \Sigma_\epsilon$, we can approximate $K$ as +$$ +K = X Y^T \left( +Y Y^T + D D^T +\right)^{-1}. +$$ + +If we assume that $YD^T \approx DY^T \approx 0$, then the middle terms in $(Y +D)( Y + D)^T = Y Y^T + Y D^T + DY^T + DD^T$ vanish. +Define $Y_\text{noisy} := Y + D$, then +$$ +K = X Y^T \left( +Y_\text{noisy} Y_\text{noisy}^T +\right)^{+}. +$$ +Finally, assume that $X D^T \approx 0$, then +$$ +K = X Y_\text{noisy}^{+}. +$$ +We can find $K$ by solving the optimization problem +$$ +\lVert Y^T K^T - X^T \rVert^2 +$$ +using e.g. Ridge og Lasso. + + +## Solution approaches + +### SVD + +Since $K (D - Y)$ + + + + + + + + + + + + + + +The ES update equation is + +$$X_\text{posterior} = X + K (D - Y)$$ + +where + +$$K = \Sigma_{x} \hat{H}^T (\hat{H}\Sigma_{x}\hat{H}^T + \Sigma_{\epsilon})^{-1}$$ +$$K = \operatorname{cov}(X, Y) (\operatorname{cov}(Y, Y) + \Sigma_{\epsilon})^{-1}$$ + +## Covariances with linear forward model + +Assume that $y = h(x) = H x + c$, then + +$$\operatorname{cov}(x, y ) = \operatorname{cov}(x, x) H^T.$$ + +Similarity, if $y = h(x) = H x + c$, then + +$$\operatorname{cov}(y, y ) = H \operatorname{cov}(x, x) H^T.$$ + +## Variations on the update equation + + +## References + +- [Ensemblized linear least squares (LLS)](https://ncda-fs.web.norce.cloud/WS2023/raanes.pdf) + + + + + +------------------ + + + + +- test +- test + +$`a=b`$ + +`$$a=b$$` + +$$`a=b`$$ + +```math +\left( \sum_{k=1}^n a_k b_k \right)^2 \leq \left( \sum_{k=1}^n a_k^2 \right) \left( \sum_{k=1}^n b_k^2 \right) +``` From af1afe5a44ff03653cff4931585537eb472f292e Mon Sep 17 00:00:00 2001 From: Tommy Odland Date: Fri, 29 Dec 2023 09:51:05 +0100 Subject: [PATCH 2/9] notebook --- docs/source/Lasso.py | 460 +++++++++++++++++++++++++++++++++++++++++++ lasso.md | 131 ++++++------ 2 files changed, 517 insertions(+), 74 deletions(-) create mode 100644 docs/source/Lasso.py diff --git a/docs/source/Lasso.py b/docs/source/Lasso.py new file mode 100644 index 00000000..75a911c3 --- /dev/null +++ b/docs/source/Lasso.py @@ -0,0 +1,460 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.15.2 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% +# ruff: noqa: E402 + +# %% [markdown] +# # Adaptive Localization +# +# In this example we run adaptive localization +# on a sparse Gauss-linear problem. +# +# - Each response is only affected by $3$ parameters. +# - This is represented by a tridiagonal matrix $A$ in the forward model $g(x) = Ax$. +# - The problem is Gauss-Linear, so in this case ESMDA will sample +# the true posterior when the number of ensemble members (realizations) is large. +# - The sparse correlation structure will lead to spurious correlations, in the sense +# that a parameter and response might appear correlated when in fact they are not. + +# %% [markdown] +# ## Import packages + +# %% +import numpy as np +import scipy as sp +from matplotlib import pyplot as plt + +from iterative_ensemble_smoother import ESMDA +from iterative_ensemble_smoother.esmda_inversion import empirical_cross_covariance +from iterative_ensemble_smoother.experimental import AdaptiveESMDA +from iterative_ensemble_smoother.utils import sample_mvnormal + +# %% [markdown] +# ## Create problem data +# +# Some settings worth experimenting with: +# +# - Decreasing `prior_std=1` will pull the posterior solution toward zero. +# - Increasing `num_ensemble` will increase the quality of the solution. +# - Increasing `num_observations / num_parameters` +# will increase the quality of the solution. + +# %% +rng = np.random.default_rng(42) + +# Dimensionality of the problem +num_parameters = 100 +num_observations = 50 +num_ensemble = 25 +prior_std = 1 + +# Number of iterations to use in ESMDA +alpha = 1 + +# Effect size (coefficients in sparse mapping A) +effect_size = 5 + +# %% [markdown] +# ## Create problem data - sparse tridiagonal matrix $A$ + +# %% +diagonal = effect_size * np.ones(min(num_parameters, num_observations)) + +# Create a tridiagonal matrix (easiest with scipy) +A = sp.sparse.diags( + [diagonal, diagonal, diagonal], + offsets=[-1, 0, 1], + shape=(num_observations, num_observations), # (num_observations, num_parameters), + dtype=float, +).toarray() + +A = np.tile(A, reps=(1, num_parameters // num_observations + 1)) +A = A[:num_observations, :num_parameters] + + +# Create a tridiagonal matrix (easiest with scipy) +A = sp.sparse.diags( + [diagonal, diagonal, diagonal], + offsets=[-1, 0, 1], + shape=(num_observations, num_parameters), + dtype=float, +).toarray() + +# We add some noise that is insignificant compared to the +# actual local structure in the forward model +A = A + rng.standard_normal(size=A.shape) * 0.001 + +plt.title("Linear mapping $A$ in forward model $g(x) = Ax$") +plt.imshow(A) +plt.xlabel("Parameters (inputs)") +plt.ylabel("Responses (outputs)") +plt.tight_layout() +plt.show() + + +# %% [markdown] +# - Below we draw prior realizations $X \sim N(0, \sigma)$. +# - The true parameter values used to generate observations are in the range $[-1, 1]$. +# - As the number of realizations (ensemble members) goes to infinity, +# the correlation between the prior and the true parameter values converges to zero. +# - The correlation is zero for a finite number of realizations too, +# but statistical noise might induce some spurious correlations between +# the prior and the true parameter values. +# +# **In summary the baseline correlation is zero.** +# Anything we can do to increase the correlation above zero beats the baseline, +# which is using the prior (no update). +# +# The correlation coefficient does not take into account the uncertainty +# represeted in the posterior, only the mean posterior value is compared with +# the true parameter values. +# To compare distributions we could use the +# [Kullback–Leibler divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence), +# but we do not pursue this here. + +# %% +def get_prior(num_parameters, num_ensemble, prior_std): + """Sample prior from N(0, prior_std).""" + return rng.normal(size=(num_parameters, num_ensemble)) * prior_std + + +def g(X): + """Apply the forward model.""" + return A @ X + + +# Create observations: obs = g(x) + N(0, 1) +x_true = np.linspace(-1, 1, num=num_parameters) +observation_noise = rng.standard_normal(size=num_observations) # N(0, 1) noise +observations = g(x_true) + observation_noise + +# Initial ensemble X ~ N(0, prior_std) and diagonal covariance with ones +X = get_prior(num_parameters, num_ensemble, prior_std) + +# Covariance matches the noise added to observations above +covariance = np.ones(num_observations) # N(0, 1) covariance + +# %% [markdown] +# ## Solve the maximum likelihood problem +# +# We can solve $Ax = b$, where $b$ are the observations, +# for the maximum likelihood estimate. +# +# Notice that unlike using a Ridge model, +# solving $Ax = b$ directly does not use any prior information. + +# %% +x_ml, *_ = np.linalg.lstsq(A, observations, rcond=None) + +plt.figure(figsize=(7, 3)) +plt.scatter(np.arange(len(x_true)), x_true, label="True parameter values") +plt.scatter(np.arange(len(x_true)), x_ml, label="ML estimate (no prior)") +plt.xlabel("Parameter index") +plt.ylabel("Parameter value") +plt.grid(True, ls="--", zorder=0, alpha=0.33) +plt.legend() +plt.tight_layout() +plt.show() + +# %% +import numpy as np +from sklearn.linear_model import LassoCV +from sklearn.preprocessing import StandardScaler + + +def linear_l1_regression(U, Y): + """ + Performs LASSO regression for each response in Y against predictors in U, + constructing a sparse matrix of regression coefficients. + + The function scales features in U using standard scaling before applying + LASSO, then re-scales the coefficients to the original scale of U. This + extracts the effect of each feature in U on each response in Y, ignoring + intercepts and constant terms. + + Parameters + ---------- + U : np.ndarray + 2D array of predictors with shape (n, p). + Y : np.ndarray + 2D array of responses with shape (n, m). + + Returns + ------- + H_sparse : scipy.sparse.csc_matrix + Sparse matrix (m, p) with re-scaled LASSO regression coefficients for + each response in Y. + + Raises + ------ + AssertionError + If the number of samples in U and Y do not match, or if the shape of + H_sparse is not (m, p). + """ + # https://github.com/equinor/graphite-maps/blob/main/graphite_maps/linear_regression.py + + n, p = U.shape # p: number of features + n_y, m = Y.shape # m: number of y responses + + # Assert that the first dimension of U and Y are the same + assert n == n_y, "Number of samples in U and Y must be the same" + + scaler_u = StandardScaler() + U_scaled = scaler_u.fit_transform(U) + + scaler_y = StandardScaler() + Y_scaled = scaler_y.fit_transform(Y) + + # Loop over features + coefs = [] + for j in range(m): + y_j = Y_scaled[:, j] + + # Learn individual regularization and fit + alphas = np.logspace(-8, 8, num=32) + model_cv = LassoCV(alphas=alphas, fit_intercept=False, max_iter=10_000, cv=5) + model_cv.fit(U_scaled, y_j) + + coef_scale = scaler_y.scale_[j] / scaler_u.scale_ + coefs.append(model_cv.coef_ * coef_scale) + + K = np.vstack(coefs) + assert K.shape == (m, p) + + return K + + +Y = g(X) +K = linear_l1_regression(X.T, Y.T) +plt.imshow(K) +plt.show() + + +# %% +def solve_esmda(X, Y, covariance, observations, seed=1): + """Solving using exact ESMDA computation.""" + + # Create a smoother to get D + smoother = ESMDA( + covariance=covariance, + observations=observations, + alpha=1, + seed=seed, + ) + N = X.shape[1] # Number of ensemble members (realizations) + D = smoother.perturb_observations(ensemble_size=N, alpha=1.0) + + # Cross covariance and covariance + cov_XY = empirical_cross_covariance(X, Y) + cov_YY = empirical_cross_covariance(Y, Y) + + # Compute the update using the exact ESMDA approach + K = cov_XY @ sp.linalg.inv(cov_YY + np.diag(covariance)) + return X + K @ (D - Y) + + +def solve_projected(X, Y, covariance, observations, seed=1): + """Instead of using the covariance matrix, sample S ~ N(0, Cov), + then use S @ S.T in place of it. This is a low rank approximation. + + In itsself, this method has little going for it. But it leads + to other approaches since Y_noisy = Y + S can be defined. + """ + + # Create a smoother to get D + smoother = ESMDA( + covariance=covariance, + observations=observations, + alpha=1, + seed=seed, + ) + N = X.shape[1] + D = smoother.perturb_observations(ensemble_size=N, alpha=1.0) + + # Instead of using the covariance matrix, sample from it + S = sample_mvnormal(C_dd_cholesky=smoother.C_D_L, rng=smoother.rng, size=N) + + cov_XY = empirical_cross_covariance(X, Y) + cov_YY = empirical_cross_covariance(Y, Y) + + # Compute the update + K = cov_XY @ sp.linalg.inv(cov_YY + S @ S.T / (N - 1)) + return X + K @ (D - Y) + + +def solve_lstsq(X, Y, covariance, observations, seed=1): + """Since K =~ E(grad h^-1), we have K @ Y = X. + Here we simply solve Y.T @ K.T = X.T for X.""" + + # Create a smoother to get D + smoother = ESMDA( + covariance=covariance, + observations=observations, + alpha=1, + seed=seed, + ) + N = X.shape[1] + D = smoother.perturb_observations(ensemble_size=N, alpha=1.0) + + # Instead of using the covariance matrix, sample from it + S = sample_mvnormal(C_dd_cholesky=smoother.C_D_L, rng=smoother.rng, size=N) + + Y_noisy = Y + S + + # Compute the update by solving K @ Y = X + K, *_ = sp.linalg.lstsq(Y_noisy.T, X.T) + K = K.T + + return X + K @ (D - Y) + + +def solve_lasso_direct(X, Y, covariance, observations, seed=1): + """Use lasso to solve for H, then use the update equation + K := cov_XX @ H.T @ inv(H @ cov_XX @ H.T + Cov) + """ + + # Create a smoother to get D + smoother = ESMDA( + covariance=covariance, + observations=observations, + alpha=1, + seed=seed, + ) + N = X.shape[1] + D = smoother.perturb_observations(ensemble_size=N, alpha=1.0) + + # Approximate forward model with H + H = linear_l1_regression(X.T, Y.T) + cov_XX = empirical_cross_covariance(X, X) + + # Compute the update + K = cov_XX @ H.T @ sp.linalg.inv(H @ cov_XX @ H.T + np.diag(covariance)) + return X + K @ (D - Y) + + +def solve_lasso(X, Y, covariance, observations, seed=1): + """Solve K @ Y = X using Lasso.""" + + # Create a smoother to get D + smoother = ESMDA( + covariance=covariance, + observations=observations, + alpha=1, + seed=seed, + ) + N = X.shape[1] + D = smoother.perturb_observations(ensemble_size=N, alpha=1.0) + + # Instead of using the covariance matrix, sample from it + S = sample_mvnormal(C_dd_cholesky=smoother.C_D_L, rng=smoother.rng, size=N) + + Y_noisy = Y + S # Seems to work better with Y instead of Y_noisy + + # Compute the update + K = linear_l1_regression(Y_noisy.T, X.T) + + return X + K @ (D - Y) + + +def solve_adaptive_ESMDA(X, Y, covariance, observations, seed=1): + """Solving using adaptive ESMDA with correlation threshold.""" + + adaptive_smoother = AdaptiveESMDA( + covariance=covariance, + observations=observations, + seed=seed, + ) + + X_i = np.copy(X) + + for i, alpha_i in enumerate([1], 1): + # Perturb observations + D_i = adaptive_smoother.perturb_observations( + ensemble_size=X_i.shape[1], alpha=alpha_i + ) + + # Assimilate data + X_i = adaptive_smoother.assimilate( + X=X_i, Y=g(X_i), D=D_i, alpha=alpha_i, verbose=False + ) + + return X_i + + +# %% +plt.figure(figsize=(7, 3)) + +for function, label in zip( + [ + solve_esmda, + solve_projected, + solve_lstsq, + solve_lasso_direct, + solve_lasso, + solve_adaptive_ESMDA, + ], + [ + "ESMDA", + "lstsq", + "Lasso (direct)", + "Lasso", + "AdaptiveESMDA", + ], +): + print(label) + # Loop over seeds and solve + corrs = [] + posterior_means = [] + for seed in range(10): + X_posterior = function(X, g(X), covariance, observations, seed=seed) + x_posterior_mean = X_posterior.mean(axis=1) + corr = sp.stats.pearsonr(x_true, x_posterior_mean).statistic + corrs.append(corr) + posterior_means.append(x_posterior_mean) + + posterior_mean = np.vstack(posterior_means).mean(axis=0) + corr_mean, corr_std = np.mean(corrs), np.std(corrs) + + plt.scatter( + np.arange(len(x_true)), + posterior_mean, + label=f"{label} (corr: {corr_mean:.2f} +- {corr_std:.2f})", + alpha=0.6, + ) + + +corr = sp.stats.pearsonr(x_true, x_posterior_mean).statistic +plt.scatter( + np.arange(len(x_true)), + x_ml, + label=f"ML solution (knowing h(x) = Ax) (corr: {corr:.2f})", +) +plt.scatter( + np.arange(len(x_true)), + x_true, + label="True parameter values", +) + +# plt.scatter(np.arange(len(x_true)), x_proj, label="Projected") +plt.xlabel("Parameter index") +plt.ylabel("Parameter value") +plt.ylim([-3, 3]) +plt.grid(True, ls="--", zorder=0, alpha=0.33) +plt.legend() +plt.tight_layout() +plt.show() + +# %% diff --git a/lasso.md b/lasso.md index 69d9ad52..ae6526b9 100644 --- a/lasso.md +++ b/lasso.md @@ -1,92 +1,97 @@ # Lasso +Notes on the Lasso approach for + ## Notation and preliminaries -Let $X \in \mathbb{R}^{n \times N}$ be a parameter ensemble matrix. -There are $n$ parameters (rows) and $N$ realizations in the ensemble (columns). +In this section we establish notation. -Let $Y \in \mathbb{R}^{m \times N}$ be a response ensemble matrix. -The forward model is $h: \mathbb{R}^n \to \mathbb{R}^n$. +- Let $X \in \mathbb{R}^{n \times N}$ be a parameter ensemble matrix. +There are $n$ parameters (rows) and $N$ realizations in the ensemble (columns). +- Let $Y \in \mathbb{R}^{m \times N}$ be a response ensemble matrix. +- The forward model is $h: \mathbb{R}^n \to \mathbb{R}^n$. +- The observational noise has distribution $\mathcal{N}(0, \Sigma_\epsilon)$. -Assume that observational noise is generated as $D \sim \mathcal{N}(d_\text{obs}, \Sigma_\epsilon)$, where $D \in \mathbb{R}^{m \times N}$. +To add observational noise to the observations, we sample $D \sim \mathcal{N}(d_\text{obs}, \Sigma_\epsilon)$, where $D \in \mathbb{R}^{m \times N}$. +Each column in D is drawn from the multivariate normal. +We take the Cholesky factorization of the covariance and write $\Sigma_\epsilon = L_\epsilon L_\epsilon^T$. +Note that each column in $D$ can be computed as $d_\text{obs} + L_\epsilon z$, where $z \sim \mathcal{N}(0, 1)$. ## Update equation -We will start with the ESMDA update equation, and work our way to the linear regression update equation. +Our starting point for the update equation will be from the ESMDA paper. The ESMDA update equation, with inflation factor $\alpha=1$, is -$$X_\text{posterior} = X + \operatorname{cov}(X, Y) (\operatorname{cov}(Y, Y) + \Sigma_\epsilon)^{-1} (D - Y)$$ -where $\operatorname{cov}(X, Y) = c(X) c(Y)^T / (N - 1)$ and $c(X)$ centers each row in X by subtracting the mean. +$$X_\text{posterior} = X + \operatorname{cov}(X, Y) (\operatorname{cov}(Y, Y) + \Sigma_\epsilon)^{-1} (D - Y),$$ -The term $K := \operatorname{cov}(X, Y) (\operatorname{cov}(Y, Y) + \Sigma_\epsilon)^{-1}$ is the Kalman gain. +where the empirical cross covariance matrix $\operatorname{cov}(X, Y) = c(X) c(Y)^T / (N - 1)$, and $c(X)$ centers each row in X by subtracting the mean. + +The term $K := \operatorname{cov}(X, Y) (\operatorname{cov}(Y, Y) + \Sigma_\epsilon)^{-1}$ is the Kalman gain matrix. +Notice that in this expression we estimate both the covariance matrix and the cross covariance matrix using few realizations $N$. ## The Kalman gain -We will now transform the kalman gain matrix into a linear regression. -We switch notations and set $X := c(X)$ and $Y := c(Y)$, meaning that we assume that each row (variable) has been centered. +We will now transform the Kalman gain matrix into a linear regression. + +We switch notations and set $X := c(X)$ and $Y := c(Y)$, meaning that we assume that each row (variable) has been centered by subtracting the mean. -Then we have +Writing out the empirical covariances, the Kalman gain becomes $$ K = \frac{X Y^T}{N - 1} \left( \frac{Y Y^T}{N - 1} + \Sigma_\epsilon \right)^{-1}. $$ -Since $D D^T / (N - 1) \approx \Sigma_\epsilon$, we can approximate $K$ as +### Adding sampled noise + +Let $S \sim \mathcal{N}(0, \Sigma_\epsilon)$. +We can write $S S^T / (N - 1) \approx \Sigma_\epsilon$, which is a low rank approximation to the observation covariance matrix. +Notice that the observation covariance matrix is typically diagonal, has shape $m \times m$, whereas $S S^T$ has rank $N \ll m$. + +Replacing $\Sigma_\epsilon$ with $S S^T / (N - 1)$, +we can approximate $K$ as $$ -K = X Y^T \left( -Y Y^T + D D^T +K \approx X Y^T \left( +Y Y^T + S S^T \right)^{-1}. $$ -If we assume that $YD^T \approx DY^T \approx 0$, then the middle terms in $(Y +D)( Y + D)^T = Y Y^T + Y D^T + DY^T + DD^T$ vanish. -Define $Y_\text{noisy} := Y + D$, then +If we assume that $YS^T \approx SY^T \approx 0$, then the middle terms in $(Y +S)( Y + S)^T = Y Y^T + Y S^T + SY^T + SS^T$ vanish. +This is reasonable in the limit of large $N$ since the rows in $S$ are uncorrelated with the rows in $Y$. + +Define $Y_\text{noisy} := Y + S$, then $$ -K = X Y^T \left( +K \approx X Y^T \left( Y_\text{noisy} Y_\text{noisy}^T \right)^{+}. $$ -Finally, assume that $X D^T \approx 0$, then -$$ -K = X Y_\text{noisy}^{+}. +Finally, assume that $X S^T \approx 0$ so that $Y \approx Y_\text{noisy}$, then $$ -We can find $K$ by solving the optimization problem +K = X Y_\text{noisy}^{+} $$ -\lVert Y^T K^T - X^T \rVert^2 -$$ -using e.g. Ridge og Lasso. - - -## Solution approaches - -### SVD - -Since $K (D - Y)$ - - - - - - - +and we solve a least squares problem $Y_\text{noisy}^T K^T = X^T$ for $K$. +If we use Ridge or Lasso, the solution decomposes since in general $A[x_1 | x_2] = [Ax_1 | Ax_2] = [y_1 | y_2]$ and we can consider each row of $X$ independently. +We can find $K$ by solving a Ridge or Lasso problem. +### Comment: Ridge and the Kalman gain +Solving $Y^T K^T = X^T$ with Ridge produces the Normal equations that define $K$. +## Comment: Another Lasso approach +The Kalman gain can also be written as +$$K = \Sigma_{x} \hat{H}^T (\hat{H}\Sigma_{x}\hat{H}^T + \Sigma_{\epsilon})^{-1}$$ +which suggests another possible way to compute $K$: +1. Estimate $\hat{H}$ using Lasso or similar. +2. Estimate $\Sigma_{x}$ as $\operatorname{cov}(X, X)$. +This is likely unfeasible since $\operatorname{cov}(X, X)$ becomes huge. -The ES update equation is - -$$X_\text{posterior} = X + K (D - Y)$$ - -where - -$$K = \Sigma_{x} \hat{H}^T (\hat{H}\Sigma_{x}\hat{H}^T + \Sigma_{\epsilon})^{-1}$$ -$$K = \operatorname{cov}(X, Y) (\operatorname{cov}(Y, Y) + \Sigma_{\epsilon})^{-1}$$ +## Comment: Covariances with linear forward model -## Covariances with linear forward model +Here we show two facts that can be proven using the definition of covariance, see e.g. sections 6.2.2 and 8.2.1 in [The Matrix Cookbook](https://www.math.uwaterloo.ca/~hwolkowi/matrixcookbook.pdf). Assume that $y = h(x) = H x + c$, then @@ -96,31 +101,9 @@ Similarity, if $y = h(x) = H x + c$, then $$\operatorname{cov}(y, y ) = H \operatorname{cov}(x, x) H^T.$$ -## Variations on the update equation - - ## References -- [Ensemblized linear least squares (LLS)](https://ncda-fs.web.norce.cloud/WS2023/raanes.pdf) - - - - - ------------------- - - - - -- test -- test - -$`a=b`$ - -`$$a=b$$` - -$$`a=b`$$ - -```math -\left( \sum_{k=1}^n a_k b_k \right)^2 \leq \left( \sum_{k=1}^n a_k^2 \right) \left( \sum_{k=1}^n b_k^2 \right) -``` +- [Issue 6599 on the Lasso in ERT](https://github.com/equinor/ert/issues/6599) +- [Presentation: Ensemblized linear least squares (LLS)](https://ncda-fs.web.norce.cloud/WS2023/raanes.pdf) +- [Paper: Ensemble Smoother with Multiple Data Assimilation](http://dx.doi.org/10.1016/j.cageo.2012.03.011) +- [Paper: Ensemble transport smoothing. Part I: Unified framework](https://arxiv.org/pdf/2210.17000.pdf) From fb175edff44ac9f53b98a843d08c55c26da6bcce Mon Sep 17 00:00:00 2001 From: Tommy Odland Date: Fri, 29 Dec 2023 09:52:41 +0100 Subject: [PATCH 3/9] operatorname to text --- lasso.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lasso.md b/lasso.md index ae6526b9..8541271a 100644 --- a/lasso.md +++ b/lasso.md @@ -23,11 +23,11 @@ Our starting point for the update equation will be from the ESMDA paper. The ESMDA update equation, with inflation factor $\alpha=1$, is -$$X_\text{posterior} = X + \operatorname{cov}(X, Y) (\operatorname{cov}(Y, Y) + \Sigma_\epsilon)^{-1} (D - Y),$$ +$$X_\text{posterior} = X + \text{cov}(X, Y) (\text{cov}(Y, Y) + \Sigma_\epsilon)^{-1} (D - Y),$$ -where the empirical cross covariance matrix $\operatorname{cov}(X, Y) = c(X) c(Y)^T / (N - 1)$, and $c(X)$ centers each row in X by subtracting the mean. +where the empirical cross covariance matrix $\text{cov}(X, Y) = c(X) c(Y)^T / (N - 1)$, and $c(X)$ centers each row in X by subtracting the mean. -The term $K := \operatorname{cov}(X, Y) (\operatorname{cov}(Y, Y) + \Sigma_\epsilon)^{-1}$ is the Kalman gain matrix. +The term $K := \text{cov}(X, Y) (\text{cov}(Y, Y) + \Sigma_\epsilon)^{-1}$ is the Kalman gain matrix. Notice that in this expression we estimate both the covariance matrix and the cross covariance matrix using few realizations $N$. ## The Kalman gain @@ -85,9 +85,9 @@ $$K = \Sigma_{x} \hat{H}^T (\hat{H}\Sigma_{x}\hat{H}^T + \Sigma_{\epsilon})^{-1} which suggests another possible way to compute $K$: 1. Estimate $\hat{H}$ using Lasso or similar. -2. Estimate $\Sigma_{x}$ as $\operatorname{cov}(X, X)$. +2. Estimate $\Sigma_{x}$ as $\text{cov}(X, X)$. -This is likely unfeasible since $\operatorname{cov}(X, X)$ becomes huge. +This is likely unfeasible since $\text{cov}(X, X)$ becomes huge. ## Comment: Covariances with linear forward model @@ -95,11 +95,11 @@ Here we show two facts that can be proven using the definition of covariance, se Assume that $y = h(x) = H x + c$, then -$$\operatorname{cov}(x, y ) = \operatorname{cov}(x, x) H^T.$$ +$$\text{cov}(x, y ) = \text{cov}(x, x) H^T.$$ Similarity, if $y = h(x) = H x + c$, then -$$\operatorname{cov}(y, y ) = H \operatorname{cov}(x, x) H^T.$$ +$$\text{cov}(y, y ) = H \text{cov}(x, x) H^T.$$ ## References From 51d9659afe1292080ecb6fec7309298fb9c83d89 Mon Sep 17 00:00:00 2001 From: Tommy Odland Date: Fri, 29 Dec 2023 09:53:51 +0100 Subject: [PATCH 4/9] spacing eqns --- lasso.md | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/lasso.md b/lasso.md index 8541271a..3407963f 100644 --- a/lasso.md +++ b/lasso.md @@ -37,11 +37,9 @@ We will now transform the Kalman gain matrix into a linear regression. We switch notations and set $X := c(X)$ and $Y := c(Y)$, meaning that we assume that each row (variable) has been centered by subtracting the mean. Writing out the empirical covariances, the Kalman gain becomes -$$ -K = \frac{X Y^T}{N - 1} \left( +$$K = \frac{X Y^T}{N - 1} \left( \frac{Y Y^T}{N - 1} + \Sigma_\epsilon -\right)^{-1}. -$$ +\right)^{-1}.$$ ### Adding sampled noise @@ -51,25 +49,23 @@ Notice that the observation covariance matrix is typically diagonal, has shape $ Replacing $\Sigma_\epsilon$ with $S S^T / (N - 1)$, we can approximate $K$ as -$$ -K \approx X Y^T \left( +$$K \approx X Y^T \left( Y Y^T + S S^T -\right)^{-1}. -$$ +\right)^{-1}.$$ If we assume that $YS^T \approx SY^T \approx 0$, then the middle terms in $(Y +S)( Y + S)^T = Y Y^T + Y S^T + SY^T + SS^T$ vanish. This is reasonable in the limit of large $N$ since the rows in $S$ are uncorrelated with the rows in $Y$. Define $Y_\text{noisy} := Y + S$, then -$$ -K \approx X Y^T \left( + +$$K \approx X Y^T \left( Y_\text{noisy} Y_\text{noisy}^T -\right)^{+}. -$$ +\right)^{+}.$$ + Finally, assume that $X S^T \approx 0$ so that $Y \approx Y_\text{noisy}$, then -$$ -K = X Y_\text{noisy}^{+} -$$ + +$$K = X Y_\text{noisy}^{+}$$ + and we solve a least squares problem $Y_\text{noisy}^T K^T = X^T$ for $K$. If we use Ridge or Lasso, the solution decomposes since in general $A[x_1 | x_2] = [Ax_1 | Ax_2] = [y_1 | y_2]$ and we can consider each row of $X$ independently. We can find $K$ by solving a Ridge or Lasso problem. @@ -81,7 +77,9 @@ Solving $Y^T K^T = X^T$ with Ridge produces the Normal equations that define $K$ ## Comment: Another Lasso approach The Kalman gain can also be written as + $$K = \Sigma_{x} \hat{H}^T (\hat{H}\Sigma_{x}\hat{H}^T + \Sigma_{\epsilon})^{-1}$$ + which suggests another possible way to compute $K$: 1. Estimate $\hat{H}$ using Lasso or similar. From c839e161786f918aa4589fde794555e11e140e2b Mon Sep 17 00:00:00 2001 From: Tommy Odland Date: Fri, 29 Dec 2023 14:19:36 +0100 Subject: [PATCH 5/9] more on lasso --- docs/source/Lasso.py | 32 +++++++++++++++++++++++++++++--- lasso.md | 42 +++++++++++++++++++++++++++++++++++------- 2 files changed, 64 insertions(+), 10 deletions(-) diff --git a/docs/source/Lasso.py b/docs/source/Lasso.py index 75a911c3..b09173c4 100644 --- a/docs/source/Lasso.py +++ b/docs/source/Lasso.py @@ -65,7 +65,7 @@ alpha = 1 # Effect size (coefficients in sparse mapping A) -effect_size = 5 +effect_size = 1000 # %% [markdown] # ## Create problem data - sparse tridiagonal matrix $A$ @@ -291,7 +291,8 @@ def solve_projected(X, Y, covariance, observations, seed=1): cov_YY = empirical_cross_covariance(Y, Y) # Compute the update - K = cov_XY @ sp.linalg.inv(cov_YY + S @ S.T / (N - 1)) + # TODO: N-1 vs N here. + K = cov_XY @ sp.linalg.inv(cov_YY + S @ S.T / (N-1)) return X + K @ (D - Y) @@ -395,7 +396,7 @@ def solve_adaptive_ESMDA(X, Y, covariance, observations, seed=1): # %% -plt.figure(figsize=(7, 3)) +plt.figure(figsize=(8, 4.5)) for function, label in zip( [ @@ -408,6 +409,7 @@ def solve_adaptive_ESMDA(X, Y, covariance, observations, seed=1): ], [ "ESMDA", + "projected", "lstsq", "Lasso (direct)", "Lasso", @@ -458,3 +460,27 @@ def solve_adaptive_ESMDA(X, Y, covariance, observations, seed=1): plt.show() # %% + +# %% + +# %% + +# %% + +# %% + +# %% + +# %% + +# %% + +# %% + +# %% + +# %% + +# %% + +# %% diff --git a/lasso.md b/lasso.md index 3407963f..82f416f2 100644 --- a/lasso.md +++ b/lasso.md @@ -1,6 +1,6 @@ # Lasso -Notes on the Lasso approach for +Notes on the Lasso approach for regularizing the forward model. ## Notation and preliminaries @@ -43,9 +43,20 @@ $$K = \frac{X Y^T}{N - 1} \left( ### Adding sampled noise -Let $S \sim \mathcal{N}(0, \Sigma_\epsilon)$. -We can write $S S^T / (N - 1) \approx \Sigma_\epsilon$, which is a low rank approximation to the observation covariance matrix. -Notice that the observation covariance matrix is typically diagonal, has shape $m \times m$, whereas $S S^T$ has rank $N \ll m$. +Let $R \sim \mathcal{N}(0, \Sigma_\epsilon)$. +If we assume the mean is known, Bessel's correction is not needed and we can write $R R^T / N \approx \Sigma_\epsilon$. +We sample from $\mathcal{N}(0, \Sigma_\epsilon)$ then estimate the covariance $\Sigma_\epsilon$ using the empirical covariance. + +Define $S = R \sqrt{(N-1) / N}$, then + +$$\Sigma_\epsilon \approx +\frac{1}{N} R R^T += +\frac{1}{N-1} \frac{N-1}{N} R R^T += \frac{S S^T}{N-1} +$$ +Notice that this is a low rank approximation to the observation covariance matrix. +The observation covariance matrix $\Sigma_\epsilon$ is typically diagonal, has shape $m \times m$, whereas $S S^T$ has rank $N \ll m$. Replacing $\Sigma_\epsilon$ with $S S^T / (N - 1)$, we can approximate $K$ as @@ -67,14 +78,31 @@ Finally, assume that $X S^T \approx 0$ so that $Y \approx Y_\text{noisy}$, then $$K = X Y_\text{noisy}^{+}$$ and we solve a least squares problem $Y_\text{noisy}^T K^T = X^T$ for $K$. -If we use Ridge or Lasso, the solution decomposes since in general $A[x_1 | x_2] = [Ax_1 | Ax_2] = [y_1 | y_2]$ and we can consider each row of $X$ independently. +If we use Ridge or Lasso, the solution decomposes and we can consider each row of $X$ independently. +This is because in the equation $Y_\text{noisy}^T K^T = X^T$ the matrix $K^T$ acts on each row vector in $Y_\text{noisy}^T$ independently to produce each row vector in $X^T$. We can find $K$ by solving a Ridge or Lasso problem. +### Comment: Kalman gain and the forward model + +Regularizing $K$ using e.g. Ridge or Lasso makes sense if we believe that forward model is localized, in the sense that each output is determined primarily by a few inputs (sparsity). + +A rough sketch of why $K \sim \mathbb{E} \nabla h^{-1}$ + +Since $K \approx X Y^{+}$ holds for centered matrices, if we change notation to uncentered matrices we have +$$K \approx \frac{X - \mathbb{E} X}{Y - \mathbb{E} Y}$$ +rearranging we see that this is a second order Taylor expansion. + +Therefore it makes sense to regularize $K$ if we believe in a sparse forward model. + ### Comment: Ridge and the Kalman gain Solving $Y^T K^T = X^T$ with Ridge produces the Normal equations that define $K$. +To be concrete, minimizing $\lVert Y^T K^T - X^T \rVert_2^2 + \alpha \lVert D K \rVert_2^2$ with respect to $K$ yields the equation +$$\left( Y Y^T + \alpha D D^T\right) K = X Y^T.$$ +This is, up to scaling, exactly the equation for the kalman gain $K$ that we started with from the ESMDA paper. +See equations (119) and (132) in the matrix cookbook for the matrix calculus needed to differentiate the loss function. -## Comment: Another Lasso approach +### Comment: Another Lasso approach The Kalman gain can also be written as @@ -87,7 +115,7 @@ which suggests another possible way to compute $K$: This is likely unfeasible since $\text{cov}(X, X)$ becomes huge. -## Comment: Covariances with linear forward model +### Comment: Covariances with linear forward model Here we show two facts that can be proven using the definition of covariance, see e.g. sections 6.2.2 and 8.2.1 in [The Matrix Cookbook](https://www.math.uwaterloo.ca/~hwolkowi/matrixcookbook.pdf). From c442d9336b7ae2fa3c9acd159d596218045b99d4 Mon Sep 17 00:00:00 2001 From: Tommy Odland Date: Fri, 5 Jan 2024 09:29:44 +0100 Subject: [PATCH 6/9] added localized.py --- src/iterative_ensemble_smoother/localized.py | 268 +++++++++++++++++++ 1 file changed, 268 insertions(+) create mode 100644 src/iterative_ensemble_smoother/localized.py diff --git a/src/iterative_ensemble_smoother/localized.py b/src/iterative_ensemble_smoother/localized.py new file mode 100644 index 00000000..8eb8fb95 --- /dev/null +++ b/src/iterative_ensemble_smoother/localized.py @@ -0,0 +1,268 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Jan 5 09:27:34 2024 + +@author: TODL +""" + +from iterative_ensemble_smoother.esmda import BaseESMDA +import numbers +from abc import ABC +from typing import Union + +import numpy as np +import numpy.typing as npt +import scipy as sp # type: ignore + +from iterative_ensemble_smoother.esmda_inversion import ( + inversion_exact_cholesky, + inversion_subspace, + normalize_alpha, +) +from iterative_ensemble_smoother.utils import sample_mvnormal + + +class LassoESMDA(BaseESMDA): + """ + Implement an Ensemble Smoother with Multiple Data Assimilation (ES-MDA). + + The implementation follows :cite:t:`EMERICK2013`. + + Parameters + ---------- + covariance : np.ndarray + Either a 1D array of diagonal covariances, or a 2D covariance matrix. + The shape is either (num_observations,) or (num_observations, num_observations). + This is C_D in Emerick (2013), and represents observation or measurement + errors. We observe d from the real world, y from the model g(x), and + assume that d = y + e, where the error e is multivariate normal with + covariance given by `covariance`. + observations : np.ndarray + 1D array of shape (num_observations,) representing real-world observations. + This is d_obs in Emerick (2013). + alpha : int or 1D np.ndarray, optional + Multiplicative factor for the covariance. + If an integer `alpha` is given, an array with length `alpha` and + elements `alpha` is constructed. If an 1D array is given, it is + normalized so sum_i 1/alpha_i = 1 and used. The default is 5, which + corresponds to np.array([5, 5, 5, 5, 5]). + seed : integer or numpy.random._generator.Generator, optional + A seed or numpy.random._generator.Generator used for random number + generation. The argument is passed to numpy.random.default_rng(). + The default is None. + inversion : str, optional + Which inversion method to use. The default is "exact". + See the dictionary ESMDA._inversion_methods for more information. + + Examples + -------- + >>> covariance = np.diag([1, 1, 1]) + >>> observations = np.array([1, 2, 3]) + >>> esmda = ESMDA(covariance, observations) + + """ + + # Available inversion methods. The inversion methods all compute + # C_MD @ (C_DD + alpha * C_D)^(-1) @ (D - Y) + _inversion_methods = { + "exact": inversion_exact_cholesky, + "subspace": inversion_subspace, + } + + def __init__( + self, + covariance: npt.NDArray[np.double], + observations: npt.NDArray[np.double], + alpha: Union[int, npt.NDArray[np.double]] = 5, + seed: Union[np.random._generator.Generator, int, None] = None, + inversion: str = "exact", + ) -> None: + """Initialize the instance.""" + + super().__init__(covariance=covariance, observations=observations, seed=seed) + + if not ( + (isinstance(alpha, np.ndarray) and alpha.ndim == 1) + or isinstance(alpha, numbers.Integral) + ): + raise TypeError("Argument `alpha` must be an integer or a 1D NumPy array.") + + if not isinstance(inversion, str): + raise TypeError( + "Argument `inversion` must be a string in " + f"{tuple(self._inversion_methods.keys())}, but got {inversion}" + ) + if inversion not in self._inversion_methods.keys(): + raise ValueError( + "Argument `inversion` must be a string in " + f"{tuple(self._inversion_methods.keys())}, but got {inversion}" + ) + + # Store data + self.inversion = inversion + + # Alpha can either be an integer (num iterations) or a list of weights + if isinstance(alpha, np.ndarray) and alpha.ndim == 1: + self.alpha = normalize_alpha(alpha) + elif isinstance(alpha, numbers.Integral): + self.alpha = normalize_alpha(np.ones(alpha)) + assert np.allclose(self.alpha, normalize_alpha(self.alpha)) + else: + raise TypeError("Alpha must be integer or 1D array.") + + def num_assimilations(self) -> int: + return len(self.alpha) + + def assimilate( + self, + X: npt.NDArray[np.double], + Y: npt.NDArray[np.double], + *, + overwrite: bool = False, + truncation: float = 1.0, + ) -> npt.NDArray[np.double]: + """Assimilate data and return an updated ensemble X_posterior. + + X_posterior = smoother.assimilate(X, Y) + + Parameters + ---------- + X : np.ndarray + A 2D array of shape (num_parameters, ensemble_size). Each row + corresponds to a parameter in the model, and each column corresponds + to an ensemble member (realization). + Y : np.ndarray + 2D array of shape (num_observations, ensemble_size), containing + responses when evaluating the model at X. In other words, Y = g(X), + where g is the forward model. + overwrite : bool + If True, then arguments X and Y may be overwritten and mutated. + If False, then the method will not mutate inputs in any way. + Setting this to True might save memory. + truncation : float + How large a fraction of the singular values to keep in the inversion + routine. Must be a float in the range (0, 1]. A lower number means + a more approximate answer and a slightly faster computation. + + Returns + ------- + X_posterior : np.ndarray + 2D array of shape (num_parameters, ensemble_size). + + """ + if self.iteration >= self.num_assimilations(): + raise Exception("No more assimilation steps to run.") + + # Verify shapes + _, num_ensemble = X.shape + num_outputs, num_emsemble2 = Y.shape + assert ( + num_ensemble == num_emsemble2 + ), "Number of ensemble members in X and Y must match" + if not np.issubdtype(X.dtype, np.floating): + raise TypeError("Argument `X` must contain floats") + if not np.issubdtype(Y.dtype, np.floating): + raise TypeError("Argument `Y` must contain floats") + + assert 0 < truncation <= 1.0 + + # Do not overwrite input arguments + if not overwrite: + X, Y = np.copy(X), np.copy(Y) + + # Line 2 (b) in the description of ES-MDA in the 2013 Emerick paper + + # Draw samples from zero-centered multivariate normal with cov=alpha * C_D, + # and add them to the observations. Notice that + # if C_D = L L.T by the cholesky factorization, then drawing y from + # a zero cented normal means that y := L @ z, where z ~ norm(0, 1) + # Therefore, scaling C_D by alpha is equivalent to scaling L with sqrt(alpha) + D = self.perturb_observations( + ensemble_size=num_ensemble, alpha=self.alpha[self.iteration] + ) + assert D.shape == (num_outputs, num_ensemble) + + # Line 2 (c) in the description of ES-MDA in the 2013 Emerick paper + # Choose inversion method, e.g. 'exact'. The inversion method computes + # C_MD @ sp.linalg.inv(C_DD + C_D_alpha) @ (D - Y) + inversion_func = self._inversion_methods[self.inversion] + + # Update and return + X += inversion_func( + alpha=self.alpha[self.iteration], + C_D=self.C_D, + D=D, + Y=Y, + X=X, + truncation=truncation, + ) + + self.iteration += 1 + return X + + def compute_transition_matrix( + self, + Y: npt.NDArray[np.double], + *, + alpha: float, + truncation: float = 1.0, + ) -> npt.NDArray[np.double]: + """Return a matrix T such that X_posterior = X_prior + X_prior @ T. + + The purpose of this method is to facilitate row-by-row, or batch-by-batch, + updates of X. This is useful if X is too large to fit in memory. + + Parameters + ---------- + Y : np.ndarray + 2D array of shape (num_observations, ensemble_size), containing + responses when evaluating the model at X. In other words, Y = g(X), + where g is the forward model. + alpha : float + The covariance inflation factor. The sequence of alphas should + obey the equation sum_i (1/alpha_i) = 1. However, this is NOT enforced + in this method call. The user/caller is responsible for this. + truncation : float + How large a fraction of the singular values to keep in the inversion + routine. Must be a float in the range (0, 1]. A lower number means + a more approximate answer and a slightly faster computation. + + Returns + ------- + T : np.ndarray + A matrix T such that X_posterior = X_prior + X_prior @ T. + It has shape (num_ensemble_members, num_ensemble_members). + """ + + # Recall the update equation: + # X += C_MD @ (C_DD + alpha * C_D)^(-1) @ (D - Y) + # X += X @ center(Y).T / (N-1) @ (C_DD + alpha * C_D)^(-1) @ (D - Y) + # We form T := center(Y).T / (N-1) @ (C_DD + alpha * C_D)^(-1) @ (D - Y), + # so that + # X_new = X_old + X_old @ T + # or + # X += X @ T + + D = self.perturb_observations(ensemble_size=Y.shape[1], alpha=alpha) + inversion_func = self._inversion_methods[self.inversion] + return inversion_func( + alpha=alpha, + C_D=self.C_D, + D=D, + Y=Y, + X=None, # We don't need X to compute the factor T + truncation=truncation, + return_T=True, # Ensures that we don't need X + ) + + +if __name__ == "__main__": + import pytest + + pytest.main( + args=[ + __file__, + "--doctest-modules", + "-v", + ] + ) From 86121b2f1eb5e9787ca8c6920a39970c60e784bc Mon Sep 17 00:00:00 2001 From: Tommy Odland Date: Fri, 5 Jan 2024 11:22:16 +0100 Subject: [PATCH 7/9] draft of lasso --- src/iterative_ensemble_smoother/localized.py | 281 +++++++------------ 1 file changed, 103 insertions(+), 178 deletions(-) diff --git a/src/iterative_ensemble_smoother/localized.py b/src/iterative_ensemble_smoother/localized.py index 8eb8fb95..1b267c95 100644 --- a/src/iterative_ensemble_smoother/localized.py +++ b/src/iterative_ensemble_smoother/localized.py @@ -22,7 +22,99 @@ from iterative_ensemble_smoother.utils import sample_mvnormal -class LassoESMDA(BaseESMDA): +import numpy as np +from sklearn.linear_model import LassoCV +from sklearn.linear_model import MultiTaskLassoCV + + +def linear_l1_regression(X, Y): + """ + Performs LASSO regression for each response in Y against predictors in U, + constructing a sparse matrix of regression coefficients. + + The function scales features in U using standard scaling before applying + LASSO, then re-scales the coefficients to the original scale of U. This + extracts the effect of each feature in U on each response in Y, ignoring + intercepts and constant terms. + + Parameters + ---------- + U : np.ndarray + 2D array of predictors with shape (num_variables, ensemble_size). + Y : np.ndarray + 2D array of responses with shape (num_variables, ensemble_size). + + + Examples + -------- + >>> K = np.array([[-0.2, -0.9], + ... [-1.5, 0.2], + ... [-0.2, 0.1], + ... [ 0.8, -0.4]]) + >>> X = np.array([[ 0.8, 2. , -0.8], + ... [ 0.4, 1.7, -0.3]]) + >>> Y = K @ X + >>> K_est = linear_l1_regression(X, Y) + >>> np.allclose(K, K_est, atol=0.005) + True + + """ + # https://github.com/equinor/graphite-maps/blob/main/graphite_maps/linear_regression.py + + # The goal is to solve K @ X = Y for K, or equivalently X.T @ K.T = Y.T + num_parameters, ensemble_size1 = X.shape + num_observations, ensemble_size2 = Y.shape + + assert ensemble_size1 == ensemble_size2, "Number of cols in X and Y must be equal" + + # Scale to unit standard deviation. We do this because of numerics in Lasso + stds_X = np.std(X, axis=1) + stds_Y = np.std(Y, axis=1) + X = X / stds_X[:, np.newaxis] + Y = Y / stds_Y[:, np.newaxis] + + # Loop over features + coefs = [] + cv = min(5, ensemble_size1) + alphas = np.logspace(-8, 8, num=17) + for j in range(num_observations): + y_j = Y[j, :] + + # Learn individual regularization and fit + model_cv = LassoCV(alphas=alphas, fit_intercept=False, max_iter=10_000, cv=cv) + model_cv.fit(X.T, y_j) # model_cv.alpha_ + + # Alphas for the next iteration + # alpha = model_cv.alpha_ * 0.5 * alphas[len(alphas)//2] * 0.5 + # alphas = np.logspace(-2, 2, num=5) * model_cv.alpha_ + + # Scale back the coefficients + coef_scale = stds_Y[j] / stds_X + coefs.append(model_cv.coef_ * coef_scale) + + K = np.vstack(coefs) + assert K.shape == (num_observations, num_parameters) + return K + + # ======================================================= + + # Alternative computation using MultiTaskLasso + + # Create a lasso model + lasso = MultiTaskLassoCV(fit_intercept=False, cv=cv) + lasso.fit(X.T, Y.T) + + # Get the matrix K + K = lasso.coef_ + assert K.shape == (num_observations, num_parameters) + + K = K / stds_X[np.newaxis, :] + K = K * stds_Y[:, np.newaxis] + + return K + + +class LassoES(BaseESMDA): """ Implement an Ensemble Smoother with Multiple Data Assimilation (ES-MDA). @@ -40,119 +132,30 @@ class LassoESMDA(BaseESMDA): observations : np.ndarray 1D array of shape (num_observations,) representing real-world observations. This is d_obs in Emerick (2013). - alpha : int or 1D np.ndarray, optional - Multiplicative factor for the covariance. - If an integer `alpha` is given, an array with length `alpha` and - elements `alpha` is constructed. If an 1D array is given, it is - normalized so sum_i 1/alpha_i = 1 and used. The default is 5, which - corresponds to np.array([5, 5, 5, 5, 5]). seed : integer or numpy.random._generator.Generator, optional A seed or numpy.random._generator.Generator used for random number generation. The argument is passed to numpy.random.default_rng(). The default is None. - inversion : str, optional - Which inversion method to use. The default is "exact". - See the dictionary ESMDA._inversion_methods for more information. - - Examples - -------- - >>> covariance = np.diag([1, 1, 1]) - >>> observations = np.array([1, 2, 3]) - >>> esmda = ESMDA(covariance, observations) """ - # Available inversion methods. The inversion methods all compute - # C_MD @ (C_DD + alpha * C_D)^(-1) @ (D - Y) - _inversion_methods = { - "exact": inversion_exact_cholesky, - "subspace": inversion_subspace, - } - def __init__( self, covariance: npt.NDArray[np.double], observations: npt.NDArray[np.double], - alpha: Union[int, npt.NDArray[np.double]] = 5, seed: Union[np.random._generator.Generator, int, None] = None, - inversion: str = "exact", ) -> None: """Initialize the instance.""" super().__init__(covariance=covariance, observations=observations, seed=seed) - if not ( - (isinstance(alpha, np.ndarray) and alpha.ndim == 1) - or isinstance(alpha, numbers.Integral) - ): - raise TypeError("Argument `alpha` must be an integer or a 1D NumPy array.") - - if not isinstance(inversion, str): - raise TypeError( - "Argument `inversion` must be a string in " - f"{tuple(self._inversion_methods.keys())}, but got {inversion}" - ) - if inversion not in self._inversion_methods.keys(): - raise ValueError( - "Argument `inversion` must be a string in " - f"{tuple(self._inversion_methods.keys())}, but got {inversion}" - ) - - # Store data - self.inversion = inversion - - # Alpha can either be an integer (num iterations) or a list of weights - if isinstance(alpha, np.ndarray) and alpha.ndim == 1: - self.alpha = normalize_alpha(alpha) - elif isinstance(alpha, numbers.Integral): - self.alpha = normalize_alpha(np.ones(alpha)) - assert np.allclose(self.alpha, normalize_alpha(self.alpha)) - else: - raise TypeError("Alpha must be integer or 1D array.") - - def num_assimilations(self) -> int: - return len(self.alpha) - def assimilate( self, X: npt.NDArray[np.double], Y: npt.NDArray[np.double], *, - overwrite: bool = False, - truncation: float = 1.0, + alpha: float = 1.0, ) -> npt.NDArray[np.double]: - """Assimilate data and return an updated ensemble X_posterior. - - X_posterior = smoother.assimilate(X, Y) - - Parameters - ---------- - X : np.ndarray - A 2D array of shape (num_parameters, ensemble_size). Each row - corresponds to a parameter in the model, and each column corresponds - to an ensemble member (realization). - Y : np.ndarray - 2D array of shape (num_observations, ensemble_size), containing - responses when evaluating the model at X. In other words, Y = g(X), - where g is the forward model. - overwrite : bool - If True, then arguments X and Y may be overwritten and mutated. - If False, then the method will not mutate inputs in any way. - Setting this to True might save memory. - truncation : float - How large a fraction of the singular values to keep in the inversion - routine. Must be a float in the range (0, 1]. A lower number means - a more approximate answer and a slightly faster computation. - - Returns - ------- - X_posterior : np.ndarray - 2D array of shape (num_parameters, ensemble_size). - - """ - if self.iteration >= self.num_assimilations(): - raise Exception("No more assimilation steps to run.") - # Verify shapes _, num_ensemble = X.shape num_outputs, num_emsemble2 = Y.shape @@ -164,96 +167,18 @@ def assimilate( if not np.issubdtype(Y.dtype, np.floating): raise TypeError("Argument `Y` must contain floats") - assert 0 < truncation <= 1.0 - - # Do not overwrite input arguments - if not overwrite: - X, Y = np.copy(X), np.copy(Y) - - # Line 2 (b) in the description of ES-MDA in the 2013 Emerick paper - - # Draw samples from zero-centered multivariate normal with cov=alpha * C_D, - # and add them to the observations. Notice that - # if C_D = L L.T by the cholesky factorization, then drawing y from - # a zero cented normal means that y := L @ z, where z ~ norm(0, 1) - # Therefore, scaling C_D by alpha is equivalent to scaling L with sqrt(alpha) - D = self.perturb_observations( - ensemble_size=num_ensemble, alpha=self.alpha[self.iteration] - ) - assert D.shape == (num_outputs, num_ensemble) - - # Line 2 (c) in the description of ES-MDA in the 2013 Emerick paper - # Choose inversion method, e.g. 'exact'. The inversion method computes - # C_MD @ sp.linalg.inv(C_DD + C_D_alpha) @ (D - Y) - inversion_func = self._inversion_methods[self.inversion] - - # Update and return - X += inversion_func( - alpha=self.alpha[self.iteration], - C_D=self.C_D, - D=D, - Y=Y, - X=X, - truncation=truncation, + # Center X and Y + X_center = X - np.mean(X, axis=1, keepdims=True) + Y_center = Y - np.mean(Y, axis=1, keepdims=True) + Y_noise = np.sqrt(alpha) * sample_mvnormal( + C_dd_cholesky=self.C_D_L, rng=self.rng, size=num_ensemble ) - self.iteration += 1 - return X + # Compute regularized Kalman gain matrix + K = linear_l1_regression(X=Y_center + Y_noise, Y=X_center) - def compute_transition_matrix( - self, - Y: npt.NDArray[np.double], - *, - alpha: float, - truncation: float = 1.0, - ) -> npt.NDArray[np.double]: - """Return a matrix T such that X_posterior = X_prior + X_prior @ T. - - The purpose of this method is to facilitate row-by-row, or batch-by-batch, - updates of X. This is useful if X is too large to fit in memory. - - Parameters - ---------- - Y : np.ndarray - 2D array of shape (num_observations, ensemble_size), containing - responses when evaluating the model at X. In other words, Y = g(X), - where g is the forward model. - alpha : float - The covariance inflation factor. The sequence of alphas should - obey the equation sum_i (1/alpha_i) = 1. However, this is NOT enforced - in this method call. The user/caller is responsible for this. - truncation : float - How large a fraction of the singular values to keep in the inversion - routine. Must be a float in the range (0, 1]. A lower number means - a more approximate answer and a slightly faster computation. - - Returns - ------- - T : np.ndarray - A matrix T such that X_posterior = X_prior + X_prior @ T. - It has shape (num_ensemble_members, num_ensemble_members). - """ - - # Recall the update equation: - # X += C_MD @ (C_DD + alpha * C_D)^(-1) @ (D - Y) - # X += X @ center(Y).T / (N-1) @ (C_DD + alpha * C_D)^(-1) @ (D - Y) - # We form T := center(Y).T / (N-1) @ (C_DD + alpha * C_D)^(-1) @ (D - Y), - # so that - # X_new = X_old + X_old @ T - # or - # X += X @ T - - D = self.perturb_observations(ensemble_size=Y.shape[1], alpha=alpha) - inversion_func = self._inversion_methods[self.inversion] - return inversion_func( - alpha=alpha, - C_D=self.C_D, - D=D, - Y=Y, - X=None, # We don't need X to compute the factor T - truncation=truncation, - return_T=True, # Ensures that we don't need X - ) + D = self.perturb_observations(ensemble_size=num_ensemble, alpha=alpha) + return X + K @ (D - Y) if __name__ == "__main__": From 01f761cba1016cbffe4bf7fb87a7282264c70b7b Mon Sep 17 00:00:00 2001 From: Tommy Odland Date: Fri, 5 Jan 2024 11:31:05 +0100 Subject: [PATCH 8/9] comments --- src/iterative_ensemble_smoother/localized.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/iterative_ensemble_smoother/localized.py b/src/iterative_ensemble_smoother/localized.py index 1b267c95..e5767533 100644 --- a/src/iterative_ensemble_smoother/localized.py +++ b/src/iterative_ensemble_smoother/localized.py @@ -170,12 +170,23 @@ def assimilate( # Center X and Y X_center = X - np.mean(X, axis=1, keepdims=True) Y_center = Y - np.mean(Y, axis=1, keepdims=True) - Y_noise = np.sqrt(alpha) * sample_mvnormal( + + # We have R @ R.T = C_D + R = np.sqrt(alpha) * sample_mvnormal( C_dd_cholesky=self.C_D_L, rng=self.rng, size=num_ensemble ) - - # Compute regularized Kalman gain matrix - K = linear_l1_regression(X=Y_center + Y_noise, Y=X_center) + + # L = sqrt((N - 1) / N) + L = np.sqrt((num_ensemble - 1)/ num_ensemble) * R + + # Compute regularized Kalman gain matrix matrix by solving + # (Y + L) K = X + # The definition of K is + # (cov(Y, Y) + covariance) K = cov(X, Y), which is approximately + # (Y @ Y.T + L @ L.T) K = X @ Y.T, [X and Y are centered] which is approximately + # (Y + L) @ (Y + L).T @ K = X @ Y.T, which is approximately + # (Y + L) @ K = X + K = linear_l1_regression(X=Y_center + L, Y=X_center) D = self.perturb_observations(ensemble_size=num_ensemble, alpha=alpha) return X + K @ (D - Y) From c54603772f67e8911d249c33678a2d487eca6109 Mon Sep 17 00:00:00 2001 From: Tommy Odland Date: Fri, 5 Jan 2024 13:57:06 +0100 Subject: [PATCH 9/9] finished initial lasso implementation --- docs/source/Lasso.py | 129 ++++--------------- lasso.md | 2 +- src/iterative_ensemble_smoother/localized.py | 69 +++------- 3 files changed, 46 insertions(+), 154 deletions(-) diff --git a/docs/source/Lasso.py b/docs/source/Lasso.py index b09173c4..7a02df29 100644 --- a/docs/source/Lasso.py +++ b/docs/source/Lasso.py @@ -56,7 +56,7 @@ rng = np.random.default_rng(42) # Dimensionality of the problem -num_parameters = 100 +num_parameters = 75 num_observations = 50 num_ensemble = 25 prior_std = 1 @@ -65,13 +65,13 @@ alpha = 1 # Effect size (coefficients in sparse mapping A) -effect_size = 1000 +effect_size = 1 # %% [markdown] # ## Create problem data - sparse tridiagonal matrix $A$ # %% -diagonal = effect_size * np.ones(min(num_parameters, num_observations)) +diagonal = effect_size * np.ones(max(num_parameters, num_observations)) # Create a tridiagonal matrix (easiest with scipy) A = sp.sparse.diags( @@ -174,71 +174,11 @@ def g(X): from sklearn.linear_model import LassoCV from sklearn.preprocessing import StandardScaler - -def linear_l1_regression(U, Y): - """ - Performs LASSO regression for each response in Y against predictors in U, - constructing a sparse matrix of regression coefficients. - - The function scales features in U using standard scaling before applying - LASSO, then re-scales the coefficients to the original scale of U. This - extracts the effect of each feature in U on each response in Y, ignoring - intercepts and constant terms. - - Parameters - ---------- - U : np.ndarray - 2D array of predictors with shape (n, p). - Y : np.ndarray - 2D array of responses with shape (n, m). - - Returns - ------- - H_sparse : scipy.sparse.csc_matrix - Sparse matrix (m, p) with re-scaled LASSO regression coefficients for - each response in Y. - - Raises - ------ - AssertionError - If the number of samples in U and Y do not match, or if the shape of - H_sparse is not (m, p). - """ - # https://github.com/equinor/graphite-maps/blob/main/graphite_maps/linear_regression.py - - n, p = U.shape # p: number of features - n_y, m = Y.shape # m: number of y responses - - # Assert that the first dimension of U and Y are the same - assert n == n_y, "Number of samples in U and Y must be the same" - - scaler_u = StandardScaler() - U_scaled = scaler_u.fit_transform(U) - - scaler_y = StandardScaler() - Y_scaled = scaler_y.fit_transform(Y) - - # Loop over features - coefs = [] - for j in range(m): - y_j = Y_scaled[:, j] - - # Learn individual regularization and fit - alphas = np.logspace(-8, 8, num=32) - model_cv = LassoCV(alphas=alphas, fit_intercept=False, max_iter=10_000, cv=5) - model_cv.fit(U_scaled, y_j) - - coef_scale = scaler_y.scale_[j] / scaler_u.scale_ - coefs.append(model_cv.coef_ * coef_scale) - - K = np.vstack(coefs) - assert K.shape == (m, p) - - return K +from iterative_ensemble_smoother.localized import linear_l1_regression Y = g(X) -K = linear_l1_regression(X.T, Y.T) +K = linear_l1_regression(X, Y) plt.imshow(K) plt.show() @@ -262,6 +202,7 @@ def solve_esmda(X, Y, covariance, observations, seed=1): cov_YY = empirical_cross_covariance(Y, Y) # Compute the update using the exact ESMDA approach + assert covariance.ndim == 1 K = cov_XY @ sp.linalg.inv(cov_YY + np.diag(covariance)) return X + K @ (D - Y) @@ -290,9 +231,10 @@ def solve_projected(X, Y, covariance, observations, seed=1): cov_XY = empirical_cross_covariance(X, Y) cov_YY = empirical_cross_covariance(Y, Y) - # Compute the update - # TODO: N-1 vs N here. - K = cov_XY @ sp.linalg.inv(cov_YY + S @ S.T / (N-1)) + # covariance ~ S @ S.T / N + + # Compute the update. Divide by N since we know the mean is zero + K = cov_XY @ sp.linalg.inv(cov_YY + S @ S.T / N) return X + K @ (D - Y) @@ -310,13 +252,21 @@ def solve_lstsq(X, Y, covariance, observations, seed=1): N = X.shape[1] D = smoother.perturb_observations(ensemble_size=N, alpha=1.0) + # Center X and Y + X_center = X - np.mean(X, axis=1, keepdims=True) + Y_center = Y - np.mean(Y, axis=1, keepdims=True) + # Instead of using the covariance matrix, sample from it + # covariance / (N-1) ~ + + N = X.shape[1] S = sample_mvnormal(C_dd_cholesky=smoother.C_D_L, rng=smoother.rng, size=N) + S = S * np.sqrt((N - 1) / N) # Divide by N since we know that the mean is zero - Y_noisy = Y + S + Y_noisy = Y_center + S # Compute the update by solving K @ Y = X - K, *_ = sp.linalg.lstsq(Y_noisy.T, X.T) + K, *_ = sp.linalg.lstsq(Y_noisy.T, X_center.T) K = K.T return X + K @ (D - Y) @@ -338,7 +288,7 @@ def solve_lasso_direct(X, Y, covariance, observations, seed=1): D = smoother.perturb_observations(ensemble_size=N, alpha=1.0) # Approximate forward model with H - H = linear_l1_regression(X.T, Y.T) + H = linear_l1_regression(X, Y) cov_XX = empirical_cross_covariance(X, X) # Compute the update @@ -349,25 +299,16 @@ def solve_lasso_direct(X, Y, covariance, observations, seed=1): def solve_lasso(X, Y, covariance, observations, seed=1): """Solve K @ Y = X using Lasso.""" + from iterative_ensemble_smoother.localized import LassoES + # Create a smoother to get D - smoother = ESMDA( + smoother = LassoES( covariance=covariance, observations=observations, - alpha=1, seed=seed, ) - N = X.shape[1] - D = smoother.perturb_observations(ensemble_size=N, alpha=1.0) - # Instead of using the covariance matrix, sample from it - S = sample_mvnormal(C_dd_cholesky=smoother.C_D_L, rng=smoother.rng, size=N) - - Y_noisy = Y + S # Seems to work better with Y instead of Y_noisy - - # Compute the update - K = linear_l1_regression(Y_noisy.T, X.T) - - return X + K @ (D - Y) + return smoother.assimilate(X, Y) def solve_adaptive_ESMDA(X, Y, covariance, observations, seed=1): @@ -464,23 +405,3 @@ def solve_adaptive_ESMDA(X, Y, covariance, observations, seed=1): # %% # %% - -# %% - -# %% - -# %% - -# %% - -# %% - -# %% - -# %% - -# %% - -# %% - -# %% diff --git a/lasso.md b/lasso.md index 82f416f2..0fb1e108 100644 --- a/lasso.md +++ b/lasso.md @@ -132,4 +132,4 @@ $$\text{cov}(y, y ) = H \text{cov}(x, x) H^T.$$ - [Issue 6599 on the Lasso in ERT](https://github.com/equinor/ert/issues/6599) - [Presentation: Ensemblized linear least squares (LLS)](https://ncda-fs.web.norce.cloud/WS2023/raanes.pdf) - [Paper: Ensemble Smoother with Multiple Data Assimilation](http://dx.doi.org/10.1016/j.cageo.2012.03.011) -- [Paper: Ensemble transport smoothing. Part I: Unified framework](https://arxiv.org/pdf/2210.17000.pdf) +- [Paper: Ensemble transport smoothing. Part I: Unified framework](https://arxiv.org/pdf/2210.17000.pdf) \ No newline at end of file diff --git a/src/iterative_ensemble_smoother/localized.py b/src/iterative_ensemble_smoother/localized.py index e5767533..6c09d669 100644 --- a/src/iterative_ensemble_smoother/localized.py +++ b/src/iterative_ensemble_smoother/localized.py @@ -1,28 +1,13 @@ # -*- coding: utf-8 -*- -""" -Created on Fri Jan 5 09:27:34 2024 - -@author: TODL -""" from iterative_ensemble_smoother.esmda import BaseESMDA -import numbers -from abc import ABC from typing import Union import numpy as np import numpy.typing as npt -import scipy as sp # type: ignore -from iterative_ensemble_smoother.esmda_inversion import ( - inversion_exact_cholesky, - inversion_subspace, - normalize_alpha, -) from iterative_ensemble_smoother.utils import sample_mvnormal - -import numpy as np from sklearn.linear_model import LassoCV from sklearn.linear_model import MultiTaskLassoCV @@ -60,6 +45,8 @@ def linear_l1_regression(X, Y): """ # https://github.com/equinor/graphite-maps/blob/main/graphite_maps/linear_regression.py + # An option is to use MultiTaskLasso too, but not exactly the same objective + # https://scikit-learn.org/stable/modules/linear_model.html#multi-task-lasso # The goal is to solve K @ X = Y for K, or equivalently X.T @ K.T = Y.T num_parameters, ensemble_size1 = X.shape @@ -76,17 +63,16 @@ def linear_l1_regression(X, Y): # Loop over features coefs = [] cv = min(5, ensemble_size1) - alphas = np.logspace(-8, 8, num=17) - for j in range(num_observations): - y_j = Y[j, :] + alphas = np.logspace(-6, 6, num=17) + + # The matrix K in K @ X = Y is built up row by row + for j, y_j in enumerate(Y): # Loop over rows in Y # Learn individual regularization and fit model_cv = LassoCV(alphas=alphas, fit_intercept=False, max_iter=10_000, cv=cv) model_cv.fit(X.T, y_j) # model_cv.alpha_ - - # Alphas for the next iteration - # alpha = model_cv.alpha_ * 0.5 * alphas[len(alphas)//2] * 0.5 - # alphas = np.logspace(-2, 2, num=5) * model_cv.alpha_ + # TODO: Consider searching smarter for alphas, e.g. momentum or + # searching within 1-2 orders of magnitude of previous alpha? # Scale back the coefficients coef_scale = stds_Y[j] / stds_X @@ -96,23 +82,6 @@ def linear_l1_regression(X, Y): assert K.shape == (num_observations, num_parameters) return K - # ======================================================= - - # Alternative computation using MultiTaskLasso - - # Create a lasso model - lasso = MultiTaskLassoCV(fit_intercept=False, cv=cv) - lasso.fit(X.T, Y.T) - - # Get the matrix K - K = lasso.coef_ - assert K.shape == (num_observations, num_parameters) - - K = K / stds_X[np.newaxis, :] - K = K * stds_Y[:, np.newaxis] - - return K - class LassoES(BaseESMDA): """ @@ -156,6 +125,7 @@ def assimilate( *, alpha: float = 1.0, ) -> npt.NDArray[np.double]: + # Verify shapes _, num_ensemble = X.shape num_outputs, num_emsemble2 = Y.shape @@ -171,22 +141,23 @@ def assimilate( X_center = X - np.mean(X, axis=1, keepdims=True) Y_center = Y - np.mean(Y, axis=1, keepdims=True) - # We have R @ R.T = C_D + # We have R @ R.T / N ~ C_D R = np.sqrt(alpha) * sample_mvnormal( C_dd_cholesky=self.C_D_L, rng=self.rng, size=num_ensemble ) - # L = sqrt((N - 1) / N) - L = np.sqrt((num_ensemble - 1)/ num_ensemble) * R + # S = R * sqrt((N - 1) / N), so that S @ S.T / (N-1) = covariance + S = np.sqrt((num_ensemble - 1) / num_ensemble) * R - # Compute regularized Kalman gain matrix matrix by solving - # (Y + L) K = X - # The definition of K is + # Compute regularized Kalman gain matrix matrix by solving: + # (Y + S) K = X + # The definition of K is: # (cov(Y, Y) + covariance) K = cov(X, Y), which is approximately - # (Y @ Y.T + L @ L.T) K = X @ Y.T, [X and Y are centered] which is approximately - # (Y + L) @ (Y + L).T @ K = X @ Y.T, which is approximately - # (Y + L) @ K = X - K = linear_l1_regression(X=Y_center + L, Y=X_center) + # (Y @ Y.T + S @ L.T) K = X @ Y.T, [X and Y are centered] which is approximately + # (Y + S) @ (Y + S).T @ K = X @ Y.T, which is approximately + # (Y + S) @ K = X + # TODO: Write this out properly. + K = linear_l1_regression(X=(Y_center + S), Y=X_center) D = self.perturb_observations(ensemble_size=num_ensemble, alpha=alpha) return X + K @ (D - Y)