diff --git a/docs/source/Lasso.py b/docs/source/Lasso.py new file mode 100644 index 00000000..7a02df29 --- /dev/null +++ b/docs/source/Lasso.py @@ -0,0 +1,407 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.15.2 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% +# ruff: noqa: E402 + +# %% [markdown] +# # Adaptive Localization +# +# In this example we run adaptive localization +# on a sparse Gauss-linear problem. +# +# - Each response is only affected by $3$ parameters. +# - This is represented by a tridiagonal matrix $A$ in the forward model $g(x) = Ax$. +# - The problem is Gauss-Linear, so in this case ESMDA will sample +# the true posterior when the number of ensemble members (realizations) is large. +# - The sparse correlation structure will lead to spurious correlations, in the sense +# that a parameter and response might appear correlated when in fact they are not. + +# %% [markdown] +# ## Import packages + +# %% +import numpy as np +import scipy as sp +from matplotlib import pyplot as plt + +from iterative_ensemble_smoother import ESMDA +from iterative_ensemble_smoother.esmda_inversion import empirical_cross_covariance +from iterative_ensemble_smoother.experimental import AdaptiveESMDA +from iterative_ensemble_smoother.utils import sample_mvnormal + +# %% [markdown] +# ## Create problem data +# +# Some settings worth experimenting with: +# +# - Decreasing `prior_std=1` will pull the posterior solution toward zero. +# - Increasing `num_ensemble` will increase the quality of the solution. +# - Increasing `num_observations / num_parameters` +# will increase the quality of the solution. + +# %% +rng = np.random.default_rng(42) + +# Dimensionality of the problem +num_parameters = 75 +num_observations = 50 +num_ensemble = 25 +prior_std = 1 + +# Number of iterations to use in ESMDA +alpha = 1 + +# Effect size (coefficients in sparse mapping A) +effect_size = 1 + +# %% [markdown] +# ## Create problem data - sparse tridiagonal matrix $A$ + +# %% +diagonal = effect_size * np.ones(max(num_parameters, num_observations)) + +# Create a tridiagonal matrix (easiest with scipy) +A = sp.sparse.diags( + [diagonal, diagonal, diagonal], + offsets=[-1, 0, 1], + shape=(num_observations, num_observations), # (num_observations, num_parameters), + dtype=float, +).toarray() + +A = np.tile(A, reps=(1, num_parameters // num_observations + 1)) +A = A[:num_observations, :num_parameters] + + +# Create a tridiagonal matrix (easiest with scipy) +A = sp.sparse.diags( + [diagonal, diagonal, diagonal], + offsets=[-1, 0, 1], + shape=(num_observations, num_parameters), + dtype=float, +).toarray() + +# We add some noise that is insignificant compared to the +# actual local structure in the forward model +A = A + rng.standard_normal(size=A.shape) * 0.001 + +plt.title("Linear mapping $A$ in forward model $g(x) = Ax$") +plt.imshow(A) +plt.xlabel("Parameters (inputs)") +plt.ylabel("Responses (outputs)") +plt.tight_layout() +plt.show() + + +# %% [markdown] +# - Below we draw prior realizations $X \sim N(0, \sigma)$. +# - The true parameter values used to generate observations are in the range $[-1, 1]$. +# - As the number of realizations (ensemble members) goes to infinity, +# the correlation between the prior and the true parameter values converges to zero. +# - The correlation is zero for a finite number of realizations too, +# but statistical noise might induce some spurious correlations between +# the prior and the true parameter values. +# +# **In summary the baseline correlation is zero.** +# Anything we can do to increase the correlation above zero beats the baseline, +# which is using the prior (no update). +# +# The correlation coefficient does not take into account the uncertainty +# represeted in the posterior, only the mean posterior value is compared with +# the true parameter values. +# To compare distributions we could use the +# [Kullback–Leibler divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence), +# but we do not pursue this here. + +# %% +def get_prior(num_parameters, num_ensemble, prior_std): + """Sample prior from N(0, prior_std).""" + return rng.normal(size=(num_parameters, num_ensemble)) * prior_std + + +def g(X): + """Apply the forward model.""" + return A @ X + + +# Create observations: obs = g(x) + N(0, 1) +x_true = np.linspace(-1, 1, num=num_parameters) +observation_noise = rng.standard_normal(size=num_observations) # N(0, 1) noise +observations = g(x_true) + observation_noise + +# Initial ensemble X ~ N(0, prior_std) and diagonal covariance with ones +X = get_prior(num_parameters, num_ensemble, prior_std) + +# Covariance matches the noise added to observations above +covariance = np.ones(num_observations) # N(0, 1) covariance + +# %% [markdown] +# ## Solve the maximum likelihood problem +# +# We can solve $Ax = b$, where $b$ are the observations, +# for the maximum likelihood estimate. +# +# Notice that unlike using a Ridge model, +# solving $Ax = b$ directly does not use any prior information. + +# %% +x_ml, *_ = np.linalg.lstsq(A, observations, rcond=None) + +plt.figure(figsize=(7, 3)) +plt.scatter(np.arange(len(x_true)), x_true, label="True parameter values") +plt.scatter(np.arange(len(x_true)), x_ml, label="ML estimate (no prior)") +plt.xlabel("Parameter index") +plt.ylabel("Parameter value") +plt.grid(True, ls="--", zorder=0, alpha=0.33) +plt.legend() +plt.tight_layout() +plt.show() + +# %% +import numpy as np +from sklearn.linear_model import LassoCV +from sklearn.preprocessing import StandardScaler + +from iterative_ensemble_smoother.localized import linear_l1_regression + + +Y = g(X) +K = linear_l1_regression(X, Y) +plt.imshow(K) +plt.show() + + +# %% +def solve_esmda(X, Y, covariance, observations, seed=1): + """Solving using exact ESMDA computation.""" + + # Create a smoother to get D + smoother = ESMDA( + covariance=covariance, + observations=observations, + alpha=1, + seed=seed, + ) + N = X.shape[1] # Number of ensemble members (realizations) + D = smoother.perturb_observations(ensemble_size=N, alpha=1.0) + + # Cross covariance and covariance + cov_XY = empirical_cross_covariance(X, Y) + cov_YY = empirical_cross_covariance(Y, Y) + + # Compute the update using the exact ESMDA approach + assert covariance.ndim == 1 + K = cov_XY @ sp.linalg.inv(cov_YY + np.diag(covariance)) + return X + K @ (D - Y) + + +def solve_projected(X, Y, covariance, observations, seed=1): + """Instead of using the covariance matrix, sample S ~ N(0, Cov), + then use S @ S.T in place of it. This is a low rank approximation. + + In itsself, this method has little going for it. But it leads + to other approaches since Y_noisy = Y + S can be defined. + """ + + # Create a smoother to get D + smoother = ESMDA( + covariance=covariance, + observations=observations, + alpha=1, + seed=seed, + ) + N = X.shape[1] + D = smoother.perturb_observations(ensemble_size=N, alpha=1.0) + + # Instead of using the covariance matrix, sample from it + S = sample_mvnormal(C_dd_cholesky=smoother.C_D_L, rng=smoother.rng, size=N) + + cov_XY = empirical_cross_covariance(X, Y) + cov_YY = empirical_cross_covariance(Y, Y) + + # covariance ~ S @ S.T / N + + # Compute the update. Divide by N since we know the mean is zero + K = cov_XY @ sp.linalg.inv(cov_YY + S @ S.T / N) + return X + K @ (D - Y) + + +def solve_lstsq(X, Y, covariance, observations, seed=1): + """Since K =~ E(grad h^-1), we have K @ Y = X. + Here we simply solve Y.T @ K.T = X.T for X.""" + + # Create a smoother to get D + smoother = ESMDA( + covariance=covariance, + observations=observations, + alpha=1, + seed=seed, + ) + N = X.shape[1] + D = smoother.perturb_observations(ensemble_size=N, alpha=1.0) + + # Center X and Y + X_center = X - np.mean(X, axis=1, keepdims=True) + Y_center = Y - np.mean(Y, axis=1, keepdims=True) + + # Instead of using the covariance matrix, sample from it + # covariance / (N-1) ~ + + N = X.shape[1] + S = sample_mvnormal(C_dd_cholesky=smoother.C_D_L, rng=smoother.rng, size=N) + S = S * np.sqrt((N - 1) / N) # Divide by N since we know that the mean is zero + + Y_noisy = Y_center + S + + # Compute the update by solving K @ Y = X + K, *_ = sp.linalg.lstsq(Y_noisy.T, X_center.T) + K = K.T + + return X + K @ (D - Y) + + +def solve_lasso_direct(X, Y, covariance, observations, seed=1): + """Use lasso to solve for H, then use the update equation + K := cov_XX @ H.T @ inv(H @ cov_XX @ H.T + Cov) + """ + + # Create a smoother to get D + smoother = ESMDA( + covariance=covariance, + observations=observations, + alpha=1, + seed=seed, + ) + N = X.shape[1] + D = smoother.perturb_observations(ensemble_size=N, alpha=1.0) + + # Approximate forward model with H + H = linear_l1_regression(X, Y) + cov_XX = empirical_cross_covariance(X, X) + + # Compute the update + K = cov_XX @ H.T @ sp.linalg.inv(H @ cov_XX @ H.T + np.diag(covariance)) + return X + K @ (D - Y) + + +def solve_lasso(X, Y, covariance, observations, seed=1): + """Solve K @ Y = X using Lasso.""" + + from iterative_ensemble_smoother.localized import LassoES + + # Create a smoother to get D + smoother = LassoES( + covariance=covariance, + observations=observations, + seed=seed, + ) + + return smoother.assimilate(X, Y) + + +def solve_adaptive_ESMDA(X, Y, covariance, observations, seed=1): + """Solving using adaptive ESMDA with correlation threshold.""" + + adaptive_smoother = AdaptiveESMDA( + covariance=covariance, + observations=observations, + seed=seed, + ) + + X_i = np.copy(X) + + for i, alpha_i in enumerate([1], 1): + # Perturb observations + D_i = adaptive_smoother.perturb_observations( + ensemble_size=X_i.shape[1], alpha=alpha_i + ) + + # Assimilate data + X_i = adaptive_smoother.assimilate( + X=X_i, Y=g(X_i), D=D_i, alpha=alpha_i, verbose=False + ) + + return X_i + + +# %% +plt.figure(figsize=(8, 4.5)) + +for function, label in zip( + [ + solve_esmda, + solve_projected, + solve_lstsq, + solve_lasso_direct, + solve_lasso, + solve_adaptive_ESMDA, + ], + [ + "ESMDA", + "projected", + "lstsq", + "Lasso (direct)", + "Lasso", + "AdaptiveESMDA", + ], +): + print(label) + # Loop over seeds and solve + corrs = [] + posterior_means = [] + for seed in range(10): + X_posterior = function(X, g(X), covariance, observations, seed=seed) + x_posterior_mean = X_posterior.mean(axis=1) + corr = sp.stats.pearsonr(x_true, x_posterior_mean).statistic + corrs.append(corr) + posterior_means.append(x_posterior_mean) + + posterior_mean = np.vstack(posterior_means).mean(axis=0) + corr_mean, corr_std = np.mean(corrs), np.std(corrs) + + plt.scatter( + np.arange(len(x_true)), + posterior_mean, + label=f"{label} (corr: {corr_mean:.2f} +- {corr_std:.2f})", + alpha=0.6, + ) + + +corr = sp.stats.pearsonr(x_true, x_posterior_mean).statistic +plt.scatter( + np.arange(len(x_true)), + x_ml, + label=f"ML solution (knowing h(x) = Ax) (corr: {corr:.2f})", +) +plt.scatter( + np.arange(len(x_true)), + x_true, + label="True parameter values", +) + +# plt.scatter(np.arange(len(x_true)), x_proj, label="Projected") +plt.xlabel("Parameter index") +plt.ylabel("Parameter value") +plt.ylim([-3, 3]) +plt.grid(True, ls="--", zorder=0, alpha=0.33) +plt.legend() +plt.tight_layout() +plt.show() + +# %% + +# %% + +# %% diff --git a/lasso.md b/lasso.md new file mode 100644 index 00000000..0fb1e108 --- /dev/null +++ b/lasso.md @@ -0,0 +1,135 @@ +# Lasso + +Notes on the Lasso approach for regularizing the forward model. + +## Notation and preliminaries + +In this section we establish notation. + +- Let $X \in \mathbb{R}^{n \times N}$ be a parameter ensemble matrix. +There are $n$ parameters (rows) and $N$ realizations in the ensemble (columns). +- Let $Y \in \mathbb{R}^{m \times N}$ be a response ensemble matrix. +- The forward model is $h: \mathbb{R}^n \to \mathbb{R}^n$. +- The observational noise has distribution $\mathcal{N}(0, \Sigma_\epsilon)$. + +To add observational noise to the observations, we sample $D \sim \mathcal{N}(d_\text{obs}, \Sigma_\epsilon)$, where $D \in \mathbb{R}^{m \times N}$. +Each column in D is drawn from the multivariate normal. +We take the Cholesky factorization of the covariance and write $\Sigma_\epsilon = L_\epsilon L_\epsilon^T$. +Note that each column in $D$ can be computed as $d_\text{obs} + L_\epsilon z$, where $z \sim \mathcal{N}(0, 1)$. + +## Update equation + +Our starting point for the update equation will be from the ESMDA paper. + +The ESMDA update equation, with inflation factor $\alpha=1$, is + +$$X_\text{posterior} = X + \text{cov}(X, Y) (\text{cov}(Y, Y) + \Sigma_\epsilon)^{-1} (D - Y),$$ + +where the empirical cross covariance matrix $\text{cov}(X, Y) = c(X) c(Y)^T / (N - 1)$, and $c(X)$ centers each row in X by subtracting the mean. + +The term $K := \text{cov}(X, Y) (\text{cov}(Y, Y) + \Sigma_\epsilon)^{-1}$ is the Kalman gain matrix. +Notice that in this expression we estimate both the covariance matrix and the cross covariance matrix using few realizations $N$. + +## The Kalman gain + +We will now transform the Kalman gain matrix into a linear regression. + +We switch notations and set $X := c(X)$ and $Y := c(Y)$, meaning that we assume that each row (variable) has been centered by subtracting the mean. + +Writing out the empirical covariances, the Kalman gain becomes +$$K = \frac{X Y^T}{N - 1} \left( +\frac{Y Y^T}{N - 1} + \Sigma_\epsilon +\right)^{-1}.$$ + +### Adding sampled noise + +Let $R \sim \mathcal{N}(0, \Sigma_\epsilon)$. +If we assume the mean is known, Bessel's correction is not needed and we can write $R R^T / N \approx \Sigma_\epsilon$. +We sample from $\mathcal{N}(0, \Sigma_\epsilon)$ then estimate the covariance $\Sigma_\epsilon$ using the empirical covariance. + +Define $S = R \sqrt{(N-1) / N}$, then + +$$\Sigma_\epsilon \approx +\frac{1}{N} R R^T += +\frac{1}{N-1} \frac{N-1}{N} R R^T += \frac{S S^T}{N-1} +$$ +Notice that this is a low rank approximation to the observation covariance matrix. +The observation covariance matrix $\Sigma_\epsilon$ is typically diagonal, has shape $m \times m$, whereas $S S^T$ has rank $N \ll m$. + +Replacing $\Sigma_\epsilon$ with $S S^T / (N - 1)$, +we can approximate $K$ as +$$K \approx X Y^T \left( +Y Y^T + S S^T +\right)^{-1}.$$ + +If we assume that $YS^T \approx SY^T \approx 0$, then the middle terms in $(Y +S)( Y + S)^T = Y Y^T + Y S^T + SY^T + SS^T$ vanish. +This is reasonable in the limit of large $N$ since the rows in $S$ are uncorrelated with the rows in $Y$. + +Define $Y_\text{noisy} := Y + S$, then + +$$K \approx X Y^T \left( +Y_\text{noisy} Y_\text{noisy}^T +\right)^{+}.$$ + +Finally, assume that $X S^T \approx 0$ so that $Y \approx Y_\text{noisy}$, then + +$$K = X Y_\text{noisy}^{+}$$ + +and we solve a least squares problem $Y_\text{noisy}^T K^T = X^T$ for $K$. +If we use Ridge or Lasso, the solution decomposes and we can consider each row of $X$ independently. +This is because in the equation $Y_\text{noisy}^T K^T = X^T$ the matrix $K^T$ acts on each row vector in $Y_\text{noisy}^T$ independently to produce each row vector in $X^T$. +We can find $K$ by solving a Ridge or Lasso problem. + +### Comment: Kalman gain and the forward model + +Regularizing $K$ using e.g. Ridge or Lasso makes sense if we believe that forward model is localized, in the sense that each output is determined primarily by a few inputs (sparsity). + +A rough sketch of why $K \sim \mathbb{E} \nabla h^{-1}$ + +Since $K \approx X Y^{+}$ holds for centered matrices, if we change notation to uncentered matrices we have +$$K \approx \frac{X - \mathbb{E} X}{Y - \mathbb{E} Y}$$ +rearranging we see that this is a second order Taylor expansion. + +Therefore it makes sense to regularize $K$ if we believe in a sparse forward model. + +### Comment: Ridge and the Kalman gain + +Solving $Y^T K^T = X^T$ with Ridge produces the Normal equations that define $K$. +To be concrete, minimizing $\lVert Y^T K^T - X^T \rVert_2^2 + \alpha \lVert D K \rVert_2^2$ with respect to $K$ yields the equation +$$\left( Y Y^T + \alpha D D^T\right) K = X Y^T.$$ +This is, up to scaling, exactly the equation for the kalman gain $K$ that we started with from the ESMDA paper. +See equations (119) and (132) in the matrix cookbook for the matrix calculus needed to differentiate the loss function. + +### Comment: Another Lasso approach + +The Kalman gain can also be written as + +$$K = \Sigma_{x} \hat{H}^T (\hat{H}\Sigma_{x}\hat{H}^T + \Sigma_{\epsilon})^{-1}$$ + +which suggests another possible way to compute $K$: + +1. Estimate $\hat{H}$ using Lasso or similar. +2. Estimate $\Sigma_{x}$ as $\text{cov}(X, X)$. + +This is likely unfeasible since $\text{cov}(X, X)$ becomes huge. + +### Comment: Covariances with linear forward model + +Here we show two facts that can be proven using the definition of covariance, see e.g. sections 6.2.2 and 8.2.1 in [The Matrix Cookbook](https://www.math.uwaterloo.ca/~hwolkowi/matrixcookbook.pdf). + +Assume that $y = h(x) = H x + c$, then + +$$\text{cov}(x, y ) = \text{cov}(x, x) H^T.$$ + +Similarity, if $y = h(x) = H x + c$, then + +$$\text{cov}(y, y ) = H \text{cov}(x, x) H^T.$$ + +## References + +- [Issue 6599 on the Lasso in ERT](https://github.com/equinor/ert/issues/6599) +- [Presentation: Ensemblized linear least squares (LLS)](https://ncda-fs.web.norce.cloud/WS2023/raanes.pdf) +- [Paper: Ensemble Smoother with Multiple Data Assimilation](http://dx.doi.org/10.1016/j.cageo.2012.03.011) +- [Paper: Ensemble transport smoothing. Part I: Unified framework](https://arxiv.org/pdf/2210.17000.pdf) \ No newline at end of file diff --git a/src/iterative_ensemble_smoother/localized.py b/src/iterative_ensemble_smoother/localized.py new file mode 100644 index 00000000..6c09d669 --- /dev/null +++ b/src/iterative_ensemble_smoother/localized.py @@ -0,0 +1,175 @@ +# -*- coding: utf-8 -*- + +from iterative_ensemble_smoother.esmda import BaseESMDA +from typing import Union + +import numpy as np +import numpy.typing as npt + +from iterative_ensemble_smoother.utils import sample_mvnormal + +from sklearn.linear_model import LassoCV +from sklearn.linear_model import MultiTaskLassoCV + + +def linear_l1_regression(X, Y): + """ + Performs LASSO regression for each response in Y against predictors in U, + constructing a sparse matrix of regression coefficients. + + The function scales features in U using standard scaling before applying + LASSO, then re-scales the coefficients to the original scale of U. This + extracts the effect of each feature in U on each response in Y, ignoring + intercepts and constant terms. + + Parameters + ---------- + U : np.ndarray + 2D array of predictors with shape (num_variables, ensemble_size). + Y : np.ndarray + 2D array of responses with shape (num_variables, ensemble_size). + + + Examples + -------- + >>> K = np.array([[-0.2, -0.9], + ... [-1.5, 0.2], + ... [-0.2, 0.1], + ... [ 0.8, -0.4]]) + >>> X = np.array([[ 0.8, 2. , -0.8], + ... [ 0.4, 1.7, -0.3]]) + >>> Y = K @ X + >>> K_est = linear_l1_regression(X, Y) + >>> np.allclose(K, K_est, atol=0.005) + True + + """ + # https://github.com/equinor/graphite-maps/blob/main/graphite_maps/linear_regression.py + # An option is to use MultiTaskLasso too, but not exactly the same objective + # https://scikit-learn.org/stable/modules/linear_model.html#multi-task-lasso + + # The goal is to solve K @ X = Y for K, or equivalently X.T @ K.T = Y.T + num_parameters, ensemble_size1 = X.shape + num_observations, ensemble_size2 = Y.shape + + assert ensemble_size1 == ensemble_size2, "Number of cols in X and Y must be equal" + + # Scale to unit standard deviation. We do this because of numerics in Lasso + stds_X = np.std(X, axis=1) + stds_Y = np.std(Y, axis=1) + X = X / stds_X[:, np.newaxis] + Y = Y / stds_Y[:, np.newaxis] + + # Loop over features + coefs = [] + cv = min(5, ensemble_size1) + alphas = np.logspace(-6, 6, num=17) + + # The matrix K in K @ X = Y is built up row by row + for j, y_j in enumerate(Y): # Loop over rows in Y + + # Learn individual regularization and fit + model_cv = LassoCV(alphas=alphas, fit_intercept=False, max_iter=10_000, cv=cv) + model_cv.fit(X.T, y_j) # model_cv.alpha_ + # TODO: Consider searching smarter for alphas, e.g. momentum or + # searching within 1-2 orders of magnitude of previous alpha? + + # Scale back the coefficients + coef_scale = stds_Y[j] / stds_X + coefs.append(model_cv.coef_ * coef_scale) + + K = np.vstack(coefs) + assert K.shape == (num_observations, num_parameters) + return K + + +class LassoES(BaseESMDA): + """ + Implement an Ensemble Smoother with Multiple Data Assimilation (ES-MDA). + + The implementation follows :cite:t:`EMERICK2013`. + + Parameters + ---------- + covariance : np.ndarray + Either a 1D array of diagonal covariances, or a 2D covariance matrix. + The shape is either (num_observations,) or (num_observations, num_observations). + This is C_D in Emerick (2013), and represents observation or measurement + errors. We observe d from the real world, y from the model g(x), and + assume that d = y + e, where the error e is multivariate normal with + covariance given by `covariance`. + observations : np.ndarray + 1D array of shape (num_observations,) representing real-world observations. + This is d_obs in Emerick (2013). + seed : integer or numpy.random._generator.Generator, optional + A seed or numpy.random._generator.Generator used for random number + generation. The argument is passed to numpy.random.default_rng(). + The default is None. + + """ + + def __init__( + self, + covariance: npt.NDArray[np.double], + observations: npt.NDArray[np.double], + seed: Union[np.random._generator.Generator, int, None] = None, + ) -> None: + """Initialize the instance.""" + + super().__init__(covariance=covariance, observations=observations, seed=seed) + + def assimilate( + self, + X: npt.NDArray[np.double], + Y: npt.NDArray[np.double], + *, + alpha: float = 1.0, + ) -> npt.NDArray[np.double]: + + # Verify shapes + _, num_ensemble = X.shape + num_outputs, num_emsemble2 = Y.shape + assert ( + num_ensemble == num_emsemble2 + ), "Number of ensemble members in X and Y must match" + if not np.issubdtype(X.dtype, np.floating): + raise TypeError("Argument `X` must contain floats") + if not np.issubdtype(Y.dtype, np.floating): + raise TypeError("Argument `Y` must contain floats") + + # Center X and Y + X_center = X - np.mean(X, axis=1, keepdims=True) + Y_center = Y - np.mean(Y, axis=1, keepdims=True) + + # We have R @ R.T / N ~ C_D + R = np.sqrt(alpha) * sample_mvnormal( + C_dd_cholesky=self.C_D_L, rng=self.rng, size=num_ensemble + ) + + # S = R * sqrt((N - 1) / N), so that S @ S.T / (N-1) = covariance + S = np.sqrt((num_ensemble - 1) / num_ensemble) * R + + # Compute regularized Kalman gain matrix matrix by solving: + # (Y + S) K = X + # The definition of K is: + # (cov(Y, Y) + covariance) K = cov(X, Y), which is approximately + # (Y @ Y.T + S @ L.T) K = X @ Y.T, [X and Y are centered] which is approximately + # (Y + S) @ (Y + S).T @ K = X @ Y.T, which is approximately + # (Y + S) @ K = X + # TODO: Write this out properly. + K = linear_l1_regression(X=(Y_center + S), Y=X_center) + + D = self.perturb_observations(ensemble_size=num_ensemble, alpha=alpha) + return X + K @ (D - Y) + + +if __name__ == "__main__": + import pytest + + pytest.main( + args=[ + __file__, + "--doctest-modules", + "-v", + ] + )