diff --git a/docs/source/Lasso.py b/docs/source/Lasso.py
new file mode 100644
index 00000000..7a02df29
--- /dev/null
+++ b/docs/source/Lasso.py
@@ -0,0 +1,407 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.15.2
+#   kernelspec:
+#     display_name: Python 3 (ipykernel)
+#     language: python
+#     name: python3
+# ---
+
+# %%
+# ruff: noqa: E402
+
+# %% [markdown]
+# # Adaptive Localization
+#
+# In this example we run adaptive localization
+# on a sparse Gauss-linear problem.
+#
+# - Each response is only affected by $3$ parameters.
+# - This is represented by a tridiagonal matrix $A$ in the forward model $g(x) = Ax$.
+# - The problem is Gauss-Linear, so in this case ESMDA will sample
+#   the true posterior when the number of ensemble members (realizations) is large.
+# - The sparse correlation structure will lead to spurious correlations, in the sense
+#   that a parameter and response might appear correlated when in fact they are not.
+
+# %% [markdown]
+# ## Import packages
+
+# %%
+import numpy as np
+import scipy as sp
+from matplotlib import pyplot as plt
+
+from iterative_ensemble_smoother import ESMDA
+from iterative_ensemble_smoother.esmda_inversion import empirical_cross_covariance
+from iterative_ensemble_smoother.experimental import AdaptiveESMDA
+from iterative_ensemble_smoother.utils import sample_mvnormal
+
+# %% [markdown]
+# ## Create problem data
+#
+# Some settings worth experimenting with:
+#
+# - Decreasing `prior_std=1` will pull the posterior solution toward zero.
+# - Increasing `num_ensemble` will increase the quality of the solution.
+# - Increasing `num_observations / num_parameters`
+#   will increase the quality of the solution.
+
+# %%
+rng = np.random.default_rng(42)
+
+# Dimensionality of the problem
+num_parameters = 75
+num_observations = 50
+num_ensemble = 25
+prior_std = 1
+
+# Number of iterations to use in ESMDA
+alpha = 1
+
+# Effect size (coefficients in sparse mapping A)
+effect_size = 1
+
+# %% [markdown]
+# ## Create problem data - sparse tridiagonal matrix $A$
+
+# %%
+diagonal = effect_size * np.ones(max(num_parameters, num_observations))
+
+# Create a tridiagonal matrix (easiest with scipy)
+A = sp.sparse.diags(
+    [diagonal, diagonal, diagonal],
+    offsets=[-1, 0, 1],
+    shape=(num_observations, num_observations),  # (num_observations, num_parameters),
+    dtype=float,
+).toarray()
+
+A = np.tile(A, reps=(1, num_parameters // num_observations + 1))
+A = A[:num_observations, :num_parameters]
+
+
+# Create a tridiagonal matrix (easiest with scipy)
+A = sp.sparse.diags(
+    [diagonal, diagonal, diagonal],
+    offsets=[-1, 0, 1],
+    shape=(num_observations, num_parameters),
+    dtype=float,
+).toarray()
+
+# We add some noise that is insignificant compared to the
+# actual local structure in the forward model
+A = A + rng.standard_normal(size=A.shape) * 0.001
+
+plt.title("Linear mapping $A$ in forward model $g(x) = Ax$")
+plt.imshow(A)
+plt.xlabel("Parameters (inputs)")
+plt.ylabel("Responses (outputs)")
+plt.tight_layout()
+plt.show()
+
+
+# %% [markdown]
+# - Below we draw prior realizations $X \sim N(0, \sigma)$.
+# - The true parameter values used to generate observations are in the range $[-1, 1]$.
+# - As the number of realizations (ensemble members) goes to infinity,
+#   the correlation between the prior and the true parameter values converges to zero.
+# - The correlation is zero for a finite number of realizations too,
+#   but statistical noise might induce some spurious correlations between
+#   the prior and the true parameter values.
+#
+# **In summary the baseline correlation is zero.**
+# Anything we can do to increase the correlation above zero beats the baseline,
+# which is using the prior (no update).
+#
+# The correlation coefficient does not take into account the uncertainty
+# represeted in the posterior, only the mean posterior value is compared with
+# the true parameter values.
+# To compare distributions we could use the
+# [Kullback–Leibler divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence),
+# but we do not pursue this here.
+
+# %%
+def get_prior(num_parameters, num_ensemble, prior_std):
+    """Sample prior from N(0, prior_std)."""
+    return rng.normal(size=(num_parameters, num_ensemble)) * prior_std
+
+
+def g(X):
+    """Apply the forward model."""
+    return A @ X
+
+
+# Create observations: obs = g(x) + N(0, 1)
+x_true = np.linspace(-1, 1, num=num_parameters)
+observation_noise = rng.standard_normal(size=num_observations)  # N(0, 1) noise
+observations = g(x_true) + observation_noise
+
+# Initial ensemble X ~ N(0, prior_std) and diagonal covariance with ones
+X = get_prior(num_parameters, num_ensemble, prior_std)
+
+# Covariance matches the noise added to observations above
+covariance = np.ones(num_observations)  # N(0, 1) covariance
+
+# %% [markdown]
+# ## Solve the maximum likelihood problem
+#
+# We can solve $Ax = b$, where $b$ are the observations,
+# for the maximum likelihood estimate.
+#
+# Notice that unlike using a Ridge model,
+# solving $Ax = b$ directly does not use any prior information.
+
+# %%
+x_ml, *_ = np.linalg.lstsq(A, observations, rcond=None)
+
+plt.figure(figsize=(7, 3))
+plt.scatter(np.arange(len(x_true)), x_true, label="True parameter values")
+plt.scatter(np.arange(len(x_true)), x_ml, label="ML estimate (no prior)")
+plt.xlabel("Parameter index")
+plt.ylabel("Parameter value")
+plt.grid(True, ls="--", zorder=0, alpha=0.33)
+plt.legend()
+plt.tight_layout()
+plt.show()
+
+# %%
+import numpy as np
+from sklearn.linear_model import LassoCV
+from sklearn.preprocessing import StandardScaler
+
+from iterative_ensemble_smoother.localized import linear_l1_regression
+
+
+Y = g(X)
+K = linear_l1_regression(X, Y)
+plt.imshow(K)
+plt.show()
+
+
+# %%
+def solve_esmda(X, Y, covariance, observations, seed=1):
+    """Solving using exact ESMDA computation."""
+
+    # Create a smoother to get D
+    smoother = ESMDA(
+        covariance=covariance,
+        observations=observations,
+        alpha=1,
+        seed=seed,
+    )
+    N = X.shape[1]  # Number of ensemble members (realizations)
+    D = smoother.perturb_observations(ensemble_size=N, alpha=1.0)
+
+    # Cross covariance and covariance
+    cov_XY = empirical_cross_covariance(X, Y)
+    cov_YY = empirical_cross_covariance(Y, Y)
+
+    # Compute the update using the exact ESMDA approach
+    assert covariance.ndim == 1
+    K = cov_XY @ sp.linalg.inv(cov_YY + np.diag(covariance))
+    return X + K @ (D - Y)
+
+
+def solve_projected(X, Y, covariance, observations, seed=1):
+    """Instead of using the covariance matrix, sample S ~ N(0, Cov),
+    then use S @ S.T in place of it. This is a low rank approximation.
+
+    In itsself, this method has little going for it. But it leads
+    to other approaches since Y_noisy = Y + S can be defined.
+    """
+
+    # Create a smoother to get D
+    smoother = ESMDA(
+        covariance=covariance,
+        observations=observations,
+        alpha=1,
+        seed=seed,
+    )
+    N = X.shape[1]
+    D = smoother.perturb_observations(ensemble_size=N, alpha=1.0)
+
+    # Instead of using the covariance matrix, sample from it
+    S = sample_mvnormal(C_dd_cholesky=smoother.C_D_L, rng=smoother.rng, size=N)
+
+    cov_XY = empirical_cross_covariance(X, Y)
+    cov_YY = empirical_cross_covariance(Y, Y)
+
+    # covariance ~ S @ S.T / N
+
+    # Compute the update. Divide by N since we know the mean is zero
+    K = cov_XY @ sp.linalg.inv(cov_YY + S @ S.T / N)
+    return X + K @ (D - Y)
+
+
+def solve_lstsq(X, Y, covariance, observations, seed=1):
+    """Since K =~ E(grad h^-1), we have K @ Y = X.
+    Here we simply solve Y.T @ K.T = X.T for X."""
+
+    # Create a smoother to get D
+    smoother = ESMDA(
+        covariance=covariance,
+        observations=observations,
+        alpha=1,
+        seed=seed,
+    )
+    N = X.shape[1]
+    D = smoother.perturb_observations(ensemble_size=N, alpha=1.0)
+
+    # Center X and Y
+    X_center = X - np.mean(X, axis=1, keepdims=True)
+    Y_center = Y - np.mean(Y, axis=1, keepdims=True)
+
+    # Instead of using the covariance matrix, sample from it
+    # covariance / (N-1) ~
+
+    N = X.shape[1]
+    S = sample_mvnormal(C_dd_cholesky=smoother.C_D_L, rng=smoother.rng, size=N)
+    S = S * np.sqrt((N - 1) / N)  # Divide by N since we know that the mean is zero
+
+    Y_noisy = Y_center + S
+
+    # Compute the update by solving K @ Y = X
+    K, *_ = sp.linalg.lstsq(Y_noisy.T, X_center.T)
+    K = K.T
+
+    return X + K @ (D - Y)
+
+
+def solve_lasso_direct(X, Y, covariance, observations, seed=1):
+    """Use lasso to solve for H, then use the update equation
+    K := cov_XX @ H.T @ inv(H @ cov_XX @ H.T + Cov)
+    """
+
+    # Create a smoother to get D
+    smoother = ESMDA(
+        covariance=covariance,
+        observations=observations,
+        alpha=1,
+        seed=seed,
+    )
+    N = X.shape[1]
+    D = smoother.perturb_observations(ensemble_size=N, alpha=1.0)
+
+    # Approximate forward model with H
+    H = linear_l1_regression(X, Y)
+    cov_XX = empirical_cross_covariance(X, X)
+
+    # Compute the update
+    K = cov_XX @ H.T @ sp.linalg.inv(H @ cov_XX @ H.T + np.diag(covariance))
+    return X + K @ (D - Y)
+
+
+def solve_lasso(X, Y, covariance, observations, seed=1):
+    """Solve K @ Y = X using Lasso."""
+
+    from iterative_ensemble_smoother.localized import LassoES
+
+    # Create a smoother to get D
+    smoother = LassoES(
+        covariance=covariance,
+        observations=observations,
+        seed=seed,
+    )
+
+    return smoother.assimilate(X, Y)
+
+
+def solve_adaptive_ESMDA(X, Y, covariance, observations, seed=1):
+    """Solving using adaptive ESMDA with correlation threshold."""
+
+    adaptive_smoother = AdaptiveESMDA(
+        covariance=covariance,
+        observations=observations,
+        seed=seed,
+    )
+
+    X_i = np.copy(X)
+
+    for i, alpha_i in enumerate([1], 1):
+        # Perturb observations
+        D_i = adaptive_smoother.perturb_observations(
+            ensemble_size=X_i.shape[1], alpha=alpha_i
+        )
+
+        # Assimilate data
+        X_i = adaptive_smoother.assimilate(
+            X=X_i, Y=g(X_i), D=D_i, alpha=alpha_i, verbose=False
+        )
+
+    return X_i
+
+
+# %%
+plt.figure(figsize=(8, 4.5))
+
+for function, label in zip(
+    [
+        solve_esmda,
+        solve_projected,
+        solve_lstsq,
+        solve_lasso_direct,
+        solve_lasso,
+        solve_adaptive_ESMDA,
+    ],
+    [
+        "ESMDA",
+        "projected",
+        "lstsq",
+        "Lasso (direct)",
+        "Lasso",
+        "AdaptiveESMDA",
+    ],
+):
+    print(label)
+    # Loop over seeds and solve
+    corrs = []
+    posterior_means = []
+    for seed in range(10):
+        X_posterior = function(X, g(X), covariance, observations, seed=seed)
+        x_posterior_mean = X_posterior.mean(axis=1)
+        corr = sp.stats.pearsonr(x_true, x_posterior_mean).statistic
+        corrs.append(corr)
+        posterior_means.append(x_posterior_mean)
+
+    posterior_mean = np.vstack(posterior_means).mean(axis=0)
+    corr_mean, corr_std = np.mean(corrs), np.std(corrs)
+
+    plt.scatter(
+        np.arange(len(x_true)),
+        posterior_mean,
+        label=f"{label} (corr: {corr_mean:.2f} +- {corr_std:.2f})",
+        alpha=0.6,
+    )
+
+
+corr = sp.stats.pearsonr(x_true, x_posterior_mean).statistic
+plt.scatter(
+    np.arange(len(x_true)),
+    x_ml,
+    label=f"ML solution (knowing h(x) = Ax) (corr: {corr:.2f})",
+)
+plt.scatter(
+    np.arange(len(x_true)),
+    x_true,
+    label="True parameter values",
+)
+
+# plt.scatter(np.arange(len(x_true)), x_proj, label="Projected")
+plt.xlabel("Parameter index")
+plt.ylabel("Parameter value")
+plt.ylim([-3, 3])
+plt.grid(True, ls="--", zorder=0, alpha=0.33)
+plt.legend()
+plt.tight_layout()
+plt.show()
+
+# %%
+
+# %%
+
+# %%
diff --git a/lasso.md b/lasso.md
new file mode 100644
index 00000000..0fb1e108
--- /dev/null
+++ b/lasso.md
@@ -0,0 +1,135 @@
+# Lasso
+
+Notes on the Lasso approach for regularizing the forward model.
+
+## Notation and preliminaries
+
+In this section we establish notation.
+
+- Let $X \in \mathbb{R}^{n \times N}$ be a parameter ensemble matrix.
+There are $n$ parameters (rows) and $N$ realizations in the ensemble (columns).
+- Let $Y \in \mathbb{R}^{m \times N}$ be a response ensemble matrix.
+- The forward model is $h: \mathbb{R}^n \to \mathbb{R}^n$.
+- The observational noise has distribution $\mathcal{N}(0, \Sigma_\epsilon)$.
+
+To add observational noise to the observations, we sample $D \sim \mathcal{N}(d_\text{obs}, \Sigma_\epsilon)$, where $D \in \mathbb{R}^{m \times N}$.
+Each column in D is drawn from the multivariate normal.
+We take the Cholesky factorization of the covariance and write $\Sigma_\epsilon = L_\epsilon  L_\epsilon^T$.
+Note that each column in $D$ can be computed as $d_\text{obs} + L_\epsilon z$, where $z \sim \mathcal{N}(0, 1)$.
+
+## Update equation
+
+Our starting point for the update equation will be from the ESMDA paper.
+
+The ESMDA update equation, with inflation factor $\alpha=1$, is
+
+$$X_\text{posterior} = X + \text{cov}(X, Y) (\text{cov}(Y, Y) + \Sigma_\epsilon)^{-1} (D - Y),$$
+
+where the empirical cross covariance matrix $\text{cov}(X, Y) = c(X) c(Y)^T / (N - 1)$, and $c(X)$ centers each row in X by subtracting the mean.
+
+The term $K := \text{cov}(X, Y) (\text{cov}(Y, Y) + \Sigma_\epsilon)^{-1}$ is the Kalman gain matrix.
+Notice that in this expression we estimate both the covariance matrix and the cross covariance matrix using few realizations $N$.
+
+## The Kalman gain
+
+We will now transform the Kalman gain matrix into a linear regression.
+
+We switch notations and set $X := c(X)$ and $Y := c(Y)$, meaning that we assume that each row (variable) has been centered by subtracting the mean.
+
+Writing out the empirical covariances, the Kalman gain becomes
+$$K = \frac{X Y^T}{N - 1} \left(
+\frac{Y Y^T}{N - 1} + \Sigma_\epsilon
+\right)^{-1}.$$
+
+### Adding sampled noise
+
+Let $R \sim \mathcal{N}(0, \Sigma_\epsilon)$.
+If we assume the mean is known, Bessel's correction is not needed and we can write $R R^T / N \approx \Sigma_\epsilon$.
+We sample from $\mathcal{N}(0, \Sigma_\epsilon)$ then estimate the covariance $\Sigma_\epsilon$ using the empirical covariance.
+
+Define $S = R \sqrt{(N-1) / N}$, then
+
+$$\Sigma_\epsilon \approx
+\frac{1}{N} R R^T
+=
+\frac{1}{N-1} \frac{N-1}{N} R R^T
+= \frac{S S^T}{N-1}
+$$
+Notice that this is a low rank approximation to the observation covariance matrix.
+The observation covariance matrix $\Sigma_\epsilon$ is typically diagonal, has shape $m \times m$, whereas $S S^T$ has rank $N \ll m$.
+
+Replacing $\Sigma_\epsilon$ with $S S^T / (N - 1)$,
+we can approximate $K$ as
+$$K \approx X Y^T \left(
+Y Y^T + S S^T
+\right)^{-1}.$$
+
+If we assume that $YS^T \approx SY^T \approx 0$, then the middle terms in $(Y +S)( Y + S)^T = Y Y^T + Y S^T + SY^T  + SS^T$ vanish.
+This is reasonable in the limit of large $N$ since the rows in $S$ are uncorrelated with the rows in $Y$.
+
+Define $Y_\text{noisy} := Y + S$, then
+
+$$K \approx X Y^T \left(
+Y_\text{noisy} Y_\text{noisy}^T
+\right)^{+}.$$
+
+Finally, assume that $X S^T \approx 0$ so that $Y \approx Y_\text{noisy}$, then
+
+$$K = X Y_\text{noisy}^{+}$$
+
+and we solve a least squares problem $Y_\text{noisy}^T K^T = X^T$ for $K$.
+If we use Ridge or Lasso, the solution decomposes and we can consider each row of $X$ independently.
+This is because in the equation $Y_\text{noisy}^T K^T = X^T$ the matrix $K^T$ acts on each row vector in $Y_\text{noisy}^T$ independently to produce each row vector in $X^T$.
+We can find $K$ by solving a Ridge or Lasso problem.
+
+### Comment: Kalman gain and the forward model
+
+Regularizing $K$ using e.g. Ridge or Lasso makes sense if we believe that forward model is localized, in the sense that each output is determined primarily by a few inputs (sparsity).
+
+A rough sketch of why $K \sim \mathbb{E} \nabla h^{-1}$
+
+Since $K \approx X Y^{+}$ holds for centered matrices, if we change notation to uncentered matrices we have
+$$K \approx \frac{X - \mathbb{E} X}{Y - \mathbb{E} Y}$$
+rearranging we see that this is a second order Taylor expansion.
+
+Therefore it makes sense to regularize $K$ if we believe in a sparse forward model.
+
+### Comment: Ridge and the Kalman gain
+
+Solving $Y^T K^T = X^T$ with Ridge produces the Normal equations that define $K$.
+To be concrete, minimizing $\lVert Y^T K^T - X^T \rVert_2^2 + \alpha \lVert D K \rVert_2^2$ with respect to $K$ yields the equation
+$$\left( Y Y^T + \alpha D D^T\right) K = X Y^T.$$
+This is, up to scaling, exactly the equation for the kalman gain $K$ that we started with from the ESMDA paper.
+See equations (119) and (132) in the matrix cookbook for the matrix calculus needed to differentiate the loss function.
+
+### Comment: Another Lasso approach
+
+The Kalman gain can also be written as
+
+$$K = \Sigma_{x} \hat{H}^T (\hat{H}\Sigma_{x}\hat{H}^T + \Sigma_{\epsilon})^{-1}$$
+
+which suggests another possible way to compute $K$:
+
+1. Estimate $\hat{H}$ using Lasso or similar.
+2. Estimate $\Sigma_{x}$ as $\text{cov}(X, X)$.
+
+This is likely unfeasible since $\text{cov}(X, X)$ becomes huge.
+
+### Comment: Covariances with linear forward model
+
+Here we show two facts that can be proven using the definition of covariance, see e.g. sections 6.2.2 and 8.2.1 in [The Matrix Cookbook](https://www.math.uwaterloo.ca/~hwolkowi/matrixcookbook.pdf).
+
+Assume that $y = h(x) = H x + c$, then
+
+$$\text{cov}(x, y ) = \text{cov}(x, x) H^T.$$
+
+Similarity, if $y = h(x) = H x + c$, then
+
+$$\text{cov}(y, y ) = H \text{cov}(x, x) H^T.$$
+
+## References
+
+- [Issue 6599 on the Lasso in ERT](https://github.com/equinor/ert/issues/6599)
+- [Presentation: Ensemblized linear least squares (LLS)](https://ncda-fs.web.norce.cloud/WS2023/raanes.pdf)
+- [Paper: Ensemble Smoother with Multiple Data Assimilation](http://dx.doi.org/10.1016/j.cageo.2012.03.011)
+- [Paper: Ensemble transport smoothing. Part I: Unified framework](https://arxiv.org/pdf/2210.17000.pdf)
\ No newline at end of file
diff --git a/src/iterative_ensemble_smoother/localized.py b/src/iterative_ensemble_smoother/localized.py
new file mode 100644
index 00000000..6c09d669
--- /dev/null
+++ b/src/iterative_ensemble_smoother/localized.py
@@ -0,0 +1,175 @@
+# -*- coding: utf-8 -*-
+
+from iterative_ensemble_smoother.esmda import BaseESMDA
+from typing import Union
+
+import numpy as np
+import numpy.typing as npt
+
+from iterative_ensemble_smoother.utils import sample_mvnormal
+
+from sklearn.linear_model import LassoCV
+from sklearn.linear_model import MultiTaskLassoCV
+
+
+def linear_l1_regression(X, Y):
+    """
+    Performs LASSO regression for each response in Y against predictors in U,
+    constructing a sparse matrix of regression coefficients.
+
+    The function scales features in U using standard scaling before applying
+    LASSO, then re-scales the coefficients to the original scale of U. This
+    extracts the effect of each feature in U on each response in Y, ignoring
+    intercepts and constant terms.
+
+    Parameters
+    ----------
+    U : np.ndarray
+        2D array of predictors with shape (num_variables, ensemble_size).
+    Y : np.ndarray
+        2D array of responses with shape (num_variables, ensemble_size).
+
+
+    Examples
+    --------
+    >>> K = np.array([[-0.2, -0.9],
+    ...               [-1.5,  0.2],
+    ...               [-0.2,  0.1],
+    ...               [ 0.8, -0.4]])
+    >>> X = np.array([[ 0.8,  2. , -0.8],
+    ...               [ 0.4,  1.7, -0.3]])
+    >>> Y = K @ X
+    >>> K_est = linear_l1_regression(X, Y)
+    >>> np.allclose(K, K_est, atol=0.005)
+    True
+
+    """
+    # https://github.com/equinor/graphite-maps/blob/main/graphite_maps/linear_regression.py
+    # An option is to use MultiTaskLasso too, but not exactly the same objective
+    # https://scikit-learn.org/stable/modules/linear_model.html#multi-task-lasso
+
+    # The goal is to solve K @ X = Y for K, or equivalently X.T @ K.T = Y.T
+    num_parameters, ensemble_size1 = X.shape
+    num_observations, ensemble_size2 = Y.shape
+
+    assert ensemble_size1 == ensemble_size2, "Number of cols in X and Y must be equal"
+
+    # Scale to unit standard deviation. We do this because of numerics in Lasso
+    stds_X = np.std(X, axis=1)
+    stds_Y = np.std(Y, axis=1)
+    X = X / stds_X[:, np.newaxis]
+    Y = Y / stds_Y[:, np.newaxis]
+
+    # Loop over features
+    coefs = []
+    cv = min(5, ensemble_size1)
+    alphas = np.logspace(-6, 6, num=17)
+    
+    # The matrix K in K @ X = Y is built up row by row
+    for j, y_j in enumerate(Y): # Loop over rows in Y
+
+        # Learn individual regularization and fit
+        model_cv = LassoCV(alphas=alphas, fit_intercept=False, max_iter=10_000, cv=cv)
+        model_cv.fit(X.T, y_j)  # model_cv.alpha_
+        # TODO: Consider searching smarter for alphas, e.g. momentum or
+        # searching within 1-2 orders of magnitude of previous alpha?
+
+        # Scale back the coefficients
+        coef_scale = stds_Y[j] / stds_X
+        coefs.append(model_cv.coef_ * coef_scale)
+
+    K = np.vstack(coefs)
+    assert K.shape == (num_observations, num_parameters)
+    return K
+
+
+class LassoES(BaseESMDA):
+    """
+    Implement an Ensemble Smoother with Multiple Data Assimilation (ES-MDA).
+
+    The implementation follows :cite:t:`EMERICK2013`.
+
+    Parameters
+    ----------
+    covariance : np.ndarray
+        Either a 1D array of diagonal covariances, or a 2D covariance matrix.
+        The shape is either (num_observations,) or (num_observations, num_observations).
+        This is C_D in Emerick (2013), and represents observation or measurement
+        errors. We observe d from the real world, y from the model g(x), and
+        assume that d = y + e, where the error e is multivariate normal with
+        covariance given by `covariance`.
+    observations : np.ndarray
+        1D array of shape (num_observations,) representing real-world observations.
+        This is d_obs in Emerick (2013).
+    seed : integer or numpy.random._generator.Generator, optional
+        A seed or numpy.random._generator.Generator used for random number
+        generation. The argument is passed to numpy.random.default_rng().
+        The default is None.
+
+    """
+
+    def __init__(
+        self,
+        covariance: npt.NDArray[np.double],
+        observations: npt.NDArray[np.double],
+        seed: Union[np.random._generator.Generator, int, None] = None,
+    ) -> None:
+        """Initialize the instance."""
+
+        super().__init__(covariance=covariance, observations=observations, seed=seed)
+
+    def assimilate(
+        self,
+        X: npt.NDArray[np.double],
+        Y: npt.NDArray[np.double],
+        *,
+        alpha: float = 1.0,
+    ) -> npt.NDArray[np.double]:
+        
+        # Verify shapes
+        _, num_ensemble = X.shape
+        num_outputs, num_emsemble2 = Y.shape
+        assert (
+            num_ensemble == num_emsemble2
+        ), "Number of ensemble members in X and Y must match"
+        if not np.issubdtype(X.dtype, np.floating):
+            raise TypeError("Argument `X` must contain floats")
+        if not np.issubdtype(Y.dtype, np.floating):
+            raise TypeError("Argument `Y` must contain floats")
+
+        # Center X and Y
+        X_center = X - np.mean(X, axis=1, keepdims=True)
+        Y_center = Y - np.mean(Y, axis=1, keepdims=True)
+        
+        # We have R @ R.T / N ~ C_D
+        R = np.sqrt(alpha) * sample_mvnormal(
+            C_dd_cholesky=self.C_D_L, rng=self.rng, size=num_ensemble
+        )
+        
+        # S = R * sqrt((N - 1) / N), so that S @ S.T / (N-1) = covariance
+        S = np.sqrt((num_ensemble - 1) / num_ensemble) * R
+
+        # Compute regularized Kalman gain matrix matrix by solving:
+        # (Y + S) K = X
+        # The definition of K is:
+        # (cov(Y, Y) + covariance) K = cov(X, Y), which is approximately
+        # (Y @ Y.T + S @ L.T) K = X @ Y.T, [X and Y are centered] which is approximately
+        # (Y + S) @ (Y + S).T @ K = X @ Y.T, which is approximately
+        # (Y + S) @ K = X
+        # TODO: Write this out properly.
+        K = linear_l1_regression(X=(Y_center + S), Y=X_center)
+
+        D = self.perturb_observations(ensemble_size=num_ensemble, alpha=alpha)
+        return X + K @ (D - Y)
+
+
+if __name__ == "__main__":
+    import pytest
+
+    pytest.main(
+        args=[
+            __file__,
+            "--doctest-modules",
+            "-v",
+        ]
+    )