From 1390f05f2facf27dc27f1cccb0cab5611c06e18c Mon Sep 17 00:00:00 2001
From: Tommy Odland <tommy.odland>
Date: Thu, 28 Dec 2023 13:24:21 +0100
Subject: [PATCH 1/9] added lasso.md

---
 lasso.md | 126 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 126 insertions(+)
 create mode 100644 lasso.md

diff --git a/lasso.md b/lasso.md
new file mode 100644
index 00000000..69d9ad52
--- /dev/null
+++ b/lasso.md
@@ -0,0 +1,126 @@
+# Lasso
+
+## Notation and preliminaries
+
+Let $X \in \mathbb{R}^{n \times N}$ be a parameter ensemble matrix.
+There are $n$ parameters (rows) and $N$ realizations in the ensemble (columns).
+
+Let $Y \in \mathbb{R}^{m \times N}$ be a response ensemble matrix.
+The forward model is $h: \mathbb{R}^n \to \mathbb{R}^n$.
+
+Assume that observational noise is generated as $D \sim \mathcal{N}(d_\text{obs}, \Sigma_\epsilon)$, where $D \in \mathbb{R}^{m \times N}$.
+
+## Update equation
+
+We will start with the ESMDA update equation, and work our way to the linear regression update equation.
+
+The ESMDA update equation, with inflation factor $\alpha=1$, is
+
+$$X_\text{posterior} = X + \operatorname{cov}(X, Y) (\operatorname{cov}(Y, Y) + \Sigma_\epsilon)^{-1} (D - Y)$$
+where $\operatorname{cov}(X, Y) = c(X) c(Y)^T / (N - 1)$ and $c(X)$ centers each row in X by subtracting the mean.
+
+The term $K := \operatorname{cov}(X, Y) (\operatorname{cov}(Y, Y) + \Sigma_\epsilon)^{-1}$ is the Kalman gain.
+
+## The Kalman gain
+
+We will now transform the kalman gain matrix into a linear regression.
+We switch notations and set $X := c(X)$ and $Y := c(Y)$, meaning that we assume that each row (variable) has been centered.
+
+Then we have
+$$
+K = \frac{X Y^T}{N - 1} \left(
+\frac{Y Y^T}{N - 1} + \Sigma_\epsilon
+\right)^{-1}.
+$$
+
+Since $D D^T / (N - 1) \approx \Sigma_\epsilon$, we can approximate $K$ as
+$$
+K = X Y^T \left(
+Y Y^T + D D^T
+\right)^{-1}.
+$$
+
+If we assume that $YD^T \approx DY^T \approx 0$, then the middle terms in $(Y +D)( Y + D)^T = Y Y^T + Y D^T + DY^T  + DD^T$ vanish.
+Define $Y_\text{noisy} := Y + D$, then
+$$
+K = X Y^T \left(
+Y_\text{noisy} Y_\text{noisy}^T
+\right)^{+}.
+$$
+Finally, assume that $X D^T \approx 0$, then
+$$
+K = X Y_\text{noisy}^{+}.
+$$
+We can find $K$ by solving the optimization problem
+$$
+\lVert Y^T K^T - X^T \rVert^2
+$$
+using e.g. Ridge og Lasso.
+
+
+## Solution approaches
+
+### SVD
+
+Since $K (D - Y)$
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+The ES update equation is
+
+$$X_\text{posterior} = X + K (D - Y)$$
+
+where
+
+$$K = \Sigma_{x} \hat{H}^T (\hat{H}\Sigma_{x}\hat{H}^T + \Sigma_{\epsilon})^{-1}$$
+$$K = \operatorname{cov}(X, Y) (\operatorname{cov}(Y, Y) + \Sigma_{\epsilon})^{-1}$$
+
+## Covariances with linear forward model
+
+Assume that $y = h(x) = H x + c$, then
+
+$$\operatorname{cov}(x, y ) = \operatorname{cov}(x, x) H^T.$$
+
+Similarity, if $y = h(x) = H x + c$, then
+
+$$\operatorname{cov}(y, y ) = H \operatorname{cov}(x, x) H^T.$$
+
+## Variations on the update equation
+
+
+## References
+
+- [Ensemblized linear least squares (LLS)](https://ncda-fs.web.norce.cloud/WS2023/raanes.pdf)
+
+
+
+
+
+------------------
+
+
+
+
+- test
+- test
+
+$`a=b`$
+
+`$$a=b$$`
+
+$$`a=b`$$
+
+```math
+\left( \sum_{k=1}^n a_k b_k \right)^2 \leq \left( \sum_{k=1}^n a_k^2 \right) \left( \sum_{k=1}^n b_k^2 \right)
+```

From af1afe5a44ff03653cff4931585537eb472f292e Mon Sep 17 00:00:00 2001
From: Tommy Odland <tommy.odland>
Date: Fri, 29 Dec 2023 09:51:05 +0100
Subject: [PATCH 2/9] notebook

---
 docs/source/Lasso.py | 460 +++++++++++++++++++++++++++++++++++++++++++
 lasso.md             | 131 ++++++------
 2 files changed, 517 insertions(+), 74 deletions(-)
 create mode 100644 docs/source/Lasso.py

diff --git a/docs/source/Lasso.py b/docs/source/Lasso.py
new file mode 100644
index 00000000..75a911c3
--- /dev/null
+++ b/docs/source/Lasso.py
@@ -0,0 +1,460 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.15.2
+#   kernelspec:
+#     display_name: Python 3 (ipykernel)
+#     language: python
+#     name: python3
+# ---
+
+# %%
+# ruff: noqa: E402
+
+# %% [markdown]
+# # Adaptive Localization
+#
+# In this example we run adaptive localization
+# on a sparse Gauss-linear problem.
+#
+# - Each response is only affected by $3$ parameters.
+# - This is represented by a tridiagonal matrix $A$ in the forward model $g(x) = Ax$.
+# - The problem is Gauss-Linear, so in this case ESMDA will sample
+#   the true posterior when the number of ensemble members (realizations) is large.
+# - The sparse correlation structure will lead to spurious correlations, in the sense
+#   that a parameter and response might appear correlated when in fact they are not.
+
+# %% [markdown]
+# ## Import packages
+
+# %%
+import numpy as np
+import scipy as sp
+from matplotlib import pyplot as plt
+
+from iterative_ensemble_smoother import ESMDA
+from iterative_ensemble_smoother.esmda_inversion import empirical_cross_covariance
+from iterative_ensemble_smoother.experimental import AdaptiveESMDA
+from iterative_ensemble_smoother.utils import sample_mvnormal
+
+# %% [markdown]
+# ## Create problem data
+#
+# Some settings worth experimenting with:
+#
+# - Decreasing `prior_std=1` will pull the posterior solution toward zero.
+# - Increasing `num_ensemble` will increase the quality of the solution.
+# - Increasing `num_observations / num_parameters`
+#   will increase the quality of the solution.
+
+# %%
+rng = np.random.default_rng(42)
+
+# Dimensionality of the problem
+num_parameters = 100
+num_observations = 50
+num_ensemble = 25
+prior_std = 1
+
+# Number of iterations to use in ESMDA
+alpha = 1
+
+# Effect size (coefficients in sparse mapping A)
+effect_size = 5
+
+# %% [markdown]
+# ## Create problem data - sparse tridiagonal matrix $A$
+
+# %%
+diagonal = effect_size * np.ones(min(num_parameters, num_observations))
+
+# Create a tridiagonal matrix (easiest with scipy)
+A = sp.sparse.diags(
+    [diagonal, diagonal, diagonal],
+    offsets=[-1, 0, 1],
+    shape=(num_observations, num_observations),  # (num_observations, num_parameters),
+    dtype=float,
+).toarray()
+
+A = np.tile(A, reps=(1, num_parameters // num_observations + 1))
+A = A[:num_observations, :num_parameters]
+
+
+# Create a tridiagonal matrix (easiest with scipy)
+A = sp.sparse.diags(
+    [diagonal, diagonal, diagonal],
+    offsets=[-1, 0, 1],
+    shape=(num_observations, num_parameters),
+    dtype=float,
+).toarray()
+
+# We add some noise that is insignificant compared to the
+# actual local structure in the forward model
+A = A + rng.standard_normal(size=A.shape) * 0.001
+
+plt.title("Linear mapping $A$ in forward model $g(x) = Ax$")
+plt.imshow(A)
+plt.xlabel("Parameters (inputs)")
+plt.ylabel("Responses (outputs)")
+plt.tight_layout()
+plt.show()
+
+
+# %% [markdown]
+# - Below we draw prior realizations $X \sim N(0, \sigma)$.
+# - The true parameter values used to generate observations are in the range $[-1, 1]$.
+# - As the number of realizations (ensemble members) goes to infinity,
+#   the correlation between the prior and the true parameter values converges to zero.
+# - The correlation is zero for a finite number of realizations too,
+#   but statistical noise might induce some spurious correlations between
+#   the prior and the true parameter values.
+#
+# **In summary the baseline correlation is zero.**
+# Anything we can do to increase the correlation above zero beats the baseline,
+# which is using the prior (no update).
+#
+# The correlation coefficient does not take into account the uncertainty
+# represeted in the posterior, only the mean posterior value is compared with
+# the true parameter values.
+# To compare distributions we could use the
+# [Kullback–Leibler divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence),
+# but we do not pursue this here.
+
+# %%
+def get_prior(num_parameters, num_ensemble, prior_std):
+    """Sample prior from N(0, prior_std)."""
+    return rng.normal(size=(num_parameters, num_ensemble)) * prior_std
+
+
+def g(X):
+    """Apply the forward model."""
+    return A @ X
+
+
+# Create observations: obs = g(x) + N(0, 1)
+x_true = np.linspace(-1, 1, num=num_parameters)
+observation_noise = rng.standard_normal(size=num_observations)  # N(0, 1) noise
+observations = g(x_true) + observation_noise
+
+# Initial ensemble X ~ N(0, prior_std) and diagonal covariance with ones
+X = get_prior(num_parameters, num_ensemble, prior_std)
+
+# Covariance matches the noise added to observations above
+covariance = np.ones(num_observations)  # N(0, 1) covariance
+
+# %% [markdown]
+# ## Solve the maximum likelihood problem
+#
+# We can solve $Ax = b$, where $b$ are the observations,
+# for the maximum likelihood estimate.
+#
+# Notice that unlike using a Ridge model,
+# solving $Ax = b$ directly does not use any prior information.
+
+# %%
+x_ml, *_ = np.linalg.lstsq(A, observations, rcond=None)
+
+plt.figure(figsize=(7, 3))
+plt.scatter(np.arange(len(x_true)), x_true, label="True parameter values")
+plt.scatter(np.arange(len(x_true)), x_ml, label="ML estimate (no prior)")
+plt.xlabel("Parameter index")
+plt.ylabel("Parameter value")
+plt.grid(True, ls="--", zorder=0, alpha=0.33)
+plt.legend()
+plt.tight_layout()
+plt.show()
+
+# %%
+import numpy as np
+from sklearn.linear_model import LassoCV
+from sklearn.preprocessing import StandardScaler
+
+
+def linear_l1_regression(U, Y):
+    """
+    Performs LASSO regression for each response in Y against predictors in U,
+    constructing a sparse matrix of regression coefficients.
+
+    The function scales features in U using standard scaling before applying
+    LASSO, then re-scales the coefficients to the original scale of U. This
+    extracts the effect of each feature in U on each response in Y, ignoring
+    intercepts and constant terms.
+
+    Parameters
+    ----------
+    U : np.ndarray
+        2D array of predictors with shape (n, p).
+    Y : np.ndarray
+        2D array of responses with shape (n, m).
+
+    Returns
+    -------
+    H_sparse : scipy.sparse.csc_matrix
+        Sparse matrix (m, p) with re-scaled LASSO regression coefficients for
+        each response in Y.
+
+    Raises
+    ------
+    AssertionError
+        If the number of samples in U and Y do not match, or if the shape of
+        H_sparse is not (m, p).
+    """
+    # https://github.com/equinor/graphite-maps/blob/main/graphite_maps/linear_regression.py
+
+    n, p = U.shape  # p: number of features
+    n_y, m = Y.shape  # m: number of y responses
+
+    # Assert that the first dimension of U and Y are the same
+    assert n == n_y, "Number of samples in U and Y must be the same"
+
+    scaler_u = StandardScaler()
+    U_scaled = scaler_u.fit_transform(U)
+
+    scaler_y = StandardScaler()
+    Y_scaled = scaler_y.fit_transform(Y)
+
+    # Loop over features
+    coefs = []
+    for j in range(m):
+        y_j = Y_scaled[:, j]
+
+        # Learn individual regularization and fit
+        alphas = np.logspace(-8, 8, num=32)
+        model_cv = LassoCV(alphas=alphas, fit_intercept=False, max_iter=10_000, cv=5)
+        model_cv.fit(U_scaled, y_j)
+
+        coef_scale = scaler_y.scale_[j] / scaler_u.scale_
+        coefs.append(model_cv.coef_ * coef_scale)
+
+    K = np.vstack(coefs)
+    assert K.shape == (m, p)
+
+    return K
+
+
+Y = g(X)
+K = linear_l1_regression(X.T, Y.T)
+plt.imshow(K)
+plt.show()
+
+
+# %%
+def solve_esmda(X, Y, covariance, observations, seed=1):
+    """Solving using exact ESMDA computation."""
+
+    # Create a smoother to get D
+    smoother = ESMDA(
+        covariance=covariance,
+        observations=observations,
+        alpha=1,
+        seed=seed,
+    )
+    N = X.shape[1]  # Number of ensemble members (realizations)
+    D = smoother.perturb_observations(ensemble_size=N, alpha=1.0)
+
+    # Cross covariance and covariance
+    cov_XY = empirical_cross_covariance(X, Y)
+    cov_YY = empirical_cross_covariance(Y, Y)
+
+    # Compute the update using the exact ESMDA approach
+    K = cov_XY @ sp.linalg.inv(cov_YY + np.diag(covariance))
+    return X + K @ (D - Y)
+
+
+def solve_projected(X, Y, covariance, observations, seed=1):
+    """Instead of using the covariance matrix, sample S ~ N(0, Cov),
+    then use S @ S.T in place of it. This is a low rank approximation.
+
+    In itsself, this method has little going for it. But it leads
+    to other approaches since Y_noisy = Y + S can be defined.
+    """
+
+    # Create a smoother to get D
+    smoother = ESMDA(
+        covariance=covariance,
+        observations=observations,
+        alpha=1,
+        seed=seed,
+    )
+    N = X.shape[1]
+    D = smoother.perturb_observations(ensemble_size=N, alpha=1.0)
+
+    # Instead of using the covariance matrix, sample from it
+    S = sample_mvnormal(C_dd_cholesky=smoother.C_D_L, rng=smoother.rng, size=N)
+
+    cov_XY = empirical_cross_covariance(X, Y)
+    cov_YY = empirical_cross_covariance(Y, Y)
+
+    # Compute the update
+    K = cov_XY @ sp.linalg.inv(cov_YY + S @ S.T / (N - 1))
+    return X + K @ (D - Y)
+
+
+def solve_lstsq(X, Y, covariance, observations, seed=1):
+    """Since K =~ E(grad h^-1), we have K @ Y = X.
+    Here we simply solve Y.T @ K.T = X.T for X."""
+
+    # Create a smoother to get D
+    smoother = ESMDA(
+        covariance=covariance,
+        observations=observations,
+        alpha=1,
+        seed=seed,
+    )
+    N = X.shape[1]
+    D = smoother.perturb_observations(ensemble_size=N, alpha=1.0)
+
+    # Instead of using the covariance matrix, sample from it
+    S = sample_mvnormal(C_dd_cholesky=smoother.C_D_L, rng=smoother.rng, size=N)
+
+    Y_noisy = Y + S
+
+    # Compute the update by solving K @ Y = X
+    K, *_ = sp.linalg.lstsq(Y_noisy.T, X.T)
+    K = K.T
+
+    return X + K @ (D - Y)
+
+
+def solve_lasso_direct(X, Y, covariance, observations, seed=1):
+    """Use lasso to solve for H, then use the update equation
+    K := cov_XX @ H.T @ inv(H @ cov_XX @ H.T + Cov)
+    """
+
+    # Create a smoother to get D
+    smoother = ESMDA(
+        covariance=covariance,
+        observations=observations,
+        alpha=1,
+        seed=seed,
+    )
+    N = X.shape[1]
+    D = smoother.perturb_observations(ensemble_size=N, alpha=1.0)
+
+    # Approximate forward model with H
+    H = linear_l1_regression(X.T, Y.T)
+    cov_XX = empirical_cross_covariance(X, X)
+
+    # Compute the update
+    K = cov_XX @ H.T @ sp.linalg.inv(H @ cov_XX @ H.T + np.diag(covariance))
+    return X + K @ (D - Y)
+
+
+def solve_lasso(X, Y, covariance, observations, seed=1):
+    """Solve K @ Y = X using Lasso."""
+
+    # Create a smoother to get D
+    smoother = ESMDA(
+        covariance=covariance,
+        observations=observations,
+        alpha=1,
+        seed=seed,
+    )
+    N = X.shape[1]
+    D = smoother.perturb_observations(ensemble_size=N, alpha=1.0)
+
+    # Instead of using the covariance matrix, sample from it
+    S = sample_mvnormal(C_dd_cholesky=smoother.C_D_L, rng=smoother.rng, size=N)
+
+    Y_noisy = Y + S  # Seems to work better with Y instead of Y_noisy
+
+    # Compute the update
+    K = linear_l1_regression(Y_noisy.T, X.T)
+
+    return X + K @ (D - Y)
+
+
+def solve_adaptive_ESMDA(X, Y, covariance, observations, seed=1):
+    """Solving using adaptive ESMDA with correlation threshold."""
+
+    adaptive_smoother = AdaptiveESMDA(
+        covariance=covariance,
+        observations=observations,
+        seed=seed,
+    )
+
+    X_i = np.copy(X)
+
+    for i, alpha_i in enumerate([1], 1):
+        # Perturb observations
+        D_i = adaptive_smoother.perturb_observations(
+            ensemble_size=X_i.shape[1], alpha=alpha_i
+        )
+
+        # Assimilate data
+        X_i = adaptive_smoother.assimilate(
+            X=X_i, Y=g(X_i), D=D_i, alpha=alpha_i, verbose=False
+        )
+
+    return X_i
+
+
+# %%
+plt.figure(figsize=(7, 3))
+
+for function, label in zip(
+    [
+        solve_esmda,
+        solve_projected,
+        solve_lstsq,
+        solve_lasso_direct,
+        solve_lasso,
+        solve_adaptive_ESMDA,
+    ],
+    [
+        "ESMDA",
+        "lstsq",
+        "Lasso (direct)",
+        "Lasso",
+        "AdaptiveESMDA",
+    ],
+):
+    print(label)
+    # Loop over seeds and solve
+    corrs = []
+    posterior_means = []
+    for seed in range(10):
+        X_posterior = function(X, g(X), covariance, observations, seed=seed)
+        x_posterior_mean = X_posterior.mean(axis=1)
+        corr = sp.stats.pearsonr(x_true, x_posterior_mean).statistic
+        corrs.append(corr)
+        posterior_means.append(x_posterior_mean)
+
+    posterior_mean = np.vstack(posterior_means).mean(axis=0)
+    corr_mean, corr_std = np.mean(corrs), np.std(corrs)
+
+    plt.scatter(
+        np.arange(len(x_true)),
+        posterior_mean,
+        label=f"{label} (corr: {corr_mean:.2f} +- {corr_std:.2f})",
+        alpha=0.6,
+    )
+
+
+corr = sp.stats.pearsonr(x_true, x_posterior_mean).statistic
+plt.scatter(
+    np.arange(len(x_true)),
+    x_ml,
+    label=f"ML solution (knowing h(x) = Ax) (corr: {corr:.2f})",
+)
+plt.scatter(
+    np.arange(len(x_true)),
+    x_true,
+    label="True parameter values",
+)
+
+# plt.scatter(np.arange(len(x_true)), x_proj, label="Projected")
+plt.xlabel("Parameter index")
+plt.ylabel("Parameter value")
+plt.ylim([-3, 3])
+plt.grid(True, ls="--", zorder=0, alpha=0.33)
+plt.legend()
+plt.tight_layout()
+plt.show()
+
+# %%
diff --git a/lasso.md b/lasso.md
index 69d9ad52..ae6526b9 100644
--- a/lasso.md
+++ b/lasso.md
@@ -1,92 +1,97 @@
 # Lasso
 
+Notes on the Lasso approach for
+
 ## Notation and preliminaries
 
-Let $X \in \mathbb{R}^{n \times N}$ be a parameter ensemble matrix.
-There are $n$ parameters (rows) and $N$ realizations in the ensemble (columns).
+In this section we establish notation.
 
-Let $Y \in \mathbb{R}^{m \times N}$ be a response ensemble matrix.
-The forward model is $h: \mathbb{R}^n \to \mathbb{R}^n$.
+- Let $X \in \mathbb{R}^{n \times N}$ be a parameter ensemble matrix.
+There are $n$ parameters (rows) and $N$ realizations in the ensemble (columns).
+- Let $Y \in \mathbb{R}^{m \times N}$ be a response ensemble matrix.
+- The forward model is $h: \mathbb{R}^n \to \mathbb{R}^n$.
+- The observational noise has distribution $\mathcal{N}(0, \Sigma_\epsilon)$.
 
-Assume that observational noise is generated as $D \sim \mathcal{N}(d_\text{obs}, \Sigma_\epsilon)$, where $D \in \mathbb{R}^{m \times N}$.
+To add observational noise to the observations, we sample $D \sim \mathcal{N}(d_\text{obs}, \Sigma_\epsilon)$, where $D \in \mathbb{R}^{m \times N}$.
+Each column in D is drawn from the multivariate normal.
+We take the Cholesky factorization of the covariance and write $\Sigma_\epsilon = L_\epsilon  L_\epsilon^T$.
+Note that each column in $D$ can be computed as $d_\text{obs} + L_\epsilon z$, where $z \sim \mathcal{N}(0, 1)$.
 
 ## Update equation
 
-We will start with the ESMDA update equation, and work our way to the linear regression update equation.
+Our starting point for the update equation will be from the ESMDA paper.
 
 The ESMDA update equation, with inflation factor $\alpha=1$, is
 
-$$X_\text{posterior} = X + \operatorname{cov}(X, Y) (\operatorname{cov}(Y, Y) + \Sigma_\epsilon)^{-1} (D - Y)$$
-where $\operatorname{cov}(X, Y) = c(X) c(Y)^T / (N - 1)$ and $c(X)$ centers each row in X by subtracting the mean.
+$$X_\text{posterior} = X + \operatorname{cov}(X, Y) (\operatorname{cov}(Y, Y) + \Sigma_\epsilon)^{-1} (D - Y),$$
 
-The term $K := \operatorname{cov}(X, Y) (\operatorname{cov}(Y, Y) + \Sigma_\epsilon)^{-1}$ is the Kalman gain.
+where the empirical cross covariance matrix $\operatorname{cov}(X, Y) = c(X) c(Y)^T / (N - 1)$, and $c(X)$ centers each row in X by subtracting the mean.
+
+The term $K := \operatorname{cov}(X, Y) (\operatorname{cov}(Y, Y) + \Sigma_\epsilon)^{-1}$ is the Kalman gain matrix.
+Notice that in this expression we estimate both the covariance matrix and the cross covariance matrix using few realizations $N$.
 
 ## The Kalman gain
 
-We will now transform the kalman gain matrix into a linear regression.
-We switch notations and set $X := c(X)$ and $Y := c(Y)$, meaning that we assume that each row (variable) has been centered.
+We will now transform the Kalman gain matrix into a linear regression.
+
+We switch notations and set $X := c(X)$ and $Y := c(Y)$, meaning that we assume that each row (variable) has been centered by subtracting the mean.
 
-Then we have
+Writing out the empirical covariances, the Kalman gain becomes
 $$
 K = \frac{X Y^T}{N - 1} \left(
 \frac{Y Y^T}{N - 1} + \Sigma_\epsilon
 \right)^{-1}.
 $$
 
-Since $D D^T / (N - 1) \approx \Sigma_\epsilon$, we can approximate $K$ as
+### Adding sampled noise
+
+Let $S \sim \mathcal{N}(0, \Sigma_\epsilon)$.
+We can write $S S^T / (N - 1) \approx \Sigma_\epsilon$, which is a low rank approximation to the observation covariance matrix.
+Notice that the observation covariance matrix is typically diagonal, has shape $m \times m$, whereas $S S^T$ has rank $N \ll m$.
+
+Replacing $\Sigma_\epsilon$ with $S S^T / (N - 1)$,
+we can approximate $K$ as
 $$
-K = X Y^T \left(
-Y Y^T + D D^T
+K \approx X Y^T \left(
+Y Y^T + S S^T
 \right)^{-1}.
 $$
 
-If we assume that $YD^T \approx DY^T \approx 0$, then the middle terms in $(Y +D)( Y + D)^T = Y Y^T + Y D^T + DY^T  + DD^T$ vanish.
-Define $Y_\text{noisy} := Y + D$, then
+If we assume that $YS^T \approx SY^T \approx 0$, then the middle terms in $(Y +S)( Y + S)^T = Y Y^T + Y S^T + SY^T  + SS^T$ vanish.
+This is reasonable in the limit of large $N$ since the rows in $S$ are uncorrelated with the rows in $Y$.
+
+Define $Y_\text{noisy} := Y + S$, then
 $$
-K = X Y^T \left(
+K \approx X Y^T \left(
 Y_\text{noisy} Y_\text{noisy}^T
 \right)^{+}.
 $$
-Finally, assume that $X D^T \approx 0$, then
-$$
-K = X Y_\text{noisy}^{+}.
+Finally, assume that $X S^T \approx 0$ so that $Y \approx Y_\text{noisy}$, then
 $$
-We can find $K$ by solving the optimization problem
+K = X Y_\text{noisy}^{+}
 $$
-\lVert Y^T K^T - X^T \rVert^2
-$$
-using e.g. Ridge og Lasso.
-
-
-## Solution approaches
-
-### SVD
-
-Since $K (D - Y)$
-
-
-
-
-
-
-
+and we solve a least squares problem $Y_\text{noisy}^T K^T = X^T$ for $K$.
+If we use Ridge or Lasso, the solution decomposes since in general $A[x_1 | x_2] = [Ax_1 | Ax_2] = [y_1 | y_2]$ and we can consider each row of $X$ independently.
+We can find $K$ by solving a Ridge or Lasso problem.
 
+### Comment: Ridge and the Kalman gain
 
+Solving $Y^T K^T = X^T$ with Ridge produces the Normal equations that define $K$.
 
+## Comment: Another Lasso approach
 
+The Kalman gain can also be written as
+$$K = \Sigma_{x} \hat{H}^T (\hat{H}\Sigma_{x}\hat{H}^T + \Sigma_{\epsilon})^{-1}$$
+which suggests another possible way to compute $K$:
 
+1. Estimate $\hat{H}$ using Lasso or similar.
+2. Estimate $\Sigma_{x}$ as $\operatorname{cov}(X, X)$.
 
+This is likely unfeasible since $\operatorname{cov}(X, X)$ becomes huge.
 
-The ES update equation is
-
-$$X_\text{posterior} = X + K (D - Y)$$
-
-where
-
-$$K = \Sigma_{x} \hat{H}^T (\hat{H}\Sigma_{x}\hat{H}^T + \Sigma_{\epsilon})^{-1}$$
-$$K = \operatorname{cov}(X, Y) (\operatorname{cov}(Y, Y) + \Sigma_{\epsilon})^{-1}$$
+## Comment: Covariances with linear forward model
 
-## Covariances with linear forward model
+Here we show two facts that can be proven using the definition of covariance, see e.g. sections 6.2.2 and 8.2.1 in [The Matrix Cookbook](https://www.math.uwaterloo.ca/~hwolkowi/matrixcookbook.pdf).
 
 Assume that $y = h(x) = H x + c$, then
 
@@ -96,31 +101,9 @@ Similarity, if $y = h(x) = H x + c$, then
 
 $$\operatorname{cov}(y, y ) = H \operatorname{cov}(x, x) H^T.$$
 
-## Variations on the update equation
-
-
 ## References
 
-- [Ensemblized linear least squares (LLS)](https://ncda-fs.web.norce.cloud/WS2023/raanes.pdf)
-
-
-
-
-
-------------------
-
-
-
-
-- test
-- test
-
-$`a=b`$
-
-`$$a=b$$`
-
-$$`a=b`$$
-
-```math
-\left( \sum_{k=1}^n a_k b_k \right)^2 \leq \left( \sum_{k=1}^n a_k^2 \right) \left( \sum_{k=1}^n b_k^2 \right)
-```
+- [Issue 6599 on the Lasso in ERT](https://github.com/equinor/ert/issues/6599)
+- [Presentation: Ensemblized linear least squares (LLS)](https://ncda-fs.web.norce.cloud/WS2023/raanes.pdf)
+- [Paper: Ensemble Smoother with Multiple Data Assimilation](http://dx.doi.org/10.1016/j.cageo.2012.03.011)
+- [Paper: Ensemble transport smoothing. Part I: Unified framework](https://arxiv.org/pdf/2210.17000.pdf)

From fb175edff44ac9f53b98a843d08c55c26da6bcce Mon Sep 17 00:00:00 2001
From: Tommy Odland <tommy.odland>
Date: Fri, 29 Dec 2023 09:52:41 +0100
Subject: [PATCH 3/9] operatorname to text

---
 lasso.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/lasso.md b/lasso.md
index ae6526b9..8541271a 100644
--- a/lasso.md
+++ b/lasso.md
@@ -23,11 +23,11 @@ Our starting point for the update equation will be from the ESMDA paper.
 
 The ESMDA update equation, with inflation factor $\alpha=1$, is
 
-$$X_\text{posterior} = X + \operatorname{cov}(X, Y) (\operatorname{cov}(Y, Y) + \Sigma_\epsilon)^{-1} (D - Y),$$
+$$X_\text{posterior} = X + \text{cov}(X, Y) (\text{cov}(Y, Y) + \Sigma_\epsilon)^{-1} (D - Y),$$
 
-where the empirical cross covariance matrix $\operatorname{cov}(X, Y) = c(X) c(Y)^T / (N - 1)$, and $c(X)$ centers each row in X by subtracting the mean.
+where the empirical cross covariance matrix $\text{cov}(X, Y) = c(X) c(Y)^T / (N - 1)$, and $c(X)$ centers each row in X by subtracting the mean.
 
-The term $K := \operatorname{cov}(X, Y) (\operatorname{cov}(Y, Y) + \Sigma_\epsilon)^{-1}$ is the Kalman gain matrix.
+The term $K := \text{cov}(X, Y) (\text{cov}(Y, Y) + \Sigma_\epsilon)^{-1}$ is the Kalman gain matrix.
 Notice that in this expression we estimate both the covariance matrix and the cross covariance matrix using few realizations $N$.
 
 ## The Kalman gain
@@ -85,9 +85,9 @@ $$K = \Sigma_{x} \hat{H}^T (\hat{H}\Sigma_{x}\hat{H}^T + \Sigma_{\epsilon})^{-1}
 which suggests another possible way to compute $K$:
 
 1. Estimate $\hat{H}$ using Lasso or similar.
-2. Estimate $\Sigma_{x}$ as $\operatorname{cov}(X, X)$.
+2. Estimate $\Sigma_{x}$ as $\text{cov}(X, X)$.
 
-This is likely unfeasible since $\operatorname{cov}(X, X)$ becomes huge.
+This is likely unfeasible since $\text{cov}(X, X)$ becomes huge.
 
 ## Comment: Covariances with linear forward model
 
@@ -95,11 +95,11 @@ Here we show two facts that can be proven using the definition of covariance, se
 
 Assume that $y = h(x) = H x + c$, then
 
-$$\operatorname{cov}(x, y ) = \operatorname{cov}(x, x) H^T.$$
+$$\text{cov}(x, y ) = \text{cov}(x, x) H^T.$$
 
 Similarity, if $y = h(x) = H x + c$, then
 
-$$\operatorname{cov}(y, y ) = H \operatorname{cov}(x, x) H^T.$$
+$$\text{cov}(y, y ) = H \text{cov}(x, x) H^T.$$
 
 ## References
 

From 51d9659afe1292080ecb6fec7309298fb9c83d89 Mon Sep 17 00:00:00 2001
From: Tommy Odland <tommy.odland>
Date: Fri, 29 Dec 2023 09:53:51 +0100
Subject: [PATCH 4/9] spacing eqns

---
 lasso.md | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/lasso.md b/lasso.md
index 8541271a..3407963f 100644
--- a/lasso.md
+++ b/lasso.md
@@ -37,11 +37,9 @@ We will now transform the Kalman gain matrix into a linear regression.
 We switch notations and set $X := c(X)$ and $Y := c(Y)$, meaning that we assume that each row (variable) has been centered by subtracting the mean.
 
 Writing out the empirical covariances, the Kalman gain becomes
-$$
-K = \frac{X Y^T}{N - 1} \left(
+$$K = \frac{X Y^T}{N - 1} \left(
 \frac{Y Y^T}{N - 1} + \Sigma_\epsilon
-\right)^{-1}.
-$$
+\right)^{-1}.$$
 
 ### Adding sampled noise
 
@@ -51,25 +49,23 @@ Notice that the observation covariance matrix is typically diagonal, has shape $
 
 Replacing $\Sigma_\epsilon$ with $S S^T / (N - 1)$,
 we can approximate $K$ as
-$$
-K \approx X Y^T \left(
+$$K \approx X Y^T \left(
 Y Y^T + S S^T
-\right)^{-1}.
-$$
+\right)^{-1}.$$
 
 If we assume that $YS^T \approx SY^T \approx 0$, then the middle terms in $(Y +S)( Y + S)^T = Y Y^T + Y S^T + SY^T  + SS^T$ vanish.
 This is reasonable in the limit of large $N$ since the rows in $S$ are uncorrelated with the rows in $Y$.
 
 Define $Y_\text{noisy} := Y + S$, then
-$$
-K \approx X Y^T \left(
+
+$$K \approx X Y^T \left(
 Y_\text{noisy} Y_\text{noisy}^T
-\right)^{+}.
-$$
+\right)^{+}.$$
+
 Finally, assume that $X S^T \approx 0$ so that $Y \approx Y_\text{noisy}$, then
-$$
-K = X Y_\text{noisy}^{+}
-$$
+
+$$K = X Y_\text{noisy}^{+}$$
+
 and we solve a least squares problem $Y_\text{noisy}^T K^T = X^T$ for $K$.
 If we use Ridge or Lasso, the solution decomposes since in general $A[x_1 | x_2] = [Ax_1 | Ax_2] = [y_1 | y_2]$ and we can consider each row of $X$ independently.
 We can find $K$ by solving a Ridge or Lasso problem.
@@ -81,7 +77,9 @@ Solving $Y^T K^T = X^T$ with Ridge produces the Normal equations that define $K$
 ## Comment: Another Lasso approach
 
 The Kalman gain can also be written as
+
 $$K = \Sigma_{x} \hat{H}^T (\hat{H}\Sigma_{x}\hat{H}^T + \Sigma_{\epsilon})^{-1}$$
+
 which suggests another possible way to compute $K$:
 
 1. Estimate $\hat{H}$ using Lasso or similar.

From c839e161786f918aa4589fde794555e11e140e2b Mon Sep 17 00:00:00 2001
From: Tommy Odland <todl@equinor.com>
Date: Fri, 29 Dec 2023 14:19:36 +0100
Subject: [PATCH 5/9] more on lasso

---
 docs/source/Lasso.py | 32 +++++++++++++++++++++++++++++---
 lasso.md             | 42 +++++++++++++++++++++++++++++++++++-------
 2 files changed, 64 insertions(+), 10 deletions(-)

diff --git a/docs/source/Lasso.py b/docs/source/Lasso.py
index 75a911c3..b09173c4 100644
--- a/docs/source/Lasso.py
+++ b/docs/source/Lasso.py
@@ -65,7 +65,7 @@
 alpha = 1
 
 # Effect size (coefficients in sparse mapping A)
-effect_size = 5
+effect_size = 1000
 
 # %% [markdown]
 # ## Create problem data - sparse tridiagonal matrix $A$
@@ -291,7 +291,8 @@ def solve_projected(X, Y, covariance, observations, seed=1):
     cov_YY = empirical_cross_covariance(Y, Y)
 
     # Compute the update
-    K = cov_XY @ sp.linalg.inv(cov_YY + S @ S.T / (N - 1))
+    # TODO: N-1 vs N here.
+    K = cov_XY @ sp.linalg.inv(cov_YY + S @ S.T / (N-1))
     return X + K @ (D - Y)
 
 
@@ -395,7 +396,7 @@ def solve_adaptive_ESMDA(X, Y, covariance, observations, seed=1):
 
 
 # %%
-plt.figure(figsize=(7, 3))
+plt.figure(figsize=(8, 4.5))
 
 for function, label in zip(
     [
@@ -408,6 +409,7 @@ def solve_adaptive_ESMDA(X, Y, covariance, observations, seed=1):
     ],
     [
         "ESMDA",
+        "projected",
         "lstsq",
         "Lasso (direct)",
         "Lasso",
@@ -458,3 +460,27 @@ def solve_adaptive_ESMDA(X, Y, covariance, observations, seed=1):
 plt.show()
 
 # %%
+
+# %%
+
+# %%
+
+# %%
+
+# %%
+
+# %%
+
+# %%
+
+# %%
+
+# %%
+
+# %%
+
+# %%
+
+# %%
+
+# %%
diff --git a/lasso.md b/lasso.md
index 3407963f..82f416f2 100644
--- a/lasso.md
+++ b/lasso.md
@@ -1,6 +1,6 @@
 # Lasso
 
-Notes on the Lasso approach for
+Notes on the Lasso approach for regularizing the forward model.
 
 ## Notation and preliminaries
 
@@ -43,9 +43,20 @@ $$K = \frac{X Y^T}{N - 1} \left(
 
 ### Adding sampled noise
 
-Let $S \sim \mathcal{N}(0, \Sigma_\epsilon)$.
-We can write $S S^T / (N - 1) \approx \Sigma_\epsilon$, which is a low rank approximation to the observation covariance matrix.
-Notice that the observation covariance matrix is typically diagonal, has shape $m \times m$, whereas $S S^T$ has rank $N \ll m$.
+Let $R \sim \mathcal{N}(0, \Sigma_\epsilon)$.
+If we assume the mean is known, Bessel's correction is not needed and we can write $R R^T / N \approx \Sigma_\epsilon$.
+We sample from $\mathcal{N}(0, \Sigma_\epsilon)$ then estimate the covariance $\Sigma_\epsilon$ using the empirical covariance.
+
+Define $S = R \sqrt{(N-1) / N}$, then
+
+$$\Sigma_\epsilon \approx
+\frac{1}{N} R R^T
+=
+\frac{1}{N-1} \frac{N-1}{N} R R^T
+= \frac{S S^T}{N-1}
+$$
+Notice that this is a low rank approximation to the observation covariance matrix.
+The observation covariance matrix $\Sigma_\epsilon$ is typically diagonal, has shape $m \times m$, whereas $S S^T$ has rank $N \ll m$.
 
 Replacing $\Sigma_\epsilon$ with $S S^T / (N - 1)$,
 we can approximate $K$ as
@@ -67,14 +78,31 @@ Finally, assume that $X S^T \approx 0$ so that $Y \approx Y_\text{noisy}$, then
 $$K = X Y_\text{noisy}^{+}$$
 
 and we solve a least squares problem $Y_\text{noisy}^T K^T = X^T$ for $K$.
-If we use Ridge or Lasso, the solution decomposes since in general $A[x_1 | x_2] = [Ax_1 | Ax_2] = [y_1 | y_2]$ and we can consider each row of $X$ independently.
+If we use Ridge or Lasso, the solution decomposes and we can consider each row of $X$ independently.
+This is because in the equation $Y_\text{noisy}^T K^T = X^T$ the matrix $K^T$ acts on each row vector in $Y_\text{noisy}^T$ independently to produce each row vector in $X^T$.
 We can find $K$ by solving a Ridge or Lasso problem.
 
+### Comment: Kalman gain and the forward model
+
+Regularizing $K$ using e.g. Ridge or Lasso makes sense if we believe that forward model is localized, in the sense that each output is determined primarily by a few inputs (sparsity).
+
+A rough sketch of why $K \sim \mathbb{E} \nabla h^{-1}$
+
+Since $K \approx X Y^{+}$ holds for centered matrices, if we change notation to uncentered matrices we have
+$$K \approx \frac{X - \mathbb{E} X}{Y - \mathbb{E} Y}$$
+rearranging we see that this is a second order Taylor expansion.
+
+Therefore it makes sense to regularize $K$ if we believe in a sparse forward model.
+
 ### Comment: Ridge and the Kalman gain
 
 Solving $Y^T K^T = X^T$ with Ridge produces the Normal equations that define $K$.
+To be concrete, minimizing $\lVert Y^T K^T - X^T \rVert_2^2 + \alpha \lVert D K \rVert_2^2$ with respect to $K$ yields the equation
+$$\left( Y Y^T + \alpha D D^T\right) K = X Y^T.$$
+This is, up to scaling, exactly the equation for the kalman gain $K$ that we started with from the ESMDA paper.
+See equations (119) and (132) in the matrix cookbook for the matrix calculus needed to differentiate the loss function.
 
-## Comment: Another Lasso approach
+### Comment: Another Lasso approach
 
 The Kalman gain can also be written as
 
@@ -87,7 +115,7 @@ which suggests another possible way to compute $K$:
 
 This is likely unfeasible since $\text{cov}(X, X)$ becomes huge.
 
-## Comment: Covariances with linear forward model
+### Comment: Covariances with linear forward model
 
 Here we show two facts that can be proven using the definition of covariance, see e.g. sections 6.2.2 and 8.2.1 in [The Matrix Cookbook](https://www.math.uwaterloo.ca/~hwolkowi/matrixcookbook.pdf).
 

From c442d9336b7ae2fa3c9acd159d596218045b99d4 Mon Sep 17 00:00:00 2001
From: Tommy Odland <todl@equinor.com>
Date: Fri, 5 Jan 2024 09:29:44 +0100
Subject: [PATCH 6/9] added localized.py

---
 src/iterative_ensemble_smoother/localized.py | 268 +++++++++++++++++++
 1 file changed, 268 insertions(+)
 create mode 100644 src/iterative_ensemble_smoother/localized.py

diff --git a/src/iterative_ensemble_smoother/localized.py b/src/iterative_ensemble_smoother/localized.py
new file mode 100644
index 00000000..8eb8fb95
--- /dev/null
+++ b/src/iterative_ensemble_smoother/localized.py
@@ -0,0 +1,268 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jan  5 09:27:34 2024
+
+@author: TODL
+"""
+
+from iterative_ensemble_smoother.esmda import BaseESMDA
+import numbers
+from abc import ABC
+from typing import Union
+
+import numpy as np
+import numpy.typing as npt
+import scipy as sp  # type: ignore
+
+from iterative_ensemble_smoother.esmda_inversion import (
+    inversion_exact_cholesky,
+    inversion_subspace,
+    normalize_alpha,
+)
+from iterative_ensemble_smoother.utils import sample_mvnormal
+
+
+class LassoESMDA(BaseESMDA):
+    """
+    Implement an Ensemble Smoother with Multiple Data Assimilation (ES-MDA).
+
+    The implementation follows :cite:t:`EMERICK2013`.
+
+    Parameters
+    ----------
+    covariance : np.ndarray
+        Either a 1D array of diagonal covariances, or a 2D covariance matrix.
+        The shape is either (num_observations,) or (num_observations, num_observations).
+        This is C_D in Emerick (2013), and represents observation or measurement
+        errors. We observe d from the real world, y from the model g(x), and
+        assume that d = y + e, where the error e is multivariate normal with
+        covariance given by `covariance`.
+    observations : np.ndarray
+        1D array of shape (num_observations,) representing real-world observations.
+        This is d_obs in Emerick (2013).
+    alpha : int or 1D np.ndarray, optional
+        Multiplicative factor for the covariance.
+        If an integer `alpha` is given, an array with length `alpha` and
+        elements `alpha` is constructed. If an 1D array is given, it is
+        normalized so sum_i 1/alpha_i = 1 and used. The default is 5, which
+        corresponds to np.array([5, 5, 5, 5, 5]).
+    seed : integer or numpy.random._generator.Generator, optional
+        A seed or numpy.random._generator.Generator used for random number
+        generation. The argument is passed to numpy.random.default_rng().
+        The default is None.
+    inversion : str, optional
+        Which inversion method to use. The default is "exact".
+        See the dictionary ESMDA._inversion_methods for more information.
+
+    Examples
+    --------
+    >>> covariance = np.diag([1, 1, 1])
+    >>> observations = np.array([1, 2, 3])
+    >>> esmda = ESMDA(covariance, observations)
+
+    """
+
+    # Available inversion methods. The inversion methods all compute
+    # C_MD @ (C_DD + alpha * C_D)^(-1)  @ (D - Y)
+    _inversion_methods = {
+        "exact": inversion_exact_cholesky,
+        "subspace": inversion_subspace,
+    }
+
+    def __init__(
+        self,
+        covariance: npt.NDArray[np.double],
+        observations: npt.NDArray[np.double],
+        alpha: Union[int, npt.NDArray[np.double]] = 5,
+        seed: Union[np.random._generator.Generator, int, None] = None,
+        inversion: str = "exact",
+    ) -> None:
+        """Initialize the instance."""
+
+        super().__init__(covariance=covariance, observations=observations, seed=seed)
+
+        if not (
+            (isinstance(alpha, np.ndarray) and alpha.ndim == 1)
+            or isinstance(alpha, numbers.Integral)
+        ):
+            raise TypeError("Argument `alpha` must be an integer or a 1D NumPy array.")
+
+        if not isinstance(inversion, str):
+            raise TypeError(
+                "Argument `inversion` must be a string in "
+                f"{tuple(self._inversion_methods.keys())}, but got {inversion}"
+            )
+        if inversion not in self._inversion_methods.keys():
+            raise ValueError(
+                "Argument `inversion` must be a string in "
+                f"{tuple(self._inversion_methods.keys())}, but got {inversion}"
+            )
+
+        # Store data
+        self.inversion = inversion
+
+        # Alpha can either be an integer (num iterations) or a list of weights
+        if isinstance(alpha, np.ndarray) and alpha.ndim == 1:
+            self.alpha = normalize_alpha(alpha)
+        elif isinstance(alpha, numbers.Integral):
+            self.alpha = normalize_alpha(np.ones(alpha))
+            assert np.allclose(self.alpha, normalize_alpha(self.alpha))
+        else:
+            raise TypeError("Alpha must be integer or 1D array.")
+
+    def num_assimilations(self) -> int:
+        return len(self.alpha)
+
+    def assimilate(
+        self,
+        X: npt.NDArray[np.double],
+        Y: npt.NDArray[np.double],
+        *,
+        overwrite: bool = False,
+        truncation: float = 1.0,
+    ) -> npt.NDArray[np.double]:
+        """Assimilate data and return an updated ensemble X_posterior.
+
+            X_posterior = smoother.assimilate(X, Y)
+
+        Parameters
+        ----------
+        X : np.ndarray
+            A 2D array of shape (num_parameters, ensemble_size). Each row
+            corresponds to a parameter in the model, and each column corresponds
+            to an ensemble member (realization).
+        Y : np.ndarray
+            2D array of shape (num_observations, ensemble_size), containing
+            responses when evaluating the model at X. In other words, Y = g(X),
+            where g is the forward model.
+        overwrite : bool
+            If True, then arguments X and Y may be overwritten and mutated.
+            If False, then the method will not mutate inputs in any way.
+            Setting this to True might save memory.
+        truncation : float
+            How large a fraction of the singular values to keep in the inversion
+            routine. Must be a float in the range (0, 1]. A lower number means
+            a more approximate answer and a slightly faster computation.
+
+        Returns
+        -------
+        X_posterior : np.ndarray
+            2D array of shape (num_parameters, ensemble_size).
+
+        """
+        if self.iteration >= self.num_assimilations():
+            raise Exception("No more assimilation steps to run.")
+
+        # Verify shapes
+        _, num_ensemble = X.shape
+        num_outputs, num_emsemble2 = Y.shape
+        assert (
+            num_ensemble == num_emsemble2
+        ), "Number of ensemble members in X and Y must match"
+        if not np.issubdtype(X.dtype, np.floating):
+            raise TypeError("Argument `X` must contain floats")
+        if not np.issubdtype(Y.dtype, np.floating):
+            raise TypeError("Argument `Y` must contain floats")
+
+        assert 0 < truncation <= 1.0
+
+        # Do not overwrite input arguments
+        if not overwrite:
+            X, Y = np.copy(X), np.copy(Y)
+
+        # Line 2 (b) in the description of ES-MDA in the 2013 Emerick paper
+
+        # Draw samples from zero-centered multivariate normal with cov=alpha * C_D,
+        # and add them to the observations. Notice that
+        # if C_D = L L.T by the cholesky factorization, then drawing y from
+        # a zero cented normal means that y := L @ z, where z ~ norm(0, 1)
+        # Therefore, scaling C_D by alpha is equivalent to scaling L with sqrt(alpha)
+        D = self.perturb_observations(
+            ensemble_size=num_ensemble, alpha=self.alpha[self.iteration]
+        )
+        assert D.shape == (num_outputs, num_ensemble)
+
+        # Line 2 (c) in the description of ES-MDA in the 2013 Emerick paper
+        # Choose inversion method, e.g. 'exact'. The inversion method computes
+        # C_MD @ sp.linalg.inv(C_DD + C_D_alpha) @ (D - Y)
+        inversion_func = self._inversion_methods[self.inversion]
+
+        # Update and return
+        X += inversion_func(
+            alpha=self.alpha[self.iteration],
+            C_D=self.C_D,
+            D=D,
+            Y=Y,
+            X=X,
+            truncation=truncation,
+        )
+
+        self.iteration += 1
+        return X
+
+    def compute_transition_matrix(
+        self,
+        Y: npt.NDArray[np.double],
+        *,
+        alpha: float,
+        truncation: float = 1.0,
+    ) -> npt.NDArray[np.double]:
+        """Return a matrix T such that X_posterior = X_prior + X_prior @ T.
+
+        The purpose of this method is to facilitate row-by-row, or batch-by-batch,
+        updates of X. This is useful if X is too large to fit in memory.
+
+        Parameters
+        ----------
+        Y : np.ndarray
+            2D array of shape (num_observations, ensemble_size), containing
+            responses when evaluating the model at X. In other words, Y = g(X),
+            where g is the forward model.
+        alpha : float
+            The covariance inflation factor. The sequence of alphas should
+            obey the equation sum_i (1/alpha_i) = 1. However, this is NOT enforced
+            in this method call. The user/caller is responsible for this.
+        truncation : float
+            How large a fraction of the singular values to keep in the inversion
+            routine. Must be a float in the range (0, 1]. A lower number means
+            a more approximate answer and a slightly faster computation.
+
+        Returns
+        -------
+        T : np.ndarray
+            A matrix T such that X_posterior = X_prior + X_prior @ T.
+            It has shape (num_ensemble_members, num_ensemble_members).
+        """
+
+        # Recall the update equation:
+        # X += C_MD @ (C_DD + alpha * C_D)^(-1)  @ (D - Y)
+        # X += X @ center(Y).T / (N-1) @ (C_DD + alpha * C_D)^(-1) @ (D - Y)
+        # We form T := center(Y).T / (N-1) @ (C_DD + alpha * C_D)^(-1) @ (D - Y),
+        # so that
+        # X_new = X_old + X_old @ T
+        # or
+        # X += X @ T
+
+        D = self.perturb_observations(ensemble_size=Y.shape[1], alpha=alpha)
+        inversion_func = self._inversion_methods[self.inversion]
+        return inversion_func(
+            alpha=alpha,
+            C_D=self.C_D,
+            D=D,
+            Y=Y,
+            X=None,  # We don't need X to compute the factor T
+            truncation=truncation,
+            return_T=True,  # Ensures that we don't need X
+        )
+
+
+if __name__ == "__main__":
+    import pytest
+
+    pytest.main(
+        args=[
+            __file__,
+            "--doctest-modules",
+            "-v",
+        ]
+    )

From 86121b2f1eb5e9787ca8c6920a39970c60e784bc Mon Sep 17 00:00:00 2001
From: Tommy Odland <todl@equinor.com>
Date: Fri, 5 Jan 2024 11:22:16 +0100
Subject: [PATCH 7/9] draft of lasso

---
 src/iterative_ensemble_smoother/localized.py | 281 +++++++------------
 1 file changed, 103 insertions(+), 178 deletions(-)

diff --git a/src/iterative_ensemble_smoother/localized.py b/src/iterative_ensemble_smoother/localized.py
index 8eb8fb95..1b267c95 100644
--- a/src/iterative_ensemble_smoother/localized.py
+++ b/src/iterative_ensemble_smoother/localized.py
@@ -22,7 +22,99 @@
 from iterative_ensemble_smoother.utils import sample_mvnormal
 
 
-class LassoESMDA(BaseESMDA):
+import numpy as np
+from sklearn.linear_model import LassoCV
+from sklearn.linear_model import MultiTaskLassoCV
+
+
+def linear_l1_regression(X, Y):
+    """
+    Performs LASSO regression for each response in Y against predictors in U,
+    constructing a sparse matrix of regression coefficients.
+
+    The function scales features in U using standard scaling before applying
+    LASSO, then re-scales the coefficients to the original scale of U. This
+    extracts the effect of each feature in U on each response in Y, ignoring
+    intercepts and constant terms.
+
+    Parameters
+    ----------
+    U : np.ndarray
+        2D array of predictors with shape (num_variables, ensemble_size).
+    Y : np.ndarray
+        2D array of responses with shape (num_variables, ensemble_size).
+
+
+    Examples
+    --------
+    >>> K = np.array([[-0.2, -0.9],
+    ...               [-1.5,  0.2],
+    ...               [-0.2,  0.1],
+    ...               [ 0.8, -0.4]])
+    >>> X = np.array([[ 0.8,  2. , -0.8],
+    ...               [ 0.4,  1.7, -0.3]])
+    >>> Y = K @ X
+    >>> K_est = linear_l1_regression(X, Y)
+    >>> np.allclose(K, K_est, atol=0.005)
+    True
+
+    """
+    # https://github.com/equinor/graphite-maps/blob/main/graphite_maps/linear_regression.py
+
+    # The goal is to solve K @ X = Y for K, or equivalently X.T @ K.T = Y.T
+    num_parameters, ensemble_size1 = X.shape
+    num_observations, ensemble_size2 = Y.shape
+
+    assert ensemble_size1 == ensemble_size2, "Number of cols in X and Y must be equal"
+
+    # Scale to unit standard deviation. We do this because of numerics in Lasso
+    stds_X = np.std(X, axis=1)
+    stds_Y = np.std(Y, axis=1)
+    X = X / stds_X[:, np.newaxis]
+    Y = Y / stds_Y[:, np.newaxis]
+
+    # Loop over features
+    coefs = []
+    cv = min(5, ensemble_size1)
+    alphas = np.logspace(-8, 8, num=17)
+    for j in range(num_observations):
+        y_j = Y[j, :]
+
+        # Learn individual regularization and fit
+        model_cv = LassoCV(alphas=alphas, fit_intercept=False, max_iter=10_000, cv=cv)
+        model_cv.fit(X.T, y_j)  # model_cv.alpha_
+
+        # Alphas for the next iteration
+        # alpha = model_cv.alpha_ * 0.5 * alphas[len(alphas)//2] * 0.5
+        # alphas = np.logspace(-2, 2, num=5) * model_cv.alpha_
+
+        # Scale back the coefficients
+        coef_scale = stds_Y[j] / stds_X
+        coefs.append(model_cv.coef_ * coef_scale)
+
+    K = np.vstack(coefs)
+    assert K.shape == (num_observations, num_parameters)
+    return K
+
+    # =======================================================
+
+    # Alternative computation using MultiTaskLasso
+
+    # Create a lasso model
+    lasso = MultiTaskLassoCV(fit_intercept=False, cv=cv)
+    lasso.fit(X.T, Y.T)
+
+    # Get the matrix K
+    K = lasso.coef_
+    assert K.shape == (num_observations, num_parameters)
+
+    K = K / stds_X[np.newaxis, :]
+    K = K * stds_Y[:, np.newaxis]
+
+    return K
+
+
+class LassoES(BaseESMDA):
     """
     Implement an Ensemble Smoother with Multiple Data Assimilation (ES-MDA).
 
@@ -40,119 +132,30 @@ class LassoESMDA(BaseESMDA):
     observations : np.ndarray
         1D array of shape (num_observations,) representing real-world observations.
         This is d_obs in Emerick (2013).
-    alpha : int or 1D np.ndarray, optional
-        Multiplicative factor for the covariance.
-        If an integer `alpha` is given, an array with length `alpha` and
-        elements `alpha` is constructed. If an 1D array is given, it is
-        normalized so sum_i 1/alpha_i = 1 and used. The default is 5, which
-        corresponds to np.array([5, 5, 5, 5, 5]).
     seed : integer or numpy.random._generator.Generator, optional
         A seed or numpy.random._generator.Generator used for random number
         generation. The argument is passed to numpy.random.default_rng().
         The default is None.
-    inversion : str, optional
-        Which inversion method to use. The default is "exact".
-        See the dictionary ESMDA._inversion_methods for more information.
-
-    Examples
-    --------
-    >>> covariance = np.diag([1, 1, 1])
-    >>> observations = np.array([1, 2, 3])
-    >>> esmda = ESMDA(covariance, observations)
 
     """
 
-    # Available inversion methods. The inversion methods all compute
-    # C_MD @ (C_DD + alpha * C_D)^(-1)  @ (D - Y)
-    _inversion_methods = {
-        "exact": inversion_exact_cholesky,
-        "subspace": inversion_subspace,
-    }
-
     def __init__(
         self,
         covariance: npt.NDArray[np.double],
         observations: npt.NDArray[np.double],
-        alpha: Union[int, npt.NDArray[np.double]] = 5,
         seed: Union[np.random._generator.Generator, int, None] = None,
-        inversion: str = "exact",
     ) -> None:
         """Initialize the instance."""
 
         super().__init__(covariance=covariance, observations=observations, seed=seed)
 
-        if not (
-            (isinstance(alpha, np.ndarray) and alpha.ndim == 1)
-            or isinstance(alpha, numbers.Integral)
-        ):
-            raise TypeError("Argument `alpha` must be an integer or a 1D NumPy array.")
-
-        if not isinstance(inversion, str):
-            raise TypeError(
-                "Argument `inversion` must be a string in "
-                f"{tuple(self._inversion_methods.keys())}, but got {inversion}"
-            )
-        if inversion not in self._inversion_methods.keys():
-            raise ValueError(
-                "Argument `inversion` must be a string in "
-                f"{tuple(self._inversion_methods.keys())}, but got {inversion}"
-            )
-
-        # Store data
-        self.inversion = inversion
-
-        # Alpha can either be an integer (num iterations) or a list of weights
-        if isinstance(alpha, np.ndarray) and alpha.ndim == 1:
-            self.alpha = normalize_alpha(alpha)
-        elif isinstance(alpha, numbers.Integral):
-            self.alpha = normalize_alpha(np.ones(alpha))
-            assert np.allclose(self.alpha, normalize_alpha(self.alpha))
-        else:
-            raise TypeError("Alpha must be integer or 1D array.")
-
-    def num_assimilations(self) -> int:
-        return len(self.alpha)
-
     def assimilate(
         self,
         X: npt.NDArray[np.double],
         Y: npt.NDArray[np.double],
         *,
-        overwrite: bool = False,
-        truncation: float = 1.0,
+        alpha: float = 1.0,
     ) -> npt.NDArray[np.double]:
-        """Assimilate data and return an updated ensemble X_posterior.
-
-            X_posterior = smoother.assimilate(X, Y)
-
-        Parameters
-        ----------
-        X : np.ndarray
-            A 2D array of shape (num_parameters, ensemble_size). Each row
-            corresponds to a parameter in the model, and each column corresponds
-            to an ensemble member (realization).
-        Y : np.ndarray
-            2D array of shape (num_observations, ensemble_size), containing
-            responses when evaluating the model at X. In other words, Y = g(X),
-            where g is the forward model.
-        overwrite : bool
-            If True, then arguments X and Y may be overwritten and mutated.
-            If False, then the method will not mutate inputs in any way.
-            Setting this to True might save memory.
-        truncation : float
-            How large a fraction of the singular values to keep in the inversion
-            routine. Must be a float in the range (0, 1]. A lower number means
-            a more approximate answer and a slightly faster computation.
-
-        Returns
-        -------
-        X_posterior : np.ndarray
-            2D array of shape (num_parameters, ensemble_size).
-
-        """
-        if self.iteration >= self.num_assimilations():
-            raise Exception("No more assimilation steps to run.")
-
         # Verify shapes
         _, num_ensemble = X.shape
         num_outputs, num_emsemble2 = Y.shape
@@ -164,96 +167,18 @@ def assimilate(
         if not np.issubdtype(Y.dtype, np.floating):
             raise TypeError("Argument `Y` must contain floats")
 
-        assert 0 < truncation <= 1.0
-
-        # Do not overwrite input arguments
-        if not overwrite:
-            X, Y = np.copy(X), np.copy(Y)
-
-        # Line 2 (b) in the description of ES-MDA in the 2013 Emerick paper
-
-        # Draw samples from zero-centered multivariate normal with cov=alpha * C_D,
-        # and add them to the observations. Notice that
-        # if C_D = L L.T by the cholesky factorization, then drawing y from
-        # a zero cented normal means that y := L @ z, where z ~ norm(0, 1)
-        # Therefore, scaling C_D by alpha is equivalent to scaling L with sqrt(alpha)
-        D = self.perturb_observations(
-            ensemble_size=num_ensemble, alpha=self.alpha[self.iteration]
-        )
-        assert D.shape == (num_outputs, num_ensemble)
-
-        # Line 2 (c) in the description of ES-MDA in the 2013 Emerick paper
-        # Choose inversion method, e.g. 'exact'. The inversion method computes
-        # C_MD @ sp.linalg.inv(C_DD + C_D_alpha) @ (D - Y)
-        inversion_func = self._inversion_methods[self.inversion]
-
-        # Update and return
-        X += inversion_func(
-            alpha=self.alpha[self.iteration],
-            C_D=self.C_D,
-            D=D,
-            Y=Y,
-            X=X,
-            truncation=truncation,
+        # Center X and Y
+        X_center = X - np.mean(X, axis=1, keepdims=True)
+        Y_center = Y - np.mean(Y, axis=1, keepdims=True)
+        Y_noise = np.sqrt(alpha) * sample_mvnormal(
+            C_dd_cholesky=self.C_D_L, rng=self.rng, size=num_ensemble
         )
 
-        self.iteration += 1
-        return X
+        # Compute regularized Kalman gain matrix
+        K = linear_l1_regression(X=Y_center + Y_noise, Y=X_center)
 
-    def compute_transition_matrix(
-        self,
-        Y: npt.NDArray[np.double],
-        *,
-        alpha: float,
-        truncation: float = 1.0,
-    ) -> npt.NDArray[np.double]:
-        """Return a matrix T such that X_posterior = X_prior + X_prior @ T.
-
-        The purpose of this method is to facilitate row-by-row, or batch-by-batch,
-        updates of X. This is useful if X is too large to fit in memory.
-
-        Parameters
-        ----------
-        Y : np.ndarray
-            2D array of shape (num_observations, ensemble_size), containing
-            responses when evaluating the model at X. In other words, Y = g(X),
-            where g is the forward model.
-        alpha : float
-            The covariance inflation factor. The sequence of alphas should
-            obey the equation sum_i (1/alpha_i) = 1. However, this is NOT enforced
-            in this method call. The user/caller is responsible for this.
-        truncation : float
-            How large a fraction of the singular values to keep in the inversion
-            routine. Must be a float in the range (0, 1]. A lower number means
-            a more approximate answer and a slightly faster computation.
-
-        Returns
-        -------
-        T : np.ndarray
-            A matrix T such that X_posterior = X_prior + X_prior @ T.
-            It has shape (num_ensemble_members, num_ensemble_members).
-        """
-
-        # Recall the update equation:
-        # X += C_MD @ (C_DD + alpha * C_D)^(-1)  @ (D - Y)
-        # X += X @ center(Y).T / (N-1) @ (C_DD + alpha * C_D)^(-1) @ (D - Y)
-        # We form T := center(Y).T / (N-1) @ (C_DD + alpha * C_D)^(-1) @ (D - Y),
-        # so that
-        # X_new = X_old + X_old @ T
-        # or
-        # X += X @ T
-
-        D = self.perturb_observations(ensemble_size=Y.shape[1], alpha=alpha)
-        inversion_func = self._inversion_methods[self.inversion]
-        return inversion_func(
-            alpha=alpha,
-            C_D=self.C_D,
-            D=D,
-            Y=Y,
-            X=None,  # We don't need X to compute the factor T
-            truncation=truncation,
-            return_T=True,  # Ensures that we don't need X
-        )
+        D = self.perturb_observations(ensemble_size=num_ensemble, alpha=alpha)
+        return X + K @ (D - Y)
 
 
 if __name__ == "__main__":

From 01f761cba1016cbffe4bf7fb87a7282264c70b7b Mon Sep 17 00:00:00 2001
From: Tommy Odland <todl@equinor.com>
Date: Fri, 5 Jan 2024 11:31:05 +0100
Subject: [PATCH 8/9] comments

---
 src/iterative_ensemble_smoother/localized.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/src/iterative_ensemble_smoother/localized.py b/src/iterative_ensemble_smoother/localized.py
index 1b267c95..e5767533 100644
--- a/src/iterative_ensemble_smoother/localized.py
+++ b/src/iterative_ensemble_smoother/localized.py
@@ -170,12 +170,23 @@ def assimilate(
         # Center X and Y
         X_center = X - np.mean(X, axis=1, keepdims=True)
         Y_center = Y - np.mean(Y, axis=1, keepdims=True)
-        Y_noise = np.sqrt(alpha) * sample_mvnormal(
+        
+        # We have R @ R.T = C_D
+        R = np.sqrt(alpha) * sample_mvnormal(
             C_dd_cholesky=self.C_D_L, rng=self.rng, size=num_ensemble
         )
-
-        # Compute regularized Kalman gain matrix
-        K = linear_l1_regression(X=Y_center + Y_noise, Y=X_center)
+        
+        # L = sqrt((N - 1) / N)
+        L = np.sqrt((num_ensemble - 1)/ num_ensemble) * R
+
+        # Compute regularized Kalman gain matrix matrix by solving
+        # (Y + L) K = X
+        # The definition of K is
+        # (cov(Y, Y) + covariance) K = cov(X, Y), which is approximately
+        # (Y @ Y.T + L @ L.T) K = X @ Y.T, [X and Y are centered] which is approximately
+        # (Y + L) @ (Y + L).T @ K = X @ Y.T, which is approximately
+        # (Y + L) @ K = X
+        K = linear_l1_regression(X=Y_center + L, Y=X_center)
 
         D = self.perturb_observations(ensemble_size=num_ensemble, alpha=alpha)
         return X + K @ (D - Y)

From c54603772f67e8911d249c33678a2d487eca6109 Mon Sep 17 00:00:00 2001
From: Tommy Odland <todl@equinor.com>
Date: Fri, 5 Jan 2024 13:57:06 +0100
Subject: [PATCH 9/9] finished initial lasso implementation

---
 docs/source/Lasso.py                         | 129 ++++---------------
 lasso.md                                     |   2 +-
 src/iterative_ensemble_smoother/localized.py |  69 +++-------
 3 files changed, 46 insertions(+), 154 deletions(-)

diff --git a/docs/source/Lasso.py b/docs/source/Lasso.py
index b09173c4..7a02df29 100644
--- a/docs/source/Lasso.py
+++ b/docs/source/Lasso.py
@@ -56,7 +56,7 @@
 rng = np.random.default_rng(42)
 
 # Dimensionality of the problem
-num_parameters = 100
+num_parameters = 75
 num_observations = 50
 num_ensemble = 25
 prior_std = 1
@@ -65,13 +65,13 @@
 alpha = 1
 
 # Effect size (coefficients in sparse mapping A)
-effect_size = 1000
+effect_size = 1
 
 # %% [markdown]
 # ## Create problem data - sparse tridiagonal matrix $A$
 
 # %%
-diagonal = effect_size * np.ones(min(num_parameters, num_observations))
+diagonal = effect_size * np.ones(max(num_parameters, num_observations))
 
 # Create a tridiagonal matrix (easiest with scipy)
 A = sp.sparse.diags(
@@ -174,71 +174,11 @@ def g(X):
 from sklearn.linear_model import LassoCV
 from sklearn.preprocessing import StandardScaler
 
-
-def linear_l1_regression(U, Y):
-    """
-    Performs LASSO regression for each response in Y against predictors in U,
-    constructing a sparse matrix of regression coefficients.
-
-    The function scales features in U using standard scaling before applying
-    LASSO, then re-scales the coefficients to the original scale of U. This
-    extracts the effect of each feature in U on each response in Y, ignoring
-    intercepts and constant terms.
-
-    Parameters
-    ----------
-    U : np.ndarray
-        2D array of predictors with shape (n, p).
-    Y : np.ndarray
-        2D array of responses with shape (n, m).
-
-    Returns
-    -------
-    H_sparse : scipy.sparse.csc_matrix
-        Sparse matrix (m, p) with re-scaled LASSO regression coefficients for
-        each response in Y.
-
-    Raises
-    ------
-    AssertionError
-        If the number of samples in U and Y do not match, or if the shape of
-        H_sparse is not (m, p).
-    """
-    # https://github.com/equinor/graphite-maps/blob/main/graphite_maps/linear_regression.py
-
-    n, p = U.shape  # p: number of features
-    n_y, m = Y.shape  # m: number of y responses
-
-    # Assert that the first dimension of U and Y are the same
-    assert n == n_y, "Number of samples in U and Y must be the same"
-
-    scaler_u = StandardScaler()
-    U_scaled = scaler_u.fit_transform(U)
-
-    scaler_y = StandardScaler()
-    Y_scaled = scaler_y.fit_transform(Y)
-
-    # Loop over features
-    coefs = []
-    for j in range(m):
-        y_j = Y_scaled[:, j]
-
-        # Learn individual regularization and fit
-        alphas = np.logspace(-8, 8, num=32)
-        model_cv = LassoCV(alphas=alphas, fit_intercept=False, max_iter=10_000, cv=5)
-        model_cv.fit(U_scaled, y_j)
-
-        coef_scale = scaler_y.scale_[j] / scaler_u.scale_
-        coefs.append(model_cv.coef_ * coef_scale)
-
-    K = np.vstack(coefs)
-    assert K.shape == (m, p)
-
-    return K
+from iterative_ensemble_smoother.localized import linear_l1_regression
 
 
 Y = g(X)
-K = linear_l1_regression(X.T, Y.T)
+K = linear_l1_regression(X, Y)
 plt.imshow(K)
 plt.show()
 
@@ -262,6 +202,7 @@ def solve_esmda(X, Y, covariance, observations, seed=1):
     cov_YY = empirical_cross_covariance(Y, Y)
 
     # Compute the update using the exact ESMDA approach
+    assert covariance.ndim == 1
     K = cov_XY @ sp.linalg.inv(cov_YY + np.diag(covariance))
     return X + K @ (D - Y)
 
@@ -290,9 +231,10 @@ def solve_projected(X, Y, covariance, observations, seed=1):
     cov_XY = empirical_cross_covariance(X, Y)
     cov_YY = empirical_cross_covariance(Y, Y)
 
-    # Compute the update
-    # TODO: N-1 vs N here.
-    K = cov_XY @ sp.linalg.inv(cov_YY + S @ S.T / (N-1))
+    # covariance ~ S @ S.T / N
+
+    # Compute the update. Divide by N since we know the mean is zero
+    K = cov_XY @ sp.linalg.inv(cov_YY + S @ S.T / N)
     return X + K @ (D - Y)
 
 
@@ -310,13 +252,21 @@ def solve_lstsq(X, Y, covariance, observations, seed=1):
     N = X.shape[1]
     D = smoother.perturb_observations(ensemble_size=N, alpha=1.0)
 
+    # Center X and Y
+    X_center = X - np.mean(X, axis=1, keepdims=True)
+    Y_center = Y - np.mean(Y, axis=1, keepdims=True)
+
     # Instead of using the covariance matrix, sample from it
+    # covariance / (N-1) ~
+
+    N = X.shape[1]
     S = sample_mvnormal(C_dd_cholesky=smoother.C_D_L, rng=smoother.rng, size=N)
+    S = S * np.sqrt((N - 1) / N)  # Divide by N since we know that the mean is zero
 
-    Y_noisy = Y + S
+    Y_noisy = Y_center + S
 
     # Compute the update by solving K @ Y = X
-    K, *_ = sp.linalg.lstsq(Y_noisy.T, X.T)
+    K, *_ = sp.linalg.lstsq(Y_noisy.T, X_center.T)
     K = K.T
 
     return X + K @ (D - Y)
@@ -338,7 +288,7 @@ def solve_lasso_direct(X, Y, covariance, observations, seed=1):
     D = smoother.perturb_observations(ensemble_size=N, alpha=1.0)
 
     # Approximate forward model with H
-    H = linear_l1_regression(X.T, Y.T)
+    H = linear_l1_regression(X, Y)
     cov_XX = empirical_cross_covariance(X, X)
 
     # Compute the update
@@ -349,25 +299,16 @@ def solve_lasso_direct(X, Y, covariance, observations, seed=1):
 def solve_lasso(X, Y, covariance, observations, seed=1):
     """Solve K @ Y = X using Lasso."""
 
+    from iterative_ensemble_smoother.localized import LassoES
+
     # Create a smoother to get D
-    smoother = ESMDA(
+    smoother = LassoES(
         covariance=covariance,
         observations=observations,
-        alpha=1,
         seed=seed,
     )
-    N = X.shape[1]
-    D = smoother.perturb_observations(ensemble_size=N, alpha=1.0)
 
-    # Instead of using the covariance matrix, sample from it
-    S = sample_mvnormal(C_dd_cholesky=smoother.C_D_L, rng=smoother.rng, size=N)
-
-    Y_noisy = Y + S  # Seems to work better with Y instead of Y_noisy
-
-    # Compute the update
-    K = linear_l1_regression(Y_noisy.T, X.T)
-
-    return X + K @ (D - Y)
+    return smoother.assimilate(X, Y)
 
 
 def solve_adaptive_ESMDA(X, Y, covariance, observations, seed=1):
@@ -464,23 +405,3 @@ def solve_adaptive_ESMDA(X, Y, covariance, observations, seed=1):
 # %%
 
 # %%
-
-# %%
-
-# %%
-
-# %%
-
-# %%
-
-# %%
-
-# %%
-
-# %%
-
-# %%
-
-# %%
-
-# %%
diff --git a/lasso.md b/lasso.md
index 82f416f2..0fb1e108 100644
--- a/lasso.md
+++ b/lasso.md
@@ -132,4 +132,4 @@ $$\text{cov}(y, y ) = H \text{cov}(x, x) H^T.$$
 - [Issue 6599 on the Lasso in ERT](https://github.com/equinor/ert/issues/6599)
 - [Presentation: Ensemblized linear least squares (LLS)](https://ncda-fs.web.norce.cloud/WS2023/raanes.pdf)
 - [Paper: Ensemble Smoother with Multiple Data Assimilation](http://dx.doi.org/10.1016/j.cageo.2012.03.011)
-- [Paper: Ensemble transport smoothing. Part I: Unified framework](https://arxiv.org/pdf/2210.17000.pdf)
+- [Paper: Ensemble transport smoothing. Part I: Unified framework](https://arxiv.org/pdf/2210.17000.pdf)
\ No newline at end of file
diff --git a/src/iterative_ensemble_smoother/localized.py b/src/iterative_ensemble_smoother/localized.py
index e5767533..6c09d669 100644
--- a/src/iterative_ensemble_smoother/localized.py
+++ b/src/iterative_ensemble_smoother/localized.py
@@ -1,28 +1,13 @@
 # -*- coding: utf-8 -*-
-"""
-Created on Fri Jan  5 09:27:34 2024
-
-@author: TODL
-"""
 
 from iterative_ensemble_smoother.esmda import BaseESMDA
-import numbers
-from abc import ABC
 from typing import Union
 
 import numpy as np
 import numpy.typing as npt
-import scipy as sp  # type: ignore
 
-from iterative_ensemble_smoother.esmda_inversion import (
-    inversion_exact_cholesky,
-    inversion_subspace,
-    normalize_alpha,
-)
 from iterative_ensemble_smoother.utils import sample_mvnormal
 
-
-import numpy as np
 from sklearn.linear_model import LassoCV
 from sklearn.linear_model import MultiTaskLassoCV
 
@@ -60,6 +45,8 @@ def linear_l1_regression(X, Y):
 
     """
     # https://github.com/equinor/graphite-maps/blob/main/graphite_maps/linear_regression.py
+    # An option is to use MultiTaskLasso too, but not exactly the same objective
+    # https://scikit-learn.org/stable/modules/linear_model.html#multi-task-lasso
 
     # The goal is to solve K @ X = Y for K, or equivalently X.T @ K.T = Y.T
     num_parameters, ensemble_size1 = X.shape
@@ -76,17 +63,16 @@ def linear_l1_regression(X, Y):
     # Loop over features
     coefs = []
     cv = min(5, ensemble_size1)
-    alphas = np.logspace(-8, 8, num=17)
-    for j in range(num_observations):
-        y_j = Y[j, :]
+    alphas = np.logspace(-6, 6, num=17)
+    
+    # The matrix K in K @ X = Y is built up row by row
+    for j, y_j in enumerate(Y): # Loop over rows in Y
 
         # Learn individual regularization and fit
         model_cv = LassoCV(alphas=alphas, fit_intercept=False, max_iter=10_000, cv=cv)
         model_cv.fit(X.T, y_j)  # model_cv.alpha_
-
-        # Alphas for the next iteration
-        # alpha = model_cv.alpha_ * 0.5 * alphas[len(alphas)//2] * 0.5
-        # alphas = np.logspace(-2, 2, num=5) * model_cv.alpha_
+        # TODO: Consider searching smarter for alphas, e.g. momentum or
+        # searching within 1-2 orders of magnitude of previous alpha?
 
         # Scale back the coefficients
         coef_scale = stds_Y[j] / stds_X
@@ -96,23 +82,6 @@ def linear_l1_regression(X, Y):
     assert K.shape == (num_observations, num_parameters)
     return K
 
-    # =======================================================
-
-    # Alternative computation using MultiTaskLasso
-
-    # Create a lasso model
-    lasso = MultiTaskLassoCV(fit_intercept=False, cv=cv)
-    lasso.fit(X.T, Y.T)
-
-    # Get the matrix K
-    K = lasso.coef_
-    assert K.shape == (num_observations, num_parameters)
-
-    K = K / stds_X[np.newaxis, :]
-    K = K * stds_Y[:, np.newaxis]
-
-    return K
-
 
 class LassoES(BaseESMDA):
     """
@@ -156,6 +125,7 @@ def assimilate(
         *,
         alpha: float = 1.0,
     ) -> npt.NDArray[np.double]:
+        
         # Verify shapes
         _, num_ensemble = X.shape
         num_outputs, num_emsemble2 = Y.shape
@@ -171,22 +141,23 @@ def assimilate(
         X_center = X - np.mean(X, axis=1, keepdims=True)
         Y_center = Y - np.mean(Y, axis=1, keepdims=True)
         
-        # We have R @ R.T = C_D
+        # We have R @ R.T / N ~ C_D
         R = np.sqrt(alpha) * sample_mvnormal(
             C_dd_cholesky=self.C_D_L, rng=self.rng, size=num_ensemble
         )
         
-        # L = sqrt((N - 1) / N)
-        L = np.sqrt((num_ensemble - 1)/ num_ensemble) * R
+        # S = R * sqrt((N - 1) / N), so that S @ S.T / (N-1) = covariance
+        S = np.sqrt((num_ensemble - 1) / num_ensemble) * R
 
-        # Compute regularized Kalman gain matrix matrix by solving
-        # (Y + L) K = X
-        # The definition of K is
+        # Compute regularized Kalman gain matrix matrix by solving:
+        # (Y + S) K = X
+        # The definition of K is:
         # (cov(Y, Y) + covariance) K = cov(X, Y), which is approximately
-        # (Y @ Y.T + L @ L.T) K = X @ Y.T, [X and Y are centered] which is approximately
-        # (Y + L) @ (Y + L).T @ K = X @ Y.T, which is approximately
-        # (Y + L) @ K = X
-        K = linear_l1_regression(X=Y_center + L, Y=X_center)
+        # (Y @ Y.T + S @ L.T) K = X @ Y.T, [X and Y are centered] which is approximately
+        # (Y + S) @ (Y + S).T @ K = X @ Y.T, which is approximately
+        # (Y + S) @ K = X
+        # TODO: Write this out properly.
+        K = linear_l1_regression(X=(Y_center + S), Y=X_center)
 
         D = self.perturb_observations(ensemble_size=num_ensemble, alpha=alpha)
         return X + K @ (D - Y)