From 08ebbc6299c8c4f68ba7a87689c3e75b06482a7a Mon Sep 17 00:00:00 2001
From: Oliver Dunbar <47412152+odunbar@users.noreply.github.com>
Date: Fri, 12 Apr 2024 18:05:26 -0700
Subject: [PATCH] Changing how we deal with Regularization (#55)

* change Matrix inversions for linear system

* correct docs to new inversion

* updated deprecated examples

* works well, but why

* allow users to invert input regularization

* format

* nicer formatting

* test pass, resolve typo, and format

* remove comment blocks
---
 docs/src/parallelism.md                       |   6 +-
 docs/src/setting_up_scalar.md                 |   4 +-
 docs/src/setting_up_vector.md                 |  18 +-
 .../1d_to_1d_regression_direct.jl             |  11 +-
 .../1d_to_1d_regression_direct_withcov.jl     |   6 +-
 .../nd_to_1d_regression_direct.jl             |  11 +-
 .../nd_to_1d_regression_direct_matchingcov.jl |   6 +-
 .../nd_to_1d_regression_direct_withcov.jl     |  11 +-
 .../nd_to_md_regression_direct_withcov.jl     | 180 +++++++---------
 src/Methods.jl                                | 195 +++++++-----------
 src/Utilities.jl                              |  27 ++-
 test/Methods/runtests.jl                      |  27 ++-
 test/Utilities/runtests.jl                    |   1 -
 13 files changed, 215 insertions(+), 288 deletions(-)

diff --git a/docs/src/parallelism.md b/docs/src/parallelism.md
index fde60cf..d5bb5a2 100644
--- a/docs/src/parallelism.md
+++ b/docs/src/parallelism.md
@@ -3,16 +3,16 @@
 ### Explicit bottlenecks
 - By far the highest computational demand for high-dimensional and/or large data systems is the building of the system matrix, particularly the multiplication ``\Phi^T\Phi`` (for scalar) and ``\Phi_{n,i,m}\Phi_{n,j,m}``. These can be accelerated by multithreading (see below)
 - For large number of features, the inversion `inv(factorization(system_matrix))` is noticeable, though typically still small when in the regime of `n_features << n_samples * output_dim`.
-- For the vector case, the use of non-diagonal ``\Lambda_{i,j}`` for regularization cannot take advantage of certain matrix-tensor identities and may also cause substantial slow-down. (currently)
+- For the vector case, the square output-dimensional regularization matrix ``\Lambda`` must be inverted. For high-dimensional spaces, diagonal approximation will avoid this.
 - Prediction bottlenecks are largely due to allocations and matrix multiplications. Please see our `predict!()` methods which allow for users to pass in preallocated of arrays. This is very beneficial for repeated predictions.
 - For systems without enough regularization, positive definiteness may need to be enforced. If done too often, it has non-negligible cost, as it involves calculating eigenvalues of the non p.d. matrix (particularly `abs(min(eigval)`) that is then added to the diagonal. It is better to add more regularization into ``\Lambda``
 
 ### Implicit bottlenecks
-- The optimization of hyperparameters is a costly operation that may require construction and evaluation of thousands of `RandomFeatureMethod`s. The dimensionality (i.e. complexity) of this task will depend on how many free parameters are taken to be within a distribution though. ``\mathcal{O}(1000s)`` parameters may take several hours to optimize (on multiple threads).
+- The optimization of hyperparameters is a costly operation that may require construction and evaluation of thousands of `RandomFeatureMethod`s. The dimensionality (i.e. complexity) of this task will depend on how many free parameters are taken to be within a distribution though. ``\mathcal{O}(1000s)`` parameters may take even hours to optimize (on multiple threads).
 
 # Parallelism/memory
 - We make use of [`Tullio.jl`](https://github.com/mcabbott/Tullio.jl) which comes with in-built memory management. We are phasing out our own batches in favour of using this for now.
-- [`Tullio.jl`(https://github.com/mcabbott/Tullio.jl) comes with multithreading routines, Simply call the code with `julia --project -t n_threads` to take advantage of this. Depending on problem size you may wish to use your own external threading, Tullio will greedily steal threads in this case. To prevent this interference we provide a keyword argument: 
+- [`Tullio.jl`](https://github.com/mcabbott/Tullio.jl) comes with multithreading routines, Simply call the code with `julia --project -t n_threads` to take advantage of this. Depending on problem size you may wish to use your own external threading, Tullio will greedily steal threads in this case. To prevent this interference we provide a keyword argument: 
 ```julia
 RandomFeatureMethod(... ; tullio_threading=false) # serial threading during the build and fit! methods
 predict(...; tullio_threading=false) # serial threading for prediction
diff --git a/docs/src/setting_up_scalar.md b/docs/src/setting_up_scalar.md
index dd09909..bd16f80 100644
--- a/docs/src/setting_up_scalar.md
+++ b/docs/src/setting_up_scalar.md
@@ -111,9 +111,9 @@ The keyword `feature_parameters = Dict("sigma" => a)`, can be included to set th
 
 The `RandomFeatureMethod` sets up the training problem to learn coefficients ``\beta\in\mathbb{R}^m`` from input-output training data ``(x,y)=\{(x_i,y_i)\}_{i=1}^n``, ``y_i \in \mathbb{R}`` and parameters ``\theta = \{\theta_j\}_{j=1}^m``:
 ```math
-(\frac{1}{m}\Phi^T(x;\theta) \Phi(x;\theta) + \lambda I) \beta = \Phi^T(x;\theta)y
+(\frac{1}{\lambda m}\Phi^T(x;\theta) \Phi(x;\theta) + I) \beta = \frac{1}{\lambda}\Phi^T(x;\theta)y
 ```
-Where ``\lambda I`` is a regularization.
+Where ``\lambda>0`` is a regularization parameter that effects the `+I`. The equation is written in this way to be consistent with the vector-output case.
 ```julia
 rfm = RandomFeatureMethod(
     sf;
diff --git a/docs/src/setting_up_vector.md b/docs/src/setting_up_vector.md
index 23a01d9..b7e3dea 100644
--- a/docs/src/setting_up_vector.md
+++ b/docs/src/setting_up_vector.md
@@ -115,20 +115,11 @@ The keyword `feature_parameters = Dict("sigma" => a)`, can be included to set th
 
 ## Method
 
-The `RandomFeatureMethod` sets up the training problem to learn coefficients ``\beta\in\mathbb{R}^m`` from input-output training data ``(x,y)=\{(x_i,y_i)\}_{i=1}^n``, ``y_i \in \mathbb{R}^p``  and parameters ``\theta = \{\theta_j\}_{j=1}^m``. Regularization is provided through ``\Lambda = \lambda \otimes I_{n\times n}`` from a user-provided `p-by-p` positive-definite regularization matrix ``\lambda``. In Einstein summation notation the method solves the following system
+The `RandomFeatureMethod` sets up the training problem to learn coefficients ``\beta\in\mathbb{R}^m`` from input-output training data ``(x,y)=\{(x_i,y_i)\}_{i=1}^n``, ``y_i \in \mathbb{R}^p``  and parameters ``\theta = \{\theta_j\}_{j=1}^m``. Regularization is provided through ``\Lambda``, a user-provided `p-by-p` positive-definite regularization matrix (or scaled identity). In Einstein summation notation the method solves the following system
 ```math
-(\frac{1}{m}\Phi_{n,i,p}(x;\theta) \Phi_{n,j,p}(x;\theta) + R_{i,j}) \beta_j = \Phi(x;\theta)_{n,i,p}y_{n,p}
+(\frac{1}{m}\Phi_{n,i,p}(x;\theta) \, [I_{n,m} \otimes \Lambda^{-1}_{p,q}]\, \Phi_{n,j,q}(x;\theta) + I_{i,j}) \beta_j = \Phi(x;\theta)_{n,i,p}\,[ I_{n,m} \otimes \Lambda^{-1}_{p,q}]\, y_{n,p}
 ```
-and where the regularization ``R`` is built from ``\Lambda`` by solving the non-square system
-```math
- \Phi_{m,j,q} R_{i,j} = \Phi_{n,i,p} \Lambda_{p,q,n,m}
-```
-In the case the ``\lambda`` is provided as a `Real` or `UniformScaling` then ``R`` is (consistently) replaced with ``\lambda I_{m\times m}``. The authors typically recommend replacing non-diagonal ``\lambda`` with ``\mathrm{det}(\lambda)^{\frac{1}{p}}I``, which often provides a reasonable approximation, as there is additional computational expense through solving this un-batched system of equations.
-
-!!! note "The nonsquare system"
-    If one chooses to solve for ``R``, note that it is also only defined for dimensions ``m < np``. For stability we also perform the following: we use a truncated SVD for when the rank of the system is less than ``m``, and we also symmetrize the resulting matrix.
-
-
+So long as ``\Lambda`` is easily invertible then this system is efficiently solved.       
 
 ```julia
 rfm = RandomFeatureMethod(
@@ -141,7 +132,8 @@ One can select batch sizes to balance the space-time (memory-process) trade-off.
 
 !!! warning "Conditioning"
     The problem is ill-conditioned without regularization.
-    If you encounters a Singular or Positive-definite exceptions, try increasing the constant scaling `regularization`
+    If you encounters a Singular or Positive-definite exceptions or warnings, try increasing the constant scaling `regularization`. 
+   
 
 
 
diff --git a/examples/Learn_hyperparameters/1d_to_1d_regression_direct.jl b/examples/Learn_hyperparameters/1d_to_1d_regression_direct.jl
index 073ee5f..40de210 100644
--- a/examples/Learn_hyperparameters/1d_to_1d_regression_direct.jl
+++ b/examples/Learn_hyperparameters/1d_to_1d_regression_direct.jl
@@ -86,7 +86,7 @@ function calculate_mean_and_coeffs(
     batch_inputs = batch_generator(itest, test_batch_size, dims = 2) # input_dim x batch_size
 
     #we want to calc lambda/m * coeffs^2 in the end
-    pred_mean = predictive_mean(rfm, fitted_features, DataContainer(itest))
+    pred_mean, features = predictive_mean(rfm, fitted_features, DataContainer(itest))
     scaled_coeffs = sqrt(1 / n_features) * get_coeffs(fitted_features)
     return pred_mean, scaled_coeffs
 
@@ -152,13 +152,13 @@ end
 ## Begin Script, define problem setting
 
 println("starting script")
-date_of_run = Date(2022, 9, 14)
+date_of_run = Date(2024, 4, 10)
 
 
 # Target function
 ftest(x::AbstractVecOrMat) = exp.(-0.5 * x .^ 2) .* (x .^ 4 - x .^ 3 - x .^ 2 + x .- 1)
 
-n_data = 20 * 4
+n_data = 20 * 2
 noise_sd = 0.1
 
 x = rand(rng, Uniform(-3, 3), n_data)
@@ -285,7 +285,7 @@ for (idx, sd, feature_type) in zip(collect(1:length(σ_c)), σ_c, feature_types)
     end
 
     push!(rfms, RandomFeatureMethod(sf, batch_sizes = batch_sizes, regularization = regularizer))
-    push!(fits, fit(rfms[end], io_pairs_test, decomposition_type = "qr"))
+    push!(fits, fit(rfms[end], io_pairs_test))
 end
 
 if PLOT_FLAG
@@ -311,7 +311,8 @@ if PLOT_FLAG
     for (idx, rfm, fit, feature_type, clr) in zip(collect(1:length(σ_c)), rfms, fits, feature_types, clrs)
 
         pred_mean, pred_cov = predict(rfm, fit, DataContainer(xplt))
-        pred_cov = max.(pred_cov, 0.0)
+        pred_cov = pred_cov[1, 1, :] #just variances
+        pred_cov = max.(pred_cov, 0.0) #not normally needed..
         plot!(
             xplt',
             pred_mean',
diff --git a/examples/Learn_hyperparameters/1d_to_1d_regression_direct_withcov.jl b/examples/Learn_hyperparameters/1d_to_1d_regression_direct_withcov.jl
index 3775a90..faeff8c 100644
--- a/examples/Learn_hyperparameters/1d_to_1d_regression_direct_withcov.jl
+++ b/examples/Learn_hyperparameters/1d_to_1d_regression_direct_withcov.jl
@@ -157,7 +157,7 @@ end
 ## Begin Script, define problem setting
 
 println("starting script")
-date_of_run = Date(2022, 9, 14)
+date_of_run = Date(2024, 4, 10)
 
 
 # Target function
@@ -254,7 +254,7 @@ for (idx, type) in enumerate(feature_types)
             string(i) *
             ", Error: " *
             string(err[i]) *
-            ", with parameter mean" *
+            ", with parameter mean " *
             string(mean(transform_unconstrained_to_constrained(priors, get_u_final(ekiobj[1])), dims = 2)[:, 1]),
             " and sd ",
             string(sqrt.(var(transform_unconstrained_to_constrained(priors, get_u_final(ekiobj[1])), dims = 2))[:, 1]),
@@ -298,7 +298,7 @@ for (idx, sd, feature_type) in zip(collect(1:length(σ_c)), σ_c, feature_types)
     end
 
     push!(rfms, RandomFeatureMethod(sf, batch_sizes = batch_sizes, regularization = regularizer))
-    push!(fits, fit(rfms[end], io_pairs_test, decomposition_type = "qr"))
+    push!(fits, fit(rfms[end], io_pairs_test))
 end
 
 if PLOT_FLAG
diff --git a/examples/Learn_hyperparameters/nd_to_1d_regression_direct.jl b/examples/Learn_hyperparameters/nd_to_1d_regression_direct.jl
index 448724a..bb8b393 100644
--- a/examples/Learn_hyperparameters/nd_to_1d_regression_direct.jl
+++ b/examples/Learn_hyperparameters/nd_to_1d_regression_direct.jl
@@ -93,13 +93,13 @@ function calculate_mean_and_coeffs(
 
     # build and fit the RF
     rfm = RFM_from_hyperparameters(rng, l, regularizer, n_features, batch_sizes, input_dim)
-    fitted_features = fit(rfm, io_train_cost, decomposition_type = "qr")
+    fitted_features = fit(rfm, io_train_cost)
 
     test_batch_size = get_batch_size(rfm, "test")
     batch_inputs = batch_generator(itest, test_batch_size, dims = 2) # input_dim x batch_size
 
     #we want to calc 1/var(y-mean)^2 + lambda/m * coeffs^2 in the end
-    pred_mean = predictive_mean(rfm, fitted_features, DataContainer(itest))
+    pred_mean, features = predictive_mean(rfm, fitted_features, DataContainer(itest))
     scaled_coeffs = sqrt(1 / n_features) * get_coeffs(fitted_features)
     return pred_mean, scaled_coeffs
 
@@ -169,7 +169,7 @@ end
 
 ## Begin Script, define problem setting
 println("Begin script")
-date_of_run = Date(2022, 9, 14)
+date_of_run = Date(2024, 4, 10)
 input_dim = 6
 println("Number of input dimensions: ", input_dim)
 
@@ -344,7 +344,7 @@ sff = ScalarFourierFeature(n_features_test, feature_sampler)
 #second case with batching
 
 rfm_batch = RandomFeatureMethod(sff, batch_sizes = batch_sizes, regularization = regularizer)
-fitted_batched_features = fit(rfm_batch, io_pairs_test, decomposition_type = "qr")
+fitted_batched_features = fit(rfm_batch, io_pairs_test)
 
 if PLOT_FLAG
     #plot slice through one dimensions, others fixed to 0
@@ -362,7 +362,8 @@ if PLOT_FLAG
         yslice = ftest_nd_to_1d(xslicenew)
 
         pred_mean_slice, pred_cov_slice = predict(rfm_batch, fitted_batched_features, DataContainer(xslicenew))
-        pred_cov_slice = max.(pred_cov_slice, 0.0)
+        pred_cov_slice = max.(pred_cov_slice[1, 1, :], 0.0)
+
         plt = plot(
             xrange,
             yslice',
diff --git a/examples/Learn_hyperparameters/nd_to_1d_regression_direct_matchingcov.jl b/examples/Learn_hyperparameters/nd_to_1d_regression_direct_matchingcov.jl
index 07a9494..62447d3 100644
--- a/examples/Learn_hyperparameters/nd_to_1d_regression_direct_matchingcov.jl
+++ b/examples/Learn_hyperparameters/nd_to_1d_regression_direct_matchingcov.jl
@@ -93,7 +93,7 @@ function calculate_mean_cov_and_coeffs(
 
     # build and fit the RF
     rfm = RFM_from_hyperparameters(rng, l, regularizer, n_features, batch_sizes, input_dim)
-    fitted_features = fit(rfm, io_train_cost, decomposition_type = "svd")
+    fitted_features = fit(rfm, io_train_cost)
 
     test_batch_size = get_batch_size(rfm, "test")
     batch_inputs = batch_generator(itest, test_batch_size, dims = 2) # input_dim x batch_size
@@ -174,7 +174,7 @@ end
 
 ## Begin Script, define problem setting
 println("Begin script")
-date_of_run = Date(2022, 9, 22)
+date_of_run = Date(2024, 4, 10)
 input_dim_list = [8]
 
 for input_dim in input_dim_list
@@ -368,7 +368,7 @@ for input_dim in input_dim_list
     #second case with batching
 
     rfm_batch = RandomFeatureMethod(sff, batch_sizes = batch_sizes, regularization = regularizer)
-    fitted_batched_features = fit(rfm_batch, io_pairs_test, decomposition_type = "svd")
+    fitted_batched_features = fit(rfm_batch, io_pairs_test)
 
     if PLOT_FLAG
         #plot slice through one dimensions, others fixed to 0
diff --git a/examples/Learn_hyperparameters/nd_to_1d_regression_direct_withcov.jl b/examples/Learn_hyperparameters/nd_to_1d_regression_direct_withcov.jl
index 67fa0a7..1ac497b 100644
--- a/examples/Learn_hyperparameters/nd_to_1d_regression_direct_withcov.jl
+++ b/examples/Learn_hyperparameters/nd_to_1d_regression_direct_withcov.jl
@@ -93,7 +93,7 @@ function calculate_mean_cov_and_coeffs(
 
     # build and fit the RF
     rfm = RFM_from_hyperparameters(rng, l, regularizer, n_features, batch_sizes, input_dim)
-    fitted_features = fit(rfm, io_train_cost, decomposition_type = "qr")
+    fitted_features = fit(rfm, io_train_cost)
 
     test_batch_size = get_batch_size(rfm, "test")
     batch_inputs = batch_generator(itest, test_batch_size, dims = 2) # input_dim x batch_size
@@ -173,7 +173,7 @@ end
 
 ## Begin Script, define problem setting
 println("Begin script")
-date_of_run = Date(2022, 9, 14)
+date_of_run = Date(2024, 4, 10)
 input_dim = 8
 println("Number of input dimensions: ", input_dim)
 
@@ -352,7 +352,7 @@ sff = ScalarFourierFeature(n_features_test, feature_sampler)
 #second case with batching
 
 rfm_batch = RandomFeatureMethod(sff, batch_sizes = batch_sizes, regularization = regularizer)
-fitted_batched_features = fit(rfm_batch, io_pairs_test, decomposition_type = "svd")
+fitted_batched_features = fit(rfm_batch, io_pairs_test)
 
 if PLOT_FLAG
     #plot slice through one dimensions, others fixed to 0
@@ -368,9 +368,8 @@ if PLOT_FLAG
         xslicenew[direction, :] = xrange
 
         yslice = ftest_nd_to_1d(xslicenew)
-
         pred_mean_slice, pred_cov_slice = predict(rfm_batch, fitted_batched_features, DataContainer(xslicenew))
-        pred_cov_slice[1, 1, :] = max.(pred_cov_slice[1, 1, :], 0.0)
+        pred_cov_slice = max.(pred_cov_slice[1, 1, :], 0.0)
         plt = plot(
             xrange,
             yslice',
@@ -384,7 +383,7 @@ if PLOT_FLAG
         plot!(
             xrange,
             pred_mean_slice',
-            ribbon = [2 * sqrt.(pred_cov_slice[1, 1, :]); 2 * sqrt.(pred_cov_slice[1, 1, :])]',
+            ribbon = [2 * sqrt.(pred_cov_slice); 2 * sqrt.(pred_cov_slice)]',
             label = "Fourier",
             color = "blue",
         )
diff --git a/examples/Learn_hyperparameters/nd_to_md_regression_direct_withcov.jl b/examples/Learn_hyperparameters/nd_to_md_regression_direct_withcov.jl
index 69687e6..d49e53d 100644
--- a/examples/Learn_hyperparameters/nd_to_md_regression_direct_withcov.jl
+++ b/examples/Learn_hyperparameters/nd_to_md_regression_direct_withcov.jl
@@ -26,7 +26,6 @@ using RandomFeatures.Samplers
 using RandomFeatures.Features
 using RandomFeatures.Methods
 using RandomFeatures.Utilities
-using ProfileView
 
 seed = 2024
 ekp_seed = 99999
@@ -124,6 +123,7 @@ function RFM_from_hyperparameters(
             "name" => "xi",
         ),
     )
+
     feature_sampler = FeatureSampler(pd, output_dim, rng = rng)
     vff = VectorFourierFeature(n_features, output_dim, feature_sampler)
 
@@ -139,7 +139,8 @@ function calculate_mean_cov_and_coeffs(
     batch_sizes::Dict{S, Int},
     io_pairs::PairedDataContainer,
     mean_store::Matrix{FT},
-    cov_store::Array{FT, 3};
+    cov_store::Array{FT, 3},
+    buffer::Array{FT, 3};
     decomp_type::S = "chol",
 ) where {
     RNG <: AbstractRNG,
@@ -173,24 +174,22 @@ function calculate_mean_cov_and_coeffs(
 
     #we want to calc 1/var(y-mean)^2 + lambda/m * coeffs^2 in the end
     #    pred_mean, pred_cov = predict(rfm, fitted_features, DataContainer(itest))
-    predict!(rfm, fitted_features, DataContainer(itest), mean_store, cov_store)
+    predict!(rfm, fitted_features, DataContainer(itest), mean_store, cov_store, buffer)
     # sizes (output_dim x n_test), (output_dim x output_dim x n_test) 
-    scaled_coeffs = sqrt(1 / n_features) * get_coeffs(fitted_features)
-    reg = get_regularization(fitted_features)
-    #    if !isa(reg, UniformScaling)
-    #        println("diag of reg: ",diag(reg))
-    #        println("det of reg: ", det(reg))
-    #    end
+
+
+    scaled_coeffs = 1 / sqrt(n_features) * norm(get_coeffs(fitted_features))
+
     if decomp_type == "chol"
         chol_fac = get_decomposition(get_feature_factors(fitted_features)).L
-        complexity = 2 * sum(log.(diag(chol_fac)))
+        complexity = 2 * sum(log(chol_fac[i, i]) for i in 1:size(chol_fac, 1))
     else
         svd_singval = get_decomposition(get_feature_factors(fitted_features)).S
-        complexity = sum(log.(svd_singval)) # note this is log(abs(det))    
+        complexity = sum(log, svd_singval) # note this is log(abs(det))
     end
-    println("sample_complexity", complexity)
+    complexity = sqrt(complexity) # complexity must be positive
 
-    #    return pred_mean, pred_cov, scaled_coeffs, complexity
+    println("sample_complexity", complexity)
     return scaled_coeffs, complexity
 
 end
@@ -203,7 +202,8 @@ function estimate_mean_and_coeffnorm_covariance(
     n_features::Int,
     batch_sizes::Dict{S, Int},
     io_pairs::PairedDataContainer,
-    n_samples::Int;
+    n_samples::Int,
+    y;
     repeats::Int = 1,
 ) where {
     RNG <: AbstractRNG,
@@ -214,59 +214,58 @@ function estimate_mean_and_coeffnorm_covariance(
     n_train = Int(floor(0.8 * size(get_inputs(io_pairs), 2))) # 80:20 train test
     n_test = size(get_inputs(io_pairs), 2) - n_train
     output_dim = size(get_outputs(io_pairs), 1)
-    #    means = zeros(n_test, n_samples, output_dim)
-    #    covs = zeros(n_test, n_samples, output_dim, output_dim)
+
     means = zeros(output_dim, n_samples, n_test)
-    covs = zeros(output_dim, n_samples, output_dim, n_test)
+    mean_of_covs = zeros(output_dim, output_dim, n_test)
+    moc_tmp = similar(mean_of_covs)
+    mtmp = zeros(output_dim, n_test)
+    buffer = zeros(n_test, output_dim, n_features)
     complexity = zeros(1, n_samples)
     coeffl2norm = zeros(1, n_samples)
-    #    mtmp = zeros(output_dim,n_test)
-    #    vtmp = zeros(output_dim,output_dim,n_test)
-    ctmp = zeros(n_features)
 
     for i in 1:n_samples
         for j in 1:repeats
-            ctmp[:], cplxtytmp = calculate_mean_cov_and_coeffs(
-                rng,
-                l,
-                lambda,
-                n_features,
-                batch_sizes,
-                io_pairs,
-                means[:, i, :],
-                covs[:, i, :, :],
-            )
+            c, cplxty =
+                calculate_mean_cov_and_coeffs(rng, l, lambda, n_features, batch_sizes, io_pairs, mtmp, moc_tmp, buffer)
             # m output_dim x n_test
             # v output_dim x output_dim x n_test
             # c n_features
-            #            means[:, i, :] += mtmp' ./ repeats
-            #            covs[:, i, :, :] += vtmp ./ repeats
-            coeffl2norm[1, i] += sqrt(sum(ctmp .^ 2)) / repeats
-            complexity[1, i] += cplxtytmp / repeats
+            # cplxty 1
+
+            # update vbles needed for cov
+            means[:, i, :] .+= (y - mtmp) ./ repeats
+            coeffl2norm[1, i] += sqrt(sum(abs2, c)) / repeats
+            complexity[1, i] += cplxty / repeats
+
+            # update vbles needed for mean
+            @. mean_of_covs += moc_tmp / (repeats * n_samples)
 
         end
     end
     means = permutedims(means, (3, 2, 1))
-    covs = permutedims(covs, (4, 2, 1, 3))
+    mean_of_covs = permutedims(mean_of_covs, (3, 1, 2))
 
-
-    blockcovmat = zeros(n_samples, n_test * output_dim, n_test * output_dim)
+    approx_σ2 = zeros(n_test * output_dim, n_test * output_dim)
     blockmeans = zeros(n_test * output_dim, n_samples)
     for i in 1:n_test
         id = ((i - 1) * output_dim + 1):(i * output_dim)
-        blockcovmat[:, id, id] = covs[i, :, :, :] # this ordering, so we can take a mean/cov in dims = 2.
+        approx_σ2[id, id] = mean_of_covs[i, :, :] # this ordering, so we can take a mean/cov in dims = 2.
         blockmeans[id, :] = permutedims(means[i, :, :], (2, 1))
     end
 
-    Γ = cov(vcat(blockmeans, coeffl2norm, complexity), dims = 2)
-    approx_σ2 = mean(blockcovmat, dims = 1)[1, :, :] # approx of \sigma^2I +rf var
+    sample_mat = vcat(blockmeans, coeffl2norm, complexity)
+    Γ = cov(sample_mat, dims = 2)
+
     if !isposdef(approx_σ2)
-        println("approx_σ2 not posdef - correcting")
+        println("approx_σ2 not posdef")
         approx_σ2 = posdef_correct(approx_σ2)
     end
 
     return Γ, approx_σ2
 
+
+    return Γ, approx_σ2
+
 end
 
 function calculate_ensemble_mean_and_coeffnorm(
@@ -275,7 +274,8 @@ function calculate_ensemble_mean_and_coeffnorm(
     lambda::L,
     n_features::Int,
     batch_sizes::Dict{S, Int},
-    io_pairs::PairedDataContainer;
+    io_pairs::PairedDataContainer,
+    y;
     repeats::Int = 1,
 ) where {
     RNG <: AbstractRNG,
@@ -293,65 +293,50 @@ function calculate_ensemble_mean_and_coeffnorm(
     n_test = size(get_inputs(io_pairs), 2) - n_train
     output_dim = size(get_outputs(io_pairs), 1)
 
-    #    means = zeros(n_test, N_ens, output_dim)
-    #    covs = zeros(n_test, N_ens, output_dim, output_dim)
     means = zeros(output_dim, N_ens, n_test)
-    covs = zeros(output_dim, N_ens, output_dim, n_test)
+    mean_of_covs = zeros(output_dim, output_dim, n_test)
+    buffer = zeros(n_test, output_dim, n_features)
     complexity = zeros(1, N_ens)
     coeffl2norm = zeros(1, N_ens)
-    #    mtmp = zeros(output_dim,n_test)
-    #    vtmp = zeros(output_dim,output_dim,n_test)
-    ctmp = zeros(n_features)
-
-    rfm_store = []
-    ff_store = []
-
+    moc_tmp = similar(mean_of_covs)
+    mtmp = zeros(output_dim, n_test)
 
     for i in collect(1:N_ens)
         for j in collect(1:repeats)
             l = lmat[:, i]
-            ctmp[:], cplxtytmp = calculate_mean_cov_and_coeffs(
-                rng,
-                l,
-                lambda,
-                n_features,
-                batch_sizes,
-                io_pairs,
-                means[:, i, :],
-                covs[:, i, :, :],
-            )
+            c, cplxty =
+                calculate_mean_cov_and_coeffs(rng, l, lambda, n_features, batch_sizes, io_pairs, mtmp, moc_tmp, buffer)
             # m output_dim x n_test
             # v output_dim x output_dim x n_test
             # c n_features
-            #            means[:, i, :] += mtmp' ./ repeats
-            #            covs[:, i, :, :] += vtmp ./ repeats
-            coeffl2norm[1, i] += sqrt(sum(ctmp .^ 2)) / repeats
-            complexity[1, i] += cplxtytmp / repeats
+            means[:, i, :] += (y - mtmp) ./ repeats
+            @. mean_of_covs += moc_tmp / (repeats * N_ens)
+            coeffl2norm[1, i] += sqrt(sum(c .^ 2)) / repeats
+            complexity[1, i] += cplxty / repeats
         end
     end
-    means = permutedims(means, (3, 2, 1)) # n_test x 
-    covs = permutedims(covs, (4, 2, 1, 3)) #n_test x N_ens x output_dim x output_dim
-
-    blockcovmat = zeros(N_ens, n_test * output_dim, n_test * output_dim)
+    means = permutedims(means, (3, 2, 1))
+    mean_of_covs = permutedims(mean_of_covs, (3, 1, 2))
+    blockcovmat = zeros(n_test * output_dim, n_test * output_dim)
     blockmeans = zeros(n_test * output_dim, N_ens)
     for i in 1:n_test
         id = ((i - 1) * output_dim + 1):(i * output_dim)
-        blockcovmat[:, id, id] = covs[i, :, :, :]
+        blockcovmat[id, id] = mean_of_covs[i, :, :]
         blockmeans[id, :] = permutedims(means[i, :, :], (2, 1))
     end
-    blockcovmat = mean(blockcovmat, dims = 1)[1, :, :] # np x np
 
     if !isposdef(blockcovmat)
-        println("blockcovmat not posdef - correcting")
+        println("blockcovmat not posdef")
         blockcovmat = posdef_correct(blockcovmat)
     end
+
     return vcat(blockmeans, coeffl2norm, complexity), blockcovmat
 end
 
 @time begin
     ## Begin Script, define problem setting
     println("Begin script")
-    date_of_run = Date(2022, 11, 10)
+    date_of_run = Date(2024, 4, 10)
 
     input_dim = 1
     output_dim = 3
@@ -377,18 +362,18 @@ end
     x = rand(rng, MvNormal(zeros(input_dim), I), n_data)
 
     # diagonal noise
-    cov_mat = Diagonal((5e-2)^2 * ones(output_dim))
-    #cov_mat = (5e-2)^2*I
+    #    cov_mat = Diagonal((5e-2)^2 * ones(output_dim))
+    #cov_mat = (5e-2)^2*I(output_dim)
     # correlated noise
-    #cov_mat = convert(Matrix,Tridiagonal((5e-3) * ones(2), (2e-2) * ones(3), (5e-3) * ones(2)))
+    cov_mat = convert(Matrix, Tridiagonal((5e-3) * ones(2), (2e-2) * ones(3), (5e-3) * ones(2)))
 
     noise_dist = MvNormal(zeros(output_dim), cov_mat)
     noise = rand(rng, noise_dist, n_data)
 
     # simple regularization
-    lambda = exp((1 / output_dim) * sum(log.(eigvals(cov_mat)))) * I
+    #lambda = exp((1 / output_dim) * sum(log.(eigvals(cov_mat)))) * I(output_dim)
     # more complex
-    #lambda = cov_mat
+    lambda = cov_mat
 
 
     y = ftest_1d_to_3d(x) + noise
@@ -435,18 +420,10 @@ end
             batch_sizes,
             io_pairs,
             n_samples,
+            y[:, (n_train + 1):end],
             repeats = repeats,
         )
-        #=   @profview estimate_mean_and_coeffnorm_covariance(
-                rng,
-                repeat([μ_l], n_l), # take mean values
-                lambda,
-                n_features_opt,
-                batch_sizes,
-                io_pairs,
-                n_samples,
-                repeats = repeats,
-            )=#
+
         save("calculated_truth_cov.jld2", "internal_Γ", internal_Γ, "approx_σ2", approx_σ2)
     else
         println("Loading truth covariance from file...")
@@ -455,7 +432,9 @@ end
     end
 
     Γ = internal_Γ
-    Γ[1:(n_test * output_dim), 1:(n_test * output_dim)] += approx_σ2
+    for i in 1:(n_test - 1)
+        Γ[((i - 1) * output_dim + 1):(i * output_dim), ((i - 1) * output_dim + 1):(i * output_dim)] += lambda[:, :]
+    end
     Γ[(n_test * output_dim + 1):end, (n_test * output_dim + 1):end] += I
     println(
         "Estimated variance. Tr(cov) = ",
@@ -475,21 +454,8 @@ end
     initial_params = construct_initial_ensemble(priors, N_ens; rng_seed = ekp_seed)
     params_init = transform_unconstrained_to_constrained(priors, initial_params)#[1, :]
     println("Prior gives parameters between: [$(minimum(params_init)),$(maximum(params_init))]")
+    data = zeros(size(Γ, 1))
 
-    if isa(lambda, Real)
-        min_complexity = n_features_opt * log(lambda)
-        data = vcat(reshape(y[:, (n_train + 1):end], :, 1), 0.0, min_complexity) #flatten data
-    elseif isa(lambda, UniformScaling)
-        min_complexity = n_features_opt * log(lambda.λ)
-        data = vcat(reshape(y[:, (n_train + 1):end], :, 1), 0.0, min_complexity) #flatten data
-    else
-        min_complexity = (n_features_opt / output_dim) * 2 * sum(log.(diag(cholesky(lambda).L)))
-        # TODO: SOLVE EVAL PROBLEM TO FIT WITH THIS DATA. the following is just a fudge to get reasonable scaling and not a global truth
-        min_complexity *= 1
-
-        data = vcat(reshape(y[:, (n_train + 1):end], :, 1), 0.0, min_complexity) #flatten data
-    end
-    println("min_complexity: ", min_complexity)
 
     ekiobj = [EKP.EnsembleKalmanProcess(initial_params, data[:], Γ, Inversion())]
     err = zeros(N_iter)
@@ -507,6 +473,7 @@ end
             n_features_opt,
             batch_sizes,
             io_pairs,
+            y[:, (n_train + 1):end],
             repeats = repeats,
         )
 
@@ -522,11 +489,12 @@ end
                 batch_sizes,
                 io_pairs,
                 n_samples,
+                y[:, (n_train + 1):end],
                 repeats = repeats,
             )
             Γ_new = internal_Γ_new
-            Γ_new[1:(n_test * output_dim), 1:(n_test * output_dim)] += approx_σ2_new
-            Γ_new[(n_test * output_dim + 1):end, (n_test * output_dim + 1):end] += I
+            # Γ_new[1:(n_test * output_dim), 1:(n_test * output_dim)] += approx_σ2_new
+            # Γ_new[(n_test * output_dim + 1):end, (n_test * output_dim + 1):end] += I
             println(
                 "Estimated variance. Tr(cov) = ",
                 tr(Γ_new[1:n_test, 1:n_test]),
diff --git a/src/Methods.jl b/src/Methods.jl
index 74813dd..0e5f9b5 100644
--- a/src/Methods.jl
+++ b/src/Methods.jl
@@ -43,7 +43,7 @@ struct RandomFeatureMethod{S <: AbstractString, USorM <: Union{UniformScaling, A
     random_feature::RandomFeature
     "A dictionary specifying the batch sizes. Must contain \"train\", \"test\", and \"feature\" keys"
     batch_sizes::Dict{S, Int}
-    "A positive definite matrix used during the fit method to regularize the linear solve"
+    "A positive definite matrix used during the fit method to regularize the linear solve, interpreted as the inverse of the observational noise covariance"
     regularization::USorM
     "Use multithreading provided by Tullio"
     tullio_threading::Bool
@@ -59,30 +59,42 @@ function RandomFeatureMethod(
     regularization::USorMorR = 1e12 * eps() * I,
     batch_sizes::Dict{S, Int} = Dict("train" => 0, "test" => 0, "feature" => 0),
     tullio_threading = true,
+    regularization_inverted::Bool = false,
 ) where {S <: AbstractString, USorMorR <: Union{<:Real, AbstractMatrix{<:Real}, UniformScaling}}
 
     if !all([key ∈ keys(batch_sizes) for key in ["train", "test", "feature"]])
         throw(ArgumentError("batch_sizes keys must contain all of \"train\", \"test\", and \"feature\""))
     end
+
+    # ToDo store cholesky factors
     if isa(regularization, Real)
-        if regularization < 0
-            @info "input regularization < 0 is invalid, using regularization = 1e12*eps()"
-            lambda = 1e12 * eps() * I
+        if regularization <= 0
+            @info "input regularization <=0 is invalid, using regularization = 1e12*eps()"
+            λ = 1e12 * eps() * I
         else
-            lambda = regularization * I
+            λ = regularization * I
         end
     else
         if !isposdef(regularization) #check positive definiteness
             tol = 1e12 * eps() #MAGIC NUMBER
-            lambda = posdef_correct(regularization, tol = tol)
-            @warn "input regularization matrix is not positive definite, replacing with nearby positive definite matrix with minimum eigenvalue $tol"
+            λ = posdef_correct(regularization, tol = tol)
+            @warn "input regularization matrix is not positive definite, replacing with nearby positive definite matrix"
         else
+            λ = regularization
+        end
+    end
 
-            lambda = regularization
+    # we work with inverted regularization matrix
+    if regularization_inverted == false
+        if cond(regularization) > 10^8
+            @warn "The provided regularization is poorly conditioned: κ(reg) = cond(regularization). Imprecision or SingularException during inversion may occur."
         end
+        λinv = inv(λ)
+    else
+        λinv = λ
     end
 
-    return RandomFeatureMethod{S, typeof(lambda)}(random_feature, batch_sizes, lambda, tullio_threading)
+    return RandomFeatureMethod{S, typeof(λ)}(random_feature, batch_sizes, λinv, tullio_threading)
 end
 
 """
@@ -102,7 +114,7 @@ get_batch_sizes(rfm::RandomFeatureMethod) = rfm.batch_sizes
 """
 $(TYPEDSIGNATURES)
 
-gets the `regularization` field
+gets the `regularization` field, this is the inverse of the provided matrix if keyword `regularization_inverted = false`
 """
 get_regularization(rfm::RandomFeatureMethod) = rfm.regularization
 
@@ -137,11 +149,11 @@ Holds the coefficients and matrix decomposition that describe a set of fitted ra
 $(TYPEDFIELDS)
 """
 struct Fit{V <: AbstractVector, USorM <: Union{UniformScaling, AbstractMatrix}}
-    "The `LinearAlgreba` matrix decomposition of `(1 / m) * Feature^T * Feature + regularization`"
+    "The `LinearAlgreba` matrix decomposition of `(1 / m) * Feature^T * regularization^-1 * Feature + I`"
     feature_factors::Decomposition
     "Coefficients of the fit to data"
     coeffs::V
-    "feature-space regularization used during fit"
+    "output-dim regularization used during fit"
     regularization::USorM
 end
 
@@ -163,7 +175,7 @@ get_coeffs(f::Fit) = f.coeffs
 """
 $(TYPEDSIGNATURES)
 
-gets the `regularization` field (note this is the feature-space regularization)
+gets the `regularization` field (note this is the outputdim regularization)
 """
 get_regularization(f::Fit) = f.regularization
 
@@ -191,97 +203,49 @@ function fit(
     n_features = get_n_features(rf)
     #data are columns, batch over samples
 
-    lambda = get_regularization(rfm)
-    build_regularization = !isa(lambda, UniformScaling) # build if lambda is not a uniform scaling
-    if build_regularization
-        lambda_new = zeros(n_features, n_features)
-    else
-        lambda_new = lambda
-    end
+    λinv = get_regularization(rfm)
     Phi = build_features(rf, input)
     FT = eltype(Phi)
 
-    # regularization build needed with p.d matrix lambda
-    if build_regularization
-        if output_dim * n_data < n_features
-            lambda_new = exp(1.0 / output_dim * log(det(lambda))) * I #det(X)^{1/m}
-            @info(
-                "pos-def regularization formulation ill-defined for output_dim ($output_dim) * n_data ($n_data) < n_feature ($n_features). \n Treating it as if regularization was a uniform scaling size det(regularization_matrix)^{1 / output_dim}: $(lambda_new.λ)"
-            )
+    PhiTλinv = zeros(size(Phi))
+    PhiTλinvY = zeros(n_features)
+    PhiTλinvPhi = zeros(n_features, n_features)
+    if !tullio_threading
+        if isa(λinv, UniformScaling)
+            PhiTλinv = Phi * λinv.λ
         else
-
-            if !isa(lambda, Diagonal)
-                if !tullio_threading
-                    @tullio threads = 10^9 lambdaT_times_phi[n, p, i] := lambda[q, p] * Phi[n, q, i] # (I ⊗ Λᵀ) Φ
-                else
-                    @tullio lambdaT_times_phi[n, p, i] := lambda[q, p] * Phi[n, q, i] # (I ⊗ Λᵀ) Φ
-                end
-            else
-                lam_diag = [lambda[i, i] for i in 1:size(lambda, 1)]
-
-                if !tullio_threading
-                    @tullio threads = 10^9 lambdaT_times_phi[n, p, i] := lam_diag[p] * Phi[n, p, i] # (I ⊗ Λᵀ) Φ
-                else
-                    @tullio lambdaT_times_phi[n, p, i] := lam_diag[p] * Phi[n, p, i] # (I ⊗ Λᵀ) Φ                
-                end
-            end
-            #reshape to stacks columns, i.e n,p -> np does (n,...,n) p times
-            rhs = reshape(permutedims(lambdaT_times_phi, (2, 1, 3)), (n_data * output_dim, n_features))
-            lhs = reshape(permutedims(Phi, (2, 1, 3)), (n_data * output_dim, n_features))
-
-            # Solve Φ Bᵀ = (I ⊗ Λᵀ) Φ and transpose for B (=lambda_new)
-            lhs_svd = svd(lhs) #stable solve of rank deficient systems with SVD
-            th_idx = 1:sum(lhs_svd.S .> 1e-2 * maximum(lhs_svd.S))
-            lambda_new =
-                lhs_svd.V[:, th_idx] *
-                Diagonal(1 ./ lhs_svd.S[th_idx]) *
-                permutedims(lhs_svd.U[:, th_idx], (2, 1)) *
-                rhs
-            #make positive definite
-            lambda_new = posdef_correct(lambda_new)
-
+            @tullio threads = 10^9 PhiTλinv[n, q, i] = Phi[n, p, i] * λinv[p, q]
         end
-    end
-
-    PhiTY = zeros(n_features) #
-    PhiTPhi = zeros(n_features, n_features)
-    if !tullio_threading
-        @tullio threads = 10^9 PhiTY[j] = Phi[n, p, j] * output[p, n]
-        @tullio threads = 10^9 PhiTPhi[i, j] = Phi[n, p, i] * Phi[n, p, j] # BOTTLENECK
+        @tullio threads = 10^9 PhiTλinvY[j] = PhiTλinv[n, p, j] * output[p, n]
+        @tullio threads = 10^9 PhiTλinvPhi[i, j] = PhiTλinv[n, p, i] * Phi[n, p, j] # BOTTLENECK
     else
-        @tullio PhiTY[j] = Phi[n, p, j] * output[p, n]
-        @tullio PhiTPhi[i, j] = Phi[n, p, i] * Phi[n, p, j] # BOTTLENECK
+        if isa(λinv, UniformScaling)
+            PhiTλinv = Phi * λinv.λ
+        else
+            @tullio PhiTλinv[n, q, i] = Phi[n, p, i] * λinv[p, q]
+        end
+        @tullio PhiTλinvY[j] = PhiTλinv[n, p, j] * output[p, n]
+        @tullio PhiTλinvPhi[i, j] = PhiTλinv[n, p, i] * Phi[n, p, j] # BOTTLENECK
     end
     # alternative using svd - turns out to be slower and more mem intensive
 
-    @. PhiTPhi /= FT(n_features)
+    @. PhiTλinvPhi /= FT(n_features)
 
     # solve the linear system
-    # (PhiTPhi + lambda_new ) * beta = PhiTY
-
-    if lambda_new == 0 * I
-        feature_factors = Decomposition(PhiTPhi, "pinv")
-    else
-        # in-place add lambda (as we don't use PhiTPhi again after this)
-        if isa(lambda_new, UniformScaling)
-            # much quicker than just adding lambda_new...
-            for i in 1:size(PhiTPhi, 1)
-                PhiTPhi[i, i] += lambda_new.λ
-            end
-        else
-            @. PhiTPhi += lambda_new
-        end
-        feature_factors = Decomposition(PhiTPhi, decomposition_type)
-        # bottleneck for small problems only (much quicker than PhiTPhi for big problems)
+    # (PhiTλinvPhi + I) * beta = PhiTλinvY
 
+    # in-place add I (as we don't use PhiTPhi again after this)
+    for i in 1:size(PhiTλinvPhi, 1)
+        PhiTλinvPhi[i, i] += 1.0
     end
+    feature_factors = Decomposition(PhiTλinvPhi, decomposition_type)
+    # bottleneck for small problems only (much quicker than PhiTPhi for big problems)
 
-    coeffs = linear_solve(feature_factors, PhiTY, tullio_threading = tullio_threading) #n_features x n_samples x dim_output
+    coeffs = linear_solve(feature_factors, PhiTλinvY, tullio_threading = tullio_threading) #n_features x n_samples x dim_output
 
-    return Fit{typeof(coeffs), typeof(lambda_new)}(feature_factors, coeffs, lambda_new)
+    return Fit{typeof(coeffs), typeof(λinv)}(feature_factors, coeffs, λinv)
 end
 
-
 """
     $(TYPEDSIGNATURES)
         
@@ -571,12 +535,11 @@ function predictive_cov!(
     rf = get_random_feature(rfm)
     tullio_threading = get_tullio_threading(rfm)
     n_features = get_n_features(rf)
-    lambda = get_regularization(fit)
 
     output_dim = get_output_dim(rf)
     coeffs = get_coeffs(fit)
-    PhiTPhi_reg_factors = get_feature_factors(fit)
-    inv_decomp = get_inv_decomposition(PhiTPhi_reg_factors)
+    PhiTλinvPhi_factors = get_feature_factors(fit)
+    inv_decomp = get_inv_decomposition(PhiTλinvPhi_factors) # (get the inverse of this)
     if !(size(cov_store) == (output_dim, output_dim, size(inputs, 2)))
         throw(
             DimensionMismatch(
@@ -586,45 +549,25 @@ function predictive_cov!(
     end
 
     FT = eltype(prebuilt_features)
-    if isa(lambda, UniformScaling)
-        if !(size(buffer) == (size(inputs, 2), output_dim, n_features))
-            throw(
-                DimensionMismatch(
-                    "provided storage for tmp buffer expected to be size ($(size(inputs,2)),$(output_dim),$(n_features)), got $(size(buffer))",
-                ),
-            )
-        end
-        if !tullio_threading
-            @tullio threads = 10^9 buffer[n, p, o] = prebuilt_features[n, p, m] * inv_decomp[m, o]
-            @tullio threads = 10^9 cov_store[p, q, n] = buffer[n, p, o] * prebuilt_features[n, q, o]
-        else
-            @tullio buffer[n, p, o] = prebuilt_features[n, p, m] * inv_decomp[m, o]
-            @tullio cov_store[p, q, n] = buffer[n, p, o] * prebuilt_features[n, q, o]
-        end
-        @. cov_store /= (FT(n_features) / lambda.λ) #i.e. * lambda / n_features
-
-    else
-        PhiTPhi_reg = get_full_matrix(PhiTPhi_reg_factors)
-        @. PhiTPhi_reg -= lambda
-        if !tullio_threading
-            @tullio threads = 10^9 buffer[n, p, i] = PhiTPhi_reg[i, j] * prebuilt_features[n, p, j] # BOTTLENECK OF PREDICTION
-            coeff_outputs = linear_solve(PhiTPhi_reg_factors, buffer, tullio_threading = tullio_threading) # n_features x bsize x output_dim
-            # make sure we don't use := in following line, or it wont modify the input argument.
-            @tullio threads = 10^9 cov_store[p, q, n] =
-                prebuilt_features[n, p, m] * (prebuilt_features[n, q, m] - coeff_outputs[n, q, m])
-        else
-            @tullio buffer[n, p, i] = PhiTPhi_reg[i, j] * prebuilt_features[n, p, j] # BOTTLENECK OF PREDICTION
-            coeff_outputs = linear_solve(PhiTPhi_reg_factors, buffer) # n_features x bsize x output_dim
-            # make sure we don't use := in following line, or it wont modify the input argument.
-            @tullio cov_store[p, q, n] =
-                prebuilt_features[n, p, m] * (prebuilt_features[n, q, m] - coeff_outputs[n, q, m])
-        end
-        @. cov_store /= FT(n_features)
 
-        # IMPORTANT, we remove lambda in-place for calculation only, so must add it back
-        @. PhiTPhi_reg += lambda
+    # Bishop Nasrabadi 2006 - efficient computation of cov:   
+    if !(size(buffer) == (size(inputs, 2), output_dim, n_features))
+        throw(
+            DimensionMismatch(
+                "provided storage for tmp buffer expected to be size ($(size(inputs,2)),$(output_dim),$(n_features)), got $(size(buffer))",
+            ),
+        )
+    end
 
+    if !tullio_threading
+        @tullio threads = 10^9 buffer[n, p, o] = prebuilt_features[n, p, m] * inv_decomp[m, o] # = betahat(x')
+        @tullio threads = 10^9 cov_store[p, q, n] = buffer[n, p, o] * prebuilt_features[n, q, o]
+    else
+        @tullio buffer[n, p, o] = prebuilt_features[n, p, m] * inv_decomp[m, o] # = betahat(x')
+        @tullio cov_store[p, q, n] = buffer[n, p, o] * prebuilt_features[n, q, o]
     end
+    @. cov_store /= FT(n_features)
+
     nothing
 end
 
diff --git a/src/Utilities.jl b/src/Utilities.jl
index 1c471cd..b5f4857 100644
--- a/src/Utilities.jl
+++ b/src/Utilities.jl
@@ -45,16 +45,27 @@ end
 
 Makes square matrix `mat` positive definite, by symmetrizing and bounding the minimum eigenvalue below by `tol`
 """
-function posdef_correct(mat::M; tol::R = 1e12 * eps()) where {M <: AbstractMatrix, R <: Float64}
-    out = 0.5 * (mat + permutedims(mat, (2, 1))) #symmetrize
-    abs_min_ev = abs(minimum(eigvals(out)))
-    for i in 1:size(out, 1)
-        out[i, i] += abs_min_ev + tol #add to diag 
+function posdef_correct(mat::AbstractMatrix; tol::Real = 1e12 * eps())
+    mat = deepcopy(mat)
+    if !issymmetric(mat)
+        out = 0.5 * (mat + permutedims(mat, (2, 1))) #symmetrize
+        if isposdef(out)
+            # very often, small numerical errors cause asymmetry, so cheaper to add this branch
+            return out
+        end
+    else
+        out = mat
+    end
+
+    if !isposdef(out)
+        nugget = abs(minimum(eigvals(out)))
+        for i in 1:size(out, 1)
+            out[i, i] += nugget + tol # add to diag
+        end
     end
     return out
 end
 
-
 """
 $(TYPEDEF)
 
@@ -97,7 +108,7 @@ function Decomposition(
     # TODOs
     # 1. Originally I used  f = getfield(LinearAlgebra, Symbol(method)) but this is slow for evaluation so defining svd and cholesky is all we have now. I could maybe do dispatch here to make this a bit more slick.
     # 2. I have tried using the in-place methods, but so far these have not made enough difference to be worthwhile, I think at some-point they would be, but the original matrix would be needed for matrix regularization. They are not the bottleneck in the end
-    # 3. The 
+
     if method == "pinv"
         invmat = pinv(mat)
         return Decomposition{PseInv, M, M}(mat, invmat, invmat)
@@ -106,7 +117,7 @@ function Decomposition(
         return Decomposition{Factor, typeof(mat), Base.return_types(svd, (typeof(mat),))[1]}(mat, fmat, inv(fmat))
     elseif method == "cholesky"
         if !isposdef(mat)
-            @info "Random Feature system not positive definite. Performing cholesky factorization with a close positive definite matrix"
+            #            @info "Random Feature system not positive definite. Performing cholesky factorization with a close positive definite matrix"
             mat = posdef_correct(mat, tol = nugget)
         end
         fmat = cholesky(mat)
diff --git a/test/Methods/runtests.jl b/test/Methods/runtests.jl
index 1242121..36a5194 100644
--- a/test/Methods/runtests.jl
+++ b/test/Methods/runtests.jl
@@ -44,16 +44,29 @@ tol = 1e3 * eps()
         @test_throws ArgumentError RandomFeatureMethod(sff, regularization = lambda, batch_sizes = batch_sizes_err)
 
         rfm_warn = RandomFeatureMethod(sff, regularization = lambda_warn, batch_sizes = batch_sizes)
-        @test get_regularization(rfm_warn) ≈ 1e12 * eps() * I
-
-        rfm_warn2 = RandomFeatureMethod(sff, regularization = lambdamat_warn, batch_sizes = batch_sizes)
+        @test get_regularization(rfm_warn) ≈ inv(1e12 * eps() * I) # inverted internally
+
+        rfm_warn2 = RandomFeatureMethod(
+            sff,
+            regularization = lambdamat_warn,
+            batch_sizes = batch_sizes,
+            regularization_inverted = true,
+        ) #don't invert, just make PD 
         reg_new = get_regularization(rfm_warn2)
         @test isposdef(reg_new)
         @test minimum(eigvals(reg_new)) > 1e12 * eps()
 
-        rfm = RandomFeatureMethod(sff, regularization = lambdamat, batch_sizes = batch_sizes)
+        rfm = RandomFeatureMethod(
+            sff,
+            regularization = lambdamat,
+            batch_sizes = batch_sizes,
+            regularization_inverted = true,
+        )
         @test get_regularization(rfm) ≈ lambdamat
 
+        rfm = RandomFeatureMethod(sff, regularization = lambdamat, batch_sizes = batch_sizes)
+        @test get_regularization(rfm) ≈ inv(lambdamat)
+
         rfm = RandomFeatureMethod(sff, regularization = lambda, batch_sizes = batch_sizes)
 
         @test get_batch_sizes(rfm) == batch_sizes
@@ -68,7 +81,7 @@ tol = 1e3 * eps()
         rfm_default = RandomFeatureMethod(sff)
 
         @test get_batch_sizes(rfm_default) == Dict("train" => 0, "test" => 0, "feature" => 0)
-        @test get_regularization(rfm_default) ≈ 1e12 * eps() * I
+        @test get_regularization(rfm_default) ≈ inv(1e12 * eps() * I)
     end
 
     @testset "Fit and predict: 1-D -> 1-D" begin
@@ -544,9 +557,9 @@ tol = 1e3 * eps()
             if exp_name == "diagonal-lambdamat"
                 vff_tmp = VectorFourierFeature(n_test * output_dim + 1, output_dim, feature_sampler) #m > np
                 rfm_tmp = RandomFeatureMethod(vff_tmp, regularization = lambda)
-                @test_logs (:info,) fit(rfm_tmp, io_pairs)
+                #                @test_logs (:info,) fit(rfm_tmp, io_pairs) loop vec throws some warning - messing this test up
                 fit_tmp = fit(rfm_tmp, io_pairs)
-                @test fit_tmp.regularization ≈ exp(1.0 / output_dim * log(det(lambda))) * I
+                @test fit_tmp.regularization ≈ inv(lambda) # exp(1.0 / output_dim * log(det(lambda))) * I
             end
 
             # test prediction L^2 error of mean
diff --git a/test/Utilities/runtests.jl b/test/Utilities/runtests.jl
index 4747272..2ed63ce 100644
--- a/test/Utilities/runtests.jl
+++ b/test/Utilities/runtests.jl
@@ -56,7 +56,6 @@ using RandomFeatures.Utilities
     @test_throws ArgumentError Decomposition(x, "qr")
 
     xbad = [1.0 1.0; 1.0 0.0] # not pos def
-    @test_logs (:info,) Decomposition(xbad, "cholesky")
     xbadchol = Decomposition(xbad, "cholesky")
     @test isposdef(get_full_matrix(xbadchol))