From 1df3dc09187cac12ed660f4a4013d657c0096cc7 Mon Sep 17 00:00:00 2001
From: Chandan Singh <chandan_singh@berkeley.edu>
Date: Wed, 4 Oct 2023 19:20:29 -0700
Subject: [PATCH] add AutoIntepretableRegressor

---
 .../marginal_shrinkage_linear_model.html      | 618 ++++++++++++++++--
 docs/algebraic/tree_gam.html                  | 220 +++++--
 docs/index.html                               |   2 +-
 docs/util/automl.html                         | 365 ++++++++---
 imodels/util/automl.py                        | 121 +++-
 readme.md                                     |   2 +-
 setup.py                                      |   2 +-
 7 files changed, 1122 insertions(+), 208 deletions(-)

diff --git a/docs/algebraic/marginal_shrinkage_linear_model.html b/docs/algebraic/marginal_shrinkage_linear_model.html
index c3318214..73ce0678 100644
--- a/docs/algebraic/marginal_shrinkage_linear_model.html
+++ b/docs/algebraic/marginal_shrinkage_linear_model.html
@@ -25,11 +25,10 @@
 import numpy as np
 import pandas as pd
 from sklearn.base import BaseEstimator
-from sklearn.linear_model import LinearRegression, RidgeCV, Ridge, ElasticNetCV
+from sklearn.linear_model import LinearRegression, RidgeCV, Ridge, ElasticNet, ElasticNetCV
 from sklearn.tree import DecisionTreeRegressor
 from sklearn.utils.multiclass import check_classification_targets
-from sklearn.utils.validation import check_X_y
-from sklearn.utils.validation import _check_sample_weight
+from sklearn.utils.validation import check_X_y, check_array, _check_sample_weight
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import accuracy_score, roc_auc_score
 from tqdm import tqdm
@@ -114,7 +113,8 @@
         self.coef_marginal_ = self._fit_marginal(X, y, sample_weight)
 
         # fit main
-        self.est_main_ = self._fit_main(X, y, sample_weight, self.coef_marginal_)
+        self.est_main_ = self._fit_main(
+            X, y, sample_weight, self.coef_marginal_)
 
         return self
 
@@ -133,7 +133,8 @@
         else:
             coef_marginal_ = []
             for i in range(X.shape[1]):
-                est_marginal.fit(X[:, i].reshape(-1, 1), y, sample_weight=sample_weight)
+                est_marginal.fit(X[:, i].reshape(-1, 1), y,
+                                 sample_weight=sample_weight)
                 coef_marginal_.append(deepcopy(est_marginal.coef_))
             coef_marginal_ = np.vstack(coef_marginal_).squeeze()
 
@@ -223,12 +224,82 @@
 #     ...
 
 
+class MarginalLinearModel(BaseEstimator):
+    &#34;&#34;&#34;Linear model that only fits marginal effects of each feature.
+    &#34;&#34;&#34;
+
+    def __init__(self, alpha=1.0, l1_ratio=0.5, max_iter=1000, random_state=None):
+        &#39;&#39;&#39;Arguments are passed to sklearn.linear_model.ElasticNet
+        &#39;&#39;&#39;
+        self.alpha = alpha
+        self.l1_ratio = l1_ratio
+        self.max_iter = max_iter
+        self.random_state = random_state
+
+    def fit(self, X, y, sample_weight=None):
+        # checks
+        X, y = check_X_y(X, y, accept_sparse=False, multi_output=False)
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=None)
+        if isinstance(self, ClassifierMixin):
+            check_classification_targets(y)
+            self.classes_, y = np.unique(y, return_inverse=True)
+
+        # fit marginal estimator to each feature
+        coef_marginal_ = []
+        for i in range(X.shape[1]):
+            est_marginal = ElasticNet(alpha=self.alpha, l1_ratio=self.l1_ratio,
+                                      max_iter=self.max_iter, random_state=self.random_state)
+            est_marginal.fit(X[:, i].reshape(-1, 1), y,
+                             sample_weight=sample_weight)
+            coef_marginal_.append(deepcopy(est_marginal.coef_))
+        coef_marginal_ = np.vstack(coef_marginal_).squeeze()
+
+        self.coef_ = coef_marginal_ / X.shape[1]
+        self.alpha_ = self.alpha
+
+        return self
+
+    def predict_proba(self, X):
+        X = check_array(X, accept_sparse=False, dtype=None)
+        return X @ self.coef_
+
+    def predict(self, X):
+        probs = self.predict_proba(X)
+        if isinstance(self, ClassifierMixin):
+            return np.argmax(probs, axis=1)
+        else:
+            return probs
+
+
+class MarginalLinearRegressor(MarginalLinearModel, RegressorMixin):
+    ...
+
+
+class MarginalLinearClassifier(MarginalLinearModel, ClassifierMixin):
+    ...
+
+
+# if __name__ == &#39;__main__&#39;:
+#     X, y = imodels.get_clean_dataset(&#39;heart&#39;)
+#     X_train, X_test, y_train, y_test = train_test_split(
+#         X, y, random_state=42, test_size=0.2)
+#     m = MarginalLinearModelRegressor()
+
+#     m.fit(X_train, y_train)
+#     print(m.coef_)
+#     print(m.predict(X_test))
+#     print(m.score(X_test, y_test))
+
 if __name__ == &#34;__main__&#34;:
     # X, y, feature_names = imodels.get_clean_dataset(&#34;heart&#34;)
     X, y, feature_names = imodels.get_clean_dataset(
         **imodels.util.data_util.DSET_KWARGS[&#34;california_housing&#34;]
     )
 
+    # scale the data
+    X = StandardScaler().fit_transform(X)
+    y = StandardScaler().fit_transform(y.reshape(-1, 1)).squeeze()
+
     print(&#34;shapes&#34;, X.shape, y.shape, &#34;nunique&#34;, np.unique(y).size)
     X_train, X_test, y_train, y_test = train_test_split(
         X, y, random_state=42, test_size=0.2
@@ -243,28 +314,30 @@
     )
     results = defaultdict(list)
     for m in [
-        MarginalShrinkageLinearModelRegressor(**kwargs),
-        MarginalShrinkageLinearModelRegressor(est_marginal_name=None, **kwargs),
-        MarginalShrinkageLinearModelRegressor(
-            est_main_name=None,
-            **kwargs,
-        ),
-        MarginalShrinkageLinearModelRegressor(
-            est_marginal_name=&#34;ridge&#34;,
-            est_main_name=&#34;ridge&#34;,
-            marginal_sign_constraint=True,
-            **kwargs,
-        ),
-        MarginalShrinkageLinearModelRegressor(
-            est_marginal_name=None, est_main_name=&#34;lasso&#34;, **kwargs
-        ),
-        MarginalShrinkageLinearModelRegressor(
-            est_marginal_name=&#34;ridge&#34;,
-            est_main_name=&#34;lasso&#34;,
-            marginal_sign_constraint=True,
-            **kwargs,
-        ),
-        # RidgeCV(alphas=alphas, fit_intercept=False),
+        # MarginalShrinkageLinearModelRegressor(**kwargs),
+        # MarginalShrinkageLinearModelRegressor(
+        #     est_marginal_name=None, **kwargs),
+        # MarginalShrinkageLinearModelRegressor(
+        #     est_main_name=None,
+        #     **kwargs,
+        # ),
+        # MarginalShrinkageLinearModelRegressor(
+        #     est_marginal_name=&#34;ridge&#34;,
+        #     est_main_name=&#34;ridge&#34;,
+        #     marginal_sign_constraint=True,
+        #     **kwargs,
+        # ),
+        # MarginalShrinkageLinearModelRegressor(
+        #     est_marginal_name=None, est_main_name=&#34;lasso&#34;, **kwargs
+        # ),
+        # MarginalShrinkageLinearModelRegressor(
+        #     est_marginal_name=&#34;ridge&#34;,
+        #     est_main_name=&#34;lasso&#34;,
+        #     marginal_sign_constraint=True,
+        #     **kwargs,
+        # ),
+        MarginalLinearRegressor(alpha=1.0),
+        RidgeCV(alphas=alphas, fit_intercept=False),
     ]:
         results[&#34;model_name&#34;].append(str(m))
         m.fit(X_train, y_train)
@@ -277,11 +350,14 @@
             results[&#34;test_roc&#34;].append(
                 roc_auc_score(y_test, m.predict_proba(X_test)[:, 1])
             )
-            results[&#34;acc_train&#34;].append(accuracy_score(y_train, m.predict(X_train)))
-            results[&#34;acc_test&#34;].append(accuracy_score(y_test, m.predict(X_test)))
+            results[&#34;acc_train&#34;].append(
+                accuracy_score(y_train, m.predict(X_train)))
+            results[&#34;acc_test&#34;].append(
+                accuracy_score(y_test, m.predict(X_test)))
         else:
             y_pred = m.predict(X_test)
-            results[&#34;train_mse&#34;].append(np.mean((y_train - m.predict(X_train)) ** 2))
+            results[&#34;train_mse&#34;].append(
+                np.mean((y_train - m.predict(X_train)) ** 2))
             results[&#34;test_mse&#34;].append(np.mean((y_test - y_pred) ** 2))
             results[&#34;train_r2&#34;].append(m.score(X_train, y_train))
             results[&#34;test_r2&#34;].append(m.score(X_test, y_test))
@@ -294,10 +370,10 @@
         coefs.append(deepcopy(lin.coef_))
         print(&#34;alpha best&#34;, lin.alpha_)
 
-    diffs = pd.DataFrame({str(i): coefs[i] for i in range(len(coefs))})
-    diffs[&#34;diff 0 - 1&#34;] = diffs[&#34;0&#34;] - diffs[&#34;1&#34;]
-    diffs[&#34;diff 1 - 2&#34;] = diffs[&#34;1&#34;] - diffs[&#34;2&#34;]
-    print(diffs)
+    # diffs = pd.DataFrame({str(i): coefs[i] for i in range(len(coefs))})
+    # diffs[&#34;diff 0 - 1&#34;] = diffs[&#34;0&#34;] - diffs[&#34;1&#34;]
+    # diffs[&#34;diff 1 - 2&#34;] = diffs[&#34;1&#34;] - diffs[&#34;2&#34;]
+    # print(diffs)
 
     # don&#39;t round strings
     with pd.option_context(
@@ -315,6 +391,450 @@
 <section>
 <h2 class="section-title" id="header-classes">Classes</h2>
 <dl>
+<dt id="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearClassifier"><code class="flex name class">
+<span>class <span class="ident">MarginalLinearClassifier</span></span>
+<span>(</span><span>alpha=1.0, l1_ratio=0.5, max_iter=1000, random_state=None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Linear model that only fits marginal effects of each feature.</p>
+<p>Arguments are passed to sklearn.linear_model.ElasticNet</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class MarginalLinearClassifier(MarginalLinearModel, ClassifierMixin):
+    ...</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel" href="#imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel">MarginalLinearModel</a></li>
+<li>sklearn.base.BaseEstimator</li>
+<li>sklearn.utils._metadata_requests._MetadataRequester</li>
+<li>sklearn.base.ClassifierMixin</li>
+</ul>
+<h3>Methods</h3>
+<dl>
+<dt id="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearClassifier.set_score_request"><code class="name flex">
+<span>def <span class="ident">set_score_request</span></span>(<span>self: <a title="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearClassifier" href="#imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearClassifier">MarginalLinearClassifier</a>, *, sample_weight: Union[bool, ForwardRef(None), str] = '$UNCHANGED$') ‑> <a title="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearClassifier" href="#imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearClassifier">MarginalLinearClassifier</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Request metadata passed to the <code>score</code> method.</p>
+<p>Note that this method is only relevant if
+<code>enable_metadata_routing=True</code> (see :func:<code>sklearn.set_config</code>).
+Please see :ref:<code>User Guide &lt;metadata_routing&gt;</code> on how the routing
+mechanism works.</p>
+<p>The options for each parameter are:</p>
+<ul>
+<li>
+<p><code>True</code>: metadata is requested, and passed to <code>score</code> if provided. The request is ignored if metadata is not provided.</p>
+</li>
+<li>
+<p><code>False</code>: metadata is not requested and the meta-estimator will not pass it to <code>score</code>.</p>
+</li>
+<li>
+<p><code>None</code>: metadata is not requested, and the meta-estimator will raise an error if the user provides it.</p>
+</li>
+<li>
+<p><code>str</code>: metadata should be passed to the meta-estimator with this given alias instead of the original name.</p>
+</li>
+</ul>
+<p>The default (<code>sklearn.utils.metadata_routing.UNCHANGED</code>) retains the
+existing request. This allows you to change the request for some
+parameters and not others.</p>
+<div class="admonition versionadded">
+<p class="admonition-title">Added in version:&ensp;1.3</p>
+</div>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>This method is only relevant if this estimator is used as a
+sub-estimator of a meta-estimator, e.g. used inside a
+:class:<code>pipeline.Pipeline</code>. Otherwise it has no effect.</p>
+</div>
+<h2 id="parameters">Parameters</h2>
+<dl>
+<dt><strong><code>sample_weight</code></strong> :&ensp;<code>str, True, False,</code> or <code>None</code>,
+default=<code>sklearn.utils.metadata_routing.UNCHANGED</code></dt>
+<dd>Metadata routing for <code>sample_weight</code> parameter in <code>score</code>.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><strong><code>self</code></strong> :&ensp;<code>object</code></dt>
+<dd>The updated object.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def func(**kw):
+    &#34;&#34;&#34;Updates the request for provided parameters
+
+    This docstring is overwritten below.
+    See REQUESTER_DOC for expected functionality
+    &#34;&#34;&#34;
+    if not _routing_enabled():
+        raise RuntimeError(
+            &#34;This method is only available when metadata routing is enabled.&#34;
+            &#34; You can enable it using&#34;
+            &#34; sklearn.set_config(enable_metadata_routing=True).&#34;
+        )
+
+    if self.validate_keys and (set(kw) - set(self.keys)):
+        raise TypeError(
+            f&#34;Unexpected args: {set(kw) - set(self.keys)}. Accepted arguments&#34;
+            f&#34; are: {set(self.keys)}&#34;
+        )
+
+    requests = instance._get_metadata_request()
+    method_metadata_request = getattr(requests, self.name)
+
+    for prop, alias in kw.items():
+        if alias is not UNCHANGED:
+            method_metadata_request.add_request(param=prop, alias=alias)
+    instance._metadata_request = requests
+
+    return instance</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel" href="#imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel">MarginalLinearModel</a></b></code>:
+<ul class="hlist">
+<li><code><a title="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel.set_fit_request" href="#imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel.set_fit_request">set_fit_request</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel"><code class="flex name class">
+<span>class <span class="ident">MarginalLinearModel</span></span>
+<span>(</span><span>alpha=1.0, l1_ratio=0.5, max_iter=1000, random_state=None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Linear model that only fits marginal effects of each feature.</p>
+<p>Arguments are passed to sklearn.linear_model.ElasticNet</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class MarginalLinearModel(BaseEstimator):
+    &#34;&#34;&#34;Linear model that only fits marginal effects of each feature.
+    &#34;&#34;&#34;
+
+    def __init__(self, alpha=1.0, l1_ratio=0.5, max_iter=1000, random_state=None):
+        &#39;&#39;&#39;Arguments are passed to sklearn.linear_model.ElasticNet
+        &#39;&#39;&#39;
+        self.alpha = alpha
+        self.l1_ratio = l1_ratio
+        self.max_iter = max_iter
+        self.random_state = random_state
+
+    def fit(self, X, y, sample_weight=None):
+        # checks
+        X, y = check_X_y(X, y, accept_sparse=False, multi_output=False)
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=None)
+        if isinstance(self, ClassifierMixin):
+            check_classification_targets(y)
+            self.classes_, y = np.unique(y, return_inverse=True)
+
+        # fit marginal estimator to each feature
+        coef_marginal_ = []
+        for i in range(X.shape[1]):
+            est_marginal = ElasticNet(alpha=self.alpha, l1_ratio=self.l1_ratio,
+                                      max_iter=self.max_iter, random_state=self.random_state)
+            est_marginal.fit(X[:, i].reshape(-1, 1), y,
+                             sample_weight=sample_weight)
+            coef_marginal_.append(deepcopy(est_marginal.coef_))
+        coef_marginal_ = np.vstack(coef_marginal_).squeeze()
+
+        self.coef_ = coef_marginal_ / X.shape[1]
+        self.alpha_ = self.alpha
+
+        return self
+
+    def predict_proba(self, X):
+        X = check_array(X, accept_sparse=False, dtype=None)
+        return X @ self.coef_
+
+    def predict(self, X):
+        probs = self.predict_proba(X)
+        if isinstance(self, ClassifierMixin):
+            return np.argmax(probs, axis=1)
+        else:
+            return probs</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>sklearn.base.BaseEstimator</li>
+<li>sklearn.utils._metadata_requests._MetadataRequester</li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearClassifier" href="#imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearClassifier">MarginalLinearClassifier</a></li>
+<li><a title="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearRegressor" href="#imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearRegressor">MarginalLinearRegressor</a></li>
+</ul>
+<h3>Methods</h3>
+<dl>
+<dt id="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel.fit"><code class="name flex">
+<span>def <span class="ident">fit</span></span>(<span>self, X, y, sample_weight=None)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def fit(self, X, y, sample_weight=None):
+    # checks
+    X, y = check_X_y(X, y, accept_sparse=False, multi_output=False)
+    sample_weight = _check_sample_weight(sample_weight, X, dtype=None)
+    if isinstance(self, ClassifierMixin):
+        check_classification_targets(y)
+        self.classes_, y = np.unique(y, return_inverse=True)
+
+    # fit marginal estimator to each feature
+    coef_marginal_ = []
+    for i in range(X.shape[1]):
+        est_marginal = ElasticNet(alpha=self.alpha, l1_ratio=self.l1_ratio,
+                                  max_iter=self.max_iter, random_state=self.random_state)
+        est_marginal.fit(X[:, i].reshape(-1, 1), y,
+                         sample_weight=sample_weight)
+        coef_marginal_.append(deepcopy(est_marginal.coef_))
+    coef_marginal_ = np.vstack(coef_marginal_).squeeze()
+
+    self.coef_ = coef_marginal_ / X.shape[1]
+    self.alpha_ = self.alpha
+
+    return self</code></pre>
+</details>
+</dd>
+<dt id="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel.predict"><code class="name flex">
+<span>def <span class="ident">predict</span></span>(<span>self, X)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def predict(self, X):
+    probs = self.predict_proba(X)
+    if isinstance(self, ClassifierMixin):
+        return np.argmax(probs, axis=1)
+    else:
+        return probs</code></pre>
+</details>
+</dd>
+<dt id="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel.predict_proba"><code class="name flex">
+<span>def <span class="ident">predict_proba</span></span>(<span>self, X)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def predict_proba(self, X):
+    X = check_array(X, accept_sparse=False, dtype=None)
+    return X @ self.coef_</code></pre>
+</details>
+</dd>
+<dt id="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel.set_fit_request"><code class="name flex">
+<span>def <span class="ident">set_fit_request</span></span>(<span>self: <a title="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel" href="#imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel">MarginalLinearModel</a>, *, sample_weight: Union[bool, ForwardRef(None), str] = '$UNCHANGED$') ‑> <a title="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel" href="#imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel">MarginalLinearModel</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Request metadata passed to the <code>fit</code> method.</p>
+<p>Note that this method is only relevant if
+<code>enable_metadata_routing=True</code> (see :func:<code>sklearn.set_config</code>).
+Please see :ref:<code>User Guide &lt;metadata_routing&gt;</code> on how the routing
+mechanism works.</p>
+<p>The options for each parameter are:</p>
+<ul>
+<li>
+<p><code>True</code>: metadata is requested, and passed to <code>fit</code> if provided. The request is ignored if metadata is not provided.</p>
+</li>
+<li>
+<p><code>False</code>: metadata is not requested and the meta-estimator will not pass it to <code>fit</code>.</p>
+</li>
+<li>
+<p><code>None</code>: metadata is not requested, and the meta-estimator will raise an error if the user provides it.</p>
+</li>
+<li>
+<p><code>str</code>: metadata should be passed to the meta-estimator with this given alias instead of the original name.</p>
+</li>
+</ul>
+<p>The default (<code>sklearn.utils.metadata_routing.UNCHANGED</code>) retains the
+existing request. This allows you to change the request for some
+parameters and not others.</p>
+<div class="admonition versionadded">
+<p class="admonition-title">Added in version:&ensp;1.3</p>
+</div>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>This method is only relevant if this estimator is used as a
+sub-estimator of a meta-estimator, e.g. used inside a
+:class:<code>pipeline.Pipeline</code>. Otherwise it has no effect.</p>
+</div>
+<h2 id="parameters">Parameters</h2>
+<dl>
+<dt><strong><code>sample_weight</code></strong> :&ensp;<code>str, True, False,</code> or <code>None</code>,
+default=<code>sklearn.utils.metadata_routing.UNCHANGED</code></dt>
+<dd>Metadata routing for <code>sample_weight</code> parameter in <code>fit</code>.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><strong><code>self</code></strong> :&ensp;<code>object</code></dt>
+<dd>The updated object.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def func(**kw):
+    &#34;&#34;&#34;Updates the request for provided parameters
+
+    This docstring is overwritten below.
+    See REQUESTER_DOC for expected functionality
+    &#34;&#34;&#34;
+    if not _routing_enabled():
+        raise RuntimeError(
+            &#34;This method is only available when metadata routing is enabled.&#34;
+            &#34; You can enable it using&#34;
+            &#34; sklearn.set_config(enable_metadata_routing=True).&#34;
+        )
+
+    if self.validate_keys and (set(kw) - set(self.keys)):
+        raise TypeError(
+            f&#34;Unexpected args: {set(kw) - set(self.keys)}. Accepted arguments&#34;
+            f&#34; are: {set(self.keys)}&#34;
+        )
+
+    requests = instance._get_metadata_request()
+    method_metadata_request = getattr(requests, self.name)
+
+    for prop, alias in kw.items():
+        if alias is not UNCHANGED:
+            method_metadata_request.add_request(param=prop, alias=alias)
+    instance._metadata_request = requests
+
+    return instance</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearRegressor"><code class="flex name class">
+<span>class <span class="ident">MarginalLinearRegressor</span></span>
+<span>(</span><span>alpha=1.0, l1_ratio=0.5, max_iter=1000, random_state=None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Linear model that only fits marginal effects of each feature.</p>
+<p>Arguments are passed to sklearn.linear_model.ElasticNet</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class MarginalLinearRegressor(MarginalLinearModel, RegressorMixin):
+    ...</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel" href="#imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel">MarginalLinearModel</a></li>
+<li>sklearn.base.BaseEstimator</li>
+<li>sklearn.utils._metadata_requests._MetadataRequester</li>
+<li>sklearn.base.RegressorMixin</li>
+</ul>
+<h3>Methods</h3>
+<dl>
+<dt id="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearRegressor.set_score_request"><code class="name flex">
+<span>def <span class="ident">set_score_request</span></span>(<span>self: <a title="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearRegressor" href="#imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearRegressor">MarginalLinearRegressor</a>, *, sample_weight: Union[bool, ForwardRef(None), str] = '$UNCHANGED$') ‑> <a title="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearRegressor" href="#imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearRegressor">MarginalLinearRegressor</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Request metadata passed to the <code>score</code> method.</p>
+<p>Note that this method is only relevant if
+<code>enable_metadata_routing=True</code> (see :func:<code>sklearn.set_config</code>).
+Please see :ref:<code>User Guide &lt;metadata_routing&gt;</code> on how the routing
+mechanism works.</p>
+<p>The options for each parameter are:</p>
+<ul>
+<li>
+<p><code>True</code>: metadata is requested, and passed to <code>score</code> if provided. The request is ignored if metadata is not provided.</p>
+</li>
+<li>
+<p><code>False</code>: metadata is not requested and the meta-estimator will not pass it to <code>score</code>.</p>
+</li>
+<li>
+<p><code>None</code>: metadata is not requested, and the meta-estimator will raise an error if the user provides it.</p>
+</li>
+<li>
+<p><code>str</code>: metadata should be passed to the meta-estimator with this given alias instead of the original name.</p>
+</li>
+</ul>
+<p>The default (<code>sklearn.utils.metadata_routing.UNCHANGED</code>) retains the
+existing request. This allows you to change the request for some
+parameters and not others.</p>
+<div class="admonition versionadded">
+<p class="admonition-title">Added in version:&ensp;1.3</p>
+</div>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>This method is only relevant if this estimator is used as a
+sub-estimator of a meta-estimator, e.g. used inside a
+:class:<code>pipeline.Pipeline</code>. Otherwise it has no effect.</p>
+</div>
+<h2 id="parameters">Parameters</h2>
+<dl>
+<dt><strong><code>sample_weight</code></strong> :&ensp;<code>str, True, False,</code> or <code>None</code>,
+default=<code>sklearn.utils.metadata_routing.UNCHANGED</code></dt>
+<dd>Metadata routing for <code>sample_weight</code> parameter in <code>score</code>.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><strong><code>self</code></strong> :&ensp;<code>object</code></dt>
+<dd>The updated object.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def func(**kw):
+    &#34;&#34;&#34;Updates the request for provided parameters
+
+    This docstring is overwritten below.
+    See REQUESTER_DOC for expected functionality
+    &#34;&#34;&#34;
+    if not _routing_enabled():
+        raise RuntimeError(
+            &#34;This method is only available when metadata routing is enabled.&#34;
+            &#34; You can enable it using&#34;
+            &#34; sklearn.set_config(enable_metadata_routing=True).&#34;
+        )
+
+    if self.validate_keys and (set(kw) - set(self.keys)):
+        raise TypeError(
+            f&#34;Unexpected args: {set(kw) - set(self.keys)}. Accepted arguments&#34;
+            f&#34; are: {set(self.keys)}&#34;
+        )
+
+    requests = instance._get_metadata_request()
+    method_metadata_request = getattr(requests, self.name)
+
+    for prop, alias in kw.items():
+        if alias is not UNCHANGED:
+            method_metadata_request.add_request(param=prop, alias=alias)
+    instance._metadata_request = requests
+
+    return instance</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel" href="#imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel">MarginalLinearModel</a></b></code>:
+<ul class="hlist">
+<li><code><a title="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel.set_fit_request" href="#imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel.set_fit_request">set_fit_request</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
 <dt id="imodels.algebraic.marginal_shrinkage_linear_model.MarginalShrinkageLinearModel"><code class="flex name class">
 <span>class <span class="ident">MarginalShrinkageLinearModel</span></span>
 <span>(</span><span>est_marginal_name='ridge', est_main_name='ridge', marginal_divide_by_d=True, marginal_sign_constraint=False, alphas=[0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0, 100000.0], elasticnet_ratio=0.5, random_state=None)</span>
@@ -417,7 +937,8 @@ <h2 id="params">Params</h2>
         self.coef_marginal_ = self._fit_marginal(X, y, sample_weight)
 
         # fit main
-        self.est_main_ = self._fit_main(X, y, sample_weight, self.coef_marginal_)
+        self.est_main_ = self._fit_main(
+            X, y, sample_weight, self.coef_marginal_)
 
         return self
 
@@ -436,7 +957,8 @@ <h2 id="params">Params</h2>
         else:
             coef_marginal_ = []
             for i in range(X.shape[1]):
-                est_marginal.fit(X[:, i].reshape(-1, 1), y, sample_weight=sample_weight)
+                est_marginal.fit(X[:, i].reshape(-1, 1), y,
+                                 sample_weight=sample_weight)
                 coef_marginal_.append(deepcopy(est_marginal.coef_))
             coef_marginal_ = np.vstack(coef_marginal_).squeeze()
 
@@ -553,7 +1075,8 @@ <h3>Methods</h3>
     self.coef_marginal_ = self._fit_marginal(X, y, sample_weight)
 
     # fit main
-    self.est_main_ = self._fit_main(X, y, sample_weight, self.coef_marginal_)
+    self.est_main_ = self._fit_main(
+        X, y, sample_weight, self.coef_marginal_)
 
     return self</code></pre>
 </details>
@@ -820,6 +1343,27 @@ <h1>Index 🔍</h1>
 <li><h3><a href="#header-classes">Classes</a></h3>
 <ul>
 <li>
+<h4><code><a title="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearClassifier" href="#imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearClassifier">MarginalLinearClassifier</a></code></h4>
+<ul class="">
+<li><code><a title="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearClassifier.set_score_request" href="#imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearClassifier.set_score_request">set_score_request</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel" href="#imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel">MarginalLinearModel</a></code></h4>
+<ul class="">
+<li><code><a title="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel.fit" href="#imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel.fit">fit</a></code></li>
+<li><code><a title="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel.predict" href="#imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel.predict">predict</a></code></li>
+<li><code><a title="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel.predict_proba" href="#imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel.predict_proba">predict_proba</a></code></li>
+<li><code><a title="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel.set_fit_request" href="#imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearModel.set_fit_request">set_fit_request</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearRegressor" href="#imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearRegressor">MarginalLinearRegressor</a></code></h4>
+<ul class="">
+<li><code><a title="imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearRegressor.set_score_request" href="#imodels.algebraic.marginal_shrinkage_linear_model.MarginalLinearRegressor.set_score_request">set_score_request</a></code></li>
+</ul>
+</li>
+<li>
 <h4><code><a title="imodels.algebraic.marginal_shrinkage_linear_model.MarginalShrinkageLinearModel" href="#imodels.algebraic.marginal_shrinkage_linear_model.MarginalShrinkageLinearModel">MarginalShrinkageLinearModel</a></code></h4>
 <ul class="">
 <li><code><a title="imodels.algebraic.marginal_shrinkage_linear_model.MarginalShrinkageLinearModel.fit" href="#imodels.algebraic.marginal_shrinkage_linear_model.MarginalShrinkageLinearModel.fit">fit</a></code></li>
diff --git a/docs/algebraic/tree_gam.html b/docs/algebraic/tree_gam.html
index 8a423683..960e2a76 100644
--- a/docs/algebraic/tree_gam.html
+++ b/docs/algebraic/tree_gam.html
@@ -25,7 +25,7 @@
 import numpy as np
 import pandas as pd
 from sklearn.base import BaseEstimator
-from sklearn.linear_model import LinearRegression, RidgeCV
+from sklearn.linear_model import ElasticNetCV, LinearRegression, RidgeCV
 from sklearn.tree import DecisionTreeRegressor
 from sklearn.utils.validation import check_is_fitted
 from sklearn.utils import check_array
@@ -62,6 +62,7 @@
         fit_linear_marginal=None,
         select_linear_marginal=False,
         decay_rate_towards_marginal=1.0,
+        fit_posthoc_tree_coefs=None,
         boosting_strategy=&#34;cyclic&#34;,
         validation_frac=0.15,
         random_state=None,
@@ -96,6 +97,8 @@
             1 means no decay, 0 means only use marginal effects
             shape = (1 - decay_rate_towards_marginal) * shape + decay_rate_towards_marginal * marginal_shape
             The way this is implemented is by keeping track of how many times to multiply decay_rate_towards_marginal for each cyclic estimator
+        fit_posthoc_tree_coefs: str [None, &#34;ridge&#34;]
+            Whether to fit a linear model to the tree coefficients after fitting the cyclic boosting.
         boosting_strategy : str [&#34;cyclic&#34;, &#34;greedy&#34;]
             Whether to use cyclic boosting (cycle over features) or greedy boosting (select best feature at each step)
         validation_frac: float
@@ -113,6 +116,7 @@
         self.fit_linear_marginal = fit_linear_marginal
         self.select_linear_marginal = select_linear_marginal
         self.decay_rate_towards_marginal = decay_rate_towards_marginal
+        self.fit_posthoc_tree_coefs = fit_posthoc_tree_coefs
         self.boosting_strategy = boosting_strategy
         self.validation_frac = validation_frac
         self.random_state = random_state
@@ -139,6 +143,7 @@
             sample_weight,
             test_size=self.validation_frac,
             random_state=self.random_state,
+            stratify=y if isinstance(self, ClassifierMixin) else None,
         )
 
         self.estimators_marginal = []
@@ -162,6 +167,9 @@
                 sample_weight_val,
             )
 
+        if self.fit_posthoc_tree_coefs is not None:
+            self._fit_posthoc_tree_coefs(X_train, y_train, sample_weight_train)
+
         self.mse_val_ = self._calc_mse(X_val, y_val, sample_weight_val)
 
         return self
@@ -186,7 +194,8 @@
             )
             est.fit(X_, residuals_train, sample_weight=sample_weight_train)
             if self.reg_param_marginal &gt; 0:
-                est = imodels.HSTreeRegressor(est, reg_param=self.reg_param_marginal)
+                est = imodels.HSTreeRegressor(
+                    est, reg_param=self.reg_param_marginal)
             self.estimators_marginal.append(est)
 
         if (
@@ -196,9 +205,11 @@
             if self.fit_linear_marginal.lower() == &#34;ridge&#34;:
                 linear_marginal = RidgeCV(fit_intercept=False)
             elif self.fit_linear_marginal == &#34;NNLS&#34;:
-                linear_marginal = LinearRegression(fit_intercept=False, positive=True)
+                linear_marginal = LinearRegression(
+                    fit_intercept=False, positive=True)
             linear_marginal.fit(
-                np.array([est.predict(X_train) for est in self.estimators_marginal]).T,
+                np.array([est.predict(X_train)
+                         for est in self.estimators_marginal]).T,
                 residuals_train,
                 sample_weight_train,
             )
@@ -235,12 +246,14 @@
                 )
                 est.fit(X_, residuals_train, sample_weight=sample_weight_train)
                 succesfully_split_on_feature = np.all(
-                    (est.tree_.feature[0] == feature_num) | (est.tree_.feature[0] == -2)
+                    (est.tree_.feature[0] == feature_num) | (
+                        est.tree_.feature[0] == -2)
                 )
                 if not succesfully_split_on_feature:
                     continue
                 if self.reg_param &gt; 0:
-                    est = imodels.HSTreeRegressor(est, reg_param=self.reg_param)
+                    est = imodels.HSTreeRegressor(
+                        est, reg_param=self.reg_param)
                 self.estimators_.append(est)
                 residuals_train_new = (
                     residuals_train - self.learning_rate * est.predict(X_train)
@@ -252,20 +265,23 @@
                         X_train, y_train, sample_weight_train
                     )
                     # don&#39;t add each estimator for greedy
-                    boosting_round_ests.append(deepcopy(self.estimators_.pop()))
+                    boosting_round_ests.append(
+                        deepcopy(self.estimators_.pop()))
                     boosting_round_mses.append(mse_train_new)
 
             if self.boosting_strategy == &#34;greedy&#34;:
                 best_est = boosting_round_ests[np.argmin(boosting_round_mses)]
                 self.estimators_.append(best_est)
                 residuals_train = (
-                    residuals_train - self.learning_rate * best_est.predict(X_train)
+                    residuals_train - self.learning_rate *
+                    best_est.predict(X_train)
                 )
 
             # decay marginal effects
             if self.decay_rate_towards_marginal &lt; 1.0:
                 new_decay_coefs = [self.decay_rate_towards_marginal] * (
-                    len(self.estimators_) - len(self.decay_coef_towards_marginal_)
+                    len(self.estimators_) -
+                    len(self.decay_coef_towards_marginal_)
                 )
                 # print(self.decay_coef_towards_marginal_)
                 # print(&#39;new_decay_coefs&#39;, new_decay_coefs)
@@ -283,6 +299,25 @@
             else:
                 mse_val = mse_val_new
 
+    def _fit_posthoc_tree_coefs(self, X, y, sample_weight=None):
+        # extract predictions from each tree
+        X_pred_tree = np.array([est.predict(X) for est in self.estimators_]).T
+        print(&#39;shapes&#39;, X.shape, X_pred_tree.shape,
+              y.shape, len(self.estimators_))
+
+        coef_prior = np.ones(len(self.estimators_)) * self.learning_rate
+        y = y - self.bias_ - X_pred_tree @ coef_prior
+
+        if self.fit_posthoc_tree_coefs.lower() == &#34;ridge&#34;:
+            m = RidgeCV(fit_intercept=False)
+        elif self.fit_posthoc_tree_coefs.lower() == &#34;nnls&#34;:
+            m = LinearRegression(fit_intercept=False, positive=True)
+        elif self.fit_posthoc_tree_coefs.lower() == &#34;elasticnet&#34;:
+            m = ElasticNetCV(fit_intercept=False, positive=True)
+
+        m.fit(X_pred_tree, y, sample_weight=sample_weight)
+        self.cyclic_coef_ = m.coef_ + coef_prior
+
     def predict_proba(self, X, marginal_only=False):
         &#34;&#34;&#34;
         Params
@@ -293,22 +328,33 @@
         X = check_array(X, accept_sparse=False, dtype=None)
         check_is_fitted(self)
         probs1 = np.ones(X.shape[0]) * self.bias_
+
+        # marginal prediction
         for i, est in enumerate(self.estimators_marginal):
             probs1 += est.predict(X) * self.marginal_coef_[i]
+
+        # cyclic coefs prediction
         if not marginal_only:
+            if not hasattr(self, &#34;cyclic_coef_&#34;):
+                cyclic_coef_ = np.ones(
+                    len(self.estimators_)) * self.learning_rate
+            else:
+                cyclic_coef_ = self.cyclic_coef_
+                # print(&#39;coef&#39;, cyclic_coef_)
+
             if self.decay_rate_towards_marginal &lt; 1.0:
                 for i, est in enumerate(self.estimators_):
                     if i &lt; len(self.decay_coef_towards_marginal_):
                         probs1 += (
-                            self.learning_rate
+                            cyclic_coef_[i]
                             * self.decay_coef_towards_marginal_[i]
                             * est.predict(X)
                         )
                     else:
-                        probs1 += self.learning_rate * est.predict(X)
+                        probs1 += cyclic_coef_[i] * est.predict(X)
             else:
-                for est in self.estimators_:
-                    probs1 += self.learning_rate * est.predict(X)
+                for i, est in enumerate(self.estimators_):
+                    probs1 += cyclic_coef_[i] * est.predict(X)
         probs1 = np.clip(probs1, a_min=0, a_max=1)
         return np.array([1 - probs1, probs1]).T
 
@@ -340,32 +386,35 @@
         boosting_strategy=&#34;cyclic&#34;,
         random_state=42,
         learning_rate=0.1,
-        max_leaf_nodes=2,
-        select_linear_marginal=True,
-        fit_linear_marginal=&#34;NNLS&#34;,
-        n_boosting_rounds_marginal=3,
-        decay_rate_towards_marginal=0,
-        n_boosting_rounds=10,
+        max_leaf_nodes=3,
+        # select_linear_marginal=True,
+        # fit_linear_marginal=&#34;NNLS&#34;,
+        # n_boosting_rounds_marginal=3,
+        # decay_rate_towards_marginal=0,
+        fit_posthoc_tree_coefs=&#34;elasticnet&#34;,
+        n_boosting_rounds=100,
     )
     gam.fit(X, y_train)
 
     # check roc auc score
     y_pred = gam.predict_proba(X_test)[:, 1]
-    print(
-        &#34;train roc:&#34;,
-        roc_auc_score(y_train, gam.predict_proba(X)[:, 1]).round(3),
-    )
+    # print(
+    #     &#34;train roc:&#34;,
+    #     roc_auc_score(y_train, gam.predict_proba(X)[:, 1]).round(3),
+    # )
     print(&#34;test roc:&#34;, roc_auc_score(y_test, y_pred).round(3))
-    print(
-        &#34;accs&#34;,
-        accuracy_score(y_train, gam.predict(X)).round(3),
-        accuracy_score(y_test, gam.predict(X_test)).round(3),
-        &#34;imb&#34;,
-        np.mean(y_train).round(3),
-        np.mean(y_test).round(3),
-    )
-
-    # print(gam.estimators_)</code></pre>
+    print(&#34;test acc:&#34;, accuracy_score(y_test, gam.predict(X_test)).round(3))
+    print(&#39;\t(imb:&#39;, np.mean(y_test).round(3), &#39;)&#39;)
+    # print(
+    #     &#34;accs&#34;,
+    #     accuracy_score(y_train, gam.predict(X)).round(3),
+    #     accuracy_score(y_test, gam.predict(X_test)).round(3),
+    #     &#34;imb&#34;,
+    #     np.mean(y_train).round(3),
+    #     np.mean(y_test).round(3),
+    # )
+
+    # # print(gam.estimators_)</code></pre>
 </details>
 </section>
 <section>
@@ -379,7 +428,7 @@ <h2 class="section-title" id="header-classes">Classes</h2>
 <dl>
 <dt id="imodels.algebraic.tree_gam.TreeGAM"><code class="flex name class">
 <span>class <span class="ident">TreeGAM</span></span>
-<span>(</span><span>n_boosting_rounds=100, max_leaf_nodes=3, reg_param=0.0, learning_rate: float = 0.01, n_boosting_rounds_marginal=0, max_leaf_nodes_marginal=2, reg_param_marginal=0.0, fit_linear_marginal=None, select_linear_marginal=False, decay_rate_towards_marginal=1.0, boosting_strategy='cyclic', validation_frac=0.15, random_state=None)</span>
+<span>(</span><span>n_boosting_rounds=100, max_leaf_nodes=3, reg_param=0.0, learning_rate: float = 0.01, n_boosting_rounds_marginal=0, max_leaf_nodes_marginal=2, reg_param_marginal=0.0, fit_linear_marginal=None, select_linear_marginal=False, decay_rate_towards_marginal=1.0, fit_posthoc_tree_coefs=None, boosting_strategy='cyclic', validation_frac=0.15, random_state=None)</span>
 </code></dt>
 <dd>
 <div class="desc"><p>Tree-based GAM classifier.
@@ -415,6 +464,8 @@ <h2 id="params">Params</h2>
 1 means no decay, 0 means only use marginal effects
 shape = (1 - decay_rate_towards_marginal) * shape + decay_rate_towards_marginal * marginal_shape
 The way this is implemented is by keeping track of how many times to multiply decay_rate_towards_marginal for each cyclic estimator
+fit_posthoc_tree_coefs: str [None, "ridge"]
+Whether to fit a linear model to the tree coefficients after fitting the cyclic boosting.
 boosting_strategy : str ["cyclic", "greedy"]
 Whether to use cyclic boosting (cycle over features) or greedy boosting (select best feature at each step)
 validation_frac: float
@@ -445,6 +496,7 @@ <h2 id="params">Params</h2>
         fit_linear_marginal=None,
         select_linear_marginal=False,
         decay_rate_towards_marginal=1.0,
+        fit_posthoc_tree_coefs=None,
         boosting_strategy=&#34;cyclic&#34;,
         validation_frac=0.15,
         random_state=None,
@@ -479,6 +531,8 @@ <h2 id="params">Params</h2>
             1 means no decay, 0 means only use marginal effects
             shape = (1 - decay_rate_towards_marginal) * shape + decay_rate_towards_marginal * marginal_shape
             The way this is implemented is by keeping track of how many times to multiply decay_rate_towards_marginal for each cyclic estimator
+        fit_posthoc_tree_coefs: str [None, &#34;ridge&#34;]
+            Whether to fit a linear model to the tree coefficients after fitting the cyclic boosting.
         boosting_strategy : str [&#34;cyclic&#34;, &#34;greedy&#34;]
             Whether to use cyclic boosting (cycle over features) or greedy boosting (select best feature at each step)
         validation_frac: float
@@ -496,6 +550,7 @@ <h2 id="params">Params</h2>
         self.fit_linear_marginal = fit_linear_marginal
         self.select_linear_marginal = select_linear_marginal
         self.decay_rate_towards_marginal = decay_rate_towards_marginal
+        self.fit_posthoc_tree_coefs = fit_posthoc_tree_coefs
         self.boosting_strategy = boosting_strategy
         self.validation_frac = validation_frac
         self.random_state = random_state
@@ -522,6 +577,7 @@ <h2 id="params">Params</h2>
             sample_weight,
             test_size=self.validation_frac,
             random_state=self.random_state,
+            stratify=y if isinstance(self, ClassifierMixin) else None,
         )
 
         self.estimators_marginal = []
@@ -545,6 +601,9 @@ <h2 id="params">Params</h2>
                 sample_weight_val,
             )
 
+        if self.fit_posthoc_tree_coefs is not None:
+            self._fit_posthoc_tree_coefs(X_train, y_train, sample_weight_train)
+
         self.mse_val_ = self._calc_mse(X_val, y_val, sample_weight_val)
 
         return self
@@ -569,7 +628,8 @@ <h2 id="params">Params</h2>
             )
             est.fit(X_, residuals_train, sample_weight=sample_weight_train)
             if self.reg_param_marginal &gt; 0:
-                est = imodels.HSTreeRegressor(est, reg_param=self.reg_param_marginal)
+                est = imodels.HSTreeRegressor(
+                    est, reg_param=self.reg_param_marginal)
             self.estimators_marginal.append(est)
 
         if (
@@ -579,9 +639,11 @@ <h2 id="params">Params</h2>
             if self.fit_linear_marginal.lower() == &#34;ridge&#34;:
                 linear_marginal = RidgeCV(fit_intercept=False)
             elif self.fit_linear_marginal == &#34;NNLS&#34;:
-                linear_marginal = LinearRegression(fit_intercept=False, positive=True)
+                linear_marginal = LinearRegression(
+                    fit_intercept=False, positive=True)
             linear_marginal.fit(
-                np.array([est.predict(X_train) for est in self.estimators_marginal]).T,
+                np.array([est.predict(X_train)
+                         for est in self.estimators_marginal]).T,
                 residuals_train,
                 sample_weight_train,
             )
@@ -618,12 +680,14 @@ <h2 id="params">Params</h2>
                 )
                 est.fit(X_, residuals_train, sample_weight=sample_weight_train)
                 succesfully_split_on_feature = np.all(
-                    (est.tree_.feature[0] == feature_num) | (est.tree_.feature[0] == -2)
+                    (est.tree_.feature[0] == feature_num) | (
+                        est.tree_.feature[0] == -2)
                 )
                 if not succesfully_split_on_feature:
                     continue
                 if self.reg_param &gt; 0:
-                    est = imodels.HSTreeRegressor(est, reg_param=self.reg_param)
+                    est = imodels.HSTreeRegressor(
+                        est, reg_param=self.reg_param)
                 self.estimators_.append(est)
                 residuals_train_new = (
                     residuals_train - self.learning_rate * est.predict(X_train)
@@ -635,20 +699,23 @@ <h2 id="params">Params</h2>
                         X_train, y_train, sample_weight_train
                     )
                     # don&#39;t add each estimator for greedy
-                    boosting_round_ests.append(deepcopy(self.estimators_.pop()))
+                    boosting_round_ests.append(
+                        deepcopy(self.estimators_.pop()))
                     boosting_round_mses.append(mse_train_new)
 
             if self.boosting_strategy == &#34;greedy&#34;:
                 best_est = boosting_round_ests[np.argmin(boosting_round_mses)]
                 self.estimators_.append(best_est)
                 residuals_train = (
-                    residuals_train - self.learning_rate * best_est.predict(X_train)
+                    residuals_train - self.learning_rate *
+                    best_est.predict(X_train)
                 )
 
             # decay marginal effects
             if self.decay_rate_towards_marginal &lt; 1.0:
                 new_decay_coefs = [self.decay_rate_towards_marginal] * (
-                    len(self.estimators_) - len(self.decay_coef_towards_marginal_)
+                    len(self.estimators_) -
+                    len(self.decay_coef_towards_marginal_)
                 )
                 # print(self.decay_coef_towards_marginal_)
                 # print(&#39;new_decay_coefs&#39;, new_decay_coefs)
@@ -666,6 +733,25 @@ <h2 id="params">Params</h2>
             else:
                 mse_val = mse_val_new
 
+    def _fit_posthoc_tree_coefs(self, X, y, sample_weight=None):
+        # extract predictions from each tree
+        X_pred_tree = np.array([est.predict(X) for est in self.estimators_]).T
+        print(&#39;shapes&#39;, X.shape, X_pred_tree.shape,
+              y.shape, len(self.estimators_))
+
+        coef_prior = np.ones(len(self.estimators_)) * self.learning_rate
+        y = y - self.bias_ - X_pred_tree @ coef_prior
+
+        if self.fit_posthoc_tree_coefs.lower() == &#34;ridge&#34;:
+            m = RidgeCV(fit_intercept=False)
+        elif self.fit_posthoc_tree_coefs.lower() == &#34;nnls&#34;:
+            m = LinearRegression(fit_intercept=False, positive=True)
+        elif self.fit_posthoc_tree_coefs.lower() == &#34;elasticnet&#34;:
+            m = ElasticNetCV(fit_intercept=False, positive=True)
+
+        m.fit(X_pred_tree, y, sample_weight=sample_weight)
+        self.cyclic_coef_ = m.coef_ + coef_prior
+
     def predict_proba(self, X, marginal_only=False):
         &#34;&#34;&#34;
         Params
@@ -676,22 +762,33 @@ <h2 id="params">Params</h2>
         X = check_array(X, accept_sparse=False, dtype=None)
         check_is_fitted(self)
         probs1 = np.ones(X.shape[0]) * self.bias_
+
+        # marginal prediction
         for i, est in enumerate(self.estimators_marginal):
             probs1 += est.predict(X) * self.marginal_coef_[i]
+
+        # cyclic coefs prediction
         if not marginal_only:
+            if not hasattr(self, &#34;cyclic_coef_&#34;):
+                cyclic_coef_ = np.ones(
+                    len(self.estimators_)) * self.learning_rate
+            else:
+                cyclic_coef_ = self.cyclic_coef_
+                # print(&#39;coef&#39;, cyclic_coef_)
+
             if self.decay_rate_towards_marginal &lt; 1.0:
                 for i, est in enumerate(self.estimators_):
                     if i &lt; len(self.decay_coef_towards_marginal_):
                         probs1 += (
-                            self.learning_rate
+                            cyclic_coef_[i]
                             * self.decay_coef_towards_marginal_[i]
                             * est.predict(X)
                         )
                     else:
-                        probs1 += self.learning_rate * est.predict(X)
+                        probs1 += cyclic_coef_[i] * est.predict(X)
             else:
-                for est in self.estimators_:
-                    probs1 += self.learning_rate * est.predict(X)
+                for i, est in enumerate(self.estimators_):
+                    probs1 += cyclic_coef_[i] * est.predict(X)
         probs1 = np.clip(probs1, a_min=0, a_max=1)
         return np.array([1 - probs1, probs1]).T
 
@@ -750,6 +847,7 @@ <h3>Methods</h3>
         sample_weight,
         test_size=self.validation_frac,
         random_state=self.random_state,
+        stratify=y if isinstance(self, ClassifierMixin) else None,
     )
 
     self.estimators_marginal = []
@@ -773,6 +871,9 @@ <h3>Methods</h3>
             sample_weight_val,
         )
 
+    if self.fit_posthoc_tree_coefs is not None:
+        self._fit_posthoc_tree_coefs(X_train, y_train, sample_weight_train)
+
     self.mse_val_ = self._calc_mse(X_val, y_val, sample_weight_val)
 
     return self</code></pre>
@@ -815,22 +916,33 @@ <h3>Methods</h3>
     X = check_array(X, accept_sparse=False, dtype=None)
     check_is_fitted(self)
     probs1 = np.ones(X.shape[0]) * self.bias_
+
+    # marginal prediction
     for i, est in enumerate(self.estimators_marginal):
         probs1 += est.predict(X) * self.marginal_coef_[i]
+
+    # cyclic coefs prediction
     if not marginal_only:
+        if not hasattr(self, &#34;cyclic_coef_&#34;):
+            cyclic_coef_ = np.ones(
+                len(self.estimators_)) * self.learning_rate
+        else:
+            cyclic_coef_ = self.cyclic_coef_
+            # print(&#39;coef&#39;, cyclic_coef_)
+
         if self.decay_rate_towards_marginal &lt; 1.0:
             for i, est in enumerate(self.estimators_):
                 if i &lt; len(self.decay_coef_towards_marginal_):
                     probs1 += (
-                        self.learning_rate
+                        cyclic_coef_[i]
                         * self.decay_coef_towards_marginal_[i]
                         * est.predict(X)
                     )
                 else:
-                    probs1 += self.learning_rate * est.predict(X)
+                    probs1 += cyclic_coef_[i] * est.predict(X)
         else:
-            for est in self.estimators_:
-                probs1 += self.learning_rate * est.predict(X)
+            for i, est in enumerate(self.estimators_):
+                probs1 += cyclic_coef_[i] * est.predict(X)
     probs1 = np.clip(probs1, a_min=0, a_max=1)
     return np.array([1 - probs1, probs1]).T</code></pre>
 </details>
@@ -1082,7 +1194,7 @@ <h2 id="returns">Returns</h2>
 </dd>
 <dt id="imodels.algebraic.tree_gam.TreeGAMClassifier"><code class="flex name class">
 <span>class <span class="ident">TreeGAMClassifier</span></span>
-<span>(</span><span>n_boosting_rounds=100, max_leaf_nodes=3, reg_param=0.0, learning_rate: float = 0.01, n_boosting_rounds_marginal=0, max_leaf_nodes_marginal=2, reg_param_marginal=0.0, fit_linear_marginal=None, select_linear_marginal=False, decay_rate_towards_marginal=1.0, boosting_strategy='cyclic', validation_frac=0.15, random_state=None)</span>
+<span>(</span><span>n_boosting_rounds=100, max_leaf_nodes=3, reg_param=0.0, learning_rate: float = 0.01, n_boosting_rounds_marginal=0, max_leaf_nodes_marginal=2, reg_param_marginal=0.0, fit_linear_marginal=None, select_linear_marginal=False, decay_rate_towards_marginal=1.0, fit_posthoc_tree_coefs=None, boosting_strategy='cyclic', validation_frac=0.15, random_state=None)</span>
 </code></dt>
 <dd>
 <div class="desc"><p>Tree-based GAM classifier.
@@ -1118,6 +1230,8 @@ <h2 id="params">Params</h2>
 1 means no decay, 0 means only use marginal effects
 shape = (1 - decay_rate_towards_marginal) * shape + decay_rate_towards_marginal * marginal_shape
 The way this is implemented is by keeping track of how many times to multiply decay_rate_towards_marginal for each cyclic estimator
+fit_posthoc_tree_coefs: str [None, "ridge"]
+Whether to fit a linear model to the tree coefficients after fitting the cyclic boosting.
 boosting_strategy : str ["cyclic", "greedy"]
 Whether to use cyclic boosting (cycle over features) or greedy boosting (select best feature at each step)
 validation_frac: float
@@ -1236,7 +1350,7 @@ <h3>Inherited members</h3>
 </dd>
 <dt id="imodels.algebraic.tree_gam.TreeGAMRegressor"><code class="flex name class">
 <span>class <span class="ident">TreeGAMRegressor</span></span>
-<span>(</span><span>n_boosting_rounds=100, max_leaf_nodes=3, reg_param=0.0, learning_rate: float = 0.01, n_boosting_rounds_marginal=0, max_leaf_nodes_marginal=2, reg_param_marginal=0.0, fit_linear_marginal=None, select_linear_marginal=False, decay_rate_towards_marginal=1.0, boosting_strategy='cyclic', validation_frac=0.15, random_state=None)</span>
+<span>(</span><span>n_boosting_rounds=100, max_leaf_nodes=3, reg_param=0.0, learning_rate: float = 0.01, n_boosting_rounds_marginal=0, max_leaf_nodes_marginal=2, reg_param_marginal=0.0, fit_linear_marginal=None, select_linear_marginal=False, decay_rate_towards_marginal=1.0, fit_posthoc_tree_coefs=None, boosting_strategy='cyclic', validation_frac=0.15, random_state=None)</span>
 </code></dt>
 <dd>
 <div class="desc"><p>Tree-based GAM classifier.
@@ -1272,6 +1386,8 @@ <h2 id="params">Params</h2>
 1 means no decay, 0 means only use marginal effects
 shape = (1 - decay_rate_towards_marginal) * shape + decay_rate_towards_marginal * marginal_shape
 The way this is implemented is by keeping track of how many times to multiply decay_rate_towards_marginal for each cyclic estimator
+fit_posthoc_tree_coefs: str [None, "ridge"]
+Whether to fit a linear model to the tree coefficients after fitting the cyclic boosting.
 boosting_strategy : str ["cyclic", "greedy"]
 Whether to use cyclic boosting (cycle over features) or greedy boosting (select best feature at each step)
 validation_frac: float
diff --git a/docs/index.html b/docs/index.html
index bbb8576a..bd4fb775 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -394,7 +394,7 @@ <h2 id="support-for-different-tasks">Support for different tasks</h2>
 <tr>
 <td style="text-align: left;">AutoML model</td>
 <td style="text-align: center;"><a href="https://csinva.io/imodels/util/automl.html">AutoInterpretableClassifier️</a></td>
-<td style="text-align: center;"></td>
+<td style="text-align: center;"><a href="https://csinva.io/imodels/util/automl.html">AutoInterpretableRegressor️</a></td>
 <td></td>
 </tr>
 </tbody>
diff --git a/docs/util/automl.html b/docs/util/automl.html
index 9ac34fff..d462d000 100644
--- a/docs/util/automl.html
+++ b/docs/util/automl.html
@@ -28,33 +28,78 @@
     TreeGAMClassifier,
     FIGSClassifier,
     HSTreeClassifier,
+    RuleFitRegressor,
+    TreeGAMRegressor,
+    FIGSRegressor,
+    HSTreeRegressor,
 )
-from sklearn.tree import DecisionTreeClassifier
-from sklearn.linear_model import LogisticRegression
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+from sklearn.linear_model import LogisticRegression, ElasticNet, Ridge
 import imodels
 from sklearn.model_selection import GridSearchCV, train_test_split
 import numpy as np
 from sklearn.pipeline import Pipeline
 
+# PARAM_GRID_LINEAR = [
+#     {
+#         &#34;est&#34;: []
+#     }
+# ]
 
-class AutoInterpretableClassifier(BaseEstimator, ClassifierMixin):
+
+class AutoInterpretableModel(BaseEstimator):
     &#34;&#34;&#34;Automatically fit and select a classifier that is interpretable.
     Note that all preprocessing should be done beforehand.
     This is basically a wrapper around GridSearchCV, with some preselected models.
     &#34;&#34;&#34;
 
-    PARAM_GRID_DEFAULT = [
+    def __init__(self, param_grid=None):
+        if param_grid is None:
+            if isinstance(self, ClassifierMixin):
+                self.param_grid = self.PARAM_GRID_DEFAULT_CLASSIFICATION
+            elif isinstance(self, RegressorMixin):
+                self.param_grid = self.PARAM_GRID_DEFAULT_REGRESSION
+        else:
+            self.param_grid = param_grid
+
+    def fit(self, X, y, cv=5):
+        self.pipe_ = Pipeline([(&#34;est&#34;, BaseEstimator())]
+                              )  # Placeholder Estimator
+        self.est_ = GridSearchCV(
+            self.pipe_, self.param_grid, scoring=&#34;roc_auc&#34;, cv=cv)
+        self.est_.fit(X, y)
+        return self
+
+    def predict(self, X):
+        return self.est_.predict(X)
+
+    def predict_proba(self, X):
+        return self.est_.predict_proba(X)
+
+    def score(self, X, y):
+        return self.est_.score(X, y)
+
+    PARAM_GRID_LINEAR_CLASSIFICATION = [
         {
-            &#34;est&#34;: [DecisionTreeClassifier()],
-            &#34;est__max_leaf_nodes&#34;: [2, 5, 10],
+            &#34;est&#34;: [
+                LogisticRegression(
+                    solver=&#34;saga&#34;, penalty=&#34;elasticnet&#34;, max_iter=100)
+            ],
+            &#34;est__C&#34;: [0.1, 1, 10],
+            &#34;est__l1_ratio&#34;: [0.5, 1],
         },
         {
             &#34;est&#34;: [
-                LogisticRegression(solver=&#34;saga&#34;, penalty=&#34;elasticnet&#34;, max_iter=100)
+                Ridge(max_iter=100)
             ],
-            &#34;est__C&#34;: [0.1, 1, 10],
-            &#34;est__l1_ratio&#34;: [0, 0.5, 1],
-            # &#34;est__penalty&#34;: [&#34;l1&#34;, &#34;l2&#34;, &#34;elasticnet&#34;],
+            &#34;est__alpha&#34;: [0, 0.1, 1, 10],
+        },
+    ]
+
+    PARAM_GRID_DEFAULT_CLASSIFICATION = [
+        {
+            &#34;est&#34;: [DecisionTreeClassifier()],
+            &#34;est__max_leaf_nodes&#34;: [2, 5, 10],
         },
         {
             &#34;est&#34;: [RuleFitClassifier()],
@@ -73,28 +118,50 @@
             &#34;est&#34;: [FIGSClassifier()],
             &#34;est__max_rules&#34;: [5, 10],
         },
+    ] + PARAM_GRID_LINEAR_CLASSIFICATION
+
+    PARAM_GRID_LINEAR_REGRESSION = [
+        {
+            &#34;est&#34;: [
+                ElasticNet(max_iter=100)
+            ],
+            &#34;est__alpha&#34;: [0.1, 1, 10],
+            &#34;est__l1_ratio&#34;: [0, 0.5, 1],
+        },
     ]
 
-    def __init__(self, param_grid=None):
-        if param_grid is None:
-            self.param_grid_ = self.PARAM_GRID_DEFAULT
-        else:
-            self.param_grid_ = param_grid
+    PARAM_GRID_DEFAULT_REGRESSION = [
+        {
+            &#34;est&#34;: [DecisionTreeRegressor()],
+            &#34;est__max_leaf_nodes&#34;: [2, 5, 10],
+        },
+        {
+            &#34;est&#34;: [HSTreeRegressor()],
+            &#34;est__max_leaf_nodes&#34;: [5, 10],
+        },
 
-    def fit(self, X, y, cv=5):
-        self.pipe_ = Pipeline([(&#34;est&#34;, BaseEstimator())])  # Placeholder Estimator
-        self.est_ = GridSearchCV(self.pipe_, self.param_grid_, scoring=&#34;roc_auc&#34;, cv=cv)
-        self.est_.fit(X, y)
-        return self
+        {
+            &#34;est&#34;: [RuleFitRegressor()],
+            &#34;est__max_rules&#34;: [10, 100],
+            &#34;est__n_estimators&#34;: [20],
+        },
+        {
+            &#34;est&#34;: [TreeGAMRegressor()],
+            &#34;est__n_boosting_rounds&#34;: [10, 100],
+        },
+        {
+            &#34;est&#34;: [FIGSRegressor()],
+            &#34;est__max_rules&#34;: [5, 10],
+        },
+    ] + PARAM_GRID_LINEAR_REGRESSION
 
-    def predict(self, X):
-        return self.est_.predict(X)
 
-    def predict_proba(self, X):
-        return self.est_.predict_proba(X)
+class AutoInterpretableClassifier(AutoInterpretableModel, ClassifierMixin):
+    ...
 
-    def score(self, X, y):
-        return self.est_.score(X, y)
+
+class AutoInterpretableRegressor(AutoInterpretableModel, RegressorMixin):
+    ...
 
 
 if __name__ == &#34;__main__&#34;:
@@ -106,6 +173,7 @@
     )
 
     m = AutoInterpretableClassifier()
+    # m = AutoInterpretableRegressor()
     m.fit(X_train, y_train)
 
     print(&#34;best params&#34;, m.est_.best_params_)
@@ -135,24 +203,90 @@ <h2 class="section-title" id="header-classes">Classes</h2>
 <summary>
 <span>Expand source code</span>
 </summary>
-<pre><code class="python">class AutoInterpretableClassifier(BaseEstimator, ClassifierMixin):
+<pre><code class="python">class AutoInterpretableClassifier(AutoInterpretableModel, ClassifierMixin):
+    ...</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="imodels.util.automl.AutoInterpretableModel" href="#imodels.util.automl.AutoInterpretableModel">AutoInterpretableModel</a></li>
+<li>sklearn.base.BaseEstimator</li>
+<li>sklearn.utils._metadata_requests._MetadataRequester</li>
+<li>sklearn.base.ClassifierMixin</li>
+</ul>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="imodels.util.automl.AutoInterpretableModel" href="#imodels.util.automl.AutoInterpretableModel">AutoInterpretableModel</a></b></code>:
+<ul class="hlist">
+<li><code><a title="imodels.util.automl.AutoInterpretableModel.set_fit_request" href="#imodels.util.automl.AutoInterpretableModel.set_fit_request">set_fit_request</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="imodels.util.automl.AutoInterpretableModel"><code class="flex name class">
+<span>class <span class="ident">AutoInterpretableModel</span></span>
+<span>(</span><span>param_grid=None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Automatically fit and select a classifier that is interpretable.
+Note that all preprocessing should be done beforehand.
+This is basically a wrapper around GridSearchCV, with some preselected models.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class AutoInterpretableModel(BaseEstimator):
     &#34;&#34;&#34;Automatically fit and select a classifier that is interpretable.
     Note that all preprocessing should be done beforehand.
     This is basically a wrapper around GridSearchCV, with some preselected models.
     &#34;&#34;&#34;
 
-    PARAM_GRID_DEFAULT = [
+    def __init__(self, param_grid=None):
+        if param_grid is None:
+            if isinstance(self, ClassifierMixin):
+                self.param_grid = self.PARAM_GRID_DEFAULT_CLASSIFICATION
+            elif isinstance(self, RegressorMixin):
+                self.param_grid = self.PARAM_GRID_DEFAULT_REGRESSION
+        else:
+            self.param_grid = param_grid
+
+    def fit(self, X, y, cv=5):
+        self.pipe_ = Pipeline([(&#34;est&#34;, BaseEstimator())]
+                              )  # Placeholder Estimator
+        self.est_ = GridSearchCV(
+            self.pipe_, self.param_grid, scoring=&#34;roc_auc&#34;, cv=cv)
+        self.est_.fit(X, y)
+        return self
+
+    def predict(self, X):
+        return self.est_.predict(X)
+
+    def predict_proba(self, X):
+        return self.est_.predict_proba(X)
+
+    def score(self, X, y):
+        return self.est_.score(X, y)
+
+    PARAM_GRID_LINEAR_CLASSIFICATION = [
         {
-            &#34;est&#34;: [DecisionTreeClassifier()],
-            &#34;est__max_leaf_nodes&#34;: [2, 5, 10],
+            &#34;est&#34;: [
+                LogisticRegression(
+                    solver=&#34;saga&#34;, penalty=&#34;elasticnet&#34;, max_iter=100)
+            ],
+            &#34;est__C&#34;: [0.1, 1, 10],
+            &#34;est__l1_ratio&#34;: [0.5, 1],
         },
         {
             &#34;est&#34;: [
-                LogisticRegression(solver=&#34;saga&#34;, penalty=&#34;elasticnet&#34;, max_iter=100)
+                Ridge(max_iter=100)
             ],
-            &#34;est__C&#34;: [0.1, 1, 10],
-            &#34;est__l1_ratio&#34;: [0, 0.5, 1],
-            # &#34;est__penalty&#34;: [&#34;l1&#34;, &#34;l2&#34;, &#34;elasticnet&#34;],
+            &#34;est__alpha&#34;: [0, 0.1, 1, 10],
+        },
+    ]
+
+    PARAM_GRID_DEFAULT_CLASSIFICATION = [
+        {
+            &#34;est&#34;: [DecisionTreeClassifier()],
+            &#34;est__max_leaf_nodes&#34;: [2, 5, 10],
         },
         {
             &#34;est&#34;: [RuleFitClassifier()],
@@ -171,45 +305,75 @@ <h2 class="section-title" id="header-classes">Classes</h2>
             &#34;est&#34;: [FIGSClassifier()],
             &#34;est__max_rules&#34;: [5, 10],
         },
-    ]
-
-    def __init__(self, param_grid=None):
-        if param_grid is None:
-            self.param_grid_ = self.PARAM_GRID_DEFAULT
-        else:
-            self.param_grid_ = param_grid
-
-    def fit(self, X, y, cv=5):
-        self.pipe_ = Pipeline([(&#34;est&#34;, BaseEstimator())])  # Placeholder Estimator
-        self.est_ = GridSearchCV(self.pipe_, self.param_grid_, scoring=&#34;roc_auc&#34;, cv=cv)
-        self.est_.fit(X, y)
-        return self
+    ] + PARAM_GRID_LINEAR_CLASSIFICATION
 
-    def predict(self, X):
-        return self.est_.predict(X)
+    PARAM_GRID_LINEAR_REGRESSION = [
+        {
+            &#34;est&#34;: [
+                ElasticNet(max_iter=100)
+            ],
+            &#34;est__alpha&#34;: [0.1, 1, 10],
+            &#34;est__l1_ratio&#34;: [0, 0.5, 1],
+        },
+    ]
 
-    def predict_proba(self, X):
-        return self.est_.predict_proba(X)
+    PARAM_GRID_DEFAULT_REGRESSION = [
+        {
+            &#34;est&#34;: [DecisionTreeRegressor()],
+            &#34;est__max_leaf_nodes&#34;: [2, 5, 10],
+        },
+        {
+            &#34;est&#34;: [HSTreeRegressor()],
+            &#34;est__max_leaf_nodes&#34;: [5, 10],
+        },
 
-    def score(self, X, y):
-        return self.est_.score(X, y)</code></pre>
+        {
+            &#34;est&#34;: [RuleFitRegressor()],
+            &#34;est__max_rules&#34;: [10, 100],
+            &#34;est__n_estimators&#34;: [20],
+        },
+        {
+            &#34;est&#34;: [TreeGAMRegressor()],
+            &#34;est__n_boosting_rounds&#34;: [10, 100],
+        },
+        {
+            &#34;est&#34;: [FIGSRegressor()],
+            &#34;est__max_rules&#34;: [5, 10],
+        },
+    ] + PARAM_GRID_LINEAR_REGRESSION</code></pre>
 </details>
 <h3>Ancestors</h3>
 <ul class="hlist">
 <li>sklearn.base.BaseEstimator</li>
 <li>sklearn.utils._metadata_requests._MetadataRequester</li>
-<li>sklearn.base.ClassifierMixin</li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="imodels.util.automl.AutoInterpretableClassifier" href="#imodels.util.automl.AutoInterpretableClassifier">AutoInterpretableClassifier</a></li>
+<li><a title="imodels.util.automl.AutoInterpretableRegressor" href="#imodels.util.automl.AutoInterpretableRegressor">AutoInterpretableRegressor</a></li>
 </ul>
 <h3>Class variables</h3>
 <dl>
-<dt id="imodels.util.automl.AutoInterpretableClassifier.PARAM_GRID_DEFAULT"><code class="name">var <span class="ident">PARAM_GRID_DEFAULT</span></code></dt>
+<dt id="imodels.util.automl.AutoInterpretableModel.PARAM_GRID_DEFAULT_CLASSIFICATION"><code class="name">var <span class="ident">PARAM_GRID_DEFAULT_CLASSIFICATION</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="imodels.util.automl.AutoInterpretableModel.PARAM_GRID_DEFAULT_REGRESSION"><code class="name">var <span class="ident">PARAM_GRID_DEFAULT_REGRESSION</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="imodels.util.automl.AutoInterpretableModel.PARAM_GRID_LINEAR_CLASSIFICATION"><code class="name">var <span class="ident">PARAM_GRID_LINEAR_CLASSIFICATION</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="imodels.util.automl.AutoInterpretableModel.PARAM_GRID_LINEAR_REGRESSION"><code class="name">var <span class="ident">PARAM_GRID_LINEAR_REGRESSION</span></code></dt>
 <dd>
 <div class="desc"></div>
 </dd>
 </dl>
 <h3>Methods</h3>
 <dl>
-<dt id="imodels.util.automl.AutoInterpretableClassifier.fit"><code class="name flex">
+<dt id="imodels.util.automl.AutoInterpretableModel.fit"><code class="name flex">
 <span>def <span class="ident">fit</span></span>(<span>self, X, y, cv=5)</span>
 </code></dt>
 <dd>
@@ -219,13 +383,15 @@ <h3>Methods</h3>
 <span>Expand source code</span>
 </summary>
 <pre><code class="python">def fit(self, X, y, cv=5):
-    self.pipe_ = Pipeline([(&#34;est&#34;, BaseEstimator())])  # Placeholder Estimator
-    self.est_ = GridSearchCV(self.pipe_, self.param_grid_, scoring=&#34;roc_auc&#34;, cv=cv)
+    self.pipe_ = Pipeline([(&#34;est&#34;, BaseEstimator())]
+                          )  # Placeholder Estimator
+    self.est_ = GridSearchCV(
+        self.pipe_, self.param_grid, scoring=&#34;roc_auc&#34;, cv=cv)
     self.est_.fit(X, y)
     return self</code></pre>
 </details>
 </dd>
-<dt id="imodels.util.automl.AutoInterpretableClassifier.predict"><code class="name flex">
+<dt id="imodels.util.automl.AutoInterpretableModel.predict"><code class="name flex">
 <span>def <span class="ident">predict</span></span>(<span>self, X)</span>
 </code></dt>
 <dd>
@@ -238,7 +404,7 @@ <h3>Methods</h3>
     return self.est_.predict(X)</code></pre>
 </details>
 </dd>
-<dt id="imodels.util.automl.AutoInterpretableClassifier.predict_proba"><code class="name flex">
+<dt id="imodels.util.automl.AutoInterpretableModel.predict_proba"><code class="name flex">
 <span>def <span class="ident">predict_proba</span></span>(<span>self, X)</span>
 </code></dt>
 <dd>
@@ -251,28 +417,11 @@ <h3>Methods</h3>
     return self.est_.predict_proba(X)</code></pre>
 </details>
 </dd>
-<dt id="imodels.util.automl.AutoInterpretableClassifier.score"><code class="name flex">
+<dt id="imodels.util.automl.AutoInterpretableModel.score"><code class="name flex">
 <span>def <span class="ident">score</span></span>(<span>self, X, y)</span>
 </code></dt>
 <dd>
-<div class="desc"><p>Return the mean accuracy on the given test data and labels.</p>
-<p>In multi-label classification, this is the subset accuracy
-which is a harsh metric since you require for each sample that
-each label set be correctly predicted.</p>
-<h2 id="parameters">Parameters</h2>
-<dl>
-<dt><strong><code>X</code></strong> :&ensp;<code>array-like</code> of <code>shape (n_samples, n_features)</code></dt>
-<dd>Test samples.</dd>
-<dt><strong><code>y</code></strong> :&ensp;<code>array-like</code> of <code>shape (n_samples,)</code> or <code>(n_samples, n_outputs)</code></dt>
-<dd>True labels for <code>X</code>.</dd>
-<dt><strong><code>sample_weight</code></strong> :&ensp;<code>array-like</code> of <code>shape (n_samples,)</code>, default=<code>None</code></dt>
-<dd>Sample weights.</dd>
-</dl>
-<h2 id="returns">Returns</h2>
-<dl>
-<dt><strong><code>score</code></strong> :&ensp;<code>float</code></dt>
-<dd>Mean accuracy of <code>self.predict(X)</code> w.r.t. <code>y</code>.</dd>
-</dl></div>
+<div class="desc"></div>
 <details class="source">
 <summary>
 <span>Expand source code</span>
@@ -281,8 +430,8 @@ <h2 id="returns">Returns</h2>
     return self.est_.score(X, y)</code></pre>
 </details>
 </dd>
-<dt id="imodels.util.automl.AutoInterpretableClassifier.set_fit_request"><code class="name flex">
-<span>def <span class="ident">set_fit_request</span></span>(<span>self: <a title="imodels.util.automl.AutoInterpretableClassifier" href="#imodels.util.automl.AutoInterpretableClassifier">AutoInterpretableClassifier</a>, *, cv: Union[bool, ForwardRef(None), str] = '$UNCHANGED$') ‑> <a title="imodels.util.automl.AutoInterpretableClassifier" href="#imodels.util.automl.AutoInterpretableClassifier">AutoInterpretableClassifier</a></span>
+<dt id="imodels.util.automl.AutoInterpretableModel.set_fit_request"><code class="name flex">
+<span>def <span class="ident">set_fit_request</span></span>(<span>self: <a title="imodels.util.automl.AutoInterpretableModel" href="#imodels.util.automl.AutoInterpretableModel">AutoInterpretableModel</a>, *, cv: Union[bool, ForwardRef(None), str] = '$UNCHANGED$') ‑> <a title="imodels.util.automl.AutoInterpretableModel" href="#imodels.util.automl.AutoInterpretableModel">AutoInterpretableModel</a></span>
 </code></dt>
 <dd>
 <div class="desc"><p>Request metadata passed to the <code>fit</code> method.</p>
@@ -364,6 +513,37 @@ <h2 id="returns">Returns</h2>
 </dd>
 </dl>
 </dd>
+<dt id="imodels.util.automl.AutoInterpretableRegressor"><code class="flex name class">
+<span>class <span class="ident">AutoInterpretableRegressor</span></span>
+<span>(</span><span>param_grid=None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Automatically fit and select a classifier that is interpretable.
+Note that all preprocessing should be done beforehand.
+This is basically a wrapper around GridSearchCV, with some preselected models.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class AutoInterpretableRegressor(AutoInterpretableModel, RegressorMixin):
+    ...</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="imodels.util.automl.AutoInterpretableModel" href="#imodels.util.automl.AutoInterpretableModel">AutoInterpretableModel</a></li>
+<li>sklearn.base.BaseEstimator</li>
+<li>sklearn.utils._metadata_requests._MetadataRequester</li>
+<li>sklearn.base.RegressorMixin</li>
+</ul>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="imodels.util.automl.AutoInterpretableModel" href="#imodels.util.automl.AutoInterpretableModel">AutoInterpretableModel</a></b></code>:
+<ul class="hlist">
+<li><code><a title="imodels.util.automl.AutoInterpretableModel.set_fit_request" href="#imodels.util.automl.AutoInterpretableModel.set_fit_request">set_fit_request</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
 </dl>
 </section>
 </article>
@@ -382,15 +562,24 @@ <h1>Index 🔍</h1>
 <ul>
 <li>
 <h4><code><a title="imodels.util.automl.AutoInterpretableClassifier" href="#imodels.util.automl.AutoInterpretableClassifier">AutoInterpretableClassifier</a></code></h4>
-<ul class="two-column">
-<li><code><a title="imodels.util.automl.AutoInterpretableClassifier.PARAM_GRID_DEFAULT" href="#imodels.util.automl.AutoInterpretableClassifier.PARAM_GRID_DEFAULT">PARAM_GRID_DEFAULT</a></code></li>
-<li><code><a title="imodels.util.automl.AutoInterpretableClassifier.fit" href="#imodels.util.automl.AutoInterpretableClassifier.fit">fit</a></code></li>
-<li><code><a title="imodels.util.automl.AutoInterpretableClassifier.predict" href="#imodels.util.automl.AutoInterpretableClassifier.predict">predict</a></code></li>
-<li><code><a title="imodels.util.automl.AutoInterpretableClassifier.predict_proba" href="#imodels.util.automl.AutoInterpretableClassifier.predict_proba">predict_proba</a></code></li>
-<li><code><a title="imodels.util.automl.AutoInterpretableClassifier.score" href="#imodels.util.automl.AutoInterpretableClassifier.score">score</a></code></li>
-<li><code><a title="imodels.util.automl.AutoInterpretableClassifier.set_fit_request" href="#imodels.util.automl.AutoInterpretableClassifier.set_fit_request">set_fit_request</a></code></li>
+</li>
+<li>
+<h4><code><a title="imodels.util.automl.AutoInterpretableModel" href="#imodels.util.automl.AutoInterpretableModel">AutoInterpretableModel</a></code></h4>
+<ul class="">
+<li><code><a title="imodels.util.automl.AutoInterpretableModel.PARAM_GRID_DEFAULT_CLASSIFICATION" href="#imodels.util.automl.AutoInterpretableModel.PARAM_GRID_DEFAULT_CLASSIFICATION">PARAM_GRID_DEFAULT_CLASSIFICATION</a></code></li>
+<li><code><a title="imodels.util.automl.AutoInterpretableModel.PARAM_GRID_DEFAULT_REGRESSION" href="#imodels.util.automl.AutoInterpretableModel.PARAM_GRID_DEFAULT_REGRESSION">PARAM_GRID_DEFAULT_REGRESSION</a></code></li>
+<li><code><a title="imodels.util.automl.AutoInterpretableModel.PARAM_GRID_LINEAR_CLASSIFICATION" href="#imodels.util.automl.AutoInterpretableModel.PARAM_GRID_LINEAR_CLASSIFICATION">PARAM_GRID_LINEAR_CLASSIFICATION</a></code></li>
+<li><code><a title="imodels.util.automl.AutoInterpretableModel.PARAM_GRID_LINEAR_REGRESSION" href="#imodels.util.automl.AutoInterpretableModel.PARAM_GRID_LINEAR_REGRESSION">PARAM_GRID_LINEAR_REGRESSION</a></code></li>
+<li><code><a title="imodels.util.automl.AutoInterpretableModel.fit" href="#imodels.util.automl.AutoInterpretableModel.fit">fit</a></code></li>
+<li><code><a title="imodels.util.automl.AutoInterpretableModel.predict" href="#imodels.util.automl.AutoInterpretableModel.predict">predict</a></code></li>
+<li><code><a title="imodels.util.automl.AutoInterpretableModel.predict_proba" href="#imodels.util.automl.AutoInterpretableModel.predict_proba">predict_proba</a></code></li>
+<li><code><a title="imodels.util.automl.AutoInterpretableModel.score" href="#imodels.util.automl.AutoInterpretableModel.score">score</a></code></li>
+<li><code><a title="imodels.util.automl.AutoInterpretableModel.set_fit_request" href="#imodels.util.automl.AutoInterpretableModel.set_fit_request">set_fit_request</a></code></li>
 </ul>
 </li>
+<li>
+<h4><code><a title="imodels.util.automl.AutoInterpretableRegressor" href="#imodels.util.automl.AutoInterpretableRegressor">AutoInterpretableRegressor</a></code></h4>
+</li>
 </ul>
 </li>
 </ul>
diff --git a/imodels/util/automl.py b/imodels/util/automl.py
index c790df7c..dcba66ad 100644
--- a/imodels/util/automl.py
+++ b/imodels/util/automl.py
@@ -5,34 +5,78 @@
     TreeGAMClassifier,
     FIGSClassifier,
     HSTreeClassifier,
+    RuleFitRegressor,
+    TreeGAMRegressor,
+    FIGSRegressor,
+    HSTreeRegressor,
 )
-from sklearn.tree import DecisionTreeClassifier
-from sklearn.linear_model import LogisticRegression
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+from sklearn.linear_model import LogisticRegression, ElasticNet, Ridge
 import imodels
 from sklearn.model_selection import GridSearchCV, train_test_split
 import numpy as np
 from sklearn.pipeline import Pipeline
 
+# PARAM_GRID_LINEAR = [
+#     {
+#         "est": []
+#     }
+# ]
 
-class AutoInterpretableClassifier(BaseEstimator, ClassifierMixin):
+
+class AutoInterpretableModel(BaseEstimator):
     """Automatically fit and select a classifier that is interpretable.
     Note that all preprocessing should be done beforehand.
     This is basically a wrapper around GridSearchCV, with some preselected models.
     """
 
-    PARAM_GRID_DEFAULT = [
-        {
-            "est": [DecisionTreeClassifier()],
-            "est__max_leaf_nodes": [2, 5, 10],
-        },
+    def __init__(self, param_grid=None):
+        if param_grid is None:
+            if isinstance(self, ClassifierMixin):
+                self.param_grid = self.PARAM_GRID_DEFAULT_CLASSIFICATION
+            elif isinstance(self, RegressorMixin):
+                self.param_grid = self.PARAM_GRID_DEFAULT_REGRESSION
+        else:
+            self.param_grid = param_grid
+
+    def fit(self, X, y, cv=5):
+        self.pipe_ = Pipeline([("est", BaseEstimator())]
+                              )  # Placeholder Estimator
+        self.est_ = GridSearchCV(
+            self.pipe_, self.param_grid, scoring="roc_auc", cv=cv)
+        self.est_.fit(X, y)
+        return self
+
+    def predict(self, X):
+        return self.est_.predict(X)
+
+    def predict_proba(self, X):
+        return self.est_.predict_proba(X)
+
+    def score(self, X, y):
+        return self.est_.score(X, y)
+
+    PARAM_GRID_LINEAR_CLASSIFICATION = [
         {
             "est": [
                 LogisticRegression(
                     solver="saga", penalty="elasticnet", max_iter=100)
             ],
             "est__C": [0.1, 1, 10],
-            "est__l1_ratio": [0, 0.5, 1],
-            # "est__penalty": ["l1", "l2", "elasticnet"],
+            "est__l1_ratio": [0.5, 1],
+        },
+        {
+            "est": [
+                Ridge(max_iter=100)
+            ],
+            "est__alpha": [0, 0.1, 1, 10],
+        },
+    ]
+
+    PARAM_GRID_DEFAULT_CLASSIFICATION = [
+        {
+            "est": [DecisionTreeClassifier()],
+            "est__max_leaf_nodes": [2, 5, 10],
         },
         {
             "est": [RuleFitClassifier()],
@@ -51,30 +95,50 @@ class AutoInterpretableClassifier(BaseEstimator, ClassifierMixin):
             "est": [FIGSClassifier()],
             "est__max_rules": [5, 10],
         },
+    ] + PARAM_GRID_LINEAR_CLASSIFICATION
+
+    PARAM_GRID_LINEAR_REGRESSION = [
+        {
+            "est": [
+                ElasticNet(max_iter=100)
+            ],
+            "est__alpha": [0.1, 1, 10],
+            "est__l1_ratio": [0, 0.5, 1],
+        },
     ]
 
-    def __init__(self, param_grid=None):
-        if param_grid is None:
-            self.param_grid = self.PARAM_GRID_DEFAULT
-        else:
-            self.param_grid = param_grid
+    PARAM_GRID_DEFAULT_REGRESSION = [
+        {
+            "est": [DecisionTreeRegressor()],
+            "est__max_leaf_nodes": [2, 5, 10],
+        },
+        {
+            "est": [HSTreeRegressor()],
+            "est__max_leaf_nodes": [5, 10],
+        },
 
-    def fit(self, X, y, cv=5):
-        self.pipe_ = Pipeline([("est", BaseEstimator())]
-                              )  # Placeholder Estimator
-        self.est_ = GridSearchCV(
-            self.pipe_, self.param_grid, scoring="roc_auc", cv=cv)
-        self.est_.fit(X, y)
-        return self
+        {
+            "est": [RuleFitRegressor()],
+            "est__max_rules": [10, 100],
+            "est__n_estimators": [20],
+        },
+        {
+            "est": [TreeGAMRegressor()],
+            "est__n_boosting_rounds": [10, 100],
+        },
+        {
+            "est": [FIGSRegressor()],
+            "est__max_rules": [5, 10],
+        },
+    ] + PARAM_GRID_LINEAR_REGRESSION
 
-    def predict(self, X):
-        return self.est_.predict(X)
 
-    def predict_proba(self, X):
-        return self.est_.predict_proba(X)
+class AutoInterpretableClassifier(AutoInterpretableModel, ClassifierMixin):
+    ...
 
-    def score(self, X, y):
-        return self.est_.score(X, y)
+
+class AutoInterpretableRegressor(AutoInterpretableModel, RegressorMixin):
+    ...
 
 
 if __name__ == "__main__":
@@ -86,6 +150,7 @@ def score(self, X, y):
     )
 
     m = AutoInterpretableClassifier()
+    # m = AutoInterpretableRegressor()
     m.fit(X_train, y_train)
 
     print("best params", m.est_.best_params_)
diff --git a/readme.md b/readme.md
index 4551a95c..864bb247 100644
--- a/readme.md
+++ b/readme.md
@@ -180,7 +180,7 @@ Different models support different machine-learning tasks. Current support for d
 | Greedy tree sums (FIGS) | [FIGSClassifier](https://csinva.io/imodels/tree/figs.html#imodels.tree.figs.FIGSClassifier) | [FIGSRegressor](https://csinva.io/imodels/tree/figs.html#imodels.tree.figs.FIGSRegressor) |                                                              |
 | Hierarchical shrinkage | [HSTreeClassifierCV](https://csinva.io/imodels/tree/hierarchical_shrinkage.html#imodels.tree.hierarchical_shrinkage.HSTreeClassifierCV) | [HSTreeRegressorCV](https://csinva.io/imodels/tree/hierarchical_shrinkage.html#imodels.tree.hierarchical_shrinkage.HSTreeRegressorCV) | Wraps any sklearn tree-based model |
 | Distillation |  | [DistilledRegressor](https://csinva.io/imodels/util/distillation.html#imodels.util.distillation.DistilledRegressor) | Wraps any sklearn-compatible models |
-| AutoML model | [AutoInterpretableClassifier️](https://csinva.io/imodels/util/automl.html)  | | |
+| AutoML model | [AutoInterpretableClassifier️](https://csinva.io/imodels/util/automl.html)  | [AutoInterpretableRegressor️](https://csinva.io/imodels/util/automl.html) | |
 
 
 ### Extras
diff --git a/setup.py b/setup.py
index b2f13255..7c3f11f9 100644
--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,7 @@
 
 setuptools.setup(
     name="imodels",
-    version="1.4.0",
+    version="1.4.1",
     author="Chandan Singh, Keyan Nasseri, Matthew Epland, Yan Shuo Tan, Omer Ronen, Tiffany Tang, Abhineet Agarwal, Theo Saarinen, Bin Yu, and others",
     author_email="chandan_singh@berkeley.edu",
     description="Implementations of various interpretable models",