diff --git a/docs/algebraic/gam.html b/docs/algebraic/gam.html
deleted file mode 100644
index a85ae4cb..00000000
--- a/docs/algebraic/gam.html
+++ /dev/null
@@ -1,311 +0,0 @@
-<!doctype html>
-<html lang="en">
-<head>
-<meta charset="utf-8">
-<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
-<meta name="generator" content="pdoc 0.10.0" />
-<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
-<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
-<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
-<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
-<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
-<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
-<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
-<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
-</head>
-<body>
-<main>
-<article id="content">
-<section id="section-intro">
-<details class="source">
-<summary>
-<span>Expand source code</span>
-</summary>
-<pre><code class="python">from copy import deepcopy
-import numpy as np
-import pandas as pd
-from sklearn.base import BaseEstimator
-from sklearn.tree import DecisionTreeRegressor
-from sklearn.utils.validation import check_is_fitted
-from sklearn.utils import check_array
-from sklearn.utils.multiclass import check_classification_targets
-from sklearn.utils.multiclass import type_of_target
-from sklearn.utils.validation import check_X_y
-from sklearn.utils.validation import check_random_state
-from sklearn.utils.validation import column_or_1d
-from sklearn.utils.validation import check_consistent_length
-from sklearn.utils.validation import _check_sample_weight
-from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
-from sklearn.datasets import load_breast_cancer
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import accuracy_score, roc_auc_score
-from tqdm import tqdm
-
-
-class TreeGAMClassifier(BaseEstimator):
-    &#34;&#34;&#34;Tree-based GAM classifier.
-    Uses cyclical boosting to fit a GAM with small trees.
-    Simplified version of the explainable boosting machine described in https://github.com/interpretml/interpret
-    &#34;&#34;&#34;
-
-    def __init__(
-        self,
-        max_leaf_nodes=3,
-        n_boosting_rounds=20,
-        random_state=None,
-    ):
-        self.max_leaf_nodes = max_leaf_nodes
-        self.random_state = random_state
-        self.n_boosting_rounds = n_boosting_rounds
-
-    def fit(self, X, y, sample_weight=None):
-        X, y = check_X_y(X, y, accept_sparse=False, multi_output=False)
-        check_classification_targets(y)
-        sample_weight = _check_sample_weight(sample_weight, X, dtype=None)
-
-        # cycle through features and fit a tree to each one
-        ests = []
-        for boosting_round in tqdm(range(self.n_boosting_rounds)):
-            for feature_num in range(X.shape[1]):
-                X_ = np.zeros_like(X)
-                X_[:, feature_num] = X[:, feature_num]
-                est = DecisionTreeRegressor(
-                    max_leaf_nodes=self.max_leaf_nodes,
-                    random_state=self.random_state,
-                )
-                est.fit(X_, y, sample_weight=sample_weight)
-                if not est.tree_.feature[0] == feature_num:
-                    # failed to split on this feature
-                    continue
-                ests.append(est)
-                y = y - est.predict(X)
-
-        self.est_ = GradientBoostingRegressor()
-        self.est_.fit(X, y)
-        self.est_.n_estimators_ = len(ests)
-        self.est_.estimators_ = np.array(ests).reshape(-1, 1)
-
-        return self
-
-    def predict_proba(self, X):
-        X = check_array(X, accept_sparse=False, dtype=None)
-        check_is_fitted(self)
-        probs1 = self.est_.predict(X)
-        return np.array([1 - probs1, probs1]).T
-
-    def predict(self, X):
-        X = check_array(X, accept_sparse=False, dtype=None)
-        check_is_fitted(self)
-        return (self.est_.predict(X) &gt; 0.5).astype(int)
-
-
-if __name__ == &#34;__main__&#34;:
-    breast = load_breast_cancer()
-    feature_names = list(breast.feature_names)
-    X, y = pd.DataFrame(breast.data, columns=feature_names), breast.target
-    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
-    gam = TreeGAMClassifier(max_leaf_nodes=2)
-    gam.fit(X_train, y_train)
-
-    # check roc auc score
-    y_pred = gam.predict_proba(X_test)[:, 1]
-    print(
-        &#34;train roc auc score:&#34;, roc_auc_score(y_train, gam.predict_proba(X_train)[:, 1])
-    )
-    print(&#34;test roc auc score:&#34;, roc_auc_score(y_test, y_pred))</code></pre>
-</details>
-</section>
-<section>
-</section>
-<section>
-</section>
-<section>
-</section>
-<section>
-<h2 class="section-title" id="header-classes">Classes</h2>
-<dl>
-<dt id="imodels.algebraic.gam.TreeGAMClassifier"><code class="flex name class">
-<span>class <span class="ident">TreeGAMClassifier</span></span>
-<span>(</span><span>max_leaf_nodes=3, n_boosting_rounds=20, random_state=None)</span>
-</code></dt>
-<dd>
-<div class="desc"><p>Tree-based GAM classifier.
-Uses cyclical boosting to fit a GAM with small trees.
-Simplified version of the explainable boosting machine described in <a href="https://github.com/interpretml/interpret">https://github.com/interpretml/interpret</a></p></div>
-<details class="source">
-<summary>
-<span>Expand source code</span>
-</summary>
-<pre><code class="python">class TreeGAMClassifier(BaseEstimator):
-    &#34;&#34;&#34;Tree-based GAM classifier.
-    Uses cyclical boosting to fit a GAM with small trees.
-    Simplified version of the explainable boosting machine described in https://github.com/interpretml/interpret
-    &#34;&#34;&#34;
-
-    def __init__(
-        self,
-        max_leaf_nodes=3,
-        n_boosting_rounds=20,
-        random_state=None,
-    ):
-        self.max_leaf_nodes = max_leaf_nodes
-        self.random_state = random_state
-        self.n_boosting_rounds = n_boosting_rounds
-
-    def fit(self, X, y, sample_weight=None):
-        X, y = check_X_y(X, y, accept_sparse=False, multi_output=False)
-        check_classification_targets(y)
-        sample_weight = _check_sample_weight(sample_weight, X, dtype=None)
-
-        # cycle through features and fit a tree to each one
-        ests = []
-        for boosting_round in tqdm(range(self.n_boosting_rounds)):
-            for feature_num in range(X.shape[1]):
-                X_ = np.zeros_like(X)
-                X_[:, feature_num] = X[:, feature_num]
-                est = DecisionTreeRegressor(
-                    max_leaf_nodes=self.max_leaf_nodes,
-                    random_state=self.random_state,
-                )
-                est.fit(X_, y, sample_weight=sample_weight)
-                if not est.tree_.feature[0] == feature_num:
-                    # failed to split on this feature
-                    continue
-                ests.append(est)
-                y = y - est.predict(X)
-
-        self.est_ = GradientBoostingRegressor()
-        self.est_.fit(X, y)
-        self.est_.n_estimators_ = len(ests)
-        self.est_.estimators_ = np.array(ests).reshape(-1, 1)
-
-        return self
-
-    def predict_proba(self, X):
-        X = check_array(X, accept_sparse=False, dtype=None)
-        check_is_fitted(self)
-        probs1 = self.est_.predict(X)
-        return np.array([1 - probs1, probs1]).T
-
-    def predict(self, X):
-        X = check_array(X, accept_sparse=False, dtype=None)
-        check_is_fitted(self)
-        return (self.est_.predict(X) &gt; 0.5).astype(int)</code></pre>
-</details>
-<h3>Ancestors</h3>
-<ul class="hlist">
-<li>sklearn.base.BaseEstimator</li>
-</ul>
-<h3>Methods</h3>
-<dl>
-<dt id="imodels.algebraic.gam.TreeGAMClassifier.fit"><code class="name flex">
-<span>def <span class="ident">fit</span></span>(<span>self, X, y, sample_weight=None)</span>
-</code></dt>
-<dd>
-<div class="desc"></div>
-<details class="source">
-<summary>
-<span>Expand source code</span>
-</summary>
-<pre><code class="python">def fit(self, X, y, sample_weight=None):
-    X, y = check_X_y(X, y, accept_sparse=False, multi_output=False)
-    check_classification_targets(y)
-    sample_weight = _check_sample_weight(sample_weight, X, dtype=None)
-
-    # cycle through features and fit a tree to each one
-    ests = []
-    for boosting_round in tqdm(range(self.n_boosting_rounds)):
-        for feature_num in range(X.shape[1]):
-            X_ = np.zeros_like(X)
-            X_[:, feature_num] = X[:, feature_num]
-            est = DecisionTreeRegressor(
-                max_leaf_nodes=self.max_leaf_nodes,
-                random_state=self.random_state,
-            )
-            est.fit(X_, y, sample_weight=sample_weight)
-            if not est.tree_.feature[0] == feature_num:
-                # failed to split on this feature
-                continue
-            ests.append(est)
-            y = y - est.predict(X)
-
-    self.est_ = GradientBoostingRegressor()
-    self.est_.fit(X, y)
-    self.est_.n_estimators_ = len(ests)
-    self.est_.estimators_ = np.array(ests).reshape(-1, 1)
-
-    return self</code></pre>
-</details>
-</dd>
-<dt id="imodels.algebraic.gam.TreeGAMClassifier.predict"><code class="name flex">
-<span>def <span class="ident">predict</span></span>(<span>self, X)</span>
-</code></dt>
-<dd>
-<div class="desc"></div>
-<details class="source">
-<summary>
-<span>Expand source code</span>
-</summary>
-<pre><code class="python">def predict(self, X):
-    X = check_array(X, accept_sparse=False, dtype=None)
-    check_is_fitted(self)
-    return (self.est_.predict(X) &gt; 0.5).astype(int)</code></pre>
-</details>
-</dd>
-<dt id="imodels.algebraic.gam.TreeGAMClassifier.predict_proba"><code class="name flex">
-<span>def <span class="ident">predict_proba</span></span>(<span>self, X)</span>
-</code></dt>
-<dd>
-<div class="desc"></div>
-<details class="source">
-<summary>
-<span>Expand source code</span>
-</summary>
-<pre><code class="python">def predict_proba(self, X):
-    X = check_array(X, accept_sparse=False, dtype=None)
-    check_is_fitted(self)
-    probs1 = self.est_.predict(X)
-    return np.array([1 - probs1, probs1]).T</code></pre>
-</details>
-</dd>
-</dl>
-</dd>
-</dl>
-</section>
-</article>
-<nav id="sidebar">
-<h1>Index 🔍</h1>
-<div class="toc">
-<ul></ul>
-</div>
-<ul id="index">
-<li><h3>Super-module</h3>
-<ul>
-<li><code><a title="imodels.algebraic" href="index.html">imodels.algebraic</a></code></li>
-</ul>
-</li>
-<li><h3><a href="#header-classes">Classes</a></h3>
-<ul>
-<li>
-<h4><code><a title="imodels.algebraic.gam.TreeGAMClassifier" href="#imodels.algebraic.gam.TreeGAMClassifier">TreeGAMClassifier</a></code></h4>
-<ul class="">
-<li><code><a title="imodels.algebraic.gam.TreeGAMClassifier.fit" href="#imodels.algebraic.gam.TreeGAMClassifier.fit">fit</a></code></li>
-<li><code><a title="imodels.algebraic.gam.TreeGAMClassifier.predict" href="#imodels.algebraic.gam.TreeGAMClassifier.predict">predict</a></code></li>
-<li><code><a title="imodels.algebraic.gam.TreeGAMClassifier.predict_proba" href="#imodels.algebraic.gam.TreeGAMClassifier.predict_proba">predict_proba</a></code></li>
-</ul>
-</li>
-</ul>
-</li>
-</ul>
-<p><img align="center" width=100% src="https://csinva.io/imodels/img/anim.gif"> </img></p>
-<!-- add wave animation -->
-</nav>
-</main>
-<footer id="footer">
-</footer>
-</body>
-</html>
-<!-- add github corner -->
-<a href="https://github.com/csinva/imodels" class="github-corner" aria-label="View source on GitHub"><svg width="120" height="120" viewBox="0 0 250 250" style="fill:#70B7FD; color:#fff; position: absolute; top: 0; border: 0; right: 0;" aria-hidden="true"><path d="M0,0 L115,115 L130,115 L142,142 L250,250 L250,0 Z"></path><path d="m128.3,109.0 c113.8,99.7 119.0,89.6 119.0,89.6 c122.0,82.7 120.5,78.6 120.5,78.6 c119.2,72.0 123.4,76.3 123.4,76.3 c127.3,80.9 125.5,87.3 125.5,87.3 c122.9,97.6 130.6,101.9 134.4,103.2" fill="currentcolor" style="transform-origin: 130px 106px;" class="octo-arm"></path><path d="M115.0,115.0 C114.9,115.1 118.7,116.5 119.8,115.4 L133.7,101.6 C136.9,99.2 139.9,98.4 142.2,98.6 C133.8,88.0 127.5,74.4 143.8,58.0 C148.5,53.4 154.0,51.2 159.7,51.0 C160.3,49.4 163.2,43.6 171.4,40.1 C171.4,40.1 176.1,42.5 178.8,56.2 C183.1,58.6 187.2,61.8 190.9,65.4 C194.5,69.0 197.7,73.2 200.1,77.6 C213.8,80.2 216.3,84.9 216.3,84.9 C212.7,93.1 206.9,96.0 205.4,96.6 C205.1,102.4 203.0,107.8 198.3,112.5 C181.9,128.9 168.3,122.5 157.7,114.1 C157.9,116.9 156.7,120.9 152.7,124.9 L141.0,136.5 C139.8,137.7 141.6,141.9 141.8,141.8 Z" fill="currentColor" class="octo-body"></path></svg></a><style>.github-corner:hover .octo-arm{animation:octocat-wave 560ms ease-in-out}@keyframes octocat-wave{0%,100%{transform:rotate(0)}20%,60%{transform:rotate(-25deg)}40%,80%{transform:rotate(10deg)}}@media (max-width:500px){.github-corner:hover .octo-arm{animation:none}.github-corner .octo-arm{animation:octocat-wave 560ms ease-in-out}}</style>
-<!-- add wave animation stylesheet -->
-<link rel="stylesheet" href="github.css">
\ No newline at end of file
diff --git a/docs/algebraic/index.html b/docs/algebraic/index.html
index 7e990fdd..1903006d 100644
--- a/docs/algebraic/index.html
+++ b/docs/algebraic/index.html
@@ -29,14 +29,14 @@
 <section>
 <h2 class="section-title" id="header-submodules">Sub-modules</h2>
 <dl>
-<dt><code class="name"><a title="imodels.algebraic.gam" href="gam.html">imodels.algebraic.gam</a></code></dt>
-<dd>
-<div class="desc"></div>
-</dd>
 <dt><code class="name"><a title="imodels.algebraic.slim" href="slim.html">imodels.algebraic.slim</a></code></dt>
 <dd>
 <div class="desc"><p>Wrapper for sparse, integer linear models …</p></div>
 </dd>
+<dt><code class="name"><a title="imodels.algebraic.tree_gam" href="tree_gam.html">imodels.algebraic.tree_gam</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
 </dl>
 </section>
 <section>
@@ -65,8 +65,8 @@ <h1>Index 🔍</h1>
 </li>
 <li><h3><a href="#header-submodules">Sub-modules</a></h3>
 <ul>
-<li><code><a title="imodels.algebraic.gam" href="gam.html">imodels.algebraic.gam</a></code></li>
 <li><code><a title="imodels.algebraic.slim" href="slim.html">imodels.algebraic.slim</a></code></li>
+<li><code><a title="imodels.algebraic.tree_gam" href="tree_gam.html">imodels.algebraic.tree_gam</a></code></li>
 </ul>
 </li>
 </ul>
diff --git a/docs/algebraic/tree_gam.html b/docs/algebraic/tree_gam.html
new file mode 100644
index 00000000..cbf8c2f7
--- /dev/null
+++ b/docs/algebraic/tree_gam.html
@@ -0,0 +1,872 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">from copy import deepcopy
+import numpy as np
+import pandas as pd
+from sklearn.base import BaseEstimator
+from sklearn.linear_model import LinearRegression, RidgeCV
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.utils.validation import check_is_fitted
+from sklearn.utils import check_array
+from sklearn.utils.multiclass import check_classification_targets
+from sklearn.utils.validation import check_X_y
+from sklearn.utils.validation import _check_sample_weight
+from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, roc_auc_score
+from tqdm import tqdm
+
+import imodels
+
+from sklearn.base import RegressorMixin, ClassifierMixin
+
+
+class TreeGAM(BaseEstimator):
+    &#34;&#34;&#34;Tree-based GAM classifier.
+    Uses cyclical boosting to fit a GAM with small trees.
+    Simplified version of the explainable boosting machine described in https://github.com/interpretml/interpret
+    Only works for binary classification.
+    Fits a scalar bias to the mean.
+    &#34;&#34;&#34;
+
+    def __init__(
+        self,
+        n_boosting_rounds=100,
+        max_leaf_nodes=3,
+        reg_param=0.0,
+        learning_rate: float = 0.01,
+        n_boosting_rounds_marginal=0,
+        max_leaf_nodes_marginal=2,
+        reg_param_marginal=0.0,
+        fit_linear_marginal=None,
+        random_state=None,
+    ):
+        &#34;&#34;&#34;
+        Params
+        ------
+        n_boosting_rounds : int
+            Number of boosting rounds for the cyclic boosting.
+        max_leaf_nodes : int
+            Maximum number of leaf nodes for the trees in the cyclic boosting.
+        reg_param : float
+            Regularization parameter for the cyclic boosting.
+        learning_rate: float
+            Learning rate for the cyclic boosting.
+        n_boosting_rounds_marginal : int
+            Number of boosting rounds for the marginal boosting.
+        max_leaf_nodes_marginal : int
+            Maximum number of leaf nodes for the trees in the marginal boosting.
+        reg_param_marginal : float
+            Regularization parameter for the marginal boosting.
+        fit_linear_marginal : str [None, &#34;None&#34;, &#34;ridge&#34;, &#34;NNLS&#34;]
+            Whether to fit a linear model to the marginal effects.
+            NNLS for non-negative least squares
+            ridge for ridge regression
+            None for no linear model
+
+        random_state : int
+            Random seed.
+        &#34;&#34;&#34;
+        self.n_boosting_rounds = n_boosting_rounds
+        self.max_leaf_nodes = max_leaf_nodes
+        self.reg_param = reg_param
+        self.learning_rate = learning_rate
+        self.max_leaf_nodes_marginal = max_leaf_nodes_marginal
+        self.reg_param_marginal = reg_param_marginal
+        self.n_boosting_rounds_marginal = n_boosting_rounds_marginal
+        self.fit_linear_marginal = fit_linear_marginal
+        self.random_state = random_state
+
+    def fit(self, X, y, sample_weight=None, validation_frac=0.15):
+        X, y = check_X_y(X, y, accept_sparse=False, multi_output=False)
+        if isinstance(self, ClassifierMixin):
+            check_classification_targets(y)
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=None)
+
+        # split into train and validation for early stopping
+        (
+            X_train,
+            X_val,
+            y_train,
+            y_val,
+            sample_weight_train,
+            sample_weight_val,
+        ) = train_test_split(
+            X,
+            y,
+            sample_weight,
+            test_size=validation_frac,
+            random_state=self.random_state,
+        )
+
+        self.estimators_marginal = []
+        self.estimators_ = []
+        self.bias_ = np.mean(y)
+
+        if self.n_boosting_rounds_marginal &gt; 0:
+            self._marginal_fit(
+                X_train,
+                y_train,
+                sample_weight_train,
+            )
+
+        if self.n_boosting_rounds &gt; 0:
+            self._cyclic_boost(
+                X_train,
+                y_train,
+                sample_weight_train,
+                X_val,
+                y_val,
+                sample_weight_val,
+            )
+
+        return self
+
+    def _marginal_fit(
+        self,
+        X_train,
+        y_train,
+        sample_weight_train,
+    ):
+        &#34;&#34;&#34;Fit a gbdt estimator for each feature independently.
+        Store in self.estimators_marginal&#34;&#34;&#34;
+        residuals_train = y_train - self.predict_proba(X_train)[:, 1]
+        p = X_train.shape[1]
+        for feature_num in range(p):
+            X_ = np.zeros_like(X_train)
+            X_[:, feature_num] = X_train[:, feature_num]
+            est = GradientBoostingRegressor(
+                max_leaf_nodes=self.max_leaf_nodes_marginal,
+                random_state=self.random_state,
+                n_estimators=self.n_boosting_rounds_marginal,
+            )
+            est.fit(X_, residuals_train, sample_weight=sample_weight_train)
+            if self.reg_param_marginal &gt; 0:
+                est = imodels.HSTreeRegressor(est, reg_param=self.reg_param_marginal)
+            self.estimators_marginal.append(est)
+
+        if (
+            self.fit_linear_marginal is not None
+            and not self.fit_linear_marginal == &#34;None&#34;
+        ):
+            if self.fit_linear_marginal.lower() == &#34;ridge&#34;:
+                linear_marginal = RidgeCV(fit_intercept=False)
+            elif self.fit_linear_marginal.lower() == &#34;nnls&#34;:
+                linear_marginal = LinearRegression(fit_intercept=False, positive=True)
+            linear_marginal.fit(
+                np.array([est.predict(X_train) for est in self.estimators_marginal]).T,
+                residuals_train,
+                sample_weight_train,
+            )
+            self.marginal_coef_ = linear_marginal.coef_
+            self.lin = linear_marginal
+        else:
+            self.marginal_coef_ = np.ones(p) / p
+
+    def _cyclic_boost(
+        self, X_train, y_train, sample_weight_train, X_val, y_val, sample_weight_val
+    ):
+        &#34;&#34;&#34;Apply cyclic boosting, storing trees in self.estimators_&#34;&#34;&#34;
+
+        residuals_train = y_train - self.predict_proba(X_train)[:, 1]
+        mse_val = np.average(
+            np.square(y_val - self.predict_proba(X_val)[:, 1]),
+            weights=sample_weight_val,
+        )
+        for boosting_round in range(self.n_boosting_rounds):
+            for feature_num in range(X_train.shape[1]):
+                X_ = np.zeros_like(X_train)
+                X_[:, feature_num] = X_train[:, feature_num]
+                est = DecisionTreeRegressor(
+                    max_leaf_nodes=self.max_leaf_nodes,
+                    random_state=self.random_state,
+                )
+                est.fit(X_, residuals_train, sample_weight=sample_weight_train)
+                succesfully_split_on_feature = np.all(
+                    (est.tree_.feature[0] == feature_num) | (est.tree_.feature[0] == -2)
+                )
+                if not succesfully_split_on_feature:
+                    continue
+                if self.reg_param &gt; 0:
+                    est = imodels.HSTreeRegressor(est, reg_param=self.reg_param)
+                self.estimators_.append(est)
+                residuals_train = residuals_train - self.learning_rate * est.predict(
+                    X_train
+                )
+
+            # early stopping if validation error does not decrease
+            mse_val_new = np.average(
+                np.square(y_val - self.predict_proba(X_val)[:, 1]),
+                weights=sample_weight_val,
+            )
+            # print(f&#34;mse val: {mse_val:.4f} mse_val_new: {mse_val_new:.4f}&#34;)
+            if mse_val_new &gt;= mse_val:
+                return
+            else:
+                mse_val = mse_val_new
+
+    def predict_proba(self, X):
+        X = check_array(X, accept_sparse=False, dtype=None)
+        check_is_fitted(self)
+        probs1 = np.ones(X.shape[0]) * self.bias_
+        for i, est in enumerate(self.estimators_marginal):
+            probs1 += est.predict(X) * self.marginal_coef_[i]
+        for est in self.estimators_:
+            probs1 += self.learning_rate * est.predict(X)
+        probs1 = np.clip(probs1, a_min=0, a_max=1)
+        return np.array([1 - probs1, probs1]).T
+
+    def predict(self, X):
+        if isinstance(self, RegressorMixin):
+            return self.predict_proba(X)[:, 1]
+        elif isinstance(self, ClassifierMixin):
+            return np.argmax(self.predict_proba(X), axis=1)
+
+    def get_shape_function_vals(self, X, max_evals=100):
+        &#34;&#34;&#34;Uses predict_proba to compute shape_function
+        Returns
+        -------
+        feature_vals_list : list of arrays
+            The values of each feature for which the shape function is evaluated.
+        shape_function_vals_list : list of arrays
+            The shape function evaluated at each value of the corresponding feature.
+        &#34;&#34;&#34;
+        p = X.shape[1]
+        dummy_input = np.zeros((1, p))
+        base = self.predict_proba(dummy_input)[:, 1][0]
+        feature_vals_list = []
+        shape_function_vals_list = []
+        for feat_num in range(p):
+            feature_vals = sorted(np.unique(X[:, feat_num]))
+            while len(feature_vals) &gt; max_evals:
+                feature_vals = feature_vals[::2]
+            dummy_input = np.zeros((len(feature_vals), p))
+            dummy_input[:, feat_num] = feature_vals
+            shape_function_vals = self.predict_proba(dummy_input)[:, 1] - base
+            feature_vals_list.append(feature_vals)
+            shape_function_vals_list.append(shape_function_vals.tolist())
+        return feature_vals_list, shape_function_vals_list
+
+
+class TreeGAMRegressor(TreeGAM, RegressorMixin):
+    ...
+
+
+class TreeGAMClassifier(TreeGAM, ClassifierMixin):
+    ...
+
+
+if __name__ == &#34;__main__&#34;:
+    X, y, feature_names = imodels.get_clean_dataset(&#34;heart&#34;)
+    X, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
+    gam = TreeGAMClassifier()
+    gam.fit(X, y_train)
+
+    # check roc auc score
+    y_pred = gam.predict_proba(X_test)[:, 1]
+    print(&#34;train roc auc score:&#34;, roc_auc_score(y_train, gam.predict_proba(X)[:, 1]))
+    print(&#34;test roc auc score:&#34;, roc_auc_score(y_test, y_pred))
+    print(
+        &#34;accs&#34;,
+        accuracy_score(y_train, gam.predict(X)),
+        accuracy_score(y_test, gam.predict(X_test)),
+        &#34;imb&#34;,
+        np.mean(y_train),
+        np.mean(y_test),
+    )</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="imodels.algebraic.tree_gam.TreeGAM"><code class="flex name class">
+<span>class <span class="ident">TreeGAM</span></span>
+<span>(</span><span>n_boosting_rounds=100, max_leaf_nodes=3, reg_param=0.0, learning_rate: float = 0.01, n_boosting_rounds_marginal=0, max_leaf_nodes_marginal=2, reg_param_marginal=0.0, fit_linear_marginal=None, random_state=None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Tree-based GAM classifier.
+Uses cyclical boosting to fit a GAM with small trees.
+Simplified version of the explainable boosting machine described in <a href="https://github.com/interpretml/interpret">https://github.com/interpretml/interpret</a>
+Only works for binary classification.
+Fits a scalar bias to the mean.</p>
+<h2 id="params">Params</h2>
+<p>n_boosting_rounds : int
+Number of boosting rounds for the cyclic boosting.
+max_leaf_nodes : int
+Maximum number of leaf nodes for the trees in the cyclic boosting.
+reg_param : float
+Regularization parameter for the cyclic boosting.
+learning_rate: float
+Learning rate for the cyclic boosting.
+n_boosting_rounds_marginal : int
+Number of boosting rounds for the marginal boosting.
+max_leaf_nodes_marginal : int
+Maximum number of leaf nodes for the trees in the marginal boosting.
+reg_param_marginal : float
+Regularization parameter for the marginal boosting.
+fit_linear_marginal : str [None, "None", "ridge", "NNLS"]
+Whether to fit a linear model to the marginal effects.
+NNLS for non-negative least squares
+ridge for ridge regression
+None for no linear model</p>
+<p>random_state : int
+Random seed.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class TreeGAM(BaseEstimator):
+    &#34;&#34;&#34;Tree-based GAM classifier.
+    Uses cyclical boosting to fit a GAM with small trees.
+    Simplified version of the explainable boosting machine described in https://github.com/interpretml/interpret
+    Only works for binary classification.
+    Fits a scalar bias to the mean.
+    &#34;&#34;&#34;
+
+    def __init__(
+        self,
+        n_boosting_rounds=100,
+        max_leaf_nodes=3,
+        reg_param=0.0,
+        learning_rate: float = 0.01,
+        n_boosting_rounds_marginal=0,
+        max_leaf_nodes_marginal=2,
+        reg_param_marginal=0.0,
+        fit_linear_marginal=None,
+        random_state=None,
+    ):
+        &#34;&#34;&#34;
+        Params
+        ------
+        n_boosting_rounds : int
+            Number of boosting rounds for the cyclic boosting.
+        max_leaf_nodes : int
+            Maximum number of leaf nodes for the trees in the cyclic boosting.
+        reg_param : float
+            Regularization parameter for the cyclic boosting.
+        learning_rate: float
+            Learning rate for the cyclic boosting.
+        n_boosting_rounds_marginal : int
+            Number of boosting rounds for the marginal boosting.
+        max_leaf_nodes_marginal : int
+            Maximum number of leaf nodes for the trees in the marginal boosting.
+        reg_param_marginal : float
+            Regularization parameter for the marginal boosting.
+        fit_linear_marginal : str [None, &#34;None&#34;, &#34;ridge&#34;, &#34;NNLS&#34;]
+            Whether to fit a linear model to the marginal effects.
+            NNLS for non-negative least squares
+            ridge for ridge regression
+            None for no linear model
+
+        random_state : int
+            Random seed.
+        &#34;&#34;&#34;
+        self.n_boosting_rounds = n_boosting_rounds
+        self.max_leaf_nodes = max_leaf_nodes
+        self.reg_param = reg_param
+        self.learning_rate = learning_rate
+        self.max_leaf_nodes_marginal = max_leaf_nodes_marginal
+        self.reg_param_marginal = reg_param_marginal
+        self.n_boosting_rounds_marginal = n_boosting_rounds_marginal
+        self.fit_linear_marginal = fit_linear_marginal
+        self.random_state = random_state
+
+    def fit(self, X, y, sample_weight=None, validation_frac=0.15):
+        X, y = check_X_y(X, y, accept_sparse=False, multi_output=False)
+        if isinstance(self, ClassifierMixin):
+            check_classification_targets(y)
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=None)
+
+        # split into train and validation for early stopping
+        (
+            X_train,
+            X_val,
+            y_train,
+            y_val,
+            sample_weight_train,
+            sample_weight_val,
+        ) = train_test_split(
+            X,
+            y,
+            sample_weight,
+            test_size=validation_frac,
+            random_state=self.random_state,
+        )
+
+        self.estimators_marginal = []
+        self.estimators_ = []
+        self.bias_ = np.mean(y)
+
+        if self.n_boosting_rounds_marginal &gt; 0:
+            self._marginal_fit(
+                X_train,
+                y_train,
+                sample_weight_train,
+            )
+
+        if self.n_boosting_rounds &gt; 0:
+            self._cyclic_boost(
+                X_train,
+                y_train,
+                sample_weight_train,
+                X_val,
+                y_val,
+                sample_weight_val,
+            )
+
+        return self
+
+    def _marginal_fit(
+        self,
+        X_train,
+        y_train,
+        sample_weight_train,
+    ):
+        &#34;&#34;&#34;Fit a gbdt estimator for each feature independently.
+        Store in self.estimators_marginal&#34;&#34;&#34;
+        residuals_train = y_train - self.predict_proba(X_train)[:, 1]
+        p = X_train.shape[1]
+        for feature_num in range(p):
+            X_ = np.zeros_like(X_train)
+            X_[:, feature_num] = X_train[:, feature_num]
+            est = GradientBoostingRegressor(
+                max_leaf_nodes=self.max_leaf_nodes_marginal,
+                random_state=self.random_state,
+                n_estimators=self.n_boosting_rounds_marginal,
+            )
+            est.fit(X_, residuals_train, sample_weight=sample_weight_train)
+            if self.reg_param_marginal &gt; 0:
+                est = imodels.HSTreeRegressor(est, reg_param=self.reg_param_marginal)
+            self.estimators_marginal.append(est)
+
+        if (
+            self.fit_linear_marginal is not None
+            and not self.fit_linear_marginal == &#34;None&#34;
+        ):
+            if self.fit_linear_marginal.lower() == &#34;ridge&#34;:
+                linear_marginal = RidgeCV(fit_intercept=False)
+            elif self.fit_linear_marginal.lower() == &#34;nnls&#34;:
+                linear_marginal = LinearRegression(fit_intercept=False, positive=True)
+            linear_marginal.fit(
+                np.array([est.predict(X_train) for est in self.estimators_marginal]).T,
+                residuals_train,
+                sample_weight_train,
+            )
+            self.marginal_coef_ = linear_marginal.coef_
+            self.lin = linear_marginal
+        else:
+            self.marginal_coef_ = np.ones(p) / p
+
+    def _cyclic_boost(
+        self, X_train, y_train, sample_weight_train, X_val, y_val, sample_weight_val
+    ):
+        &#34;&#34;&#34;Apply cyclic boosting, storing trees in self.estimators_&#34;&#34;&#34;
+
+        residuals_train = y_train - self.predict_proba(X_train)[:, 1]
+        mse_val = np.average(
+            np.square(y_val - self.predict_proba(X_val)[:, 1]),
+            weights=sample_weight_val,
+        )
+        for boosting_round in range(self.n_boosting_rounds):
+            for feature_num in range(X_train.shape[1]):
+                X_ = np.zeros_like(X_train)
+                X_[:, feature_num] = X_train[:, feature_num]
+                est = DecisionTreeRegressor(
+                    max_leaf_nodes=self.max_leaf_nodes,
+                    random_state=self.random_state,
+                )
+                est.fit(X_, residuals_train, sample_weight=sample_weight_train)
+                succesfully_split_on_feature = np.all(
+                    (est.tree_.feature[0] == feature_num) | (est.tree_.feature[0] == -2)
+                )
+                if not succesfully_split_on_feature:
+                    continue
+                if self.reg_param &gt; 0:
+                    est = imodels.HSTreeRegressor(est, reg_param=self.reg_param)
+                self.estimators_.append(est)
+                residuals_train = residuals_train - self.learning_rate * est.predict(
+                    X_train
+                )
+
+            # early stopping if validation error does not decrease
+            mse_val_new = np.average(
+                np.square(y_val - self.predict_proba(X_val)[:, 1]),
+                weights=sample_weight_val,
+            )
+            # print(f&#34;mse val: {mse_val:.4f} mse_val_new: {mse_val_new:.4f}&#34;)
+            if mse_val_new &gt;= mse_val:
+                return
+            else:
+                mse_val = mse_val_new
+
+    def predict_proba(self, X):
+        X = check_array(X, accept_sparse=False, dtype=None)
+        check_is_fitted(self)
+        probs1 = np.ones(X.shape[0]) * self.bias_
+        for i, est in enumerate(self.estimators_marginal):
+            probs1 += est.predict(X) * self.marginal_coef_[i]
+        for est in self.estimators_:
+            probs1 += self.learning_rate * est.predict(X)
+        probs1 = np.clip(probs1, a_min=0, a_max=1)
+        return np.array([1 - probs1, probs1]).T
+
+    def predict(self, X):
+        if isinstance(self, RegressorMixin):
+            return self.predict_proba(X)[:, 1]
+        elif isinstance(self, ClassifierMixin):
+            return np.argmax(self.predict_proba(X), axis=1)
+
+    def get_shape_function_vals(self, X, max_evals=100):
+        &#34;&#34;&#34;Uses predict_proba to compute shape_function
+        Returns
+        -------
+        feature_vals_list : list of arrays
+            The values of each feature for which the shape function is evaluated.
+        shape_function_vals_list : list of arrays
+            The shape function evaluated at each value of the corresponding feature.
+        &#34;&#34;&#34;
+        p = X.shape[1]
+        dummy_input = np.zeros((1, p))
+        base = self.predict_proba(dummy_input)[:, 1][0]
+        feature_vals_list = []
+        shape_function_vals_list = []
+        for feat_num in range(p):
+            feature_vals = sorted(np.unique(X[:, feat_num]))
+            while len(feature_vals) &gt; max_evals:
+                feature_vals = feature_vals[::2]
+            dummy_input = np.zeros((len(feature_vals), p))
+            dummy_input[:, feat_num] = feature_vals
+            shape_function_vals = self.predict_proba(dummy_input)[:, 1] - base
+            feature_vals_list.append(feature_vals)
+            shape_function_vals_list.append(shape_function_vals.tolist())
+        return feature_vals_list, shape_function_vals_list</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>sklearn.base.BaseEstimator</li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="imodels.algebraic.tree_gam.TreeGAMClassifier" href="#imodels.algebraic.tree_gam.TreeGAMClassifier">TreeGAMClassifier</a></li>
+<li><a title="imodels.algebraic.tree_gam.TreeGAMRegressor" href="#imodels.algebraic.tree_gam.TreeGAMRegressor">TreeGAMRegressor</a></li>
+</ul>
+<h3>Methods</h3>
+<dl>
+<dt id="imodels.algebraic.tree_gam.TreeGAM.fit"><code class="name flex">
+<span>def <span class="ident">fit</span></span>(<span>self, X, y, sample_weight=None, validation_frac=0.15)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def fit(self, X, y, sample_weight=None, validation_frac=0.15):
+    X, y = check_X_y(X, y, accept_sparse=False, multi_output=False)
+    if isinstance(self, ClassifierMixin):
+        check_classification_targets(y)
+    sample_weight = _check_sample_weight(sample_weight, X, dtype=None)
+
+    # split into train and validation for early stopping
+    (
+        X_train,
+        X_val,
+        y_train,
+        y_val,
+        sample_weight_train,
+        sample_weight_val,
+    ) = train_test_split(
+        X,
+        y,
+        sample_weight,
+        test_size=validation_frac,
+        random_state=self.random_state,
+    )
+
+    self.estimators_marginal = []
+    self.estimators_ = []
+    self.bias_ = np.mean(y)
+
+    if self.n_boosting_rounds_marginal &gt; 0:
+        self._marginal_fit(
+            X_train,
+            y_train,
+            sample_weight_train,
+        )
+
+    if self.n_boosting_rounds &gt; 0:
+        self._cyclic_boost(
+            X_train,
+            y_train,
+            sample_weight_train,
+            X_val,
+            y_val,
+            sample_weight_val,
+        )
+
+    return self</code></pre>
+</details>
+</dd>
+<dt id="imodels.algebraic.tree_gam.TreeGAM.get_shape_function_vals"><code class="name flex">
+<span>def <span class="ident">get_shape_function_vals</span></span>(<span>self, X, max_evals=100)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Uses predict_proba to compute shape_function
+Returns</p>
+<hr>
+<dl>
+<dt><strong><code>feature_vals_list</code></strong> :&ensp;<code>list</code> of <code>arrays</code></dt>
+<dd>The values of each feature for which the shape function is evaluated.</dd>
+<dt><strong><code>shape_function_vals_list</code></strong> :&ensp;<code>list</code> of <code>arrays</code></dt>
+<dd>The shape function evaluated at each value of the corresponding feature.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_shape_function_vals(self, X, max_evals=100):
+    &#34;&#34;&#34;Uses predict_proba to compute shape_function
+    Returns
+    -------
+    feature_vals_list : list of arrays
+        The values of each feature for which the shape function is evaluated.
+    shape_function_vals_list : list of arrays
+        The shape function evaluated at each value of the corresponding feature.
+    &#34;&#34;&#34;
+    p = X.shape[1]
+    dummy_input = np.zeros((1, p))
+    base = self.predict_proba(dummy_input)[:, 1][0]
+    feature_vals_list = []
+    shape_function_vals_list = []
+    for feat_num in range(p):
+        feature_vals = sorted(np.unique(X[:, feat_num]))
+        while len(feature_vals) &gt; max_evals:
+            feature_vals = feature_vals[::2]
+        dummy_input = np.zeros((len(feature_vals), p))
+        dummy_input[:, feat_num] = feature_vals
+        shape_function_vals = self.predict_proba(dummy_input)[:, 1] - base
+        feature_vals_list.append(feature_vals)
+        shape_function_vals_list.append(shape_function_vals.tolist())
+    return feature_vals_list, shape_function_vals_list</code></pre>
+</details>
+</dd>
+<dt id="imodels.algebraic.tree_gam.TreeGAM.predict"><code class="name flex">
+<span>def <span class="ident">predict</span></span>(<span>self, X)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def predict(self, X):
+    if isinstance(self, RegressorMixin):
+        return self.predict_proba(X)[:, 1]
+    elif isinstance(self, ClassifierMixin):
+        return np.argmax(self.predict_proba(X), axis=1)</code></pre>
+</details>
+</dd>
+<dt id="imodels.algebraic.tree_gam.TreeGAM.predict_proba"><code class="name flex">
+<span>def <span class="ident">predict_proba</span></span>(<span>self, X)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def predict_proba(self, X):
+    X = check_array(X, accept_sparse=False, dtype=None)
+    check_is_fitted(self)
+    probs1 = np.ones(X.shape[0]) * self.bias_
+    for i, est in enumerate(self.estimators_marginal):
+        probs1 += est.predict(X) * self.marginal_coef_[i]
+    for est in self.estimators_:
+        probs1 += self.learning_rate * est.predict(X)
+    probs1 = np.clip(probs1, a_min=0, a_max=1)
+    return np.array([1 - probs1, probs1]).T</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="imodels.algebraic.tree_gam.TreeGAMClassifier"><code class="flex name class">
+<span>class <span class="ident">TreeGAMClassifier</span></span>
+<span>(</span><span>n_boosting_rounds=100, max_leaf_nodes=3, reg_param=0.0, learning_rate: float = 0.01, n_boosting_rounds_marginal=0, max_leaf_nodes_marginal=2, reg_param_marginal=0.0, fit_linear_marginal=None, random_state=None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Tree-based GAM classifier.
+Uses cyclical boosting to fit a GAM with small trees.
+Simplified version of the explainable boosting machine described in <a href="https://github.com/interpretml/interpret">https://github.com/interpretml/interpret</a>
+Only works for binary classification.
+Fits a scalar bias to the mean.</p>
+<h2 id="params">Params</h2>
+<p>n_boosting_rounds : int
+Number of boosting rounds for the cyclic boosting.
+max_leaf_nodes : int
+Maximum number of leaf nodes for the trees in the cyclic boosting.
+reg_param : float
+Regularization parameter for the cyclic boosting.
+learning_rate: float
+Learning rate for the cyclic boosting.
+n_boosting_rounds_marginal : int
+Number of boosting rounds for the marginal boosting.
+max_leaf_nodes_marginal : int
+Maximum number of leaf nodes for the trees in the marginal boosting.
+reg_param_marginal : float
+Regularization parameter for the marginal boosting.
+fit_linear_marginal : str [None, "None", "ridge", "NNLS"]
+Whether to fit a linear model to the marginal effects.
+NNLS for non-negative least squares
+ridge for ridge regression
+None for no linear model</p>
+<p>random_state : int
+Random seed.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class TreeGAMClassifier(TreeGAM, ClassifierMixin):
+    ...</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="imodels.algebraic.tree_gam.TreeGAM" href="#imodels.algebraic.tree_gam.TreeGAM">TreeGAM</a></li>
+<li>sklearn.base.BaseEstimator</li>
+<li>sklearn.base.ClassifierMixin</li>
+</ul>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="imodels.algebraic.tree_gam.TreeGAM" href="#imodels.algebraic.tree_gam.TreeGAM">TreeGAM</a></b></code>:
+<ul class="hlist">
+<li><code><a title="imodels.algebraic.tree_gam.TreeGAM.get_shape_function_vals" href="#imodels.algebraic.tree_gam.TreeGAM.get_shape_function_vals">get_shape_function_vals</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="imodels.algebraic.tree_gam.TreeGAMRegressor"><code class="flex name class">
+<span>class <span class="ident">TreeGAMRegressor</span></span>
+<span>(</span><span>n_boosting_rounds=100, max_leaf_nodes=3, reg_param=0.0, learning_rate: float = 0.01, n_boosting_rounds_marginal=0, max_leaf_nodes_marginal=2, reg_param_marginal=0.0, fit_linear_marginal=None, random_state=None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Tree-based GAM classifier.
+Uses cyclical boosting to fit a GAM with small trees.
+Simplified version of the explainable boosting machine described in <a href="https://github.com/interpretml/interpret">https://github.com/interpretml/interpret</a>
+Only works for binary classification.
+Fits a scalar bias to the mean.</p>
+<h2 id="params">Params</h2>
+<p>n_boosting_rounds : int
+Number of boosting rounds for the cyclic boosting.
+max_leaf_nodes : int
+Maximum number of leaf nodes for the trees in the cyclic boosting.
+reg_param : float
+Regularization parameter for the cyclic boosting.
+learning_rate: float
+Learning rate for the cyclic boosting.
+n_boosting_rounds_marginal : int
+Number of boosting rounds for the marginal boosting.
+max_leaf_nodes_marginal : int
+Maximum number of leaf nodes for the trees in the marginal boosting.
+reg_param_marginal : float
+Regularization parameter for the marginal boosting.
+fit_linear_marginal : str [None, "None", "ridge", "NNLS"]
+Whether to fit a linear model to the marginal effects.
+NNLS for non-negative least squares
+ridge for ridge regression
+None for no linear model</p>
+<p>random_state : int
+Random seed.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class TreeGAMRegressor(TreeGAM, RegressorMixin):
+    ...</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="imodels.algebraic.tree_gam.TreeGAM" href="#imodels.algebraic.tree_gam.TreeGAM">TreeGAM</a></li>
+<li>sklearn.base.BaseEstimator</li>
+<li>sklearn.base.RegressorMixin</li>
+</ul>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="imodels.algebraic.tree_gam.TreeGAM" href="#imodels.algebraic.tree_gam.TreeGAM">TreeGAM</a></b></code>:
+<ul class="hlist">
+<li><code><a title="imodels.algebraic.tree_gam.TreeGAM.get_shape_function_vals" href="#imodels.algebraic.tree_gam.TreeGAM.get_shape_function_vals">get_shape_function_vals</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index 🔍</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="imodels.algebraic" href="index.html">imodels.algebraic</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="imodels.algebraic.tree_gam.TreeGAM" href="#imodels.algebraic.tree_gam.TreeGAM">TreeGAM</a></code></h4>
+<ul class="">
+<li><code><a title="imodels.algebraic.tree_gam.TreeGAM.fit" href="#imodels.algebraic.tree_gam.TreeGAM.fit">fit</a></code></li>
+<li><code><a title="imodels.algebraic.tree_gam.TreeGAM.get_shape_function_vals" href="#imodels.algebraic.tree_gam.TreeGAM.get_shape_function_vals">get_shape_function_vals</a></code></li>
+<li><code><a title="imodels.algebraic.tree_gam.TreeGAM.predict" href="#imodels.algebraic.tree_gam.TreeGAM.predict">predict</a></code></li>
+<li><code><a title="imodels.algebraic.tree_gam.TreeGAM.predict_proba" href="#imodels.algebraic.tree_gam.TreeGAM.predict_proba">predict_proba</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="imodels.algebraic.tree_gam.TreeGAMClassifier" href="#imodels.algebraic.tree_gam.TreeGAMClassifier">TreeGAMClassifier</a></code></h4>
+</li>
+<li>
+<h4><code><a title="imodels.algebraic.tree_gam.TreeGAMRegressor" href="#imodels.algebraic.tree_gam.TreeGAMRegressor">TreeGAMRegressor</a></code></h4>
+</li>
+</ul>
+</li>
+</ul>
+<p><img align="center" width=100% src="https://csinva.io/imodels/img/anim.gif"> </img></p>
+<!-- add wave animation -->
+</nav>
+</main>
+<footer id="footer">
+</footer>
+</body>
+</html>
+<!-- add github corner -->
+<a href="https://github.com/csinva/imodels" class="github-corner" aria-label="View source on GitHub"><svg width="120" height="120" viewBox="0 0 250 250" style="fill:#70B7FD; color:#fff; position: absolute; top: 0; border: 0; right: 0;" aria-hidden="true"><path d="M0,0 L115,115 L130,115 L142,142 L250,250 L250,0 Z"></path><path d="m128.3,109.0 c113.8,99.7 119.0,89.6 119.0,89.6 c122.0,82.7 120.5,78.6 120.5,78.6 c119.2,72.0 123.4,76.3 123.4,76.3 c127.3,80.9 125.5,87.3 125.5,87.3 c122.9,97.6 130.6,101.9 134.4,103.2" fill="currentcolor" style="transform-origin: 130px 106px;" class="octo-arm"></path><path d="M115.0,115.0 C114.9,115.1 118.7,116.5 119.8,115.4 L133.7,101.6 C136.9,99.2 139.9,98.4 142.2,98.6 C133.8,88.0 127.5,74.4 143.8,58.0 C148.5,53.4 154.0,51.2 159.7,51.0 C160.3,49.4 163.2,43.6 171.4,40.1 C171.4,40.1 176.1,42.5 178.8,56.2 C183.1,58.6 187.2,61.8 190.9,65.4 C194.5,69.0 197.7,73.2 200.1,77.6 C213.8,80.2 216.3,84.9 216.3,84.9 C212.7,93.1 206.9,96.0 205.4,96.6 C205.1,102.4 203.0,107.8 198.3,112.5 C181.9,128.9 168.3,122.5 157.7,114.1 C157.9,116.9 156.7,120.9 152.7,124.9 L141.0,136.5 C139.8,137.7 141.6,141.9 141.8,141.8 Z" fill="currentColor" class="octo-body"></path></svg></a><style>.github-corner:hover .octo-arm{animation:octocat-wave 560ms ease-in-out}@keyframes octocat-wave{0%,100%{transform:rotate(0)}20%,60%{transform:rotate(-25deg)}40%,80%{transform:rotate(10deg)}}@media (max-width:500px){.github-corner:hover .octo-arm{animation:none}.github-corner .octo-arm{animation:octocat-wave 560ms ease-in-out}}</style>
+<!-- add wave animation stylesheet -->
+<link rel="stylesheet" href="github.css">
\ No newline at end of file
diff --git a/docs/index.html b/docs/index.html
index 25c62815..f923b0fd 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -158,7 +158,7 @@ <h3 id="supported-models">Supported models</h3>
 </tr>
 <tr>
 <td style="text-align: left;">Tree GAM</td>
-<td><a href="https://csinva.io/imodels/algebraic/gam.html">🗂️</a>, <a href="https://github.com/interpretml/interpret">🔗</a>, <a href="https://dl.acm.org/doi/abs/10.1145/2339530.2339556">📄</a></td>
+<td><a href="https://csinva.io/imodels/algebraic/tree_gam.html">🗂️</a>, <a href="https://github.com/interpretml/interpret">🔗</a>, <a href="https://dl.acm.org/doi/abs/10.1145/2339530.2339556">📄</a></td>
 <td>Generalized additive model fit with short boosted trees</td>
 </tr>
 <tr>
@@ -363,6 +363,12 @@ <h2 id="support-for-different-tasks">Support for different tasks</h2>
 <td>Requires extra dependencies for speed</td>
 </tr>
 <tr>
+<td style="text-align: left;">Tree GAM</td>
+<td style="text-align: center;"><a href="https://csinva.io/imodels/algebraic/tree_gam.html">TreeGAMClassifier</a></td>
+<td style="text-align: center;"><a href="https://csinva.io/imodels/algebraic/tree_gam.html">TreeGAMRegressor</a></td>
+<td></td>
+</tr>
+<tr>
 <td style="text-align: left;">Greedy tree sums (FIGS)</td>
 <td style="text-align: center;"><a href="https://csinva.io/imodels/tree/figs.html#imodels.tree.figs.FIGSClassifier">FIGSClassifier</a></td>
 <td style="text-align: center;"><a href="https://csinva.io/imodels/tree/figs.html#imodels.tree.figs.FIGSRegressor">FIGSRegressor</a></td>
@@ -444,6 +450,9 @@ <h3 id="hierarchical-shrinkage-post-hoc-regularization-for-tree-based-methods">H
 <p align="center">
 <i><b>HS Example.</b> HS applies post-hoc regularization to any decision tree by shrinking each node towards its parent.</i>
 </p>
+<h3 id="mdi-a-flexible-random-forest-based-feature-importance-framework">MDI+: A Flexible Random Forest-Based Feature Importance Framework</h3>
+<p><a href="https://arxiv.org/pdf/2307.01932.pdf">📄 Paper</a>, <a href="https://scholar.google.com/scholar?hl=en&amp;as_sdt=0%2C23&amp;q=MDI%2B%3A+A+Flexible+Random+Forest-Based+Feature+Importance+Framework&amp;btnG=#d=gs_cit&amp;t=1690399844081&amp;u=%2Fscholar%3Fq%3Dinfo%3Axc0LcHXE_lUJ%3Ascholar.google.com%2F%26output%3Dcite%26scirp%3D0%26hl%3Den">📌 Citation</a></p>
+<p>MDI+ is a novel feature importance framework, which generalizes the popular mean decrease in impurity (MDI) importance score for random forests. At its core, MDI+ expands upon a recently discovered connection between linear regression and decision trees. In doing so, MDI+ enables practitioners to (1) tailor the feature importance computation to the data/problem structure and (2) incorporate additional features or knowledge to mitigate known biases of decision trees. In both real data case studies and extensive real-data-inspired simulations, MDI+ outperforms commonly used feature importance measures (e.g., MDI, permutation-based scores, and TreeSHAP) by substantional margins.</p>
 <h2 id="references">References</h2>
 <details>
 <summary>Readings</summary>
@@ -511,7 +520,7 @@ <h2 id="references">References</h2>
 # Github repo available [here](https://github.com/csinva/imodels)
 
 from .algebraic.slim import SLIMRegressor, SLIMClassifier
-from .algebraic.gam import TreeGAMClassifier
+from .algebraic.tree_gam import TreeGAMClassifier, TreeGAMRegressor
 from .discretization.discretizer import RFDiscretizer, BasicDiscretizer
 from .discretization.mdlp import MDLPDiscretizer, BRLDiscretizer
 from .experimental.bartpy import BART
@@ -529,27 +538,64 @@ <h2 id="references">References</h2>
 from .rule_set.skope_rules import SkopeRulesClassifier
 from .rule_set.slipper import SlipperClassifier
 from .tree.c45_tree.c45_tree import C45TreeClassifier
-from .tree.cart_ccp import DecisionTreeCCPClassifier, DecisionTreeCCPRegressor, HSDecisionTreeCCPClassifierCV, \
-    HSDecisionTreeCCPRegressorCV
+from .tree.cart_ccp import (
+    DecisionTreeCCPClassifier,
+    DecisionTreeCCPRegressor,
+    HSDecisionTreeCCPClassifierCV,
+    HSDecisionTreeCCPRegressorCV,
+)
+
 # from .tree.iterative_random_forest.iterative_random_forest import IRFClassifier
 # from .tree.optimal_classification_tree import OptimalTreeModel
 from .tree.cart_wrapper import GreedyTreeClassifier, GreedyTreeRegressor
 from .tree.figs import FIGSRegressor, FIGSClassifier, FIGSRegressorCV, FIGSClassifierCV
 from .tree.gosdt.pygosdt import OptimalTreeClassifier
-from .tree.gosdt.pygosdt_shrinkage import HSOptimalTreeClassifier, HSOptimalTreeClassifierCV
-from .tree.hierarchical_shrinkage import HSTreeRegressor, HSTreeClassifier, HSTreeRegressorCV, HSTreeClassifierCV
+from .tree.gosdt.pygosdt_shrinkage import (
+    HSOptimalTreeClassifier,
+    HSOptimalTreeClassifierCV,
+)
+from .tree.hierarchical_shrinkage import (
+    HSTreeRegressor,
+    HSTreeClassifier,
+    HSTreeRegressorCV,
+    HSTreeClassifierCV,
+)
 from .tree.tao import TaoTreeClassifier, TaoTreeRegressor
 from .util.data_util import get_clean_dataset
 from .util.distillation import DistilledRegressor
 from .util.explain_errors import explain_classification_errors
 
-CLASSIFIERS = [BayesianRuleListClassifier, GreedyRuleListClassifier, SkopeRulesClassifier,
-               BoostedRulesClassifier, SLIMClassifier, SlipperClassifier, BayesianRuleSetClassifier,
-               C45TreeClassifier, OptimalTreeClassifier, OptimalRuleListClassifier, OneRClassifier,
-               SlipperClassifier, RuleFitClassifier, TaoTreeClassifier,
-               FIGSClassifier, HSTreeClassifier, HSTreeClassifierCV]  # , IRFClassifier
-REGRESSORS = [RuleFitRegressor, SLIMRegressor, GreedyTreeClassifier, FIGSRegressor,
-              TaoTreeRegressor, HSTreeRegressor, HSTreeRegressorCV, BART]
+CLASSIFIERS = [
+    BayesianRuleListClassifier,
+    GreedyRuleListClassifier,
+    SkopeRulesClassifier,
+    BoostedRulesClassifier,
+    SLIMClassifier,
+    SlipperClassifier,
+    BayesianRuleSetClassifier,
+    C45TreeClassifier,
+    OptimalTreeClassifier,
+    OptimalRuleListClassifier,
+    OneRClassifier,
+    SlipperClassifier,
+    RuleFitClassifier,
+    TaoTreeClassifier,
+    TreeGAMClassifier,
+    FIGSClassifier,
+    HSTreeClassifier,
+    HSTreeClassifierCV,
+]  # , IRFClassifier
+REGRESSORS = [
+    RuleFitRegressor,
+    SLIMRegressor,
+    GreedyTreeClassifier,
+    FIGSRegressor,
+    TaoTreeRegressor,
+    TreeGAMRegressor,
+    HSTreeRegressor,
+    HSTreeRegressorCV,
+    BART,
+]
 ESTIMATORS = CLASSIFIERS + REGRESSORS
 DISCRETIZERS = [RFDiscretizer, BasicDiscretizer, MDLPDiscretizer, BRLDiscretizer]</code></pre>
 </details>
@@ -609,7 +655,14 @@ <h1>Index 🔍</h1>
 <li><a href="#support-for-different-tasks">Support for different tasks</a><ul>
 <li><a href="#extras">Extras</a></li>
 </ul>
-</li><li><a href="#references">References</a></li>
+</li>
+<li><a href="#our-favorite-models">Our favorite models</a><ul>
+<li><a href="#figs-fast-interpretable-greedy-tree-sums">FIGS: Fast interpretable greedy-tree sums</a></li>
+<li><a href="#hierarchical-shrinkage-post-hoc-regularization-for-tree-based-methods">Hierarchical shrinkage: post-hoc regularization for tree-based methods</a></li>
+<li><a href="#mdi-a-flexible-random-forest-based-feature-importance-framework">MDI+: A Flexible Random Forest-Based Feature Importance Framework</a></li>
+</ul>
+</li>
+<li><a href="#references">References</a></li>
 </ul>
 </div>
 <ul id="index">
diff --git a/docs/tree/figs.html b/docs/tree/figs.html
index c699977d..9673ceb7 100644
--- a/docs/tree/figs.html
+++ b/docs/tree/figs.html
@@ -401,7 +401,10 @@
             # require the tree to have more than 1 node, otherwise just leave importance_data_tree as zeros
             if 1 &lt; next(node_counter):
                 tree_samples = _importances(tree_)
-                importance_data_tree /= tree_samples
+                if tree_samples != 0:
+                    importance_data_tree /= tree_samples
+                else:
+                    importance_data_tree = 0
 
             importance_data.append(importance_data_tree)
 
@@ -1015,7 +1018,10 @@ <h2 id="params">Params</h2>
             # require the tree to have more than 1 node, otherwise just leave importance_data_tree as zeros
             if 1 &lt; next(node_counter):
                 tree_samples = _importances(tree_)
-                importance_data_tree /= tree_samples
+                if tree_samples != 0:
+                    importance_data_tree /= tree_samples
+                else:
+                    importance_data_tree = 0
 
             importance_data.append(importance_data_tree)
 
@@ -1410,7 +1416,10 @@ <h3>Methods</h3>
         # require the tree to have more than 1 node, otherwise just leave importance_data_tree as zeros
         if 1 &lt; next(node_counter):
             tree_samples = _importances(tree_)
-            importance_data_tree /= tree_samples
+            if tree_samples != 0:
+                importance_data_tree /= tree_samples
+            else:
+                importance_data_tree = 0
 
         importance_data.append(importance_data_tree)
 
diff --git a/docs/tree/hierarchical_shrinkage.html b/docs/tree/hierarchical_shrinkage.html
index 19a93877..7eba34f3 100644
--- a/docs/tree/hierarchical_shrinkage.html
+++ b/docs/tree/hierarchical_shrinkage.html
@@ -31,9 +31,8 @@
 from sklearn.metrics import r2_score, mean_squared_error, log_loss
 from sklearn.model_selection import cross_val_score, KFold
 from sklearn.model_selection import train_test_split
-from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, \
-    export_text
-from sklearn.ensemble import GradientBoostingClassifier, RandomForestRegressor
+from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, export_text
+from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor, RandomForestRegressor
 
 from imodels.util import checks
 from imodels.util.arguments import check_fit_arguments
@@ -41,8 +40,12 @@
 
 
 class HSTree:
-    def __init__(self, estimator_: BaseEstimator = DecisionTreeClassifier(max_leaf_nodes=20),
-                 reg_param: float = 1, shrinkage_scheme_: str = &#39;node_based&#39;):
+    def __init__(
+        self,
+        estimator_: BaseEstimator = DecisionTreeClassifier(max_leaf_nodes=20),
+        reg_param: float = 1,
+        shrinkage_scheme_: str = &#34;node_based&#34;,
+    ):
         &#34;&#34;&#34;HSTree (Tree with hierarchical shrinkage applied).
         Hierarchical shinkage is an extremely fast post-hoc regularization method which works on any decision tree (or tree-based ensemble, such as Random Forest).
         It does not modify the tree structure, and instead regularizes the tree by shrinking the prediction over each node towards the sample means of its ancestors (using a single regularization parameter).
@@ -57,11 +60,11 @@
 
         reg_param: float
             Higher is more regularization (can be arbitrarily large, should not be &lt; 0)
-        
+
         shrinkage_scheme: str
-            Experimental: Used to experiment with different forms of shrinkage. options are: 
+            Experimental: Used to experiment with different forms of shrinkage. options are:
                 (i) node_based shrinks based on number of samples in parent node
-                (ii) leaf_based only shrinks leaf nodes based on number of leaf samples 
+                (ii) leaf_based only shrinks leaf nodes based on number of leaf samples
                 (iii) constant shrinks every node by a constant lambda
         &#34;&#34;&#34;
         super().__init__()
@@ -73,41 +76,54 @@
 
     def get_params(self, deep=True):
         if deep:
-            return deepcopy({&#39;reg_param&#39;: self.reg_param, &#39;estimator_&#39;: self.estimator_,
-                             &#39;shrinkage_scheme_&#39;: self.shrinkage_scheme_})
-        return {&#39;reg_param&#39;: self.reg_param, &#39;estimator_&#39;: self.estimator_,
-                &#39;shrinkage_scheme_&#39;: self.shrinkage_scheme_}
+            return deepcopy(
+                {
+                    &#34;reg_param&#34;: self.reg_param,
+                    &#34;estimator_&#34;: self.estimator_,
+                    &#34;shrinkage_scheme_&#34;: self.shrinkage_scheme_,
+                }
+            )
+        return {
+            &#34;reg_param&#34;: self.reg_param,
+            &#34;estimator_&#34;: self.estimator_,
+            &#34;shrinkage_scheme_&#34;: self.shrinkage_scheme_,
+        }
 
     def fit(self, X, y, sample_weight=None, *args, **kwargs):
         # remove feature_names if it exists (note: only works as keyword-arg)
-        feature_names = kwargs.pop(&#39;feature_names&#39;, None)  # None returned if not passed
+        feature_names = kwargs.pop(&#34;feature_names&#34;, None)  # None returned if not passed
         X, y, feature_names = check_fit_arguments(self, X, y, feature_names)
-        self.estimator_ = self.estimator_.fit(X, y, *args, sample_weight=sample_weight, **kwargs)
+        self.estimator_ = self.estimator_.fit(
+            X, y, *args, sample_weight=sample_weight, **kwargs
+        )
         self._shrink()
 
         # compute complexity
-        if hasattr(self.estimator_, &#39;tree_&#39;):
+        if hasattr(self.estimator_, &#34;tree_&#34;):
             self.complexity_ = compute_tree_complexity(self.estimator_.tree_)
-        elif hasattr(self.estimator_, &#39;estimators_&#39;):
+        elif hasattr(self.estimator_, &#34;estimators_&#34;):
             self.complexity_ = 0
             for i in range(len(self.estimator_.estimators_)):
                 t = deepcopy(self.estimator_.estimators_[i])
                 if isinstance(t, np.ndarray):
-                    assert t.size == 1, &#39;multiple trees stored under tree_?&#39;
+                    assert t.size == 1, &#34;multiple trees stored under tree_?&#34;
                     t = t[0]
                 self.complexity_ += compute_tree_complexity(t.tree_)
         return self
 
-    def _shrink_tree(self, tree, reg_param, i=0, parent_val=None, parent_num=None, cum_sum=0):
-        &#34;&#34;&#34;Shrink the tree
-        &#34;&#34;&#34;
+    def _shrink_tree(
+        self, tree, reg_param, i=0, parent_val=None, parent_num=None, cum_sum=0
+    ):
+        &#34;&#34;&#34;Shrink the tree&#34;&#34;&#34;
         if reg_param is None:
             reg_param = 1.0
         left = tree.children_left[i]
         right = tree.children_right[i]
         is_leaf = left == right
         n_samples = tree.weighted_n_node_samples[i]
-        if isinstance(self, RegressorMixin) or isinstance(self.estimator_, GradientBoostingClassifier):
+        if isinstance(self, RegressorMixin) or isinstance(
+            self.estimator_, GradientBoostingClassifier
+        ):
             val = deepcopy(tree.value[i, :, :])
         else:  # If classification, normalize to probability vector
             val = tree.value[i, :, :] / n_samples
@@ -119,42 +135,59 @@
 
         # if has parent
         else:
-            if self.shrinkage_scheme_ == &#39;node_based&#39;:
+            if self.shrinkage_scheme_ == &#34;node_based&#34;:
                 val_new = (val - parent_val) / (1 + reg_param / parent_num)
-            elif self.shrinkage_scheme_ == &#39;constant&#39;:
+            elif self.shrinkage_scheme_ == &#34;constant&#34;:
                 val_new = (val - parent_val) / (1 + reg_param)
             else:  # leaf_based
                 val_new = 0
             cum_sum += val_new
 
         # Step 2: Update node values
-        if self.shrinkage_scheme_ == &#39;node_based&#39; or self.shrinkage_scheme_ == &#39;constant&#39;:
+        if (
+            self.shrinkage_scheme_ == &#34;node_based&#34;
+            or self.shrinkage_scheme_ == &#34;constant&#34;
+        ):
             tree.value[i, :, :] = cum_sum
         else:  # leaf_based
             if is_leaf:  # update node values if leaf_based
                 root_val = tree.value[0, :, :]
-                tree.value[i, :, :] = root_val + (val - root_val) / (1 + reg_param / n_samples)
+                tree.value[i, :, :] = root_val + (val - root_val) / (
+                    1 + reg_param / n_samples
+                )
             else:
                 tree.value[i, :, :] = val
 
                 # Step 3: Recurse if not leaf
         if not is_leaf:
-            self._shrink_tree(tree, reg_param, left,
-                              parent_val=val, parent_num=n_samples, cum_sum=deepcopy(cum_sum))
-            self._shrink_tree(tree, reg_param, right,
-                              parent_val=val, parent_num=n_samples, cum_sum=deepcopy(cum_sum))
+            self._shrink_tree(
+                tree,
+                reg_param,
+                left,
+                parent_val=val,
+                parent_num=n_samples,
+                cum_sum=deepcopy(cum_sum),
+            )
+            self._shrink_tree(
+                tree,
+                reg_param,
+                right,
+                parent_val=val,
+                parent_num=n_samples,
+                cum_sum=deepcopy(cum_sum),
+            )
 
             # edit the non-leaf nodes for later visualization (doesn&#39;t effect predictions)
 
         return tree
 
     def _shrink(self):
-        if hasattr(self.estimator_, &#39;tree_&#39;):
+        if hasattr(self.estimator_, &#34;tree_&#34;):
             self._shrink_tree(self.estimator_.tree_, self.reg_param)
-        elif hasattr(self.estimator_, &#39;estimators_&#39;):
+        elif hasattr(self.estimator_, &#34;estimators_&#34;):
             for t in self.estimator_.estimators_:
                 if isinstance(t, np.ndarray):
-                    assert t.size == 1, &#39;multiple trees stored under tree_?&#39;
+                    assert t.size == 1, &#34;multiple trees stored under tree_?&#34;
                     t = t[0]
                 self._shrink_tree(t.tree_, self.reg_param)
 
@@ -162,24 +195,26 @@
         return self.estimator_.predict(X, *args, **kwargs)
 
     def predict_proba(self, X, *args, **kwargs):
-        if hasattr(self.estimator_, &#39;predict_proba&#39;):
+        if hasattr(self.estimator_, &#34;predict_proba&#34;):
             return self.estimator_.predict_proba(X, *args, **kwargs)
         else:
             return NotImplemented
 
     def score(self, X, y, *args, **kwargs):
-        if hasattr(self.estimator_, &#39;score&#39;):
+        if hasattr(self.estimator_, &#34;score&#34;):
             return self.estimator_.score(X, y, *args, **kwargs)
         else:
             return NotImplemented
 
     def __str__(self):
-        s = &#39;&gt; ------------------------------\n&#39;
-        s += &#39;&gt; Decision Tree with Hierarchical Shrinkage\n&#39;
-        s += &#39;&gt; \tPrediction is made by looking at the value in the appropriate leaf of the tree\n&#39;
-        s += &#39;&gt; ------------------------------&#39; + &#39;\n&#39;
-        if hasattr(self, &#39;feature_names&#39;) and self.feature_names is not None:
-            return s + export_text(self.estimator_, feature_names=self.feature_names, show_weights=True)
+        s = &#34;&gt; ------------------------------\n&#34;
+        s += &#34;&gt; Decision Tree with Hierarchical Shrinkage\n&#34;
+        s += &#34;&gt; \tPrediction is made by looking at the value in the appropriate leaf of the tree\n&#34;
+        s += &#34;&gt; ------------------------------&#34; + &#34;\n&#34;
+        if hasattr(self, &#34;feature_names&#34;) and self.feature_names is not None:
+            return s + export_text(
+                self.estimator_, feature_names=self.feature_names, show_weights=True
+            )
         else:
             return s + export_text(self.estimator_, show_weights=True)
 
@@ -206,25 +241,35 @@
 
 
 class HSTreeRegressor(HSTree, RegressorMixin):
-    def __init__(self, estimator_: BaseEstimator = DecisionTreeRegressor(max_leaf_nodes=20),
-                 reg_param: float = 1, shrinkage_scheme_: str = &#39;node_based&#39;):
-        super().__init__(estimator_=estimator_,
-                         reg_param=reg_param,
-                         shrinkage_scheme_=shrinkage_scheme_,
-                         )
+    def __init__(
+        self,
+        estimator_: BaseEstimator = DecisionTreeRegressor(max_leaf_nodes=20),
+        reg_param: float = 1,
+        shrinkage_scheme_: str = &#34;node_based&#34;,
+    ):
+        super().__init__(
+            estimator_=estimator_,
+            reg_param=reg_param,
+            shrinkage_scheme_=shrinkage_scheme_,
+        )
 
 
 class HSTreeClassifier(HSTree, ClassifierMixin):
-    def __init__(self, estimator_: BaseEstimator = DecisionTreeClassifier(max_leaf_nodes=20),
-                 reg_param: float = 1, shrinkage_scheme_: str = &#39;node_based&#39;):
-        super().__init__(estimator_=estimator_,
-                         reg_param=reg_param,
-                         shrinkage_scheme_=shrinkage_scheme_,
-                         )
+    def __init__(
+        self,
+        estimator_: BaseEstimator = DecisionTreeClassifier(max_leaf_nodes=20),
+        reg_param: float = 1,
+        shrinkage_scheme_: str = &#34;node_based&#34;,
+    ):
+        super().__init__(
+            estimator_=estimator_,
+            reg_param=reg_param,
+            shrinkage_scheme_=shrinkage_scheme_,
+        )
 
 
 def _get_cv_criterion(scorer):
-    y_true = np.random.binomial(n=1, p=.5, size=100)
+    y_true = np.random.binomial(n=1, p=0.5, size=100)
 
     y_pred_good = y_true
     y_pred_bad = np.random.uniform(0, 1, 100)
@@ -239,11 +284,17 @@
 
 
 class HSTreeClassifierCV(HSTreeClassifier):
-    def __init__(self, estimator_: BaseEstimator = None,
-                 reg_param_list: List[float] = [0, 0.1, 1, 10, 50, 100, 500],
-                 shrinkage_scheme_: str = &#39;node_based&#39;,
-                 max_leaf_nodes: int = 20,
-                 cv: int = 3, scoring=None, *args, **kwargs):
+    def __init__(
+        self,
+        estimator_: BaseEstimator = None,
+        reg_param_list: List[float] = [0, 0.1, 1, 10, 50, 100, 500],
+        shrinkage_scheme_: str = &#34;node_based&#34;,
+        max_leaf_nodes: int = 20,
+        cv: int = 3,
+        scoring=None,
+        *args,
+        **kwargs
+    ):
         &#34;&#34;&#34;Cross-validation is used to select the best regularization parameter for hierarchical shrinkage.
 
          Params
@@ -273,7 +324,7 @@
 
     def fit(self, X, y, *args, **kwargs):
         self.scores_ = [[] for _ in self.reg_param_list]
-        scorer = kwargs.get(&#39;scoring&#39;, log_loss)
+        scorer = kwargs.get(&#34;scoring&#34;, log_loss)
         kf = KFold(n_splits=self.cv)
         for train_index, test_index in kf.split(X):
             X_out, y_out = X[test_index, :], y[test_index]
@@ -290,8 +341,13 @@
         super().fit(X=X, y=y, *args, **kwargs)
 
     def __repr__(self):
-        attr_list = [&#34;estimator_&#34;, &#34;reg_param_list&#34;, &#34;shrinkage_scheme_&#34;,
-                     &#34;cv&#34;, &#34;scoring&#34;]
+        attr_list = [
+            &#34;estimator_&#34;,
+            &#34;reg_param_list&#34;,
+            &#34;shrinkage_scheme_&#34;,
+            &#34;cv&#34;,
+            &#34;scoring&#34;,
+        ]
         s = self.__class__.__name__
         s += &#34;(&#34;
         for attr in attr_list:
@@ -301,11 +357,17 @@
 
 
 class HSTreeRegressorCV(HSTreeRegressor):
-    def __init__(self, estimator_: BaseEstimator = None,
-                 reg_param_list: List[float] = [0, 0.1, 1, 10, 50, 100, 500],
-                 shrinkage_scheme_: str = &#39;node_based&#39;,
-                 max_leaf_nodes: int = 20,
-                 cv: int = 3, scoring=None, *args, **kwargs):
+    def __init__(
+        self,
+        estimator_: BaseEstimator = None,
+        reg_param_list: List[float] = [0, 0.1, 1, 10, 50, 100, 500],
+        shrinkage_scheme_: str = &#34;node_based&#34;,
+        max_leaf_nodes: int = 20,
+        cv: int = 3,
+        scoring=None,
+        *args,
+        **kwargs
+    ):
         &#34;&#34;&#34;Cross-validation is used to select the best regularization parameter for hierarchical shrinkage.
 
          Params
@@ -336,7 +398,7 @@
     def fit(self, X, y, *args, **kwargs):
         self.scores_ = [[] for _ in self.reg_param_list]
         kf = KFold(n_splits=self.cv)
-        scorer = kwargs.get(&#39;scoring&#39;, mean_squared_error)
+        scorer = kwargs.get(&#34;scoring&#34;, mean_squared_error)
         for train_index, test_index in kf.split(X):
             X_out, y_out = X[test_index, :], y[test_index]
             X_in, y_in = X[train_index, :], y[train_index]
@@ -352,8 +414,13 @@
         super().fit(X=X, y=y, *args, **kwargs)
 
     def __repr__(self):
-        attr_list = [&#34;estimator_&#34;, &#34;reg_param_list&#34;, &#34;shrinkage_scheme_&#34;,
-                     &#34;cv&#34;, &#34;scoring&#34;]
+        attr_list = [
+            &#34;estimator_&#34;,
+            &#34;reg_param_list&#34;,
+            &#34;shrinkage_scheme_&#34;,
+            &#34;cv&#34;,
+            &#34;scoring&#34;,
+        ]
         s = self.__class__.__name__
         s += &#34;(&#34;
         for attr in attr_list:
@@ -362,7 +429,7 @@
         return s
 
 
-if __name__ == &#39;__main__&#39;:
+if __name__ == &#34;__main__&#34;:
     np.random.seed(15)
     # X, y = datasets.fetch_california_housing(return_X_y=True)  # regression
     # X, y = datasets.load_breast_cancer(return_X_y=True)  # binary classification
@@ -373,34 +440,47 @@
     X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.33, random_state=10
     )
-    print(&#39;X.shape&#39;, X.shape)
-    print(&#39;ys&#39;, np.unique(y_train))
+    print(&#34;X.shape&#34;, X.shape)
+    print(&#34;ys&#34;, np.unique(y_train))
 
     # m = HSTree(estimator_=DecisionTreeClassifier(), reg_param=0.1)
     # m = DecisionTreeClassifier(max_leaf_nodes = 20,random_state=1, max_features=None)
-    m = DecisionTreeClassifier(random_state=42)
+    # m = DecisionTreeClassifier(random_state=42)
+    m = GradientBoostingRegressor(random_state=10, n_estimators=5)
     # print(&#39;best alpha&#39;, m.reg_param)
     m.fit(X_train, y_train)
     # m.predict_proba(X_train)  # just run this
-    print(&#39;score&#39;, r2_score(y_test, m.predict(X_test)))
-    print(&#39;running again....&#39;)
+    print(&#34;score&#34;, r2_score(y_test, m.predict(X_test)))
+    print(&#34;running again....&#34;)
 
     # x = DecisionTreeRegressor(random_state = 42, ccp_alpha = 0.3)
     # x.fit(X_train,y_train)
 
     # m = HSTree(estimator_=DecisionTreeRegressor(random_state=42, max_features=None), reg_param=10)
     # m = HSTree(estimator_=DecisionTreeClassifier(random_state=42, max_features=None), reg_param=0)
-    m = HSTreeRegressorCV(estimator_=DecisionTreeClassifier(random_state=42),
-                          shrinkage_scheme_=&#39;node_based&#39;,
-                          reg_param_list=[0.1, 1, 2, 5, 10, 25, 50, 100, 500])
+    # m = HSTreeRegressorCV(
+    #     estimator_=DecisionTreeClassifier(random_state=42),
+    #     shrinkage_scheme_=&#34;node_based&#34;,
+    #     reg_param_list=[0.1, 1, 2, 5, 10, 25, 50, 100, 500],
+    # )
     # m = ShrunkTreeCV(estimator_=DecisionTreeClassifier())
+    m = HSTreeRegressor(m)
+    print(&#34;score&#34;, r2_score(y_test, m.predict(X_test)))
+
 
-    # m = HSTreeClassifier(estimator_ = GradientBoostingClassifier(random_state = 10),reg_param = 5)
+
+    m = HSTreeRegressor(
+        estimator_=GradientBoostingRegressor(random_state=10, n_estimators=5,),
+        reg_param=1,
+    )
     m.fit(X_train, y_train)
-    print(&#39;best alpha&#39;, m.reg_param)
+    print(&#34;best alpha&#34;, m.reg_param)
     # m.predict_proba(X_train)  # just run this
     # print(&#39;score&#39;, m.score(X_test, y_test))
-    print(&#39;score&#39;, r2_score(y_test, m.predict(X_test)))</code></pre>
+    print(&#34;score&#34;, r2_score(y_test, m.predict(X_test)))
+
+
+    </code></pre>
 </details>
 </section>
 <section>
@@ -438,8 +518,12 @@ <h2 id="params">Params</h2>
 <span>Expand source code</span>
 </summary>
 <pre><code class="python">class HSTree:
-    def __init__(self, estimator_: BaseEstimator = DecisionTreeClassifier(max_leaf_nodes=20),
-                 reg_param: float = 1, shrinkage_scheme_: str = &#39;node_based&#39;):
+    def __init__(
+        self,
+        estimator_: BaseEstimator = DecisionTreeClassifier(max_leaf_nodes=20),
+        reg_param: float = 1,
+        shrinkage_scheme_: str = &#34;node_based&#34;,
+    ):
         &#34;&#34;&#34;HSTree (Tree with hierarchical shrinkage applied).
         Hierarchical shinkage is an extremely fast post-hoc regularization method which works on any decision tree (or tree-based ensemble, such as Random Forest).
         It does not modify the tree structure, and instead regularizes the tree by shrinking the prediction over each node towards the sample means of its ancestors (using a single regularization parameter).
@@ -454,11 +538,11 @@ <h2 id="params">Params</h2>
 
         reg_param: float
             Higher is more regularization (can be arbitrarily large, should not be &lt; 0)
-        
+
         shrinkage_scheme: str
-            Experimental: Used to experiment with different forms of shrinkage. options are: 
+            Experimental: Used to experiment with different forms of shrinkage. options are:
                 (i) node_based shrinks based on number of samples in parent node
-                (ii) leaf_based only shrinks leaf nodes based on number of leaf samples 
+                (ii) leaf_based only shrinks leaf nodes based on number of leaf samples
                 (iii) constant shrinks every node by a constant lambda
         &#34;&#34;&#34;
         super().__init__()
@@ -470,41 +554,54 @@ <h2 id="params">Params</h2>
 
     def get_params(self, deep=True):
         if deep:
-            return deepcopy({&#39;reg_param&#39;: self.reg_param, &#39;estimator_&#39;: self.estimator_,
-                             &#39;shrinkage_scheme_&#39;: self.shrinkage_scheme_})
-        return {&#39;reg_param&#39;: self.reg_param, &#39;estimator_&#39;: self.estimator_,
-                &#39;shrinkage_scheme_&#39;: self.shrinkage_scheme_}
+            return deepcopy(
+                {
+                    &#34;reg_param&#34;: self.reg_param,
+                    &#34;estimator_&#34;: self.estimator_,
+                    &#34;shrinkage_scheme_&#34;: self.shrinkage_scheme_,
+                }
+            )
+        return {
+            &#34;reg_param&#34;: self.reg_param,
+            &#34;estimator_&#34;: self.estimator_,
+            &#34;shrinkage_scheme_&#34;: self.shrinkage_scheme_,
+        }
 
     def fit(self, X, y, sample_weight=None, *args, **kwargs):
         # remove feature_names if it exists (note: only works as keyword-arg)
-        feature_names = kwargs.pop(&#39;feature_names&#39;, None)  # None returned if not passed
+        feature_names = kwargs.pop(&#34;feature_names&#34;, None)  # None returned if not passed
         X, y, feature_names = check_fit_arguments(self, X, y, feature_names)
-        self.estimator_ = self.estimator_.fit(X, y, *args, sample_weight=sample_weight, **kwargs)
+        self.estimator_ = self.estimator_.fit(
+            X, y, *args, sample_weight=sample_weight, **kwargs
+        )
         self._shrink()
 
         # compute complexity
-        if hasattr(self.estimator_, &#39;tree_&#39;):
+        if hasattr(self.estimator_, &#34;tree_&#34;):
             self.complexity_ = compute_tree_complexity(self.estimator_.tree_)
-        elif hasattr(self.estimator_, &#39;estimators_&#39;):
+        elif hasattr(self.estimator_, &#34;estimators_&#34;):
             self.complexity_ = 0
             for i in range(len(self.estimator_.estimators_)):
                 t = deepcopy(self.estimator_.estimators_[i])
                 if isinstance(t, np.ndarray):
-                    assert t.size == 1, &#39;multiple trees stored under tree_?&#39;
+                    assert t.size == 1, &#34;multiple trees stored under tree_?&#34;
                     t = t[0]
                 self.complexity_ += compute_tree_complexity(t.tree_)
         return self
 
-    def _shrink_tree(self, tree, reg_param, i=0, parent_val=None, parent_num=None, cum_sum=0):
-        &#34;&#34;&#34;Shrink the tree
-        &#34;&#34;&#34;
+    def _shrink_tree(
+        self, tree, reg_param, i=0, parent_val=None, parent_num=None, cum_sum=0
+    ):
+        &#34;&#34;&#34;Shrink the tree&#34;&#34;&#34;
         if reg_param is None:
             reg_param = 1.0
         left = tree.children_left[i]
         right = tree.children_right[i]
         is_leaf = left == right
         n_samples = tree.weighted_n_node_samples[i]
-        if isinstance(self, RegressorMixin) or isinstance(self.estimator_, GradientBoostingClassifier):
+        if isinstance(self, RegressorMixin) or isinstance(
+            self.estimator_, GradientBoostingClassifier
+        ):
             val = deepcopy(tree.value[i, :, :])
         else:  # If classification, normalize to probability vector
             val = tree.value[i, :, :] / n_samples
@@ -516,42 +613,59 @@ <h2 id="params">Params</h2>
 
         # if has parent
         else:
-            if self.shrinkage_scheme_ == &#39;node_based&#39;:
+            if self.shrinkage_scheme_ == &#34;node_based&#34;:
                 val_new = (val - parent_val) / (1 + reg_param / parent_num)
-            elif self.shrinkage_scheme_ == &#39;constant&#39;:
+            elif self.shrinkage_scheme_ == &#34;constant&#34;:
                 val_new = (val - parent_val) / (1 + reg_param)
             else:  # leaf_based
                 val_new = 0
             cum_sum += val_new
 
         # Step 2: Update node values
-        if self.shrinkage_scheme_ == &#39;node_based&#39; or self.shrinkage_scheme_ == &#39;constant&#39;:
+        if (
+            self.shrinkage_scheme_ == &#34;node_based&#34;
+            or self.shrinkage_scheme_ == &#34;constant&#34;
+        ):
             tree.value[i, :, :] = cum_sum
         else:  # leaf_based
             if is_leaf:  # update node values if leaf_based
                 root_val = tree.value[0, :, :]
-                tree.value[i, :, :] = root_val + (val - root_val) / (1 + reg_param / n_samples)
+                tree.value[i, :, :] = root_val + (val - root_val) / (
+                    1 + reg_param / n_samples
+                )
             else:
                 tree.value[i, :, :] = val
 
                 # Step 3: Recurse if not leaf
         if not is_leaf:
-            self._shrink_tree(tree, reg_param, left,
-                              parent_val=val, parent_num=n_samples, cum_sum=deepcopy(cum_sum))
-            self._shrink_tree(tree, reg_param, right,
-                              parent_val=val, parent_num=n_samples, cum_sum=deepcopy(cum_sum))
+            self._shrink_tree(
+                tree,
+                reg_param,
+                left,
+                parent_val=val,
+                parent_num=n_samples,
+                cum_sum=deepcopy(cum_sum),
+            )
+            self._shrink_tree(
+                tree,
+                reg_param,
+                right,
+                parent_val=val,
+                parent_num=n_samples,
+                cum_sum=deepcopy(cum_sum),
+            )
 
             # edit the non-leaf nodes for later visualization (doesn&#39;t effect predictions)
 
         return tree
 
     def _shrink(self):
-        if hasattr(self.estimator_, &#39;tree_&#39;):
+        if hasattr(self.estimator_, &#34;tree_&#34;):
             self._shrink_tree(self.estimator_.tree_, self.reg_param)
-        elif hasattr(self.estimator_, &#39;estimators_&#39;):
+        elif hasattr(self.estimator_, &#34;estimators_&#34;):
             for t in self.estimator_.estimators_:
                 if isinstance(t, np.ndarray):
-                    assert t.size == 1, &#39;multiple trees stored under tree_?&#39;
+                    assert t.size == 1, &#34;multiple trees stored under tree_?&#34;
                     t = t[0]
                 self._shrink_tree(t.tree_, self.reg_param)
 
@@ -559,24 +673,26 @@ <h2 id="params">Params</h2>
         return self.estimator_.predict(X, *args, **kwargs)
 
     def predict_proba(self, X, *args, **kwargs):
-        if hasattr(self.estimator_, &#39;predict_proba&#39;):
+        if hasattr(self.estimator_, &#34;predict_proba&#34;):
             return self.estimator_.predict_proba(X, *args, **kwargs)
         else:
             return NotImplemented
 
     def score(self, X, y, *args, **kwargs):
-        if hasattr(self.estimator_, &#39;score&#39;):
+        if hasattr(self.estimator_, &#34;score&#34;):
             return self.estimator_.score(X, y, *args, **kwargs)
         else:
             return NotImplemented
 
     def __str__(self):
-        s = &#39;&gt; ------------------------------\n&#39;
-        s += &#39;&gt; Decision Tree with Hierarchical Shrinkage\n&#39;
-        s += &#39;&gt; \tPrediction is made by looking at the value in the appropriate leaf of the tree\n&#39;
-        s += &#39;&gt; ------------------------------&#39; + &#39;\n&#39;
-        if hasattr(self, &#39;feature_names&#39;) and self.feature_names is not None:
-            return s + export_text(self.estimator_, feature_names=self.feature_names, show_weights=True)
+        s = &#34;&gt; ------------------------------\n&#34;
+        s += &#34;&gt; Decision Tree with Hierarchical Shrinkage\n&#34;
+        s += &#34;&gt; \tPrediction is made by looking at the value in the appropriate leaf of the tree\n&#34;
+        s += &#34;&gt; ------------------------------&#34; + &#34;\n&#34;
+        if hasattr(self, &#34;feature_names&#34;) and self.feature_names is not None:
+            return s + export_text(
+                self.estimator_, feature_names=self.feature_names, show_weights=True
+            )
         else:
             return s + export_text(self.estimator_, show_weights=True)
 
@@ -619,20 +735,22 @@ <h3>Methods</h3>
 </summary>
 <pre><code class="python">def fit(self, X, y, sample_weight=None, *args, **kwargs):
     # remove feature_names if it exists (note: only works as keyword-arg)
-    feature_names = kwargs.pop(&#39;feature_names&#39;, None)  # None returned if not passed
+    feature_names = kwargs.pop(&#34;feature_names&#34;, None)  # None returned if not passed
     X, y, feature_names = check_fit_arguments(self, X, y, feature_names)
-    self.estimator_ = self.estimator_.fit(X, y, *args, sample_weight=sample_weight, **kwargs)
+    self.estimator_ = self.estimator_.fit(
+        X, y, *args, sample_weight=sample_weight, **kwargs
+    )
     self._shrink()
 
     # compute complexity
-    if hasattr(self.estimator_, &#39;tree_&#39;):
+    if hasattr(self.estimator_, &#34;tree_&#34;):
         self.complexity_ = compute_tree_complexity(self.estimator_.tree_)
-    elif hasattr(self.estimator_, &#39;estimators_&#39;):
+    elif hasattr(self.estimator_, &#34;estimators_&#34;):
         self.complexity_ = 0
         for i in range(len(self.estimator_.estimators_)):
             t = deepcopy(self.estimator_.estimators_[i])
             if isinstance(t, np.ndarray):
-                assert t.size == 1, &#39;multiple trees stored under tree_?&#39;
+                assert t.size == 1, &#34;multiple trees stored under tree_?&#34;
                 t = t[0]
             self.complexity_ += compute_tree_complexity(t.tree_)
     return self</code></pre>
@@ -649,10 +767,18 @@ <h3>Methods</h3>
 </summary>
 <pre><code class="python">def get_params(self, deep=True):
     if deep:
-        return deepcopy({&#39;reg_param&#39;: self.reg_param, &#39;estimator_&#39;: self.estimator_,
-                         &#39;shrinkage_scheme_&#39;: self.shrinkage_scheme_})
-    return {&#39;reg_param&#39;: self.reg_param, &#39;estimator_&#39;: self.estimator_,
-            &#39;shrinkage_scheme_&#39;: self.shrinkage_scheme_}</code></pre>
+        return deepcopy(
+            {
+                &#34;reg_param&#34;: self.reg_param,
+                &#34;estimator_&#34;: self.estimator_,
+                &#34;shrinkage_scheme_&#34;: self.shrinkage_scheme_,
+            }
+        )
+    return {
+        &#34;reg_param&#34;: self.reg_param,
+        &#34;estimator_&#34;: self.estimator_,
+        &#34;shrinkage_scheme_&#34;: self.shrinkage_scheme_,
+    }</code></pre>
 </details>
 </dd>
 <dt id="imodels.tree.hierarchical_shrinkage.HSTree.predict"><code class="name flex">
@@ -678,7 +804,7 @@ <h3>Methods</h3>
 <span>Expand source code</span>
 </summary>
 <pre><code class="python">def predict_proba(self, X, *args, **kwargs):
-    if hasattr(self.estimator_, &#39;predict_proba&#39;):
+    if hasattr(self.estimator_, &#34;predict_proba&#34;):
         return self.estimator_.predict_proba(X, *args, **kwargs)
     else:
         return NotImplemented</code></pre>
@@ -694,7 +820,7 @@ <h3>Methods</h3>
 <span>Expand source code</span>
 </summary>
 <pre><code class="python">def score(self, X, y, *args, **kwargs):
-    if hasattr(self.estimator_, &#39;score&#39;):
+    if hasattr(self.estimator_, &#34;score&#34;):
         return self.estimator_.score(X, y, *args, **kwargs)
     else:
         return NotImplemented</code></pre>
@@ -729,12 +855,17 @@ <h2 id="params">Params</h2>
 <span>Expand source code</span>
 </summary>
 <pre><code class="python">class HSTreeClassifier(HSTree, ClassifierMixin):
-    def __init__(self, estimator_: BaseEstimator = DecisionTreeClassifier(max_leaf_nodes=20),
-                 reg_param: float = 1, shrinkage_scheme_: str = &#39;node_based&#39;):
-        super().__init__(estimator_=estimator_,
-                         reg_param=reg_param,
-                         shrinkage_scheme_=shrinkage_scheme_,
-                         )</code></pre>
+    def __init__(
+        self,
+        estimator_: BaseEstimator = DecisionTreeClassifier(max_leaf_nodes=20),
+        reg_param: float = 1,
+        shrinkage_scheme_: str = &#34;node_based&#34;,
+    ):
+        super().__init__(
+            estimator_=estimator_,
+            reg_param=reg_param,
+            shrinkage_scheme_=shrinkage_scheme_,
+        )</code></pre>
 </details>
 <h3>Ancestors</h3>
 <ul class="hlist">
@@ -767,11 +898,17 @@ <h2 id="params">Params</h2>
 <span>Expand source code</span>
 </summary>
 <pre><code class="python">class HSTreeClassifierCV(HSTreeClassifier):
-    def __init__(self, estimator_: BaseEstimator = None,
-                 reg_param_list: List[float] = [0, 0.1, 1, 10, 50, 100, 500],
-                 shrinkage_scheme_: str = &#39;node_based&#39;,
-                 max_leaf_nodes: int = 20,
-                 cv: int = 3, scoring=None, *args, **kwargs):
+    def __init__(
+        self,
+        estimator_: BaseEstimator = None,
+        reg_param_list: List[float] = [0, 0.1, 1, 10, 50, 100, 500],
+        shrinkage_scheme_: str = &#34;node_based&#34;,
+        max_leaf_nodes: int = 20,
+        cv: int = 3,
+        scoring=None,
+        *args,
+        **kwargs
+    ):
         &#34;&#34;&#34;Cross-validation is used to select the best regularization parameter for hierarchical shrinkage.
 
          Params
@@ -801,7 +938,7 @@ <h2 id="params">Params</h2>
 
     def fit(self, X, y, *args, **kwargs):
         self.scores_ = [[] for _ in self.reg_param_list]
-        scorer = kwargs.get(&#39;scoring&#39;, log_loss)
+        scorer = kwargs.get(&#34;scoring&#34;, log_loss)
         kf = KFold(n_splits=self.cv)
         for train_index, test_index in kf.split(X):
             X_out, y_out = X[test_index, :], y[test_index]
@@ -818,8 +955,13 @@ <h2 id="params">Params</h2>
         super().fit(X=X, y=y, *args, **kwargs)
 
     def __repr__(self):
-        attr_list = [&#34;estimator_&#34;, &#34;reg_param_list&#34;, &#34;shrinkage_scheme_&#34;,
-                     &#34;cv&#34;, &#34;scoring&#34;]
+        attr_list = [
+            &#34;estimator_&#34;,
+            &#34;reg_param_list&#34;,
+            &#34;shrinkage_scheme_&#34;,
+            &#34;cv&#34;,
+            &#34;scoring&#34;,
+        ]
         s = self.__class__.__name__
         s += &#34;(&#34;
         for attr in attr_list:
@@ -846,7 +988,7 @@ <h3>Methods</h3>
 </summary>
 <pre><code class="python">def fit(self, X, y, *args, **kwargs):
     self.scores_ = [[] for _ in self.reg_param_list]
-    scorer = kwargs.get(&#39;scoring&#39;, log_loss)
+    scorer = kwargs.get(&#34;scoring&#34;, log_loss)
     kf = KFold(n_splits=self.cv)
     for train_index, test_index in kf.split(X):
         X_out, y_out = X[test_index, :], y[test_index]
@@ -892,12 +1034,17 @@ <h2 id="params">Params</h2>
 <span>Expand source code</span>
 </summary>
 <pre><code class="python">class HSTreeRegressor(HSTree, RegressorMixin):
-    def __init__(self, estimator_: BaseEstimator = DecisionTreeRegressor(max_leaf_nodes=20),
-                 reg_param: float = 1, shrinkage_scheme_: str = &#39;node_based&#39;):
-        super().__init__(estimator_=estimator_,
-                         reg_param=reg_param,
-                         shrinkage_scheme_=shrinkage_scheme_,
-                         )</code></pre>
+    def __init__(
+        self,
+        estimator_: BaseEstimator = DecisionTreeRegressor(max_leaf_nodes=20),
+        reg_param: float = 1,
+        shrinkage_scheme_: str = &#34;node_based&#34;,
+    ):
+        super().__init__(
+            estimator_=estimator_,
+            reg_param=reg_param,
+            shrinkage_scheme_=shrinkage_scheme_,
+        )</code></pre>
 </details>
 <h3>Ancestors</h3>
 <ul class="hlist">
@@ -930,11 +1077,17 @@ <h2 id="params">Params</h2>
 <span>Expand source code</span>
 </summary>
 <pre><code class="python">class HSTreeRegressorCV(HSTreeRegressor):
-    def __init__(self, estimator_: BaseEstimator = None,
-                 reg_param_list: List[float] = [0, 0.1, 1, 10, 50, 100, 500],
-                 shrinkage_scheme_: str = &#39;node_based&#39;,
-                 max_leaf_nodes: int = 20,
-                 cv: int = 3, scoring=None, *args, **kwargs):
+    def __init__(
+        self,
+        estimator_: BaseEstimator = None,
+        reg_param_list: List[float] = [0, 0.1, 1, 10, 50, 100, 500],
+        shrinkage_scheme_: str = &#34;node_based&#34;,
+        max_leaf_nodes: int = 20,
+        cv: int = 3,
+        scoring=None,
+        *args,
+        **kwargs
+    ):
         &#34;&#34;&#34;Cross-validation is used to select the best regularization parameter for hierarchical shrinkage.
 
          Params
@@ -965,7 +1118,7 @@ <h2 id="params">Params</h2>
     def fit(self, X, y, *args, **kwargs):
         self.scores_ = [[] for _ in self.reg_param_list]
         kf = KFold(n_splits=self.cv)
-        scorer = kwargs.get(&#39;scoring&#39;, mean_squared_error)
+        scorer = kwargs.get(&#34;scoring&#34;, mean_squared_error)
         for train_index, test_index in kf.split(X):
             X_out, y_out = X[test_index, :], y[test_index]
             X_in, y_in = X[train_index, :], y[train_index]
@@ -981,8 +1134,13 @@ <h2 id="params">Params</h2>
         super().fit(X=X, y=y, *args, **kwargs)
 
     def __repr__(self):
-        attr_list = [&#34;estimator_&#34;, &#34;reg_param_list&#34;, &#34;shrinkage_scheme_&#34;,
-                     &#34;cv&#34;, &#34;scoring&#34;]
+        attr_list = [
+            &#34;estimator_&#34;,
+            &#34;reg_param_list&#34;,
+            &#34;shrinkage_scheme_&#34;,
+            &#34;cv&#34;,
+            &#34;scoring&#34;,
+        ]
         s = self.__class__.__name__
         s += &#34;(&#34;
         for attr in attr_list:
@@ -1010,7 +1168,7 @@ <h3>Methods</h3>
 <pre><code class="python">def fit(self, X, y, *args, **kwargs):
     self.scores_ = [[] for _ in self.reg_param_list]
     kf = KFold(n_splits=self.cv)
-    scorer = kwargs.get(&#39;scoring&#39;, mean_squared_error)
+    scorer = kwargs.get(&#34;scoring&#34;, mean_squared_error)
     for train_index, test_index in kf.split(X):
         X_out, y_out = X[test_index, :], y[test_index]
         X_in, y_in = X[train_index, :], y[train_index]
diff --git a/docs/util/data_util.html b/docs/util/data_util.html
index 032d277e..68e2853b 100644
--- a/docs/util/data_util.html
+++ b/docs/util/data_util.html
@@ -147,10 +147,12 @@
         elif dataset_name == &#39;california_housing&#39;:
             data = sklearn.datasets.fetch_california_housing(
                 data_home=oj(data_path, &#39;sklearn_data&#39;))
+        elif dataset_name == &#39;breast_cancer&#39;:
+            data = sklearn.datasets.load_breast_cancer()
         return data[&#39;data&#39;], data[&#39;target&#39;], _clean_feat_names(data[&#39;feature_names&#39;])
     elif data_source == &#39;openml&#39;:  # note this api might change in newer sklearn - should give dataset-id not name
         data = sklearn.datasets.fetch_openml(
-            data_id=dataset_name, data_home=oj(data_path, &#39;openml_data&#39;))
+            data_id=dataset_name, data_home=oj(data_path, &#39;openml_data&#39;), parser=&#39;auto&#39;)
         X, y, feature_names = data[&#39;data&#39;], data[&#39;target&#39;], _clean_feat_names(
             data[&#39;feature_names&#39;])
         if isinstance(X, pd.DataFrame):
@@ -372,10 +374,12 @@ <h2 id="example">Example</h2>
         elif dataset_name == &#39;california_housing&#39;:
             data = sklearn.datasets.fetch_california_housing(
                 data_home=oj(data_path, &#39;sklearn_data&#39;))
+        elif dataset_name == &#39;breast_cancer&#39;:
+            data = sklearn.datasets.load_breast_cancer()
         return data[&#39;data&#39;], data[&#39;target&#39;], _clean_feat_names(data[&#39;feature_names&#39;])
     elif data_source == &#39;openml&#39;:  # note this api might change in newer sklearn - should give dataset-id not name
         data = sklearn.datasets.fetch_openml(
-            data_id=dataset_name, data_home=oj(data_path, &#39;openml_data&#39;))
+            data_id=dataset_name, data_home=oj(data_path, &#39;openml_data&#39;), parser=&#39;auto&#39;)
         X, y, feature_names = data[&#39;data&#39;], data[&#39;target&#39;], _clean_feat_names(
             data[&#39;feature_names&#39;])
         if isinstance(X, pd.DataFrame):
diff --git a/imodels/__init__.py b/imodels/__init__.py
index 68ec5e30..c1306908 100644
--- a/imodels/__init__.py
+++ b/imodels/__init__.py
@@ -5,7 +5,7 @@
 # Github repo available [here](https://github.com/csinva/imodels)
 
 from .algebraic.slim import SLIMRegressor, SLIMClassifier
-from .algebraic.gam import TreeGAMClassifier
+from .algebraic.tree_gam import TreeGAMClassifier, TreeGAMRegressor
 from .discretization.discretizer import RFDiscretizer, BasicDiscretizer
 from .discretization.mdlp import MDLPDiscretizer, BRLDiscretizer
 from .experimental.bartpy import BART
@@ -23,26 +23,63 @@
 from .rule_set.skope_rules import SkopeRulesClassifier
 from .rule_set.slipper import SlipperClassifier
 from .tree.c45_tree.c45_tree import C45TreeClassifier
-from .tree.cart_ccp import DecisionTreeCCPClassifier, DecisionTreeCCPRegressor, HSDecisionTreeCCPClassifierCV, \
-    HSDecisionTreeCCPRegressorCV
+from .tree.cart_ccp import (
+    DecisionTreeCCPClassifier,
+    DecisionTreeCCPRegressor,
+    HSDecisionTreeCCPClassifierCV,
+    HSDecisionTreeCCPRegressorCV,
+)
+
 # from .tree.iterative_random_forest.iterative_random_forest import IRFClassifier
 # from .tree.optimal_classification_tree import OptimalTreeModel
 from .tree.cart_wrapper import GreedyTreeClassifier, GreedyTreeRegressor
 from .tree.figs import FIGSRegressor, FIGSClassifier, FIGSRegressorCV, FIGSClassifierCV
 from .tree.gosdt.pygosdt import OptimalTreeClassifier
-from .tree.gosdt.pygosdt_shrinkage import HSOptimalTreeClassifier, HSOptimalTreeClassifierCV
-from .tree.hierarchical_shrinkage import HSTreeRegressor, HSTreeClassifier, HSTreeRegressorCV, HSTreeClassifierCV
+from .tree.gosdt.pygosdt_shrinkage import (
+    HSOptimalTreeClassifier,
+    HSOptimalTreeClassifierCV,
+)
+from .tree.hierarchical_shrinkage import (
+    HSTreeRegressor,
+    HSTreeClassifier,
+    HSTreeRegressorCV,
+    HSTreeClassifierCV,
+)
 from .tree.tao import TaoTreeClassifier, TaoTreeRegressor
 from .util.data_util import get_clean_dataset
 from .util.distillation import DistilledRegressor
 from .util.explain_errors import explain_classification_errors
 
-CLASSIFIERS = [BayesianRuleListClassifier, GreedyRuleListClassifier, SkopeRulesClassifier,
-               BoostedRulesClassifier, SLIMClassifier, SlipperClassifier, BayesianRuleSetClassifier,
-               C45TreeClassifier, OptimalTreeClassifier, OptimalRuleListClassifier, OneRClassifier,
-               SlipperClassifier, RuleFitClassifier, TaoTreeClassifier,
-               FIGSClassifier, HSTreeClassifier, HSTreeClassifierCV]  # , IRFClassifier
-REGRESSORS = [RuleFitRegressor, SLIMRegressor, GreedyTreeClassifier, FIGSRegressor,
-              TaoTreeRegressor, HSTreeRegressor, HSTreeRegressorCV, BART]
+CLASSIFIERS = [
+    BayesianRuleListClassifier,
+    GreedyRuleListClassifier,
+    SkopeRulesClassifier,
+    BoostedRulesClassifier,
+    SLIMClassifier,
+    SlipperClassifier,
+    BayesianRuleSetClassifier,
+    C45TreeClassifier,
+    OptimalTreeClassifier,
+    OptimalRuleListClassifier,
+    OneRClassifier,
+    SlipperClassifier,
+    RuleFitClassifier,
+    TaoTreeClassifier,
+    TreeGAMClassifier,
+    FIGSClassifier,
+    HSTreeClassifier,
+    HSTreeClassifierCV,
+]  # , IRFClassifier
+REGRESSORS = [
+    RuleFitRegressor,
+    SLIMRegressor,
+    GreedyTreeClassifier,
+    FIGSRegressor,
+    TaoTreeRegressor,
+    TreeGAMRegressor,
+    HSTreeRegressor,
+    HSTreeRegressorCV,
+    BART,
+]
 ESTIMATORS = CLASSIFIERS + REGRESSORS
 DISCRETIZERS = [RFDiscretizer, BasicDiscretizer, MDLPDiscretizer, BRLDiscretizer]
diff --git a/imodels/algebraic/gam.py b/imodels/algebraic/tree_gam.py
similarity index 93%
rename from imodels/algebraic/gam.py
rename to imodels/algebraic/tree_gam.py
index 474918f2..3b3acc80 100644
--- a/imodels/algebraic/gam.py
+++ b/imodels/algebraic/tree_gam.py
@@ -16,8 +16,10 @@
 
 import imodels
 
+from sklearn.base import RegressorMixin, ClassifierMixin
 
-class TreeGAMClassifier(BaseEstimator):
+
+class TreeGAM(BaseEstimator):
     """Tree-based GAM classifier.
     Uses cyclical boosting to fit a GAM with small trees.
     Simplified version of the explainable boosting machine described in https://github.com/interpretml/interpret
@@ -30,6 +32,7 @@ def __init__(
         n_boosting_rounds=100,
         max_leaf_nodes=3,
         reg_param=0.0,
+        learning_rate: float = 0.01,
         n_boosting_rounds_marginal=0,
         max_leaf_nodes_marginal=2,
         reg_param_marginal=0.0,
@@ -45,6 +48,8 @@ def __init__(
             Maximum number of leaf nodes for the trees in the cyclic boosting.
         reg_param : float
             Regularization parameter for the cyclic boosting.
+        learning_rate: float
+            Learning rate for the cyclic boosting.
         n_boosting_rounds_marginal : int
             Number of boosting rounds for the marginal boosting.
         max_leaf_nodes_marginal : int
@@ -56,21 +61,24 @@ def __init__(
             NNLS for non-negative least squares
             ridge for ridge regression
             None for no linear model
+
         random_state : int
             Random seed.
         """
         self.n_boosting_rounds = n_boosting_rounds
         self.max_leaf_nodes = max_leaf_nodes
         self.reg_param = reg_param
+        self.learning_rate = learning_rate
         self.max_leaf_nodes_marginal = max_leaf_nodes_marginal
         self.reg_param_marginal = reg_param_marginal
         self.n_boosting_rounds_marginal = n_boosting_rounds_marginal
         self.fit_linear_marginal = fit_linear_marginal
         self.random_state = random_state
 
-    def fit(self, X, y, sample_weight=None, learning_rate=0.01, validation_frac=0.15):
+    def fit(self, X, y, sample_weight=None, validation_frac=0.15):
         X, y = check_X_y(X, y, accept_sparse=False, multi_output=False)
-        check_classification_targets(y)
+        if isinstance(self, ClassifierMixin):
+            check_classification_targets(y)
         sample_weight = _check_sample_weight(sample_weight, X, dtype=None)
 
         # split into train and validation for early stopping
@@ -91,7 +99,6 @@ def fit(self, X, y, sample_weight=None, learning_rate=0.01, validation_frac=0.15
 
         self.estimators_marginal = []
         self.estimators_ = []
-        self.learning_rate = learning_rate
         self.bias_ = np.mean(y)
 
         if self.n_boosting_rounds_marginal > 0:
@@ -208,7 +215,10 @@ def predict_proba(self, X):
         return np.array([1 - probs1, probs1]).T
 
     def predict(self, X):
-        return np.argmax(self.predict_proba(X), axis=1)
+        if isinstance(self, RegressorMixin):
+            return self.predict_proba(X)[:, 1]
+        elif isinstance(self, ClassifierMixin):
+            return np.argmax(self.predict_proba(X), axis=1)
 
     def get_shape_function_vals(self, X, max_evals=100):
         """Uses predict_proba to compute shape_function
@@ -236,6 +246,14 @@ def get_shape_function_vals(self, X, max_evals=100):
         return feature_vals_list, shape_function_vals_list
 
 
+class TreeGAMRegressor(TreeGAM, RegressorMixin):
+    ...
+
+
+class TreeGAMClassifier(TreeGAM, ClassifierMixin):
+    ...
+
+
 if __name__ == "__main__":
     X, y, feature_names = imodels.get_clean_dataset("heart")
     X, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
diff --git a/readme.md b/readme.md
index 15cf95d4..06aa8806 100644
--- a/readme.md
+++ b/readme.md
@@ -83,7 +83,7 @@ Install with `pip install imodels` (see [here](https://github.com/csinva/imodels
 | TAO rule tree        | [🗂️](https://csinva.io/imodels/tree/tao.html), &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;, ㅤㅤ[📄](https://proceedings.neurips.cc/paper/2018/hash/185c29dc24325934ee377cfda20e414c-Abstract.html) | Fits tree using alternating optimization                    |
 | Iterative random<br/>forest | [🗂️](https://csinva.io/imodels/tree/iterative_random_forest/iterative_random_forest.html), [🔗](https://github.com/Yu-Group/iterative-Random-Forest), [📄](https://www.pnas.org/content/115/8/1943) | Repeatedly fit random forest, giving features with<br/>high importance a higher chance of being selected |
 | Sparse integer<br/>linear model | [🗂️](https://csinva.io/imodels/algebraic/slim.html), &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;, ㅤㅤ[📄](https://link.springer.com/article/10.1007/s10994-015-5528-6) | Sparse linear model with integer coefficients                           |
-| Tree GAM | [🗂️](https://csinva.io/imodels/algebraic/gam.html), [🔗](https://github.com/interpretml/interpret), [📄](https://dl.acm.org/doi/abs/10.1145/2339530.2339556) | Generalized additive model fit with short boosted trees                           |
+| Tree GAM | [🗂️](https://csinva.io/imodels/algebraic/tree_gam.html), [🔗](https://github.com/interpretml/interpret), [📄](https://dl.acm.org/doi/abs/10.1145/2339530.2339556) | Generalized additive model fit with short boosted trees                           |
 | <b>Greedy tree sums</b> | [🗂️](https://csinva.io/imodels/tree/figs.html#imodels.tree.figs), &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;, ㅤㅤ[📄](https://arxiv.org/abs/2201.11931) | Sum of small trees with very few total rules (FIGS)                          |
 | <b>Hierarchical<br/> shrinkage wrapper</b> | [🗂️](https://csinva.io/imodels/tree/hierarchical_shrinkage.html), &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;, ㅤㅤ[📄](https://arxiv.org/abs/2202.00858) | Improve a decision tree, random forest, or<br/>gradient-boosting ensemble with ultra-fast, post-hoc regularization |
 | Distillation<br/>wrapper | [🗂️](https://csinva.io/imodels/util/distillation.html)  | Train a black-box model,<br/>then distill it into an interpretable model |
@@ -175,6 +175,7 @@ Different models support different machine-learning tasks. Current support for d
 | TAO rule tree              | [TaoTreeClassifier](https://csinva.io/imodels/tree/tao.html#imodels.tree.tao.TaoTreeClassifier) |   [TaoTreeRegressor](https://csinva.io/imodels/tree/tao.html#imodels.tree.tao.TaoTreeRegressor)        |  |
 | Iterative random forest     | [IRFClassifier](https://csinva.io/imodels/tree/iterative_random_forest/iterative_random_forest.html#imodels.tree.iterative_random_forest.iterative_random_forest.IRFClassifier)                                                             |                                                              | Requires [irf](https://pypi.org/project/irf/) |
 | Sparse integer linear model | [SLIMClassifier](https://csinva.io/imodels/algebraic/slim.html#imodels.algebraic.slim.SLIMClassifier) | [SLIMRegressor](https://csinva.io/imodels/algebraic/slim.html#imodels.algebraic.slim.SLIMRegressor) | Requires extra dependencies for speed |
+| Tree GAM | [TreeGAMClassifier](https://csinva.io/imodels/algebraic/tree_gam.html) | [TreeGAMRegressor](https://csinva.io/imodels/algebraic/tree_gam.html) | |
 | Greedy tree sums (FIGS) | [FIGSClassifier](https://csinva.io/imodels/tree/figs.html#imodels.tree.figs.FIGSClassifier) | [FIGSRegressor](https://csinva.io/imodels/tree/figs.html#imodels.tree.figs.FIGSRegressor) |                                                              |
 | Hierarchical shrinkage | [HSTreeClassifierCV](https://csinva.io/imodels/tree/hierarchical_shrinkage.html#imodels.tree.hierarchical_shrinkage.HSTreeClassifierCV) | [HSTreeRegressorCV](https://csinva.io/imodels/tree/hierarchical_shrinkage.html#imodels.tree.hierarchical_shrinkage.HSTreeRegressorCV) | Wraps any sklearn tree-based model |
 | Distillation |  | [DistilledRegressor](https://csinva.io/imodels/util/distillation.html#imodels.util.distillation.DistilledRegressor) | Wraps any sklearn-compatible models |
diff --git a/setup.py b/setup.py
index 6c450a26..b2f13255 100644
--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,7 @@
 
 setuptools.setup(
     name="imodels",
-    version="1.3.18",
+    version="1.4.0",
     author="Chandan Singh, Keyan Nasseri, Matthew Epland, Yan Shuo Tan, Omer Ronen, Tiffany Tang, Abhineet Agarwal, Theo Saarinen, Bin Yu, and others",
     author_email="chandan_singh@berkeley.edu",
     description="Implementations of various interpretable models",
diff --git a/tests/regression_test.py b/tests/regression_test.py
index 4d1210fb..908015bc 100644
--- a/tests/regression_test.py
+++ b/tests/regression_test.py
@@ -6,7 +6,7 @@
 from sklearn.tree import DecisionTreeRegressor
 
 from imodels import RuleFitRegressor, SLIMRegressor, GreedyTreeRegressor, HSTreeRegressor, HSTreeRegressorCV, \
-    FIGSRegressor, DistilledRegressor, TaoTreeRegressor, BoostedRulesRegressor
+    FIGSRegressor, DistilledRegressor, TaoTreeRegressor, BoostedRulesRegressor, TreeGAMRegressor
 
 
 class TestClassRegression:
@@ -26,6 +26,7 @@ def test_regression(self):
                            BoostedRulesRegressor,
                            partial(DistilledRegressor, teacher=RandomForestRegressor(n_estimators=3),
                                    student=DecisionTreeRegressor()),
+                            TreeGAMRegressor,
                            ]:
             if model_type == RuleFitRegressor:
                 m = model_type(include_linear=False, max_rules=3)