From 1d718ffbde5e7e3a2dd17c3298454100d59d516f Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Sat, 16 Jan 2021 15:58:53 -0600
Subject: [PATCH 01/29] ENH: add dependancy injection point to transform X & y
 together

---
 scikeras/wrappers.py        | 35 ++++++++++++++++++++++++++++++++---
 tests/test_api.py           | 37 +++++++++++++++++++++++++++++++++++--
 tests/test_param_routing.py |  2 ++
 3 files changed, 69 insertions(+), 5 deletions(-)

diff --git a/scikeras/wrappers.py b/scikeras/wrappers.py
index 2d0780ff3..f772ff6fd 100644
--- a/scikeras/wrappers.py
+++ b/scikeras/wrappers.py
@@ -696,6 +696,29 @@ def feature_encoder(self):
         """
         return FunctionTransformer()
 
+    @property
+    def dataset_transformer(self):
+        """Retrieve a transformer to be applied jointly to X & y.
+
+        It MUST accept a 2 element tuple as it's single input argument
+        to `fit`, `transform` and `inverse_transform` and the
+        latter two MUST also output a two element tuple.
+        The second element, corresponding to `y`, can be None
+        if the first element (corresponding to `X`) is a Dataset.
+
+        Metadata will be collected from ``get_metadata`` if
+        the transformer implements that method.
+        Override this method to implement a custom data transformer
+        for entire dataset.
+
+        Returns
+        -------
+        dataset_transformer
+            Transformer implementing the sklearn transformer
+            interface.
+        """
+        return FunctionTransformer()
+
     def fit(self, X, y, sample_weight=None, **kwargs) -> "BaseWrapper":
         """Constructs a new model with ``model`` & fit the model to ``(X, y)``.
 
@@ -774,8 +797,11 @@ def _initialize(
         target_metadata = getattr(self.target_encoder_, "get_metadata", dict)()
         vars(self).update(**target_metadata)
         self.feature_encoder_ = self.feature_encoder.fit(X)
-        feature_meta = getattr(self.feature_encoder, "get_metadata", dict)()
+        feature_meta = getattr(self.feature_encoder_, "get_metadata", dict)()
         vars(self).update(**feature_meta)
+        self.dataset_transformer_ = self.dataset_transformer.fit((X, y))
+        dataset_meta = getattr(self.dataset_transformer_, "get_metadata", dict)()
+        vars(self).update(**dataset_meta)
 
         self.model_ = self._build_keras_model()
 
@@ -846,10 +872,13 @@ def _fit(
         if sample_weight is not None:
             X, y, sample_weight = self._validate_sample_weight(X, y, sample_weight)
 
-        y = self.target_encoder_.transform(y)
         X = self.feature_encoder_.transform(X)
 
-        self._check_model_compatibility(y)
+        if y is not None:
+            y = self.target_encoder_.transform(y)
+            self._check_model_compatibility(y)
+
+        X, y = self.dataset_transformer_.transform((X, y))
 
         self._fit_keras_model(
             X,
diff --git a/tests/test_api.py b/tests/test_api.py
index 95510d65f..b47e6b382 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -2,6 +2,7 @@
 import pickle
 
 from typing import Any, Dict
+from unittest.mock import patch
 
 import numpy as np
 import pytest
@@ -17,7 +18,8 @@
 from sklearn.exceptions import NotFittedError
 from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
 from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import FunctionTransformer, StandardScaler
+from tensorflow.data import Dataset
 from tensorflow.keras import losses as losses_module
 from tensorflow.keras import metrics as metrics_module
 from tensorflow.keras.layers import Conv2D, Dense, Flatten, Input
@@ -27,7 +29,7 @@
 from tensorflow.python.keras.utils.generic_utils import register_keras_serializable
 from tensorflow.python.keras.utils.np_utils import to_categorical
 
-from scikeras.wrappers import BaseWrapper, KerasClassifier, KerasRegressor
+from scikeras.wrappers import KerasClassifier, KerasRegressor
 
 from .mlp_models import dynamic_classifier, dynamic_regressor
 from .testing_utils import basic_checks
@@ -812,3 +814,34 @@ def test_prebuilt_model(self, wrapper):
         np.testing.assert_allclose(y_pred_keras, y_pred_scikeras)
         # Check that we are still using the same model object
         assert est.model_ is m2
+
+
+class TestDatasetTransformer:
+    def test_conversion_to_dataset(self):
+        inp = Input((1,))
+        out = Dense(1, activation="sigmoid")(inp)
+        m = Model(inp, out)
+        m.compile(loss="bce")
+
+        class MyWrapper(KerasClassifier):
+            @property
+            def dataset_transformer(self):
+                f = lambda x_y: (Dataset.from_tensor_slices(x_y), None)
+                return FunctionTransformer(f)
+
+        est = MyWrapper(m)
+        X = np.random.random((100, 1))
+        y = np.array(["a", "b"] * 50, dtype=str)
+        fit_orig = m.fit
+
+        def check_fit(**kwargs):
+            assert isinstance(kwargs["x"], Dataset)
+            assert kwargs["y"] is None
+            return fit_orig(**kwargs)
+
+        with patch.object(m, "fit", new=check_fit):
+            est.fit(X, y)
+        y_pred = est.predict(X)
+        assert y_pred.dtype == y.dtype
+        assert y_pred.shape == y.shape
+        assert set(y_pred).issubset(set(y))
diff --git a/tests/test_param_routing.py b/tests/test_param_routing.py
index 026e75d50..8d2d6149f 100644
--- a/tests/test_param_routing.py
+++ b/tests/test_param_routing.py
@@ -23,6 +23,7 @@
     "n_outputs_",
     "feature_encoder_",
     "target_encoder_",
+    "dataset_transformer_",
 }
 
 keras_regressor_base_meta_set = {
@@ -36,6 +37,7 @@
     "target_type_",
     "feature_encoder_",
     "target_encoder_",
+    "dataset_transformer_",
 }
 
 

From c170f4b9a565531ef72c79f97ff11b14d051c118 Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Thu, 21 Jan 2021 15:19:53 -0600
Subject: [PATCH 02/29] Extend data transformer notebook with examples of
 data_transformer usage

---
 notebooks/DataTransformers.ipynb | 851 ++++++++++++++++++++++++++-----
 scikeras/utils/transformers.py   |  23 +-
 scikeras/wrappers.py             | 133 +++--
 tests/test_api.py                |   8 +-
 tests/test_param_routing.py      |   2 -
 5 files changed, 845 insertions(+), 172 deletions(-)

diff --git a/notebooks/DataTransformers.ipynb b/notebooks/DataTransformers.ipynb
index 8221ee37a..a12418abe 100644
--- a/notebooks/DataTransformers.ipynb
+++ b/notebooks/DataTransformers.ipynb
@@ -3,9 +3,13 @@
   "nbformat_minor": 0,
   "metadata": {
     "kernelspec": {
-      "display_name": "Python 3",
-      "language": "python",
-      "name": "python3"
+      "name": "python3",
+      "display_name": "Python 3.8.6 64-bit ('.venv': venv)",
+      "metadata": {
+        "interpreter": {
+          "hash": "aa27e2362274a734444ef07021bde9bc2912ecaf24c8326dfe4db5717933d8db"
+        }
+      }
     },
     "language_info": {
       "codemirror_mode": {
@@ -17,7 +21,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.8.5-final"
+      "version": "3.8.6-final"
     },
     "colab": {
       "name": "DataTransformers.ipynb",
@@ -46,6 +50,8 @@
         "* Multiple inputs\n",
         "* Multiple outputs\n",
         "* Higher-dimensional tensors\n",
+        "* Ragged datasets (variable datapoints per observation)\n",
+        "* `tf.data.Dataset`\n",
         "\n",
         "In this notebook, we explore how to reconcile this functionality with the sklearn ecosystem via SciKeras Data Transformer interface.\n",
         "\n",
@@ -86,7 +92,9 @@
         "* [3. Multidimensional inputs with MNIST dataset](#3-multidimensional-inputs-with-MNIST-dataset)\n",
         "    * [3.1 Define Keras Model](#3-1-define-keras-model)\n",
         "    * [3.2 Define transformer](#3-2-define-transformer)\n",
-        "    * [3.3 Test classifier](#3-3-test-classifier)"
+        "    * [3.3 Test classifier](#3-3-test-classifier)\n",
+        "* [4. Ragged datasets with tf.data.Dataset](#4-ragged-datasets)\n",
+        "* [5. Multi-output class_weight](#4-multi-output-class-weight)"
       ]
     },
     {
@@ -106,8 +114,56 @@
       "source": [
         "!python -m pip install scikeras"
       ],
-      "execution_count": null,
-      "outputs": []
+      "execution_count": 53,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Requirement already satisfied: scikeras in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (0.2.1)\n",
+            "Requirement already satisfied: tensorflow>=2.4.0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from scikeras) (2.4.0)\n",
+            "Requirement already satisfied: scikit-learn>=0.22.0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from scikeras) (0.23.2)\n",
+            "Requirement already satisfied: scipy>=0.19.1 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from scikit-learn>=0.22.0->scikeras) (1.5.4)\n",
+            "Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from scikit-learn>=0.22.0->scikeras) (2.1.0)\n",
+            "Requirement already satisfied: joblib>=0.11 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from scikit-learn>=0.22.0->scikeras) (1.0.0)\n",
+            "Requirement already satisfied: numpy>=1.13.3 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from scikit-learn>=0.22.0->scikeras) (1.19.4)\n",
+            "Requirement already satisfied: h5py~=2.10.0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (2.10.0)\n",
+            "Requirement already satisfied: protobuf>=3.9.2 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (3.14.0)\n",
+            "Requirement already satisfied: six~=1.15.0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (1.15.0)\n",
+            "Requirement already satisfied: keras-preprocessing~=1.1.2 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (1.1.2)\n",
+            "Requirement already satisfied: google-pasta~=0.2 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (0.2.0)\n",
+            "Requirement already satisfied: opt-einsum~=3.3.0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (3.3.0)\n",
+            "Requirement already satisfied: absl-py~=0.10 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (0.11.0)\n",
+            "Requirement already satisfied: wrapt~=1.12.1 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (1.12.1)\n",
+            "Requirement already satisfied: tensorboard~=2.4 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (2.4.0)\n",
+            "Requirement already satisfied: flatbuffers~=1.12.0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (1.12)\n",
+            "Requirement already satisfied: termcolor~=1.1.0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (1.1.0)\n",
+            "Requirement already satisfied: astunparse~=1.6.3 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (1.6.3)\n",
+            "Requirement already satisfied: wheel~=0.35 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (0.36.2)\n",
+            "Requirement already satisfied: gast==0.3.3 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (0.3.3)\n",
+            "Requirement already satisfied: grpcio~=1.32.0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (1.32.0)\n",
+            "Requirement already satisfied: typing-extensions~=3.7.4 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (3.7.4.3)\n",
+            "Requirement already satisfied: tensorflow-estimator<2.5.0,>=2.4.0rc0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (2.4.0)\n",
+            "Requirement already satisfied: werkzeug>=0.11.15 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (1.0.1)\n",
+            "Requirement already satisfied: requests<3,>=2.21.0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (2.25.1)\n",
+            "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (1.7.0)\n",
+            "Requirement already satisfied: markdown>=2.6.8 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (3.3.3)\n",
+            "Requirement already satisfied: google-auth<2,>=1.6.3 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (1.24.0)\n",
+            "Requirement already satisfied: setuptools>=41.0.0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (51.0.0)\n",
+            "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (0.4.2)\n",
+            "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from google-auth<2,>=1.6.3->tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (4.2.0)\n",
+            "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from google-auth<2,>=1.6.3->tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (0.2.8)\n",
+            "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from google-auth<2,>=1.6.3->tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (4.6)\n",
+            "Requirement already satisfied: requests-oauthlib>=0.7.0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (1.3.0)\n",
+            "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from pyasn1-modules>=0.2.1->google-auth<2,>=1.6.3->tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (0.4.8)\n",
+            "Requirement already satisfied: idna<3,>=2.5 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (2.10)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (2020.12.5)\n",
+            "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (1.26.2)\n",
+            "Requirement already satisfied: chardet<5,>=3.0.2 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (4.0.0)\n",
+            "Requirement already satisfied: oauthlib>=3.0.0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (3.1.0)\n"
+          ]
+        }
+      ]
     },
     {
       "cell_type": "markdown",
@@ -129,7 +185,7 @@
         "get_logger().setLevel('ERROR')\n",
         "warnings.filterwarnings(\"ignore\", message=\"Setting the random state for TF\")"
       ],
-      "execution_count": 3,
+      "execution_count": 54,
       "outputs": []
     },
     {
@@ -142,7 +198,7 @@
         "from scikeras.wrappers import KerasClassifier, KerasRegressor\n",
         "from tensorflow import keras"
       ],
-      "execution_count": 4,
+      "execution_count": 55,
       "outputs": []
     },
     {
@@ -214,7 +270,7 @@
         "        def target_encoder(self):\n",
         "            return RegressorTargetEncoder()"
       ],
-      "execution_count": 39,
+      "execution_count": 56,
       "outputs": []
     },
     {
@@ -234,7 +290,7 @@
       "source": [
         "from sklearn.base import BaseEstimator, TransformerMixin"
       ],
-      "execution_count": 40,
+      "execution_count": 57,
       "outputs": []
     },
     {
@@ -255,7 +311,7 @@
         "        def target_encoder(self):\n",
         "            return MultiOutputTransformer(...)"
       ],
-      "execution_count": 41,
+      "execution_count": 58,
       "outputs": []
     },
     {
@@ -304,7 +360,7 @@
         "    clf.fit(X, y)  # Got: foobarbaz\n",
         "    print(clf.my_param_)  # foobarbaz"
       ],
-      "execution_count": 42,
+      "execution_count": 59,
       "outputs": []
     },
     {
@@ -365,7 +421,7 @@
         "    )\n",
         "    return model"
       ],
-      "execution_count": 43,
+      "execution_count": 60,
       "outputs": []
     },
     {
@@ -401,7 +457,7 @@
         "model.fit(X, y, verbose=0)\n",
         "y_pred = model.predict(X)"
       ],
-      "execution_count": 44,
+      "execution_count": 61,
       "outputs": []
     },
     {
@@ -417,15 +473,14 @@
       "source": [
         "print(y_pred[0][:2, :])"
       ],
-      "execution_count": 45,
+      "execution_count": 62,
       "outputs": [
         {
           "output_type": "stream",
+          "name": "stdout",
           "text": [
-            "[[0.4910586 ]\n",
-            " [0.47602195]]\n"
-          ],
-          "name": "stdout"
+            "[[0.396703  ]\n [0.36225754]]\n"
+          ]
         }
       ]
     },
@@ -442,15 +497,14 @@
       "source": [
         "print(y_pred[1][:2, :])"
       ],
-      "execution_count": 46,
+      "execution_count": 63,
       "outputs": [
         {
           "output_type": "stream",
+          "name": "stdout",
           "text": [
-            "[[0.16950993 0.13774167 0.23052557 0.22972858 0.2324943 ]\n",
-            " [0.1702391  0.13294716 0.20055309 0.25442293 0.24183771]]\n"
-          ],
-          "name": "stdout"
+            "[[0.23275968 0.180671   0.30023158 0.16657698 0.11976068]\n [0.20901404 0.19675821 0.29663667 0.16592933 0.13166177]]\n"
+          ]
         }
       ]
     },
@@ -544,7 +598,7 @@
         "            \"n_outputs_expected_\": self.n_outputs_expected_,\n",
         "        }"
       ],
-      "execution_count": 219,
+      "execution_count": 64,
       "outputs": []
     },
     {
@@ -577,15 +631,14 @@
         "print(\"`y`, as will be passed to Keras:\")\n",
         "print([y_keras[0][:4], y_keras[1][:4]])"
       ],
-      "execution_count": 220,
+      "execution_count": 65,
       "outputs": [
         {
           "output_type": "stream",
+          "name": "stdout",
           "text": [
-            "`y`, as will be passed to Keras:\n",
-            "[array([0, 1, 1, 1]), array([2, 4, 2, 0])]\n"
-          ],
-          "name": "stdout"
+            "`y`, as will be passed to Keras:\n[array([1, 1, 1, 0]), array([3, 2, 2, 3])]\n"
+          ]
         }
       ]
     },
@@ -604,30 +657,28 @@
         "print(\"`y_pred`, as will be returned to sklearn:\")\n",
         "y_pred_sklearn[:5]"
       ],
-      "execution_count": 221,
+      "execution_count": 66,
       "outputs": [
         {
           "output_type": "stream",
+          "name": "stdout",
           "text": [
             "`y_pred`, as will be returned to sklearn:\n"
-          ],
-          "name": "stdout"
+          ]
         },
         {
           "output_type": "execute_result",
           "data": {
             "text/plain": [
-              "array([[0, 3],\n",
-              "       [1, 3],\n",
-              "       [0, 3],\n",
-              "       [0, 3],\n",
-              "       [0, 3]])"
+              "array([[0, 2],\n",
+              "       [0, 2],\n",
+              "       [0, 2],\n",
+              "       [0, 2],\n",
+              "       [0, 2]])"
             ]
           },
-          "metadata": {
-            "tags": []
-          },
-          "execution_count": 221
+          "metadata": {},
+          "execution_count": 66
         }
       ]
     },
@@ -644,14 +695,14 @@
       "source": [
         "print(f\"metadata = {tf.get_metadata()}\")"
       ],
-      "execution_count": 222,
+      "execution_count": 67,
       "outputs": [
         {
           "output_type": "stream",
+          "name": "stdout",
           "text": [
             "metadata = {'n_classes_': [2, 5], 'n_outputs_expected_': 2}\n"
-          ],
-          "name": "stdout"
+          ]
         }
       ]
     },
@@ -676,7 +727,7 @@
         "    def target_encoder(self):\n",
         "        return MultiOutputTransformer()"
       ],
-      "execution_count": 223,
+      "execution_count": 68,
       "outputs": []
     },
     {
@@ -700,7 +751,7 @@
         "X = y_sklearn\n",
         "X = StandardScaler().fit_transform(X)"
       ],
-      "execution_count": 224,
+      "execution_count": 69,
       "outputs": []
     },
     {
@@ -720,7 +771,7 @@
         "    y_pred_bin, y_pred_cat = y_pred[:, 0], y_pred[:, 1]\n",
         "    return np.mean([accuracy_score(y_bin, y_pred_bin), accuracy_score(y_cat, y_pred_cat)])"
       ],
-      "execution_count": 225,
+      "execution_count": 70,
       "outputs": []
     },
     {
@@ -738,19 +789,17 @@
         "\n",
         "np.mean(cross_val_score(clf, X, y_sklearn, scoring=scorer))"
       ],
-      "execution_count": 226,
+      "execution_count": 71,
       "outputs": [
         {
           "output_type": "execute_result",
           "data": {
             "text/plain": [
-              "0.9800000000000001"
+              "0.985"
             ]
           },
-          "metadata": {
-            "tags": []
-          },
-          "execution_count": 226
+          "metadata": {},
+          "execution_count": 71
         }
       ]
     },
@@ -778,20 +827,21 @@
         "id": "NPOO7tO6-1TV"
       },
       "source": [
-        "from sklearn.base import BaseEstimator, TransformerMixin\n",
+        "if False:  # avoid running pseudocode\n",
+        "    from sklearn.base import BaseEstimator, TransformerMixin\n",
         "\n",
         "\n",
-        "class MultiOutputTransformer(BaseEstimator, TransformerMixin):\n",
-        "    ...\n",
+        "    class MultiInptuTransformer(BaseEstimator, TransformerMixin):\n",
+        "        ...\n",
         "\n",
         "\n",
-        "class MultiOutputClassifier(KerasClassifier):\n",
+        "    class MultiInputClassifier(KerasClassifier):\n",
         "\n",
-        "    @property\n",
-        "    def feature_encoder(self):\n",
-        "        return MultiInputTransformer(...)"
+        "        @property\n",
+        "        def feature_encoder(self):\n",
+        "            return MultiInputTransformer(...)"
       ],
-      "execution_count": 227,
+      "execution_count": 72,
       "outputs": []
     },
     {
@@ -835,7 +885,7 @@
         "\n",
         "    return model"
       ],
-      "execution_count": 228,
+      "execution_count": 73,
       "outputs": []
     },
     {
@@ -865,7 +915,7 @@
         "model.fit(X, y, verbose=0, epochs=100)\n",
         "y_pred = model.predict(X).squeeze()"
       ],
-      "execution_count": 230,
+      "execution_count": 74,
       "outputs": []
     },
     {
@@ -883,19 +933,17 @@
         "\n",
         "r2_score(y, y_pred)"
       ],
-      "execution_count": 231,
+      "execution_count": 75,
       "outputs": [
         {
           "output_type": "execute_result",
           "data": {
             "text/plain": [
-              "0.815902187716304"
+              "0.9495515519877006"
             ]
           },
-          "metadata": {
-            "tags": []
-          },
-          "execution_count": 231
+          "metadata": {},
+          "execution_count": 75
         }
       ]
     },
@@ -944,7 +992,7 @@
         "            func=lambda X: [X[:, 0], X[:, 1]],\n",
         "        )"
       ],
-      "execution_count": 13,
+      "execution_count": 76,
       "outputs": []
     },
     {
@@ -982,19 +1030,17 @@
         "\n",
         "np.mean(cross_val_score(reg, X_sklearn, y))"
       ],
-      "execution_count": 233,
+      "execution_count": 77,
       "outputs": [
         {
           "output_type": "execute_result",
           "data": {
             "text/plain": [
-              "0.9994605932537043"
+              "0.9997162264854872"
             ]
           },
-          "metadata": {
-            "tags": []
-          },
-          "execution_count": 233
+          "metadata": {},
+          "execution_count": 77
         }
       ]
     },
@@ -1032,7 +1078,7 @@
         "(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()\n",
         "x_train.shape"
       ],
-      "execution_count": 22,
+      "execution_count": 78,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -1041,10 +1087,8 @@
               "(60000, 28, 28)"
             ]
           },
-          "metadata": {
-            "tags": []
-          },
-          "execution_count": 22
+          "metadata": {},
+          "execution_count": 78
         }
       ]
     },
@@ -1071,15 +1115,14 @@
         "print(y_train.shape)\n",
         "print(np.unique(y_train))"
       ],
-      "execution_count": 23,
+      "execution_count": 79,
       "outputs": [
         {
           "output_type": "stream",
+          "name": "stdout",
           "text": [
-            "(60000,)\n",
-            "[0 1 2 3 4 5 6 7 8 9]\n"
-          ],
-          "name": "stdout"
+            "(60000,)\n[0 1 2 3 4 5 6 7 8 9]\n"
+          ]
         }
       ]
     },
@@ -1108,7 +1151,7 @@
         "x_train = MinMaxScaler().fit_transform(x_train)\n",
         "x_test = MinMaxScaler().fit_transform(x_test)"
       ],
-      "execution_count": 24,
+      "execution_count": 80,
       "outputs": []
     },
     {
@@ -1124,14 +1167,14 @@
       "source": [
         "print(x_train.shape[1:])  # 784 = 28*28"
       ],
-      "execution_count": 25,
+      "execution_count": 81,
       "outputs": [
         {
           "output_type": "stream",
+          "name": "stdout",
           "text": [
             "(784,)\n"
-          ],
-          "name": "stdout"
+          ]
         }
       ]
     },
@@ -1148,14 +1191,14 @@
       "source": [
         "print(np.min(x_train), np.max(x_train))  # scaled 0-1"
       ],
-      "execution_count": 26,
+      "execution_count": 82,
       "outputs": [
         {
           "output_type": "stream",
+          "name": "stdout",
           "text": [
             "0.0 1.0\n"
-          ],
-          "name": "stdout"
+          ]
         }
       ]
     },
@@ -1214,7 +1257,7 @@
         "    )\n",
         "    return model"
       ],
-      "execution_count": 31,
+      "execution_count": 83,
       "outputs": []
     },
     {
@@ -1240,7 +1283,7 @@
         "            func=lambda X: X.reshape(X.shape[0], *input_shape),\n",
         "        )"
       ],
-      "execution_count": 32,
+      "execution_count": 84,
       "outputs": []
     },
     {
@@ -1257,7 +1300,7 @@
         "    random_state=0,\n",
         ")"
       ],
-      "execution_count": 36,
+      "execution_count": 85,
       "outputs": []
     },
     {
@@ -1291,53 +1334,53 @@
       "source": [
         "clf.fit(x_train, y_train)"
       ],
-      "execution_count": 37,
+      "execution_count": 86,
       "outputs": [
         {
           "output_type": "stream",
+          "name": "stdout",
           "text": [
             "Epoch 1/15\n",
-            "422/422 [==============================] - 39s 94ms/step - loss: 0.3532 - val_loss: 0.0825\n",
+            "422/422 [==============================] - 14s 33ms/step - loss: 0.7640 - val_loss: 0.0871\n",
             "Epoch 2/15\n",
-            "422/422 [==============================] - 39s 93ms/step - loss: 0.1124 - val_loss: 0.0590\n",
+            "422/422 [==============================] - 15s 35ms/step - loss: 0.1219 - val_loss: 0.0600\n",
             "Epoch 3/15\n",
-            "422/422 [==============================] - 39s 93ms/step - loss: 0.0870 - val_loss: 0.0483\n",
+            "422/422 [==============================] - 17s 40ms/step - loss: 0.0863 - val_loss: 0.0454\n",
             "Epoch 4/15\n",
-            "422/422 [==============================] - 39s 93ms/step - loss: 0.0722 - val_loss: 0.0419\n",
+            "422/422 [==============================] - 18s 42ms/step - loss: 0.0755 - val_loss: 0.0435\n",
             "Epoch 5/15\n",
-            "422/422 [==============================] - 38s 91ms/step - loss: 0.0647 - val_loss: 0.0419\n",
+            "422/422 [==============================] - 17s 39ms/step - loss: 0.0645 - val_loss: 0.0438\n",
             "Epoch 6/15\n",
-            "422/422 [==============================] - 39s 93ms/step - loss: 0.0583 - val_loss: 0.0361\n",
+            "422/422 [==============================] - 18s 42ms/step - loss: 0.0553 - val_loss: 0.0361\n",
             "Epoch 7/15\n",
-            "422/422 [==============================] - 39s 93ms/step - loss: 0.0537 - val_loss: 0.0360\n",
+            "422/422 [==============================] - 17s 41ms/step - loss: 0.0508 - val_loss: 0.0372\n",
             "Epoch 8/15\n",
-            "422/422 [==============================] - 40s 94ms/step - loss: 0.0509 - val_loss: 0.0335\n",
+            "422/422 [==============================] - 18s 44ms/step - loss: 0.0512 - val_loss: 0.0342\n",
             "Epoch 9/15\n",
-            "422/422 [==============================] - 39s 94ms/step - loss: 0.0470 - val_loss: 0.0321\n",
+            "422/422 [==============================] - 18s 41ms/step - loss: 0.0450 - val_loss: 0.0307\n",
             "Epoch 10/15\n",
-            "422/422 [==============================] - 39s 93ms/step - loss: 0.0433 - val_loss: 0.0315\n",
+            "422/422 [==============================] - 18s 43ms/step - loss: 0.0422 - val_loss: 0.0308\n",
             "Epoch 11/15\n",
-            "422/422 [==============================] - 39s 93ms/step - loss: 0.0431 - val_loss: 0.0315\n",
+            "422/422 [==============================] - 17s 41ms/step - loss: 0.0378 - val_loss: 0.0321\n",
             "Epoch 12/15\n",
-            "422/422 [==============================] - 39s 93ms/step - loss: 0.0410 - val_loss: 0.0297\n",
+            "422/422 [==============================] - 17s 41ms/step - loss: 0.0376 - val_loss: 0.0323\n",
             "Epoch 13/15\n",
-            "422/422 [==============================] - 40s 95ms/step - loss: 0.0395 - val_loss: 0.0293\n",
+            "422/422 [==============================] - 17s 39ms/step - loss: 0.0343 - val_loss: 0.0299\n",
             "Epoch 14/15\n",
-            "422/422 [==============================] - 38s 89ms/step - loss: 0.0386 - val_loss: 0.0296\n",
+            "422/422 [==============================] - 18s 42ms/step - loss: 0.0358 - val_loss: 0.0304\n",
             "Epoch 15/15\n",
-            "422/422 [==============================] - 38s 90ms/step - loss: 0.0362 - val_loss: 0.0284\n"
-          ],
-          "name": "stdout"
+            "422/422 [==============================] - 17s 41ms/step - loss: 0.0322 - val_loss: 0.0294\n"
+          ]
         },
         {
           "output_type": "execute_result",
           "data": {
             "text/plain": [
               "MultiDimensionalClassifier(\n",
-              "\tmodel=<function get_model at 0x7f03c05ed7b8>\n",
+              "\tmodel=<function get_model at 0x14edf3700>\n",
               "\tbuild_fn=None\n",
               "\twarm_start=False\n",
-              "\trandom_state=None\n",
+              "\trandom_state=0\n",
               "\toptimizer=rmsprop\n",
               "\tloss=None\n",
               "\tmetrics=None\n",
@@ -1348,13 +1391,12 @@
               "\tshuffle=True\n",
               "\trun_eagerly=False\n",
               "\tepochs=15\n",
+              "\tclass_weight=None\n",
               ")"
             ]
           },
-          "metadata": {
-            "tags": []
-          },
-          "execution_count": 37
+          "metadata": {},
+          "execution_count": 86
         }
       ]
     },
@@ -1372,16 +1414,573 @@
         "score = clf.score(x_test, y_test)\n",
         "print(f\"Test score (accuracy): {score:.2f}\")"
       ],
-      "execution_count": 38,
+      "execution_count": 87,
       "outputs": [
         {
           "output_type": "stream",
+          "name": "stdout",
           "text": [
-            "79/79 [==============================] - 2s 26ms/step\n",
+            "79/79 [==============================] - 1s 12ms/step\n",
             "Test score (accuracy): 0.99\n"
-          ],
-          "name": "stdout"
+          ]
+        }
+      ]
+    },
+    {
+      "source": [
+        "## 4. Ragged datasets with tf.data.Dataset"
+      ],
+      "cell_type": "markdown",
+      "metadata": {}
+    },
+    {
+      "source": [
+        "SciKeras provides a third dependancy injection point that operats on the entire dataset: X, y & sample_weight. This `dataset_transformer` is applied after `target_transformer` and `feature_transformer`. One use case for this dependancy injection point is to transform data from tabular/array-like to the `tf.data.Dataset` format, which only requires iteration. We can use this to create a `tf.data.Dataset` of ragged tensors."
+      ],
+      "cell_type": "markdown",
+      "metadata": {}
+    },
+    {
+      "source": [
+        "Note that `dataset_transformer` should accept a single **3 element tuple** as its argument and return value:"
+      ],
+      "cell_type": "markdown",
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 88,
+      "metadata": {},
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Help on property:\n\n    Retrieve a transformer to be applied jointly to the entire\n    dataset (X, y & sample_weights).\n    \n    By default, KerasClassifier implements ClassWeightDataTransformer,\n    which embeds class_weight into sample_weight.\n    \n    You can override this method to provide custom transformations.\n    To keep the default class_weight behavior, you can chain your\n    transfromer and ClassWeightDataTransformer using a Pipeline.\n    \n    It MUST accept a 3 element tuple as it's single input argument\n    to `fit` and `transform`. `transform` must also output\n    a 3 element tuple in the same format.\n    The first element corresponds to X, or as an output from the\n    transformer, to a `tf.data.Dataset` instance containing\n    X, y and optionally sample_weights.\n    The second element corresponds to `y`, and may be None\n    on the output side.\n    The third element is `sample_weights` which may be None\n    on the input and output sides.\n    \n    Note that `inverse_transform` is never used\n    and is not required to be implemented.\n    \n    Returns\n    -------\n    dataset_transformer\n        Transformer implementing the sklearn transformer\n        interface.\n\n"
+          ]
+        }
+      ],
+      "source": [
+        "help(KerasClassifier.dataset_transformer)"
+      ]
+    },
+    {
+      "source": [
+        "When you return a tuple like `(tf.data.Dataset(...), None, None)`, SciKeras will pass the data untouched to `Model.fit` like `Model.fit(x=tf.data.Dataset(...), y=None, sample_weight=None)`.\n",
+        "\n",
+        "Let's start by defining our data. We'll have an extra \"feature\" that marks the observation index, but we'll remove it when we deconstruct our data in the transformer."
+      ],
+      "cell_type": "markdown",
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 89,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "feature_1 = np.random.uniform(size=(10, ))\n",
+        "feature_2 = np.random.uniform(size=(10, ))\n",
+        "obs = [0] * 3 + [1] * 2 + [2] * 1 + [3] * 2 + [4] * 2\n",
+        "\n",
+        "X = np.column_stack([feature_1, feature_2, obs]).astype(\"float32\")\n",
+        "\n",
+        "y = np.array([\"class1\"] * 5 + [\"class2\"] * 5, dtype=str)"
+      ]
+    },
+    {
+      "source": [
+        "Next, we define our `dataset_transformer`. We will do this by defining a custom forward transformation outside of the Keras model. Note that we do not define an inverse transformation since that is never used.\n",
+        "Also note that `dataset_transformer` will _always_ be called with `X` (i.e. the first element of the tuple will always be populated), but will be called with `y=None` when used for `predict`. Thus,\n",
+        "you should check if `y` and `sample_weigh` are None before doing any operations on them."
+      ],
+      "cell_type": "markdown",
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 90,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from typing import Tuple, Optional\n",
+        "\n",
+        "from sklearn.base import BaseEstimator, TransformerMixin\n",
+        "from tensorflow import RaggedTensor\n",
+        "from tensorflow.data import Dataset\n",
+        "\n",
+        "\n",
+        "def dataset_transformer(data: Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]) -> Tuple[Dataset, None, None]:\n",
+        "    X, y, sample_weights = data\n",
+        "    if y is not None:\n",
+        "        y = y.reshape(-1, 1 if len(y.shape) == 1 else y.shape[1])\n",
+        "        y = y[RaggedTensor.from_value_rowids(y, X[:, -1]).row_starts().numpy()]\n",
+        "    if sample_weights is not None:\n",
+        "        sample_weights = sample_weights.reshape(-1, 1 if len(sample_weights.shape) == 1 else sample_weights.shape[1])\n",
+        "        sample_weights = sample_weights[RaggedTensor.from_value_rowids(sample_weights, X[:, -1]).row_starts().numpy()]\n",
+        "    X = RaggedTensor.from_value_rowids(X[:, :-1], X[:, -1])\n",
+        "    return (Dataset.from_tensor_slices((X, y, sample_weights)), None, None)\n"
+      ]
+    },
+    {
+      "source": [
+        "Lets quickly test our transformer:"
+      ],
+      "cell_type": "markdown",
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 91,
+      "metadata": {},
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "(<TensorSliceDataset shapes: ((None, 2), (1,), NoneTensorSpec()), types: (tf.float32, tf.string, NoneTensorSpec())>,\n",
+              " None,\n",
+              " None)"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 91
         }
+      ],
+      "source": [
+        "data = dataset_transformer((X, y, None))\n",
+        "data"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 92,
+      "metadata": {},
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "(<TensorSliceDataset shapes: ((None, 2), NoneTensorSpec(), NoneTensorSpec()), types: (tf.float32, NoneTensorSpec(), NoneTensorSpec())>,\n",
+              " None,\n",
+              " None)"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 92
+        }
+      ],
+      "source": [
+        "data = dataset_transformer((X, None, None))\n",
+        "data"
+      ]
+    },
+    {
+      "source": [
+        "Our shapes look good, and we can handle the `y=None` case.\n",
+        "Next, we can add our transormer to our model."
+      ],
+      "cell_type": "markdown",
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 93,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from sklearn.preprocessing import FunctionTransformer\n",
+        "\n",
+        "\n",
+        "class RaggedClassifier(KerasClassifier):\n",
+        "\n",
+        "    @property\n",
+        "    def dataset_transformer(self):\n",
+        "        return FunctionTransformer(dataset_transformer)"
+      ]
+    },
+    {
+      "source": [
+        "Now we can define a Model. We need some way to handle/flatten our ragged arrays within our model. For this example, we use a custom mean layer, but you could use an Embedding layer, LSTM, etc."
+      ],
+      "cell_type": "markdown",
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 94,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from tensorflow import reduce_mean, reshape\n",
+        "from tensorflow.keras import Sequential, layers\n",
+        "\n",
+        "\n",
+        "class CustomMean(layers.Layer):\n",
+        "\n",
+        "    def __init__(self, axis=None):\n",
+        "        super(CustomMean, self).__init__()\n",
+        "        self._supports_ragged_inputs = True\n",
+        "        self.axis = axis\n",
+        "\n",
+        "    def call(self, inputs, **kwargs):\n",
+        "        input_shape = inputs.get_shape()\n",
+        "        return reshape(reduce_mean(inputs, axis=self.axis), (1, *input_shape[1:]))\n",
+        "\n",
+        "\n",
+        "def get_model(meta):\n",
+        "    inp_shape = meta[\"X_shape_\"][1]-1\n",
+        "    model = Sequential([               \n",
+        "        layers.Input(shape=(inp_shape,), ragged=True),\n",
+        "        CustomMean(axis=0),\n",
+        "        layers.Dense(1, activation='sigmoid')\n",
+        "    ])\n",
+        "    return model"
+      ]
+    },
+    {
+      "source": [
+        "And attatch our model to our classifier wrapper:"
+      ],
+      "cell_type": "markdown",
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 95,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "clf = RaggedClassifier(get_model, loss=\"bce\")"
+      ]
+    },
+    {
+      "source": [
+        "Finally, let's train and predict:"
+      ],
+      "cell_type": "markdown",
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 96,
+      "metadata": {},
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "5/5 [==============================] - 0s 1ms/step - loss: 0.6143\n",
+            "5/5 [==============================] - 0s 1ms/step\n"
+          ]
+        },
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "array(['class1', 'class1', 'class1', 'class1', 'class1'], dtype='<U6')"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 96
+        }
+      ],
+      "source": [
+        "clf.fit(X, y)\n",
+        "y_pred = clf.predict(X)\n",
+        "y_pred"
+      ]
+    },
+    {
+      "source": [
+        "If we define our custom layers, transformers and wrappers in their own module, we can easily create a self-contained classifier that is able to handle ragged datasets and has a clean Scikit-Learn compatible API:"
+      ],
+      "cell_type": "markdown",
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 97,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "class RaggedClassifier(KerasClassifier):\n",
+        "\n",
+        "    @property\n",
+        "    def dataset_transformer(self):\n",
+        "        return FunctionTransformer(dataset_transformer)\n",
+        "    \n",
+        "    def _keras_build_fn(self):\n",
+        "        inp_shape = self.X_shape_[1] - 1\n",
+        "        model = Sequential([               \n",
+        "            layers.Input(shape=(inp_shape,), ragged=True),\n",
+        "            CustomMean(axis=0),\n",
+        "            layers.Dense(1, activation='sigmoid')\n",
+        "        ])\n",
+        "        return model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 98,
+      "metadata": {},
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "5/5 [==============================] - 0s 2ms/step - loss: 0.7775\n",
+            "5/5 [==============================] - 0s 1ms/step\n"
+          ]
+        },
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "array(['class2', 'class2', 'class2', 'class2', 'class1'], dtype='<U6')"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 98
+        }
+      ],
+      "source": [
+        "clf = RaggedClassifier(loss=\"bce\")\n",
+        "clf.fit(X, y)\n",
+        "y_pred = clf.predict(X)\n",
+        "y_pred"
+      ]
+    },
+    {
+      "source": [
+        "## 5. Multi-output class_weight"
+      ],
+      "cell_type": "markdown",
+      "metadata": {}
+    },
+    {
+      "source": [
+        "In this example, we will use `dataset_transformer` to support multi-output class weights. We will re-use our `MultiOutputTransformer` from our previous example to split the output, then we will create `sample_weights` from `class_weight`"
+      ],
+      "cell_type": "markdown",
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 99,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from collections import defaultdict\n",
+        "\n",
+        "from sklearn.utils.class_weight import compute_sample_weight\n",
+        "\n",
+        "\n",
+        "class DatasetTransformer(BaseEstimator, TransformerMixin):\n",
+        "\n",
+        "    def __init__(self, output_names, class_weight=None):\n",
+        "        self.class_weight = class_weight\n",
+        "        self.output_names = output_names\n",
+        "\n",
+        "    def fit(self, data: Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]) -> \"DatasetTransformer\":\n",
+        "        return self\n",
+        "\n",
+        "    def transform(self, data: Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]) -> Tuple[Dataset, None, None]:\n",
+        "        if self.class_weight is None:\n",
+        "            return data\n",
+        "        class_weight = self.class_weight\n",
+        "        if isinstance(class_weight, str):  # handle \"balanced\"\n",
+        "            class_weight_ = class_weight\n",
+        "            class_weight = defaultdict(lambda: class_weight_)\n",
+        "        X, y, sample_weights = data\n",
+        "        assert sample_weights is None, \"Cannot use class_weight & sample_weights together\"\n",
+        "        if y is not None:\n",
+        "            # y should be a list of arrays, as split up by MultiOutputTransformer\n",
+        "            sample_weights = dict()\n",
+        "            for output_num, (output_name, output_data) in enumerate(zip(self.output_names, y)):\n",
+        "                # class_weight is expected to be indexable by output_number\n",
+        "                # see https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_sample_weight.html\n",
+        "                # Note that it is trivial to change the expected format to match Keras' ({output_name: weights, ...})\n",
+        "                # see https://github.com/keras-team/keras/issues/4735#issuecomment-267473722\n",
+        "                cls_wt_out = class_weight[output_num]\n",
+        "                sample_weights[output_name] = compute_sample_weight(cls_wt_out, output_data)\n",
+        "        return X, y, sample_weights\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 100,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "def get_model(meta, compile_kwargs):\n",
+        "    inp = keras.layers.Input(shape=(meta[\"n_features_in_\"]))\n",
+        "    x1 = keras.layers.Dense(100, activation=\"relu\")(inp)\n",
+        "    out_bin = keras.layers.Dense(1, activation=\"sigmoid\")(x1)\n",
+        "    out_cat = keras.layers.Dense(meta[\"n_classes_\"][1], activation=\"softmax\")(x1)\n",
+        "    model = keras.Model(inputs=inp, outputs=[out_bin, out_cat])\n",
+        "    model.compile(\n",
+        "        loss=[\"binary_crossentropy\", \"sparse_categorical_crossentropy\"],\n",
+        "        optimizer=compile_kwargs[\"optimizer\"]\n",
+        "    )\n",
+        "    return model\n",
+        "\n",
+        "\n",
+        "class CustomClassifier(KerasClassifier):\n",
+        "\n",
+        "    @property\n",
+        "    def target_encoder(self):\n",
+        "        return MultiOutputTransformer()\n",
+        "    \n",
+        "    @property\n",
+        "    def dataset_transformer(self):\n",
+        "        return DatasetTransformer(\n",
+        "            output_names=self.model_.output_names,\n",
+        "            class_weight=self.class_weight\n",
+        "        )"
+      ]
+    },
+    {
+      "source": [
+        "Next, we define the data. We'll use `sklearn.datasets.make_blobs` to generate a relatively noisy dataset:"
+      ],
+      "cell_type": "markdown",
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 133,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from sklearn.datasets import make_blobs\n",
+        "\n",
+        "\n",
+        "X, y = make_blobs(centers=3, random_state=0, cluster_std=20)\n",
+        "# make a binary target for \"is the value of the first class?\"\n",
+        "y_bin = y == y[0]\n",
+        "y = np.column_stack([y_bin, y])"
+      ]
+    },
+    {
+      "source": [
+        "Test the model without specifying class weighting:"
+      ],
+      "cell_type": "markdown",
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 134,
+      "metadata": {},
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "[91  9]\n[28 30 42]\n"
+          ]
+        }
+      ],
+      "source": [
+        "clf = CustomClassifier(get_model, epochs=100, verbose=0, random_state=0)\n",
+        "clf.fit(X, y)\n",
+        "y_pred = clf.predict(X)\n",
+        "(_, counts_bin) = np.unique(y_pred[:, 0], return_counts=True)\n",
+        "print(counts_bin)\n",
+        "(_, counts_cat) = np.unique(y_pred[:, 1], return_counts=True)\n",
+        "print(counts_cat)"
+      ]
+    },
+    {
+      "source": [
+        "As you can see, without `class_weight=\"balanced\"`, our classifier only predicts mainly a single class for the first output. Now with `class_weight=\"balanced\"`:"
+      ],
+      "cell_type": "markdown",
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 135,
+      "metadata": {},
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "[57 43]\n[27 27 46]\n"
+          ]
+        }
+      ],
+      "source": [
+        "clf = CustomClassifier(get_model, class_weight=\"balanced\", epochs=100, verbose=0, random_state=0)\n",
+        "clf.fit(X, y)\n",
+        "y_pred = clf.predict(X)\n",
+        "(_, counts_bin) = np.unique(y_pred[:, 0], return_counts=True)\n",
+        "print(counts_bin)\n",
+        "(_, counts_cat) = np.unique(y_pred[:, 1], return_counts=True)\n",
+        "print(counts_cat)"
+      ]
+    },
+    {
+      "source": [
+        "Now, we get (mostly) balanced classes. But what if we want to specify our classes manually? You will notice that in when we defined `DatasetTransformer`, we gave it the ability to handle\n",
+        "a list of class weights. For demonstration purposes, we will highly bias towards the second class in each output:"
+      ],
+      "cell_type": "markdown",
+      "metadata": {}
+    },
+    {
+      "source": [
+        "clf = CustomClassifier(get_model, class_weight=[{0: 0.1, 1: 1}, {0: 0.1, 1: 1, 2: 0.1}], epochs=100, verbose=0, random_state=0)\n",
+        "clf.fit(X, y)\n",
+        "y_pred = clf.predict(X)\n",
+        "(_, counts_bin) = np.unique(y_pred[:, 0], return_counts=True)\n",
+        "print(counts_bin)\n",
+        "(_, counts_cat) = np.unique(y_pred[:, 1], return_counts=True)\n",
+        "print(counts_cat)"
+      ],
+      "cell_type": "code",
+      "metadata": {},
+      "execution_count": 136,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "[ 7 93]\n[ 2 98]\n"
+          ]
+        }
+      ]
+    },
+    {
+      "source": [
+        "Or mixing the two methods, because our first output is unbalanced but our second is (presumably) balanced:"
+      ],
+      "cell_type": "markdown",
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 137,
+      "metadata": {},
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "[57 43]\n[30 25 45]\n"
+          ]
+        }
+      ],
+      "source": [
+        "clf = CustomClassifier(get_model, class_weight=[\"balanced\", None], epochs=100, verbose=0, random_state=0)\n",
+        "clf.fit(X, y)\n",
+        "y_pred = clf.predict(X)\n",
+        "(_, counts_bin) = np.unique(y_pred[:, 0], return_counts=True)\n",
+        "print(counts_bin)\n",
+        "(_, counts_cat) = np.unique(y_pred[:, 1], return_counts=True)\n",
+        "print(counts_cat)"
       ]
     }
   ]
diff --git a/scikeras/utils/transformers.py b/scikeras/utils/transformers.py
index 703cdbf71..6a1c951bf 100644
--- a/scikeras/utils/transformers.py
+++ b/scikeras/utils/transformers.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import tensorflow as tf
@@ -7,6 +7,7 @@
 from sklearn.exceptions import NotFittedError
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, OrdinalEncoder
+from sklearn.utils.class_weight import compute_sample_weight
 from sklearn.utils.multiclass import type_of_target
 from tensorflow.keras.losses import Loss
 from tensorflow.python.keras.losses import is_categorical_crossentropy
@@ -390,3 +391,23 @@ def get_metadata(self):
             "n_outputs_": self.n_outputs_,
             "n_outputs_expected_": self.n_outputs_expected_,
         }
+
+
+class ClassWeightDataTransformer(BaseEstimator, TransformerMixin):
+    def __init__(self, class_weight: Optional[Union[str, Dict[int, float]]] = None):
+        self.class_weight = class_weight
+
+    def fit(
+        self, data: Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]
+    ) -> "ClassWeightDataTransformer":
+        return self
+
+    def transform(
+        self, data: Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]
+    ) -> Tuple[np.ndarray, Union[np.ndarray, None], Union[np.ndarray, None]]:
+        X, y, sample_weight = data
+        if self.class_weight is None or y is None:
+            return data
+        sample_weight = 1 if sample_weight is None else sample_weight
+        sample_weight *= compute_sample_weight(class_weight=self.class_weight, y=y)
+        return X, y, sample_weight
diff --git a/scikeras/wrappers.py b/scikeras/wrappers.py
index f772ff6fd..9ffccdff5 100644
--- a/scikeras/wrappers.py
+++ b/scikeras/wrappers.py
@@ -5,6 +5,7 @@
 import warnings
 
 from collections import defaultdict
+from random import sample
 from typing import Any, Callable, Dict, Iterable, List, Tuple, Type, Union
 
 import numpy as np
@@ -15,7 +16,6 @@
 from sklearn.metrics import accuracy_score as sklearn_accuracy_score
 from sklearn.metrics import r2_score as sklearn_r2_score
 from sklearn.preprocessing import FunctionTransformer
-from sklearn.utils.class_weight import compute_sample_weight
 from sklearn.utils.multiclass import type_of_target
 from sklearn.utils.validation import _check_sample_weight, check_array, check_X_y
 from tensorflow.keras import losses as losses_module
@@ -33,7 +33,11 @@
     unflatten_params,
 )
 from scikeras.utils import loss_name, metric_name
-from scikeras.utils.transformers import ClassifierLabelEncoder, RegressorTargetEncoder
+from scikeras.utils.transformers import (
+    ClassifierLabelEncoder,
+    ClassWeightDataTransformer,
+    RegressorTargetEncoder,
+)
 
 
 class BaseWrapper(BaseEstimator):
@@ -135,7 +139,6 @@ class BaseWrapper(BaseEstimator):
         "callbacks",
         "validation_split",
         "shuffle",
-        "class_weight",
         "sample_weight",
         "initial_epoch",
         "validation_steps",
@@ -550,7 +553,12 @@ def _check_model_compatibility(self, y: np.ndarray) -> None:
                     )
 
     def _validate_data(
-        self, X=None, y=None, reset: bool = False, y_numeric: bool = False
+        self,
+        X=None,
+        y=None,
+        sample_weight=None,
+        reset: bool = False,
+        y_numeric: bool = False,
     ) -> Tuple[np.ndarray, Union[np.ndarray, None]]:
         """Validate input arrays and set or check their meta-parameters.
 
@@ -657,7 +665,9 @@ def _check_array_dtype(arr, force_numeric):
                             n_features_in_, self.__class__.__name__, self.n_features_in_
                         )
                     )
-        return X, y
+        if sample_weight is not None:
+            X, y, sample_weight = self._validate_sample_weight(X, y, sample_weight)
+        return X, y, sample_weight
 
     def _type_of_target(self, y: np.ndarray) -> str:
         return type_of_target(y)
@@ -698,18 +708,24 @@ def feature_encoder(self):
 
     @property
     def dataset_transformer(self):
-        """Retrieve a transformer to be applied jointly to X & y.
+        """Retrieve a transformer to be applied jointly to the entire
+        dataset (X, y & sample_weights).
 
-        It MUST accept a 2 element tuple as it's single input argument
-        to `fit`, `transform` and `inverse_transform` and the
-        latter two MUST also output a two element tuple.
-        The second element, corresponding to `y`, can be None
-        if the first element (corresponding to `X`) is a Dataset.
+        You can override this method to provide custom transformations.
 
-        Metadata will be collected from ``get_metadata`` if
-        the transformer implements that method.
-        Override this method to implement a custom data transformer
-        for entire dataset.
+        It MUST accept a 3 element tuple as it's single input argument
+        to `fit` and `transform`. `transform` must also output
+        a 3 element tuple in the same format.
+        The first element corresponds to X, or as an output from the
+        transformer, to a `tf.data.Dataset` instance containing
+        X, y and optionally sample_weights.
+        The second element corresponds to `y`, and may be None
+        on the output side.
+        The third element is `sample_weights` which may be None
+        on the input and output sides.
+
+        Note that `inverse_transform` is never used
+        and is not required to be implemented.
 
         Returns
         -------
@@ -775,7 +791,10 @@ def initialized_(self) -> bool:
         return hasattr(self, "model_")
 
     def _initialize(
-        self, X: np.ndarray, y: Union[np.ndarray, None] = None
+        self,
+        X: np.ndarray,
+        y: Union[np.ndarray, None] = None,
+        sample_weight: Union[np.ndarray, None] = None,
     ) -> Tuple[np.ndarray, np.ndarray]:
 
         # Handle random state
@@ -791,7 +810,7 @@ def _initialize(
             # int or None
             self._random_state = self.random_state
 
-        X, y = self._validate_data(X, y, reset=True)
+        X, y, sample_weight = self._validate_data(X, y, sample_weight, reset=True)
 
         self.target_encoder_ = self.target_encoder.fit(y)
         target_metadata = getattr(self.target_encoder_, "get_metadata", dict)()
@@ -799,15 +818,16 @@ def _initialize(
         self.feature_encoder_ = self.feature_encoder.fit(X)
         feature_meta = getattr(self.feature_encoder_, "get_metadata", dict)()
         vars(self).update(**feature_meta)
-        self.dataset_transformer_ = self.dataset_transformer.fit((X, y))
-        dataset_meta = getattr(self.dataset_transformer_, "get_metadata", dict)()
-        vars(self).update(**dataset_meta)
 
         self.model_ = self._build_keras_model()
 
-        return X, y
+        self.dataset_transformer_ = self.dataset_transformer.fit((X, y, sample_weight))
+        dataset_meta = getattr(self.dataset_transformer_, "get_metadata", dict)()
+        vars(self).update(**dataset_meta)
+
+        return X, y, sample_weight
 
-    def initialize(self, X, y=None) -> "BaseWrapper":
+    def initialize(self, X, y=None, sample_weight=None) -> "BaseWrapper":
         """Initialize the model without any fitting.
 
         You only need to call this model if you explicitly do not want to do any fitting
@@ -828,7 +848,7 @@ def initialize(self, X, y=None) -> "BaseWrapper":
         BaseWrapper
             A reference to the BaseWrapper instance for chained calling.
         """
-        self._initialize(X, y)
+        self._initialize(X, y, sample_weight)
         return self  # to allow chained calls like initialize(...).predict(...)
 
     def _fit(
@@ -865,12 +885,9 @@ def _fit(
         """
         # Data checks
         if not ((self.warm_start or warm_start) and self.initialized_):
-            X, y = self._initialize(X, y)
+            X, y, sample_weight = self._initialize(X, y, sample_weight)
         else:
-            X, y = self._validate_data(X, y)
-
-        if sample_weight is not None:
-            X, y, sample_weight = self._validate_sample_weight(X, y, sample_weight)
+            X, y, sample_weight = self._validate_data(X, y, sample_weight)
 
         X = self.feature_encoder_.transform(X)
 
@@ -878,7 +895,7 @@ def _fit(
             y = self.target_encoder_.transform(y)
             self._check_model_compatibility(y)
 
-        X, y = self.dataset_transformer_.transform((X, y))
+        X, y, sample_weight = self.dataset_transformer_.transform((X, y, sample_weight))
 
         self._fit_keras_model(
             X,
@@ -953,10 +970,11 @@ def predict(self, X, **kwargs):
                 "Estimator needs to be fit before `predict` " "can be called"
             )
         # basic input checks
-        X, _ = self._validate_data(X=X, y=None)
+        X, _, _ = self._validate_data(X=X)
 
         # pre process X
         X = self.feature_encoder_.transform(X)
+        X, _, _ = self.dataset_transformer_.transform((X, None, None))
 
         # filter kwargs and get attributes for predict
         params = self.get_params()
@@ -1026,7 +1044,7 @@ def score(self, X, y, sample_weight=None) -> float:
             )
 
         # validate y
-        _, y = self._validate_data(X=None, y=y)
+        _, y, _ = self._validate_data(X=None, y=y)
 
         # compute Keras model score
         y_pred = self.predict(X)
@@ -1343,6 +1361,40 @@ def target_encoder(self):
         categories = "auto" if self.classes_ is None else [self.classes_]
         return ClassifierLabelEncoder(loss=self.loss, categories=categories)
 
+    @property
+    def dataset_transformer(self):
+        """Retrieve a transformer to be applied jointly to the entire
+        dataset (X, y & sample_weights).
+
+        By default, KerasClassifier implements ClassWeightDataTransformer,
+        which embeds class_weight into sample_weight.
+
+        You can override this method to provide custom transformations.
+        To keep the default class_weight behavior, you can chain your
+        transfromer and ClassWeightDataTransformer using a Pipeline.
+
+        It MUST accept a 3 element tuple as it's single input argument
+        to `fit` and `transform`. `transform` must also output
+        a 3 element tuple in the same format.
+        The first element corresponds to X, or as an output from the
+        transformer, to a `tf.data.Dataset` instance containing
+        X, y and optionally sample_weights.
+        The second element corresponds to `y`, and may be None
+        on the output side.
+        The third element is `sample_weights` which may be None
+        on the input and output sides.
+
+        Note that `inverse_transform` is never used
+        and is not required to be implemented.
+
+        Returns
+        -------
+        dataset_transformer
+            Transformer implementing the sklearn transformer
+            interface.
+        """
+        return ClassWeightDataTransformer(class_weight=self.class_weight)
+
     def initialize(self, X, y) -> "KerasClassifier":
         """Initialize the model without any fitting.
         You only need to call this model if you explicitly do not want to do any fitting
@@ -1393,9 +1445,6 @@ def fit(self, X, y, sample_weight=None, **kwargs) -> "KerasClassifier":
             (ex: instance.fit(X,y).transform(X) )
         """
         self.classes_ = None
-        if self.class_weight is not None:
-            sample_weight = 1 if sample_weight is None else sample_weight
-            sample_weight *= compute_sample_weight(class_weight=self.class_weight, y=y)
         super().fit(X=X, y=y, sample_weight=sample_weight, **kwargs)
         return self
 
@@ -1430,9 +1479,6 @@ def partial_fit(self, X, y, classes=None, sample_weight=None) -> "KerasClassifie
         self.classes_ = (
             classes if classes is not None else getattr(self, "classes_", None)
         )
-        if self.class_weight is not None:
-            sample_weight = 1 if sample_weight is None else sample_weight
-            sample_weight *= compute_sample_weight(class_weight=self.class_weight, y=y)
         super().partial_fit(X, y, sample_weight=sample_weight)
         return self
 
@@ -1472,7 +1518,7 @@ def predict_proba(self, X, **kwargs):
             )
 
         # basic input checks
-        X, _ = self._validate_data(X=X, y=None)
+        X, _, _ = self._validate_data(X=X)
 
         # pre process X
         X = self.feature_encoder_.transform(X)
@@ -1635,13 +1681,20 @@ def scorer(y_true, y_pred, **kwargs) -> float:
         return sklearn_r2_score(y_true, y_pred, **kwargs)
 
     def _validate_data(
-        self, X=None, y=None, reset: bool = False, y_numeric: bool = False
+        self,
+        X=None,
+        y=None,
+        sample_weight=None,
+        reset: bool = False,
+        y_numeric: bool = False,
     ) -> Tuple[np.ndarray, Union[np.ndarray, None]]:
         # For regressors, y should ALWAYS be numeric
         # To enforce this without additional dtype checks, we set `y_numeric=True`
         # when calling `_validate_data` which will force casting to numeric for
         # non-numeric data.
-        return super()._validate_data(X=X, y=y, reset=reset, y_numeric=True)
+        return super()._validate_data(
+            X=X, y=y, sample_weight=sample_weight, reset=reset, y_numeric=True
+        )
 
     @property
     def target_encoder(self):
diff --git a/tests/test_api.py b/tests/test_api.py
index b47e6b382..e0cc80c00 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -1,7 +1,7 @@
 """Tests for Scikit-learn API wrapper."""
 import pickle
 
-from typing import Any, Dict
+from typing import Any, Dict, Tuple
 from unittest.mock import patch
 
 import numpy as np
@@ -823,11 +823,13 @@ def test_conversion_to_dataset(self):
         m = Model(inp, out)
         m.compile(loss="bce")
 
+        def tf(X_y_s: Tuple[np.ndarray, np.ndarray, np.ndarray]):
+            return Dataset.from_tensor_slices(X_y_s), None, None
+
         class MyWrapper(KerasClassifier):
             @property
             def dataset_transformer(self):
-                f = lambda x_y: (Dataset.from_tensor_slices(x_y), None)
-                return FunctionTransformer(f)
+                return FunctionTransformer(tf)
 
         est = MyWrapper(m)
         X = np.random.random((100, 1))
diff --git a/tests/test_param_routing.py b/tests/test_param_routing.py
index 8d2d6149f..026e75d50 100644
--- a/tests/test_param_routing.py
+++ b/tests/test_param_routing.py
@@ -23,7 +23,6 @@
     "n_outputs_",
     "feature_encoder_",
     "target_encoder_",
-    "dataset_transformer_",
 }
 
 keras_regressor_base_meta_set = {
@@ -37,7 +36,6 @@
     "target_type_",
     "feature_encoder_",
     "target_encoder_",
-    "dataset_transformer_",
 }
 
 

From b7fb34cefa5104ecf24d0f77cbe5a0e1d74ab33d Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Thu, 21 Jan 2021 15:42:05 -0600
Subject: [PATCH 03/29] run entire notebook

---
 notebooks/DataTransformers.ipynb | 238 ++++++++++++-------------------
 1 file changed, 95 insertions(+), 143 deletions(-)

diff --git a/notebooks/DataTransformers.ipynb b/notebooks/DataTransformers.ipynb
index a12418abe..1d042696f 100644
--- a/notebooks/DataTransformers.ipynb
+++ b/notebooks/DataTransformers.ipynb
@@ -114,56 +114,8 @@
       "source": [
         "!python -m pip install scikeras"
       ],
-      "execution_count": 53,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Requirement already satisfied: scikeras in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (0.2.1)\n",
-            "Requirement already satisfied: tensorflow>=2.4.0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from scikeras) (2.4.0)\n",
-            "Requirement already satisfied: scikit-learn>=0.22.0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from scikeras) (0.23.2)\n",
-            "Requirement already satisfied: scipy>=0.19.1 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from scikit-learn>=0.22.0->scikeras) (1.5.4)\n",
-            "Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from scikit-learn>=0.22.0->scikeras) (2.1.0)\n",
-            "Requirement already satisfied: joblib>=0.11 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from scikit-learn>=0.22.0->scikeras) (1.0.0)\n",
-            "Requirement already satisfied: numpy>=1.13.3 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from scikit-learn>=0.22.0->scikeras) (1.19.4)\n",
-            "Requirement already satisfied: h5py~=2.10.0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (2.10.0)\n",
-            "Requirement already satisfied: protobuf>=3.9.2 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (3.14.0)\n",
-            "Requirement already satisfied: six~=1.15.0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (1.15.0)\n",
-            "Requirement already satisfied: keras-preprocessing~=1.1.2 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (1.1.2)\n",
-            "Requirement already satisfied: google-pasta~=0.2 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (0.2.0)\n",
-            "Requirement already satisfied: opt-einsum~=3.3.0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (3.3.0)\n",
-            "Requirement already satisfied: absl-py~=0.10 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (0.11.0)\n",
-            "Requirement already satisfied: wrapt~=1.12.1 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (1.12.1)\n",
-            "Requirement already satisfied: tensorboard~=2.4 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (2.4.0)\n",
-            "Requirement already satisfied: flatbuffers~=1.12.0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (1.12)\n",
-            "Requirement already satisfied: termcolor~=1.1.0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (1.1.0)\n",
-            "Requirement already satisfied: astunparse~=1.6.3 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (1.6.3)\n",
-            "Requirement already satisfied: wheel~=0.35 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (0.36.2)\n",
-            "Requirement already satisfied: gast==0.3.3 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (0.3.3)\n",
-            "Requirement already satisfied: grpcio~=1.32.0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (1.32.0)\n",
-            "Requirement already satisfied: typing-extensions~=3.7.4 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (3.7.4.3)\n",
-            "Requirement already satisfied: tensorflow-estimator<2.5.0,>=2.4.0rc0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorflow>=2.4.0->scikeras) (2.4.0)\n",
-            "Requirement already satisfied: werkzeug>=0.11.15 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (1.0.1)\n",
-            "Requirement already satisfied: requests<3,>=2.21.0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (2.25.1)\n",
-            "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (1.7.0)\n",
-            "Requirement already satisfied: markdown>=2.6.8 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (3.3.3)\n",
-            "Requirement already satisfied: google-auth<2,>=1.6.3 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (1.24.0)\n",
-            "Requirement already satisfied: setuptools>=41.0.0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (51.0.0)\n",
-            "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (0.4.2)\n",
-            "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from google-auth<2,>=1.6.3->tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (4.2.0)\n",
-            "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from google-auth<2,>=1.6.3->tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (0.2.8)\n",
-            "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from google-auth<2,>=1.6.3->tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (4.6)\n",
-            "Requirement already satisfied: requests-oauthlib>=0.7.0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (1.3.0)\n",
-            "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from pyasn1-modules>=0.2.1->google-auth<2,>=1.6.3->tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (0.4.8)\n",
-            "Requirement already satisfied: idna<3,>=2.5 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (2.10)\n",
-            "Requirement already satisfied: certifi>=2017.4.17 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (2020.12.5)\n",
-            "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (1.26.2)\n",
-            "Requirement already satisfied: chardet<5,>=3.0.2 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (4.0.0)\n",
-            "Requirement already satisfied: oauthlib>=3.0.0 in /Users/adrian.badaracco/Documents/GitHub/scikeras/.venv/lib/python3.8/site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard~=2.4->tensorflow>=2.4.0->scikeras) (3.1.0)\n"
-          ]
-        }
-      ]
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
@@ -185,7 +137,7 @@
         "get_logger().setLevel('ERROR')\n",
         "warnings.filterwarnings(\"ignore\", message=\"Setting the random state for TF\")"
       ],
-      "execution_count": 54,
+      "execution_count": 138,
       "outputs": []
     },
     {
@@ -198,7 +150,7 @@
         "from scikeras.wrappers import KerasClassifier, KerasRegressor\n",
         "from tensorflow import keras"
       ],
-      "execution_count": 55,
+      "execution_count": 139,
       "outputs": []
     },
     {
@@ -270,7 +222,7 @@
         "        def target_encoder(self):\n",
         "            return RegressorTargetEncoder()"
       ],
-      "execution_count": 56,
+      "execution_count": 140,
       "outputs": []
     },
     {
@@ -290,7 +242,7 @@
       "source": [
         "from sklearn.base import BaseEstimator, TransformerMixin"
       ],
-      "execution_count": 57,
+      "execution_count": 141,
       "outputs": []
     },
     {
@@ -311,7 +263,7 @@
         "        def target_encoder(self):\n",
         "            return MultiOutputTransformer(...)"
       ],
-      "execution_count": 58,
+      "execution_count": 142,
       "outputs": []
     },
     {
@@ -360,7 +312,7 @@
         "    clf.fit(X, y)  # Got: foobarbaz\n",
         "    print(clf.my_param_)  # foobarbaz"
       ],
-      "execution_count": 59,
+      "execution_count": 143,
       "outputs": []
     },
     {
@@ -421,7 +373,7 @@
         "    )\n",
         "    return model"
       ],
-      "execution_count": 60,
+      "execution_count": 144,
       "outputs": []
     },
     {
@@ -457,7 +409,7 @@
         "model.fit(X, y, verbose=0)\n",
         "y_pred = model.predict(X)"
       ],
-      "execution_count": 61,
+      "execution_count": 145,
       "outputs": []
     },
     {
@@ -473,13 +425,13 @@
       "source": [
         "print(y_pred[0][:2, :])"
       ],
-      "execution_count": 62,
+      "execution_count": 146,
       "outputs": [
         {
           "output_type": "stream",
           "name": "stdout",
           "text": [
-            "[[0.396703  ]\n [0.36225754]]\n"
+            "[[0.4953647]\n [0.5277547]]\n"
           ]
         }
       ]
@@ -497,13 +449,13 @@
       "source": [
         "print(y_pred[1][:2, :])"
       ],
-      "execution_count": 63,
+      "execution_count": 147,
       "outputs": [
         {
           "output_type": "stream",
           "name": "stdout",
           "text": [
-            "[[0.23275968 0.180671   0.30023158 0.16657698 0.11976068]\n [0.20901404 0.19675821 0.29663667 0.16592933 0.13166177]]\n"
+            "[[0.14030135 0.24718134 0.21445115 0.18598711 0.21207905]\n [0.15261228 0.24328376 0.22087498 0.17868358 0.20454536]]\n"
           ]
         }
       ]
@@ -598,7 +550,7 @@
         "            \"n_outputs_expected_\": self.n_outputs_expected_,\n",
         "        }"
       ],
-      "execution_count": 64,
+      "execution_count": 148,
       "outputs": []
     },
     {
@@ -631,13 +583,13 @@
         "print(\"`y`, as will be passed to Keras:\")\n",
         "print([y_keras[0][:4], y_keras[1][:4]])"
       ],
-      "execution_count": 65,
+      "execution_count": 149,
       "outputs": [
         {
           "output_type": "stream",
           "name": "stdout",
           "text": [
-            "`y`, as will be passed to Keras:\n[array([1, 1, 1, 0]), array([3, 2, 2, 3])]\n"
+            "`y`, as will be passed to Keras:\n[array([1, 0, 1, 0]), array([4, 4, 3, 2])]\n"
           ]
         }
       ]
@@ -657,7 +609,7 @@
         "print(\"`y_pred`, as will be returned to sklearn:\")\n",
         "y_pred_sklearn[:5]"
       ],
-      "execution_count": 66,
+      "execution_count": 150,
       "outputs": [
         {
           "output_type": "stream",
@@ -670,15 +622,15 @@
           "output_type": "execute_result",
           "data": {
             "text/plain": [
-              "array([[0, 2],\n",
-              "       [0, 2],\n",
-              "       [0, 2],\n",
-              "       [0, 2],\n",
-              "       [0, 2]])"
+              "array([[0, 1],\n",
+              "       [1, 1],\n",
+              "       [0, 4],\n",
+              "       [0, 4],\n",
+              "       [0, 4]])"
             ]
           },
           "metadata": {},
-          "execution_count": 66
+          "execution_count": 150
         }
       ]
     },
@@ -695,7 +647,7 @@
       "source": [
         "print(f\"metadata = {tf.get_metadata()}\")"
       ],
-      "execution_count": 67,
+      "execution_count": 151,
       "outputs": [
         {
           "output_type": "stream",
@@ -727,7 +679,7 @@
         "    def target_encoder(self):\n",
         "        return MultiOutputTransformer()"
       ],
-      "execution_count": 68,
+      "execution_count": 152,
       "outputs": []
     },
     {
@@ -751,7 +703,7 @@
         "X = y_sklearn\n",
         "X = StandardScaler().fit_transform(X)"
       ],
-      "execution_count": 69,
+      "execution_count": 153,
       "outputs": []
     },
     {
@@ -771,7 +723,7 @@
         "    y_pred_bin, y_pred_cat = y_pred[:, 0], y_pred[:, 1]\n",
         "    return np.mean([accuracy_score(y_bin, y_pred_bin), accuracy_score(y_cat, y_pred_cat)])"
       ],
-      "execution_count": 70,
+      "execution_count": 154,
       "outputs": []
     },
     {
@@ -789,7 +741,7 @@
         "\n",
         "np.mean(cross_val_score(clf, X, y_sklearn, scoring=scorer))"
       ],
-      "execution_count": 71,
+      "execution_count": 155,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -799,7 +751,7 @@
             ]
           },
           "metadata": {},
-          "execution_count": 71
+          "execution_count": 155
         }
       ]
     },
@@ -841,7 +793,7 @@
         "        def feature_encoder(self):\n",
         "            return MultiInputTransformer(...)"
       ],
-      "execution_count": 72,
+      "execution_count": 156,
       "outputs": []
     },
     {
@@ -885,7 +837,7 @@
         "\n",
         "    return model"
       ],
-      "execution_count": 73,
+      "execution_count": 157,
       "outputs": []
     },
     {
@@ -915,7 +867,7 @@
         "model.fit(X, y, verbose=0, epochs=100)\n",
         "y_pred = model.predict(X).squeeze()"
       ],
-      "execution_count": 74,
+      "execution_count": 158,
       "outputs": []
     },
     {
@@ -933,17 +885,17 @@
         "\n",
         "r2_score(y, y_pred)"
       ],
-      "execution_count": 75,
+      "execution_count": 159,
       "outputs": [
         {
           "output_type": "execute_result",
           "data": {
             "text/plain": [
-              "0.9495515519877006"
+              "0.8902906544625109"
             ]
           },
           "metadata": {},
-          "execution_count": 75
+          "execution_count": 159
         }
       ]
     },
@@ -992,7 +944,7 @@
         "            func=lambda X: [X[:, 0], X[:, 1]],\n",
         "        )"
       ],
-      "execution_count": 76,
+      "execution_count": 160,
       "outputs": []
     },
     {
@@ -1030,17 +982,17 @@
         "\n",
         "np.mean(cross_val_score(reg, X_sklearn, y))"
       ],
-      "execution_count": 77,
+      "execution_count": 161,
       "outputs": [
         {
           "output_type": "execute_result",
           "data": {
             "text/plain": [
-              "0.9997162264854872"
+              "0.9995720499457423"
             ]
           },
           "metadata": {},
-          "execution_count": 77
+          "execution_count": 161
         }
       ]
     },
@@ -1078,7 +1030,7 @@
         "(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()\n",
         "x_train.shape"
       ],
-      "execution_count": 78,
+      "execution_count": 162,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -1088,7 +1040,7 @@
             ]
           },
           "metadata": {},
-          "execution_count": 78
+          "execution_count": 162
         }
       ]
     },
@@ -1115,7 +1067,7 @@
         "print(y_train.shape)\n",
         "print(np.unique(y_train))"
       ],
-      "execution_count": 79,
+      "execution_count": 163,
       "outputs": [
         {
           "output_type": "stream",
@@ -1151,7 +1103,7 @@
         "x_train = MinMaxScaler().fit_transform(x_train)\n",
         "x_test = MinMaxScaler().fit_transform(x_test)"
       ],
-      "execution_count": 80,
+      "execution_count": 164,
       "outputs": []
     },
     {
@@ -1167,7 +1119,7 @@
       "source": [
         "print(x_train.shape[1:])  # 784 = 28*28"
       ],
-      "execution_count": 81,
+      "execution_count": 165,
       "outputs": [
         {
           "output_type": "stream",
@@ -1191,7 +1143,7 @@
       "source": [
         "print(np.min(x_train), np.max(x_train))  # scaled 0-1"
       ],
-      "execution_count": 82,
+      "execution_count": 166,
       "outputs": [
         {
           "output_type": "stream",
@@ -1257,7 +1209,7 @@
         "    )\n",
         "    return model"
       ],
-      "execution_count": 83,
+      "execution_count": 167,
       "outputs": []
     },
     {
@@ -1283,7 +1235,7 @@
         "            func=lambda X: X.reshape(X.shape[0], *input_shape),\n",
         "        )"
       ],
-      "execution_count": 84,
+      "execution_count": 168,
       "outputs": []
     },
     {
@@ -1300,7 +1252,7 @@
         "    random_state=0,\n",
         ")"
       ],
-      "execution_count": 85,
+      "execution_count": 169,
       "outputs": []
     },
     {
@@ -1334,42 +1286,42 @@
       "source": [
         "clf.fit(x_train, y_train)"
       ],
-      "execution_count": 86,
+      "execution_count": 170,
       "outputs": [
         {
           "output_type": "stream",
           "name": "stdout",
           "text": [
             "Epoch 1/15\n",
-            "422/422 [==============================] - 14s 33ms/step - loss: 0.7640 - val_loss: 0.0871\n",
+            "422/422 [==============================] - 17s 40ms/step - loss: 0.7640 - val_loss: 0.0871\n",
             "Epoch 2/15\n",
-            "422/422 [==============================] - 15s 35ms/step - loss: 0.1219 - val_loss: 0.0600\n",
+            "422/422 [==============================] - 18s 42ms/step - loss: 0.1219 - val_loss: 0.0600\n",
             "Epoch 3/15\n",
-            "422/422 [==============================] - 17s 40ms/step - loss: 0.0863 - val_loss: 0.0454\n",
+            "422/422 [==============================] - 17s 41ms/step - loss: 0.0863 - val_loss: 0.0454\n",
             "Epoch 4/15\n",
-            "422/422 [==============================] - 18s 42ms/step - loss: 0.0755 - val_loss: 0.0435\n",
+            "422/422 [==============================] - 20s 47ms/step - loss: 0.0755 - val_loss: 0.0435\n",
             "Epoch 5/15\n",
-            "422/422 [==============================] - 17s 39ms/step - loss: 0.0645 - val_loss: 0.0438\n",
+            "422/422 [==============================] - 19s 46ms/step - loss: 0.0645 - val_loss: 0.0438\n",
             "Epoch 6/15\n",
             "422/422 [==============================] - 18s 42ms/step - loss: 0.0553 - val_loss: 0.0361\n",
             "Epoch 7/15\n",
-            "422/422 [==============================] - 17s 41ms/step - loss: 0.0508 - val_loss: 0.0372\n",
+            "422/422 [==============================] - 18s 42ms/step - loss: 0.0508 - val_loss: 0.0372\n",
             "Epoch 8/15\n",
-            "422/422 [==============================] - 18s 44ms/step - loss: 0.0512 - val_loss: 0.0342\n",
+            "422/422 [==============================] - 18s 42ms/step - loss: 0.0512 - val_loss: 0.0342\n",
             "Epoch 9/15\n",
-            "422/422 [==============================] - 18s 41ms/step - loss: 0.0450 - val_loss: 0.0307\n",
+            "422/422 [==============================] - 18s 42ms/step - loss: 0.0450 - val_loss: 0.0307\n",
             "Epoch 10/15\n",
-            "422/422 [==============================] - 18s 43ms/step - loss: 0.0422 - val_loss: 0.0308\n",
+            "422/422 [==============================] - 17s 41ms/step - loss: 0.0422 - val_loss: 0.0308\n",
             "Epoch 11/15\n",
-            "422/422 [==============================] - 17s 41ms/step - loss: 0.0378 - val_loss: 0.0321\n",
+            "422/422 [==============================] - 17s 40ms/step - loss: 0.0378 - val_loss: 0.0321\n",
             "Epoch 12/15\n",
-            "422/422 [==============================] - 17s 41ms/step - loss: 0.0376 - val_loss: 0.0323\n",
+            "422/422 [==============================] - 17s 40ms/step - loss: 0.0376 - val_loss: 0.0323\n",
             "Epoch 13/15\n",
-            "422/422 [==============================] - 17s 39ms/step - loss: 0.0343 - val_loss: 0.0299\n",
+            "422/422 [==============================] - 18s 43ms/step - loss: 0.0343 - val_loss: 0.0299\n",
             "Epoch 14/15\n",
-            "422/422 [==============================] - 18s 42ms/step - loss: 0.0358 - val_loss: 0.0304\n",
+            "422/422 [==============================] - 19s 44ms/step - loss: 0.0358 - val_loss: 0.0304\n",
             "Epoch 15/15\n",
-            "422/422 [==============================] - 17s 41ms/step - loss: 0.0322 - val_loss: 0.0294\n"
+            "422/422 [==============================] - 18s 43ms/step - loss: 0.0322 - val_loss: 0.0294\n"
           ]
         },
         {
@@ -1377,7 +1329,7 @@
           "data": {
             "text/plain": [
               "MultiDimensionalClassifier(\n",
-              "\tmodel=<function get_model at 0x14edf3700>\n",
+              "\tmodel=<function get_model at 0x14dd1b9d0>\n",
               "\tbuild_fn=None\n",
               "\twarm_start=False\n",
               "\trandom_state=0\n",
@@ -1396,7 +1348,7 @@
             ]
           },
           "metadata": {},
-          "execution_count": 86
+          "execution_count": 170
         }
       ]
     },
@@ -1414,13 +1366,13 @@
         "score = clf.score(x_test, y_test)\n",
         "print(f\"Test score (accuracy): {score:.2f}\")"
       ],
-      "execution_count": 87,
+      "execution_count": 171,
       "outputs": [
         {
           "output_type": "stream",
           "name": "stdout",
           "text": [
-            "79/79 [==============================] - 1s 12ms/step\n",
+            "79/79 [==============================] - 1s 14ms/step\n",
             "Test score (accuracy): 0.99\n"
           ]
         }
@@ -1449,7 +1401,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 88,
+      "execution_count": 172,
       "metadata": {},
       "outputs": [
         {
@@ -1475,7 +1427,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 89,
+      "execution_count": 173,
       "metadata": {},
       "outputs": [],
       "source": [
@@ -1499,7 +1451,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 90,
+      "execution_count": 174,
       "metadata": {},
       "outputs": [],
       "source": [
@@ -1531,7 +1483,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 91,
+      "execution_count": 175,
       "metadata": {},
       "outputs": [
         {
@@ -1544,7 +1496,7 @@
             ]
           },
           "metadata": {},
-          "execution_count": 91
+          "execution_count": 175
         }
       ],
       "source": [
@@ -1554,7 +1506,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 92,
+      "execution_count": 176,
       "metadata": {},
       "outputs": [
         {
@@ -1567,7 +1519,7 @@
             ]
           },
           "metadata": {},
-          "execution_count": 92
+          "execution_count": 176
         }
       ],
       "source": [
@@ -1585,7 +1537,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 93,
+      "execution_count": 177,
       "metadata": {},
       "outputs": [],
       "source": [
@@ -1608,7 +1560,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 94,
+      "execution_count": 178,
       "metadata": {},
       "outputs": [],
       "source": [
@@ -1647,7 +1599,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 95,
+      "execution_count": 179,
       "metadata": {},
       "outputs": [],
       "source": [
@@ -1663,15 +1615,15 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 96,
+      "execution_count": 180,
       "metadata": {},
       "outputs": [
         {
           "output_type": "stream",
           "name": "stdout",
           "text": [
-            "5/5 [==============================] - 0s 1ms/step - loss: 0.6143\n",
-            "5/5 [==============================] - 0s 1ms/step\n"
+            "5/5 [==============================] - 0s 980us/step - loss: 0.6251\n",
+            "5/5 [==============================] - 0s 968us/step\n"
           ]
         },
         {
@@ -1682,7 +1634,7 @@
             ]
           },
           "metadata": {},
-          "execution_count": 96
+          "execution_count": 180
         }
       ],
       "source": [
@@ -1700,7 +1652,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 97,
+      "execution_count": 181,
       "metadata": {},
       "outputs": [],
       "source": [
@@ -1722,26 +1674,26 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 98,
+      "execution_count": 182,
       "metadata": {},
       "outputs": [
         {
           "output_type": "stream",
           "name": "stdout",
           "text": [
-            "5/5 [==============================] - 0s 2ms/step - loss: 0.7775\n",
-            "5/5 [==============================] - 0s 1ms/step\n"
+            "5/5 [==============================] - 0s 1ms/step - loss: 0.6066\n",
+            "5/5 [==============================] - 0s 939us/step\n"
           ]
         },
         {
           "output_type": "execute_result",
           "data": {
             "text/plain": [
-              "array(['class2', 'class2', 'class2', 'class2', 'class1'], dtype='<U6')"
+              "array(['class1', 'class1', 'class1', 'class1', 'class1'], dtype='<U6')"
             ]
           },
           "metadata": {},
-          "execution_count": 98
+          "execution_count": 182
         }
       ],
       "source": [
@@ -1767,7 +1719,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 99,
+      "execution_count": 183,
       "metadata": {},
       "outputs": [],
       "source": [
@@ -1809,7 +1761,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 100,
+      "execution_count": 184,
       "metadata": {},
       "outputs": [],
       "source": [
@@ -1849,7 +1801,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 133,
+      "execution_count": 185,
       "metadata": {},
       "outputs": [],
       "source": [
@@ -1871,7 +1823,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 134,
+      "execution_count": 186,
       "metadata": {},
       "outputs": [
         {
@@ -1901,7 +1853,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 135,
+      "execution_count": 187,
       "metadata": {},
       "outputs": [
         {
@@ -1942,7 +1894,7 @@
       ],
       "cell_type": "code",
       "metadata": {},
-      "execution_count": 136,
+      "execution_count": 188,
       "outputs": [
         {
           "output_type": "stream",
@@ -1962,7 +1914,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 137,
+      "execution_count": 189,
       "metadata": {},
       "outputs": [
         {

From bc92cffd924e60881ee4c46126d0a57252d088e1 Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Thu, 21 Jan 2021 18:18:56 -0600
Subject: [PATCH 04/29] Update docstring

---
 scikeras/wrappers.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/scikeras/wrappers.py b/scikeras/wrappers.py
index 33b461db7..90d389606 100644
--- a/scikeras/wrappers.py
+++ b/scikeras/wrappers.py
@@ -720,7 +720,8 @@ def dataset_transformer(self):
         transformer, to a `tf.data.Dataset` instance containing
         X, y and optionally sample_weights.
         The second element corresponds to `y`, and may be None
-        on the output side.
+        on the output side always and on the input side when
+        called from `predict`.
         The third element is `sample_weights` which may be None
         on the input and output sides.
 
@@ -1381,7 +1382,8 @@ def dataset_transformer(self):
         transformer, to a `tf.data.Dataset` instance containing
         X, y and optionally sample_weights.
         The second element corresponds to `y`, and may be None
-        on the output side.
+        on the output side always and on the input side when
+        called from `predict`.
         The third element is `sample_weights` which may be None
         on the input and output sides.
 

From 5b8e1330798d9d110e1c792ea59f1bf9f158fc79 Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Fri, 22 Jan 2021 11:15:18 -0600
Subject: [PATCH 05/29] typo

---
 notebooks/DataTransformers.ipynb | 48 ++++++++++++++++----------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/notebooks/DataTransformers.ipynb b/notebooks/DataTransformers.ipynb
index 1d042696f..cc43811fa 100644
--- a/notebooks/DataTransformers.ipynb
+++ b/notebooks/DataTransformers.ipynb
@@ -79,22 +79,22 @@
         "id": "ekJWKPFMvoxR"
       },
       "source": [
-        "* [Data transformer interface](#Data-transformer-interface)\n",
-        "    * [get_metadata method](#get_metadata-method)\n",
-        "* [1. Multiple outputs](#1-1-multiple-outputs)\n",
-        "    * [1.1 Define Keras Model](#1-1-define-keras-model)\n",
-        "    * [1.2 Define output transformer](1-2-define-output-transformer)\n",
-        "    * [1.3 Test classifier](#1-3-test-classifier)\n",
-        "* [2. Multiple inputs](#2-multiple-inputs)\n",
-        "    * [2.1 Define Keras Model](#2-1-define-keras-model)\n",
-        "    * [2.2 Define input transformer](#2-2-define-input-transformer)\n",
-        "    * [2.3 Test regressor](#2-3-test-regressor)\n",
-        "* [3. Multidimensional inputs with MNIST dataset](#3-multidimensional-inputs-with-MNIST-dataset)\n",
-        "    * [3.1 Define Keras Model](#3-1-define-keras-model)\n",
-        "    * [3.2 Define transformer](#3-2-define-transformer)\n",
-        "    * [3.3 Test classifier](#3-3-test-classifier)\n",
-        "* [4. Ragged datasets with tf.data.Dataset](#4-ragged-datasets)\n",
-        "* [5. Multi-output class_weight](#4-multi-output-class-weight)"
+        "* [Data transformer interface](#0)\n",
+        "    * [get_metadata method](#0-1)\n",
+        "* [1. Multiple outputs](#1)\n",
+        "    * [1.1 Define Keras Model](#1-1)\n",
+        "    * [1.2 Define output transformer](1-2)\n",
+        "    * [1.3 Test classifier](#1-3)\n",
+        "* [2. Multiple inputs](#2)\n",
+        "    * [2.1 Define Keras Model](#2-1)\n",
+        "    * [2.2 Define input transformer](#2-2)\n",
+        "    * [2.3 Test regressor](#2-3)\n",
+        "* [3. Multidimensional inputs with MNIST dataset](#3)\n",
+        "    * [3.1 Define Keras Model](#3-1)\n",
+        "    * [3.2 Define transformer](#3-2)\n",
+        "    * [3.3 Test classifier](#3-3-)\n",
+        "* [4. Ragged datasets with tf.data.Dataset](#4)\n",
+        "* [5. Multi-output class_weight](#5)"
       ]
     },
     {
@@ -159,7 +159,7 @@
         "id": "hCuOBH8AvoxX"
       },
       "source": [
-        "## Data transformer interface"
+        "## <a name=\"0\"></a> Data transformer interface"
       ]
     },
     {
@@ -272,7 +272,7 @@
         "id": "8pwBaT2Qi1U2"
       },
       "source": [
-        "### get_metadata method"
+        "### <a name=\"0-1\"></a> get_metadata method"
       ]
     },
     {
@@ -321,7 +321,7 @@
         "id": "lNz5uY-v-1TQ"
       },
       "source": [
-        "## 1. Multiple Outputs"
+        "## <a name=\"1\"></a> 1. Multiple Outputs"
       ]
     },
     {
@@ -343,7 +343,7 @@
         "id": "QTwqF_0UL9qA"
       },
       "source": [
-        "### 1.1 Define Keras Model"
+        "### <a name=\"1-1\"></a> 1.1 Define Keras Model"
       ]
     },
     {
@@ -480,7 +480,7 @@
         "id": "2hN9nZiqMNJ9"
       },
       "source": [
-        "### 1.2 Define output Data Transformer"
+        "### <a name=\"1-2\"></a> 1.2 Define output Data Transformer"
       ]
     },
     {
@@ -688,7 +688,7 @@
         "id": "1heA-eeTMp3t"
       },
       "source": [
-        "### 1.3 Test classifier"
+        "### <a name=\"1-3\"></a> 1.3 Test classifier"
       ]
     },
     {
@@ -761,7 +761,7 @@
         "id": "Pznw-f0v-1TU"
       },
       "source": [
-        "## 2. Multiple inputs"
+        "## <a name=\"2\"></a> 2. Multiple inputs"
       ]
     },
     {
@@ -802,7 +802,7 @@
         "id": "sp251zciLXAY"
       },
       "source": [
-        "### 2.1 Define Keras Model"
+        "### <a name=\"1-1\"></a>2.1 Define Keras Model"
       ]
     },
     {

From 6ee6425d4266b91e9d8d4bf1e2a58fa6520609d7 Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Sun, 24 Jan 2021 11:46:04 -0600
Subject: [PATCH 06/29] Test pipeline, move notebook to markdown

---
 docs/source/notebooks/DataTransformers.md |  307 ++++
 notebooks/DataTransformers.ipynb          | 1939 ---------------------
 scikeras/utils/transformers.py            |   54 +-
 scikeras/wrappers.py                      |    3 +-
 tests/test_api.py                         |   40 +-
 5 files changed, 391 insertions(+), 1952 deletions(-)
 delete mode 100644 notebooks/DataTransformers.ipynb

diff --git a/docs/source/notebooks/DataTransformers.md b/docs/source/notebooks/DataTransformers.md
index a8b026f62..2ece0fc20 100644
--- a/docs/source/notebooks/DataTransformers.md
+++ b/docs/source/notebooks/DataTransformers.md
@@ -43,6 +43,8 @@ In this notebook, we explore how to reconcile this functionality with the sklear
 * [5. Multidimensional inputs with MNIST dataset](#5.-Multidimensional-inputs-with-MNIST-dataset)
   * [5.1 Define Keras Model](#5.1-Define-Keras-Model)
   * [5.2 Test](#5.2-Test)
+* [6. Ragged datasets with tf.data.Dataset](#6.-Ragged-datasets-with-tf.data.Dataset)
+* [7. Multi-output class_weight](#7.-Multi-output-class_weight)
 
 ## 1. Setup
 
@@ -469,6 +471,10 @@ x_train = x_train.reshape((n_samples_train, -1))
 x_test = x_test.reshape((n_samples_test, -1))
 x_train = MinMaxScaler().fit_transform(x_train)
 x_test = MinMaxScaler().fit_transform(x_test)
+
+# reduce dataset size for faster training
+n_samples = 1000
+x_train, y_train, x_test, y_test = x_train[:n_samples], y_train[:n_samples], x_test[:n_samples], y_test[:n_samples]
 ```
 
 ```python
@@ -543,3 +549,304 @@ clf.fit(x_train, y_train)
 score = clf.score(x_test, y_test)
 print(f"Test score (accuracy): {score:.2f}")
 ```
+
+## 6. Ragged datasets with tf.data.Dataset
+
+SciKeras provides a third dependency injection point that operats on the entire dataset: X, y & sample_weight. This `dataset_transformer` is applied after `target_transformer` and `feature_transformer`. One use case for this dependancy injection point is to transform data from tabular/array-like to the `tf.data.Dataset` format, which only requires iteration. We can use this to create a `tf.data.Dataset` of ragged tensors.
+
+Note that `dataset_transformer` should accept a single **3 element tuple** as its argument and return value:
+
+```python
+help(KerasClassifier.dataset_transformer)
+```
+
+The use of a 3 element tuple allows you to chain transformers with this same interface using a Scikit-Learn Pipeline, as you will see below.
+
+
+When you return a tuple like `(tf.data.Dataset(...), None, None)`, SciKeras will pass the data untouched to `Model.fit` like `Model.fit(x=tf.data.Dataset(...), y=None, sample_weight=None)`. You can process these arguments in any way you like, as long as Keras accepts them, SciKeras will not complain.
+
+
+Let's start by defining our data. We'll have an extra "feature" that marks the observation index, but we'll remove it when we deconstruct our data in the transformer.
+
+```python
+feature_1 = np.random.uniform(size=(10, ))
+feature_2 = np.random.uniform(size=(10, ))
+obs = [0] * 3 + [1] * 2 + [2] * 1 + [3] * 2 + [4] * 2
+
+X = np.column_stack([feature_1, feature_2, obs]).astype("float32")
+
+y = np.array(["class1"] * 5 + ["class2"] * 5, dtype=str)
+```
+
+Next, we define our `dataset_transformer`. We will do this by defining a custom forward transformation outside of the Keras model. Note that we do not define an inverse transformation since that is never used.
+Also note that `dataset_transformer` will _always_ be called with `X` (i.e. the first element of the tuple will always be populated), but will be called with `y=None` when used for `predict`. Thus,
+you should check if `y` and `sample_weigh` are None before doing any operations on them.
+
+```python
+from typing import Tuple, Optional
+
+from sklearn.base import BaseEstimator, TransformerMixin
+import tensorflow as tf
+
+
+def ragged_transformer(data: Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]) -> Tuple[tf.RaggedTensor, None, None]:
+    X, y, sample_weights = data
+    if y is not None:
+        y = y.reshape(-1, 1 if len(y.shape) == 1 else y.shape[1])
+        y = y[tf.RaggedTensor.from_value_rowids(y, X[:, -1]).row_starts().numpy()]
+    if sample_weights is not None:
+        sample_weights = sample_weights.reshape(-1, 1 if len(sample_weights.shape) == 1 else sample_weights.shape[1])
+        sample_weights = sample_weights[tf.RaggedTensor.from_value_rowids(sample_weights, X[:, -1]).row_starts().numpy()]
+    X = tf.RaggedTensor.from_value_rowids(X[:, :-1], X[:, -1])
+    return (X, y, sample_weights)
+```
+
+In this case, we chose to keep `y` and `sample_weights` as numpy arrays, which will allow us to re-use 
+
+Lets quickly test our transformer:
+
+```python
+data = ragged_transformer((X, y, None))
+data[0]
+```
+
+```python
+data = ragged_transformer((X, None, None))
+data[0]
+```
+
+Our shapes look good, and we can handle the `y=None` case.
+
+
+Because Keras will not accept a RaggedTensor directly, we will need to wrap our entire dataset into a tensorflow `Dataset`. We can do this by adding one more transformation step:
+
+
+Next, we can add our transormers to our model. We use an sklearn `Pipeline` (generated via `make_pipeline`) to keep ClassWeightDataTransformer operational while implementing our custom transformation.
+
+```python
+def dataset_transformer(data: Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]) -> Tuple[tf.data.Dataset, None, None]:
+    return (tf.data.Dataset.from_tensor_slices(data), None, None)
+```
+
+```python
+from sklearn.preprocessing import FunctionTransformer
+from sklearn.pipeline import make_pipeline
+
+
+class RaggedClassifier(KerasClassifier):
+
+    @property
+    def dataset_transformer(self):
+        t1 = FunctionTransformer(ragged_transformer)
+        t2 = super().dataset_transformer  # ClassWeightDataTransformer
+        t3 = FunctionTransformer(dataset_transformer)
+        return make_pipeline(t1, t2, t3)
+```
+
+Now we can define a Model. We need some way to handle/flatten our ragged arrays within our model. For this example, we use a custom mean layer, but you could use an Embedding layer, LSTM, etc.
+
+```python
+from tensorflow import reduce_mean, reshape
+from tensorflow.keras import Sequential, layers
+
+
+class CustomMean(layers.Layer):
+
+    def __init__(self, axis=None):
+        super(CustomMean, self).__init__()
+        self._supports_ragged_inputs = True
+        self.axis = axis
+
+    def call(self, inputs, **kwargs):
+        input_shape = inputs.get_shape()
+        return reshape(reduce_mean(inputs, axis=self.axis), (1, *input_shape[1:]))
+
+
+def get_model(meta):
+    inp_shape = meta["X_shape_"][1]-1
+    model = Sequential([               
+        layers.Input(shape=(inp_shape,), ragged=True),
+        CustomMean(axis=0),
+        layers.Dense(1, activation='sigmoid')
+    ])
+    return model
+```
+
+And attach our model to our classifier wrapper:
+
+```python
+clf = RaggedClassifier(get_model, loss="bce")
+```
+
+Finally, let's train and predict:
+
+```python
+clf.fit(X, y)
+y_pred = clf.predict(X)
+y_pred
+```
+
+If we define our custom layers, transformers and wrappers in their own module, we can easily create a self-contained classifier that is able to handle ragged datasets and has a clean Scikit-Learn compatible API:
+
+```python
+class RaggedClassifier(KerasClassifier):
+
+    @property
+    def dataset_transformer(self):
+        t1 = FunctionTransformer(ragged_transformer)
+        t2 = ClassWeightDataTransformer(self.class_weight)
+        t3 = FunctionTransformer(dataset_transformer)
+        return make_pipeline(t1, t2, t3)
+    
+    def _keras_build_fn(self):
+        inp_shape = self.X_shape_[1] - 1
+        model = Sequential([               
+            layers.Input(shape=(inp_shape,), ragged=True),
+            CustomMean(axis=0),
+            layers.Dense(1, activation='sigmoid')
+        ])
+        return model
+```
+
+```python
+clf = RaggedClassifier(loss="bce")
+clf.fit(X, y)
+y_pred = clf.predict(X)
+y_pred
+```
+
+## 7. Multi-output class_weight
+
+In this example, we will use `dataset_transformer` to support multi-output class weights. We will re-use our `MultiOutputTransformer` from our previous example to split the output, then we will create `sample_weights` from `class_weight`
+
+```python
+from collections import defaultdict
+from typing import Union
+
+from sklearn.utils.class_weight import compute_sample_weight
+
+
+class DatasetTransformer(BaseEstimator, TransformerMixin):
+
+    def __init__(self, output_names, class_weight=None):
+        self.class_weight = class_weight
+        self.output_names = output_names
+
+    def fit(self, data: Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]) -> "DatasetTransformer":
+        return self
+
+    def transform(self, data: Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]) -> Tuple[np.ndarray, Union[np.ndarray, None], Union[np.ndarray, None]]:
+        if self.class_weight is None:
+            return data
+        class_weight = self.class_weight
+        if isinstance(class_weight, str):  # handle "balanced"
+            class_weight_ = class_weight
+            class_weight = defaultdict(lambda: class_weight_)
+        X, y, sample_weights = data
+        assert sample_weights is None, "Cannot use class_weight & sample_weights together"
+        if y is not None:
+            # y should be a list of arrays, as split up by MultiOutputTransformer
+            sample_weights = dict()
+            for output_num, (output_name, output_data) in enumerate(zip(self.output_names, y)):
+                # class_weight is expected to be indexable by output_number
+                # see https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_sample_weight.html
+                # Note that it is trivial to change the expected format to match Keras' ({output_name: weights, ...})
+                # see https://github.com/keras-team/keras/issues/4735#issuecomment-267473722
+                cls_wt_out = class_weight[output_num]
+                sample_weights[output_name] = compute_sample_weight(cls_wt_out, output_data)
+        return X, y, sample_weights
+
+```
+
+```python
+def get_model(meta, compile_kwargs):
+    inp = keras.layers.Input(shape=(meta["n_features_in_"]))
+    x1 = keras.layers.Dense(100, activation="relu")(inp)
+    out_bin = keras.layers.Dense(1, activation="sigmoid")(x1)
+    out_cat = keras.layers.Dense(meta["n_classes_"][1], activation="softmax")(x1)
+    model = keras.Model(inputs=inp, outputs=[out_bin, out_cat])
+    model.compile(
+        loss=["binary_crossentropy", "sparse_categorical_crossentropy"],
+        optimizer=compile_kwargs["optimizer"]
+    )
+    return model
+
+
+class CustomClassifier(KerasClassifier):
+
+    @property
+    def target_encoder(self):
+        return MultiOutputTransformer()
+    
+    @property
+    def dataset_transformer(self):
+        return DatasetTransformer(
+            output_names=self.model_.output_names,
+            class_weight=self.class_weight
+        )
+```
+
+Next, we define the data. We'll use `sklearn.datasets.make_blobs` to generate a relatively noisy dataset:
+
+```python
+from sklearn.datasets import make_blobs
+
+
+X, y = make_blobs(centers=3, random_state=0, cluster_std=20)
+# make a binary target for "is the value of the first class?"
+y_bin = y == y[0]
+y = np.column_stack([y_bin, y])
+```
+
+Test the model without specifying class weighting:
+
+```python
+clf = CustomClassifier(get_model, epochs=100, verbose=0, random_state=0)
+clf.fit(X, y)
+y_pred = clf.predict(X)
+(_, counts_bin) = np.unique(y_pred[:, 0], return_counts=True)
+print(counts_bin)
+(_, counts_cat) = np.unique(y_pred[:, 1], return_counts=True)
+print(counts_cat)
+```
+
+As you can see, without `class_weight="balanced"`, our classifier only predicts mainly a single class for the first output. Now with `class_weight="balanced"`:
+
+```python
+clf = CustomClassifier(get_model, class_weight="balanced", epochs=100, verbose=0, random_state=0)
+clf.fit(X, y)
+y_pred = clf.predict(X)
+(_, counts_bin) = np.unique(y_pred[:, 0], return_counts=True)
+print(counts_bin)
+(_, counts_cat) = np.unique(y_pred[:, 1], return_counts=True)
+print(counts_cat)
+```
+
+Now, we get (mostly) balanced classes. But what if we want to specify our classes manually? You will notice that in when we defined `DatasetTransformer`, we gave it the ability to handle
+a list of class weights. For demonstration purposes, we will highly bias towards the second class in each output:
+
+```python
+clf = CustomClassifier(get_model, class_weight=[{0: 0.1, 1: 1}, {0: 0.1, 1: 1, 2: 0.1}], epochs=100, verbose=0, random_state=0)
+clf.fit(X, y)
+y_pred = clf.predict(X)
+(_, counts_bin) = np.unique(y_pred[:, 0], return_counts=True)
+print(counts_bin)
+(_, counts_cat) = np.unique(y_pred[:, 1], return_counts=True)
+print(counts_cat)
+```
+
+Or mixing the two methods, because our first output is unbalanced but our second is (presumably) balanced:
+
+```python
+clf = CustomClassifier(get_model, class_weight=["balanced", None], epochs=100, verbose=0, random_state=0)
+clf.fit(X, y)
+y_pred = clf.predict(X)
+(_, counts_bin) = np.unique(y_pred[:, 0], return_counts=True)
+print(counts_bin)
+(_, counts_cat) = np.unique(y_pred[:, 1], return_counts=True)
+print(counts_cat)
+```
+
+```python
+
+```
diff --git a/notebooks/DataTransformers.ipynb b/notebooks/DataTransformers.ipynb
deleted file mode 100644
index cc43811fa..000000000
--- a/notebooks/DataTransformers.ipynb
+++ /dev/null
@@ -1,1939 +0,0 @@
-{
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3.8.6 64-bit ('.venv': venv)",
-      "metadata": {
-        "interpreter": {
-          "hash": "aa27e2362274a734444ef07021bde9bc2912ecaf24c8326dfe4db5717933d8db"
-        }
-      }
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.8.6-final"
-    },
-    "colab": {
-      "name": "DataTransformers.ipynb",
-      "provenance": [],
-      "collapsed_sections": []
-    }
-  },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "z22BE9uhvoxO"
-      },
-      "source": [
-        "# Data Transformers"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "hapoJed-voxP"
-      },
-      "source": [
-        "Keras support many types of input and output data formats, including:\n",
-        "\n",
-        "* Multiple inputs\n",
-        "* Multiple outputs\n",
-        "* Higher-dimensional tensors\n",
-        "* Ragged datasets (variable datapoints per observation)\n",
-        "* `tf.data.Dataset`\n",
-        "\n",
-        "In this notebook, we explore how to reconcile this functionality with the sklearn ecosystem via SciKeras Data Transformer interface.\n",
-        "\n",
-        "\n",
-        "\n",
-        "<table align=\"left\"><td>\n",
-        "<a target=\"_blank\" href=\"https://colab.research.google.com/github/adriangb/scikeras/blob/master/notebooks/Basic_Usage.ipyn\">\n",
-        "    <img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>  \n",
-        "</td><td>\n",
-        "<a target=\"_blank\" href=\"https://github.com/adriangb/scikeras/blob/master/notebooks/Basic_Usage.ipynb\"><img width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a></td></table>"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "iT-ibpi7voxQ"
-      },
-      "source": [
-        "### Table of contents"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ekJWKPFMvoxR"
-      },
-      "source": [
-        "* [Data transformer interface](#0)\n",
-        "    * [get_metadata method](#0-1)\n",
-        "* [1. Multiple outputs](#1)\n",
-        "    * [1.1 Define Keras Model](#1-1)\n",
-        "    * [1.2 Define output transformer](1-2)\n",
-        "    * [1.3 Test classifier](#1-3)\n",
-        "* [2. Multiple inputs](#2)\n",
-        "    * [2.1 Define Keras Model](#2-1)\n",
-        "    * [2.2 Define input transformer](#2-2)\n",
-        "    * [2.3 Test regressor](#2-3)\n",
-        "* [3. Multidimensional inputs with MNIST dataset](#3)\n",
-        "    * [3.1 Define Keras Model](#3-1)\n",
-        "    * [3.2 Define transformer](#3-2)\n",
-        "    * [3.3 Test classifier](#3-3-)\n",
-        "* [4. Ragged datasets with tf.data.Dataset](#4)\n",
-        "* [5. Multi-output class_weight](#5)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "6avb3GBQDQyG"
-      },
-      "source": [
-        "Install SciKeras"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "qCcyTjVkvoxR"
-      },
-      "source": [
-        "!python -m pip install scikeras"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "EZveNcetDQyL"
-      },
-      "source": [
-        "Silence TensorFlow warnings to keep output succint."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "ekNmO_GPDQyL"
-      },
-      "source": [
-        "import warnings\n",
-        "from tensorflow import get_logger\n",
-        "get_logger().setLevel('ERROR')\n",
-        "warnings.filterwarnings(\"ignore\", message=\"Setting the random state for TF\")"
-      ],
-      "execution_count": 138,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "Sf4j-x4DvoxV"
-      },
-      "source": [
-        "import numpy as np\n",
-        "from scikeras.wrappers import KerasClassifier, KerasRegressor\n",
-        "from tensorflow import keras"
-      ],
-      "execution_count": 139,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "hCuOBH8AvoxX"
-      },
-      "source": [
-        "## <a name=\"0\"></a> Data transformer interface"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "i3fAUKBUvoxY"
-      },
-      "source": [
-        "SciKeras enables advanced Keras use cases by providing an interface to convert sklearn compliant data to whatever format your Keras model requires within SciKeras, right before passing said data to the Keras model.\n",
-        "\n",
-        "This interface is implemented in the form of two sklearn transformers, one for the features (`X`) and one for the target (`y`).  SciKeras loads these transformers via the `target_encoder` and `feature_encoder` methods.\n",
-        "\n",
-        "By default, SciKeras implements `target_encoder` for both KerasClassifier and KerasRegressor to facilitate common types of tasks in sklearn. The default implementations are `scikeras.utils.transformers.ClassifierLabelEncoder` and `scikeras.utils.transformers.RegressorTargetEncoder` for KerasClassifier and KerasRegressor respectively. Information on the types of tasks that these default transformers are able to perform can be found in the [SciKeras docs](https://scikeras.readthedocs.io/en/latest/advanced.html#data-transformers).\n",
-        "\n",
-        "Below is an outline of the inner workings of the data transfomer interfaces to help understand when they are called:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "QM74xeoe-1S-"
-      },
-      "source": [
-        "if False:  # avoid executing pseudocode\n",
-        "    from scikeras.utils.transformers import (\n",
-        "        ClassifierLabelEncoder,\n",
-        "        RegressorTargetEncoder,\n",
-        "    )\n",
-        "\n",
-        "\n",
-        "    class BaseWrapper:\n",
-        "        def fit(self, X, y):\n",
-        "            self.target_encoder_ = self.target_encoder\n",
-        "            self.feature_encoder_ = self.feature_encoder\n",
-        "            y = self.target_encoder_.fit_transform(y)\n",
-        "            X = self.feature_encoder_.fit_transform(X)\n",
-        "            self.model_.fit(X, y)\n",
-        "            return self\n",
-        "        \n",
-        "        def predict(self, X):\n",
-        "            X = self.feature_encoder_.transform(X)\n",
-        "            y_pred = self.model_.predict(X)\n",
-        "            return self.target_encoder_.inverse_transform(y_pred)\n",
-        "\n",
-        "    class KerasClassifier(BaseWrapper):\n",
-        "\n",
-        "        @property\n",
-        "        def target_encoder(self):\n",
-        "            return ClassifierLabelEncoder(loss=self.loss)\n",
-        "        \n",
-        "        def predict_proba(self, X):\n",
-        "            X = self.feature_encoder_.transform(X)\n",
-        "            y_pred = self.model_.predict(X)\n",
-        "            return self.target_encoder_.inverse_transform(y_pred, return_proba=True)\n",
-        "\n",
-        "\n",
-        "    class KerasRegressor(BaseWrapper):\n",
-        "\n",
-        "        @property\n",
-        "        def target_encoder(self):\n",
-        "            return RegressorTargetEncoder()"
-      ],
-      "execution_count": 140,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Yg_0PtqhwNSo"
-      },
-      "source": [
-        "To substitute your own data transformation routine, you must subclass the wrappers and override one of the encoder defining functions. You will have access to all attributes of the wrappers, and you can pass these to your transformer, like we do above with `loss`."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "Lb8uZq_dIRUE"
-      },
-      "source": [
-        "from sklearn.base import BaseEstimator, TransformerMixin"
-      ],
-      "execution_count": 141,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "1DeX_yH_wTIX"
-      },
-      "source": [
-        "if False:  # avoid executing pseudocode\n",
-        "\n",
-        "    class MultiOutputTransformer(BaseEstimator, TransformerMixin):\n",
-        "        ...\n",
-        "\n",
-        "\n",
-        "    class MultiOutputClassifier(KerasClassifier):\n",
-        "\n",
-        "        @property\n",
-        "        def target_encoder(self):\n",
-        "            return MultiOutputTransformer(...)"
-      ],
-      "execution_count": 142,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "8pwBaT2Qi1U2"
-      },
-      "source": [
-        "### <a name=\"0-1\"></a> get_metadata method"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "N3FYPOwGi7t8"
-      },
-      "source": [
-        "SciKeras recognized an optional `get_metadata` on the transformers. `get_metadata` is expected to return a dicionary of with key strings and arbitrary values. SciKeras will set add these items to the wrappers namespace and make them available to your model building function via the `meta` keyword argument:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "Nx2KNaRTi5aY"
-      },
-      "source": [
-        "if False:  # avoid executing pseudocode\n",
-        "\n",
-        "    class MultiOutputTransformer(BaseEstimator, TransformerMixin):\n",
-        "        def get_metadata(self):\n",
-        "            return {\"my_param_\": \"foobarbaz\"}\n",
-        "\n",
-        "\n",
-        "    class MultiOutputClassifier(KerasClassifier):\n",
-        "\n",
-        "        @property\n",
-        "        def target_encoder(self):\n",
-        "            return MultiOutputTransformer(...)\n",
-        "\n",
-        "\n",
-        "    def get_model(meta):\n",
-        "        print(f\"Got: {meta['my_param_']}\")\n",
-        "\n",
-        "\n",
-        "    clf = MultiOutputClassifier(model=get_model)\n",
-        "    clf.fit(X, y)  # Got: foobarbaz\n",
-        "    print(clf.my_param_)  # foobarbaz"
-      ],
-      "execution_count": 143,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "lNz5uY-v-1TQ"
-      },
-      "source": [
-        "## <a name=\"1\"></a> 1. Multiple Outputs"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "W5rrSfES-1TQ"
-      },
-      "source": [
-        "Keras makes it striaghtforward to define models with multiple outputs, that is a Model with multiple sets of fully-connected heads at the end of the network. This functionality is only available in the Functional Model and subclassed Model definition modes, and is not available when using Sequential.\n",
-        "\n",
-        "In practice, the main thing about Keras models with multiple outputs that you need to know as a SciKeras user is that Keras expects `X` or `y` to be a list of arrays/tensors, with one array/tensor for each input/output.\n",
-        "\n",
-        "Note that \"multiple outputs\" in Keras has a slightly different meaning than \"multiple outputs\" in sklearn. Many tasks that would be considered \"multiple output\" tasks in sklearn can be mapped to a single \"output\" in Keras with multiple units. This notebook specifically focuses on the cases that require multiple distinct Keras outputs."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "QTwqF_0UL9qA"
-      },
-      "source": [
-        "### <a name=\"1-1\"></a> 1.1 Define Keras Model"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "4gfMyrIZLjJB"
-      },
-      "source": [
-        "Here we define a simple perceptron that has two outputs, corresponding to one binary classification taks and one multiclass classification task. For example, one output might be \"image has car\" (binary) and the other might be \"color of car in image\" (multiclass)."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "_JgVNml3yEup"
-      },
-      "source": [
-        "def get_clf_model(meta, compile_kwargs):\n",
-        "    inp = keras.layers.Input(shape=(meta[\"n_features_in_\"]))\n",
-        "    x1 = keras.layers.Dense(100, activation=\"relu\")(inp)\n",
-        "    out_bin = keras.layers.Dense(1, activation=\"sigmoid\")(x1)\n",
-        "    out_cat = keras.layers.Dense(meta[\"n_classes_\"][1], activation=\"softmax\")(x1)\n",
-        "    model = keras.Model(inputs=inp, outputs=[out_bin, out_cat])\n",
-        "    model.compile(\n",
-        "        loss=[\"binary_crossentropy\", \"sparse_categorical_crossentropy\"],\n",
-        "        optimizer=compile_kwargs[\"optimizer\"]\n",
-        "    )\n",
-        "    return model"
-      ],
-      "execution_count": 144,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "vWGp30MRk_PN"
-      },
-      "source": [
-        "Let's test that this model works with the kind of inputs and outputs we expect."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "azbKtjtd-1TR"
-      },
-      "source": [
-        "X = np.random.random(size=(100, 10))\n",
-        "y_bin = np.random.randint(0, 2, size=(100,))\n",
-        "y_cat = np.random.randint(0, 5, size=(100, ))\n",
-        "y = [y_bin, y_cat]\n",
-        "\n",
-        "# build mock meta\n",
-        "meta = {\n",
-        "    \"n_features_in_\": 10,\n",
-        "    \"n_classes_\": [2, 5]  # note that we made this a list, one for each output\n",
-        "}\n",
-        "# build mock compile_kwargs\n",
-        "compile_kwargs = {\"optimizer\": \"sgd\"}\n",
-        "\n",
-        "model = get_clf_model(meta=meta, compile_kwargs=compile_kwargs)\n",
-        "\n",
-        "model.fit(X, y, verbose=0)\n",
-        "y_pred = model.predict(X)"
-      ],
-      "execution_count": 145,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "C4xiVhKjqxzI",
-        "outputId": "8176d691-bfc7-40dc-bc02-9f92693d69ee",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 51
-        }
-      },
-      "source": [
-        "print(y_pred[0][:2, :])"
-      ],
-      "execution_count": 146,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "[[0.4953647]\n [0.5277547]]\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "GyIq1xXEqyMY",
-        "outputId": "0cda9bba-4361-46ce-8b00-9669ab5b072e",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 51
-        }
-      },
-      "source": [
-        "print(y_pred[1][:2, :])"
-      ],
-      "execution_count": 147,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "[[0.14030135 0.24718134 0.21445115 0.18598711 0.21207905]\n [0.15261228 0.24328376 0.22087498 0.17868358 0.20454536]]\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "mic4ByPGBwgA"
-      },
-      "source": [
-        "As you can see, our `predict` output is also a list of arrays, except it contains probabilities instead of the class predictions.\n",
-        "\n",
-        "Our data transormer's job will be to convert from a single numpy array (which is what the sklearn ecosystem works with) to the list of arrays and then back. Additionally, for classifiers, we will want to be able to convert probabilities to class predictions.\n",
-        "\n",
-        "We will structure our data on the sklearn side by column-stacking our list\n",
-        "of arrays. This works well in this case since we have the same number of datapoints in each array."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "2hN9nZiqMNJ9"
-      },
-      "source": [
-        "### <a name=\"1-2\"></a> 1.2 Define output Data Transformer"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "QNBuyxhcMPg8"
-      },
-      "source": [
-        "Let's go ahead and protoype this data transformer:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "y3E81vxrDGhM"
-      },
-      "source": [
-        "from typing import List\n",
-        "\n",
-        "from sklearn.base import BaseEstimator, TransformerMixin\n",
-        "from sklearn.preprocessing import LabelEncoder\n",
-        "\n",
-        "\n",
-        "class MultiOutputTransformer(BaseEstimator, TransformerMixin):\n",
-        "\n",
-        "    def fit(self, y):\n",
-        "        y_bin, y_cat = y[:, 0], y[:, 1]\n",
-        "        # Create internal encoders to ensure labels are 0, 1, 2...\n",
-        "        self.bin_encoder_ = LabelEncoder()\n",
-        "        self.cat_encoder_ = LabelEncoder()\n",
-        "        # Fit them to the input data\n",
-        "        self.bin_encoder_.fit(y_bin)\n",
-        "        self.cat_encoder_.fit(y_cat)\n",
-        "        # Save the number of classes\n",
-        "        self.n_classes_ = [\n",
-        "            self.bin_encoder_.classes_.size,\n",
-        "            self.cat_encoder_.classes_.size,\n",
-        "        ]\n",
-        "        # Save number of expected outputs in the Keras model\n",
-        "        # SciKeras will automatically use this to do error-checking\n",
-        "        self.n_outputs_expected_ = 2\n",
-        "        return self\n",
-        "\n",
-        "    def transform(self, y: np.ndarray) -> List[np.ndarray]:\n",
-        "        y_bin, y_cat = y[:, 0], y[:, 1]\n",
-        "        # Apply transformers to input array\n",
-        "        y_bin = self.bin_encoder_.transform(y_bin)\n",
-        "        y_cat = self.cat_encoder_.transform(y_cat)\n",
-        "        # Split the data into a list\n",
-        "        return [y_bin, y_cat]\n",
-        "\n",
-        "    def inverse_transform(self, y: List[np.ndarray], return_proba: bool = False) -> np.ndarray:\n",
-        "        y_pred_proba = y  # rename for clarity, what Keras gives us are probs\n",
-        "        if return_proba:\n",
-        "            return np.column_stack(y_pred_proba, axis=1)\n",
-        "        # Get class predictions from probabilities\n",
-        "        y_pred_bin = (y_pred_proba[0] > 0.5).astype(int).reshape(-1, )\n",
-        "        y_pred_cat = np.argmax(y_pred_proba[1], axis=1)\n",
-        "        # Pass back through LabelEncoder\n",
-        "        y_pred_bin = self.bin_encoder_.inverse_transform(y_pred_bin)\n",
-        "        y_pred_cat = self.cat_encoder_.inverse_transform(y_pred_cat)\n",
-        "        return np.column_stack([y_pred_bin, y_pred_cat])\n",
-        "    \n",
-        "    def get_metadata(self):\n",
-        "        return {\n",
-        "            \"n_classes_\": self.n_classes_,\n",
-        "            \"n_outputs_expected_\": self.n_outputs_expected_,\n",
-        "        }"
-      ],
-      "execution_count": 148,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "eKM37jy-HAX9"
-      },
-      "source": [
-        "Note that in addition to the usual `transform` and `inverse_transform` methods, we implement the `get_metadata` method to return the `n_classes_` attribute.\n",
-        "\n",
-        "Lets test our transformer with the same dataset we previoulsy used to test our model:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "yPqhenjpG_qy",
-        "outputId": "8bbbe68b-7380-431d-bd63-5d23ad7f8b64",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 51
-        }
-      },
-      "source": [
-        "tf = MultiOutputTransformer()\n",
-        "\n",
-        "y_sklearn = np.column_stack(y)\n",
-        "\n",
-        "y_keras = tf.fit_transform(y_sklearn)\n",
-        "print(\"`y`, as will be passed to Keras:\")\n",
-        "print([y_keras[0][:4], y_keras[1][:4]])"
-      ],
-      "execution_count": 149,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "`y`, as will be passed to Keras:\n[array([1, 0, 1, 0]), array([4, 4, 3, 2])]\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "d_a6JqcDKkTg",
-        "outputId": "a9d4f3e8-3459-46dc-a8fe-f8ff138389cf",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 119
-        }
-      },
-      "source": [
-        "y_pred_sklearn = tf.inverse_transform(y_pred)\n",
-        "print(\"`y_pred`, as will be returned to sklearn:\")\n",
-        "y_pred_sklearn[:5]"
-      ],
-      "execution_count": 150,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "`y_pred`, as will be returned to sklearn:\n"
-          ]
-        },
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "array([[0, 1],\n",
-              "       [1, 1],\n",
-              "       [0, 4],\n",
-              "       [0, 4],\n",
-              "       [0, 4]])"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 150
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "9v0RG5ZjPgcy",
-        "outputId": "7f9a05c9-303b-446e-bca9-42c4f6391e06",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 34
-        }
-      },
-      "source": [
-        "print(f\"metadata = {tf.get_metadata()}\")"
-      ],
-      "execution_count": 151,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "metadata = {'n_classes_': [2, 5], 'n_outputs_expected_': 2}\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ScbMdm-9LSx1"
-      },
-      "source": [
-        "Since this looks good, we move on to integrating our transformer into our classifier."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "RKA2U8ANMhn9"
-      },
-      "source": [
-        "class MultiOutputClassifier(KerasClassifier):\n",
-        "\n",
-        "    @property\n",
-        "    def target_encoder(self):\n",
-        "        return MultiOutputTransformer()"
-      ],
-      "execution_count": 152,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "1heA-eeTMp3t"
-      },
-      "source": [
-        "### <a name=\"1-3\"></a> 1.3 Test classifier"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "jXbxL2mnNKkb"
-      },
-      "source": [
-        "from sklearn.preprocessing import StandardScaler\n",
-        "\n",
-        "# First we build an artifical dataset where the features are highly correlated with the labels\n",
-        "X = y_sklearn\n",
-        "X = StandardScaler().fit_transform(X)"
-      ],
-      "execution_count": 153,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "MU14rp50Qs-C"
-      },
-      "source": [
-        "from sklearn.model_selection import cross_val_score\n",
-        "from sklearn.metrics import accuracy_score\n",
-        "\n",
-        "# We need a custom scorer for this dataset\n",
-        "# See https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html\n",
-        "def scorer(estimator, X, y):\n",
-        "    y_pred = estimator.predict(X)\n",
-        "    y_bin, y_cat = y[:, 0], y[:, 1]\n",
-        "    y_pred_bin, y_pred_cat = y_pred[:, 0], y_pred[:, 1]\n",
-        "    return np.mean([accuracy_score(y_bin, y_pred_bin), accuracy_score(y_cat, y_pred_cat)])"
-      ],
-      "execution_count": 154,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "G7THvXqaMrSw",
-        "outputId": "d6ecbd6a-c34e-4296-e3e9-fd8869dc5e30",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 34
-        }
-      },
-      "source": [
-        "clf = MultiOutputClassifier(model=get_clf_model, verbose=0, random_state=0, epochs=100,)\n",
-        "\n",
-        "np.mean(cross_val_score(clf, X, y_sklearn, scoring=scorer))"
-      ],
-      "execution_count": 155,
-      "outputs": [
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "0.985"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 155
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Pznw-f0v-1TU"
-      },
-      "source": [
-        "## <a name=\"2\"></a> 2. Multiple inputs"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "p0uSTuc-voxu"
-      },
-      "source": [
-        "The process for multiple inputs is similar, but instead of overriding the transformer in `target_encoder` we override `feature_encoder`."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "NPOO7tO6-1TV"
-      },
-      "source": [
-        "if False:  # avoid running pseudocode\n",
-        "    from sklearn.base import BaseEstimator, TransformerMixin\n",
-        "\n",
-        "\n",
-        "    class MultiInptuTransformer(BaseEstimator, TransformerMixin):\n",
-        "        ...\n",
-        "\n",
-        "\n",
-        "    class MultiInputClassifier(KerasClassifier):\n",
-        "\n",
-        "        @property\n",
-        "        def feature_encoder(self):\n",
-        "            return MultiInputTransformer(...)"
-      ],
-      "execution_count": 156,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "sp251zciLXAY"
-      },
-      "source": [
-        "### <a name=\"1-1\"></a>2.1 Define Keras Model"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "1RHjV2Bg902U"
-      },
-      "source": [
-        "Let's define a Keras **regression** Model with 2 inputs:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "q84Po5aj928v"
-      },
-      "source": [
-        "def get_reg_model(compile_kwargs):\n",
-        "\n",
-        "    inp1 = keras.layers.Input(shape=(1, ))\n",
-        "    inp2 = keras.layers.Input(shape=(1, ))\n",
-        "\n",
-        "    x1 = keras.layers.Dense(100, activation=\"relu\")(inp1)\n",
-        "    x2 = keras.layers.Dense(50, activation=\"relu\")(inp2)\n",
-        "\n",
-        "    concat = keras.layers.Concatenate(axis=-1)([x1, x2])\n",
-        "\n",
-        "    out = keras.layers.Dense(1)(concat)\n",
-        "\n",
-        "    model = keras.Model(inputs=[inp1, inp2], outputs=out)\n",
-        "    model.compile(loss=\"mse\", optimizer=compile_kwargs[\"optimizer\"])\n",
-        "\n",
-        "    return model"
-      ],
-      "execution_count": 157,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "SXsenn70Alvr"
-      },
-      "source": [
-        "And test it with a small mock dataset:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "YBAzKT9bAn91"
-      },
-      "source": [
-        "X = np.random.random(size=(100, 2))\n",
-        "y = np.sum(X, axis=1)\n",
-        "X = np.split(X, 2, axis=1)\n",
-        "\n",
-        "# build mock compile_kwargs\n",
-        "compile_kwargs = {\"optimizer\": \"sgd\"}\n",
-        "\n",
-        "model = get_reg_model(compile_kwargs=compile_kwargs)\n",
-        "\n",
-        "model.fit(X, y, verbose=0, epochs=100)\n",
-        "y_pred = model.predict(X).squeeze()"
-      ],
-      "execution_count": 158,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "EIKruR9UBzu8",
-        "outputId": "8c345c10-3a98-4576-b0fc-38c957c1d4ce",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 34
-        }
-      },
-      "source": [
-        "from sklearn.metrics import r2_score\n",
-        "\n",
-        "r2_score(y, y_pred)"
-      ],
-      "execution_count": 159,
-      "outputs": [
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "0.8902906544625109"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 159
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "nLiQPj62CIIO"
-      },
-      "source": [
-        "Having verified that our model builds without errors and accepts the inputs types we expect, we move onto integrating a transformer into our SciKeras model."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "6lRbJEbVLqDw"
-      },
-      "source": [
-        "### 2.2 Define Data Transformer"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "zPAOsyJ59Ngj"
-      },
-      "source": [
-        "Just like for overriding `target_encoder`, we just need to define a sklearn transformer and drop it into our SciKeras wrapper. Since we hardcoded the input\n",
-        "shapes into our model and do not rely on any transformer-generated metadata, we can simply use `sklearn.preprocessing.FunctionTransformer`:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "zbKiSm_B9VKG"
-      },
-      "source": [
-        "from sklearn.preprocessing import FunctionTransformer\n",
-        "\n",
-        "\n",
-        "class MultiInputRegressor(KerasRegressor):\n",
-        "\n",
-        "    @property\n",
-        "    def feature_encoder(self):\n",
-        "        return FunctionTransformer(\n",
-        "            func=lambda X: [X[:, 0], X[:, 1]],\n",
-        "        )"
-      ],
-      "execution_count": 160,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "nLvhTvaTCwIZ"
-      },
-      "source": [
-        "Note that we did **not** implement `inverse_transform` (that is, we did not pass an `inverse_func` argument to `FunctionTransformer`) because features are never converted back to their original form."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ngMljBaIL_-3"
-      },
-      "source": [
-        "### 2.3 Test regressor"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "IRyRFjLTEqzE",
-        "outputId": "7e350740-941a-4571-9c8f-9a5364724b21",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 34
-        }
-      },
-      "source": [
-        "reg = MultiInputRegressor(model=get_reg_model, epochs=100, verbose=0, random_state=0)\n",
-        "\n",
-        "X_sklearn = np.column_stack(X)\n",
-        "\n",
-        "np.mean(cross_val_score(reg, X_sklearn, y))"
-      ],
-      "execution_count": 161,
-      "outputs": [
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "0.9995720499457423"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 161
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "4T753k83IhmW"
-      },
-      "source": [
-        "## 3. Multidimensional inputs with MNIST dataset"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "lKNj5QO76UxW"
-      },
-      "source": [
-        "In this example, we look at how we can use SciKeras to process the MNIST dataset. The dataset is composed of 60,000 images of digits, each of which is a 2D 28x28 image.\n",
-        "\n",
-        "The dataset and Keras Model architecture used come from a [Keras example](https://keras.io/examples/vision/mnist_convnet/). It may be beneficial to understand the Keras model by reviewing that example first."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "NdDB8dWx8Akc",
-        "outputId": "c2253bc4-09d4-4c69-80da-a3c10e853782",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 34
-        }
-      },
-      "source": [
-        "(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()\n",
-        "x_train.shape"
-      ],
-      "execution_count": 162,
-      "outputs": [
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "(60000, 28, 28)"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 162
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "aUcI-gys8EvE"
-      },
-      "source": [
-        "The outputs (labels) are numbers 0-9:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "gqdzgCv18SDw",
-        "outputId": "19d6a7de-9fea-4345-d332-f25ebe8a212f",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 51
-        }
-      },
-      "source": [
-        "print(y_train.shape)\n",
-        "print(np.unique(y_train))"
-      ],
-      "execution_count": 163,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "(60000,)\n[0 1 2 3 4 5 6 7 8 9]\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "DYiQTQH29iMB"
-      },
-      "source": [
-        "First, we will \"flatten\" the data into an array of shape `(n_samples, 28*28)` (i.e. a 2D array). This will allow us to use sklearn ecosystem utilities, for example, `sklearn.preprocessing.MinMaxScaler`."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "3TCV8Uem90rt"
-      },
-      "source": [
-        "from sklearn.preprocessing import MinMaxScaler\n",
-        "\n",
-        "n_samples_train = x_train.shape[0]\n",
-        "n_samples_test = x_test.shape[0]\n",
-        "\n",
-        "x_train = x_train.reshape((n_samples_train, -1))\n",
-        "x_test = x_test.reshape((n_samples_test, -1))\n",
-        "x_train = MinMaxScaler().fit_transform(x_train)\n",
-        "x_test = MinMaxScaler().fit_transform(x_test)"
-      ],
-      "execution_count": 164,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "bvEra5TKA2yi",
-        "outputId": "32238303-9ca3-406e-95ca-2c59ac8dfc82",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 34
-        }
-      },
-      "source": [
-        "print(x_train.shape[1:])  # 784 = 28*28"
-      ],
-      "execution_count": 165,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "(784,)\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "f5a02q6tBB3R",
-        "outputId": "810df38e-a0e6-45bd-ed42-0d7af3a3cbf2",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 34
-        }
-      },
-      "source": [
-        "print(np.min(x_train), np.max(x_train))  # scaled 0-1"
-      ],
-      "execution_count": 166,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "0.0 1.0\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "aCoUz1qy-rh9"
-      },
-      "source": [
-        "Of course, in this case, we could have just as easily used numpy functions to scale our data, but we use `MinMaxScaler` to demonstrate use of the sklearn ecosystem."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "yuR10hymK0dh"
-      },
-      "source": [
-        "### 3.1 Define Keras Model"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "rE15zkS4_hGU"
-      },
-      "source": [
-        "Next we will define our Keras model (adapted from [keras.io](https://keras.io/examples/vision/mnist_convnet/)):"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "dBFFXT-__7KU"
-      },
-      "source": [
-        "num_classes = 10\n",
-        "input_shape = (28, 28, 1)\n",
-        "\n",
-        "\n",
-        "def get_model(meta):\n",
-        "    model = keras.Sequential(\n",
-        "        [\n",
-        "            keras.Input(input_shape),\n",
-        "            keras.layers.Conv2D(32, kernel_size=(3, 3), activation=\"relu\"),\n",
-        "            keras.layers.MaxPooling2D(pool_size=(2, 2)),\n",
-        "            keras.layers.Conv2D(64, kernel_size=(3, 3), activation=\"relu\"),\n",
-        "            keras.layers.MaxPooling2D(pool_size=(2, 2)),\n",
-        "            keras.layers.Flatten(),\n",
-        "            keras.layers.Dropout(0.5),\n",
-        "            keras.layers.Dense(num_classes, activation=\"softmax\"),\n",
-        "        ]\n",
-        "    )\n",
-        "    model.compile(\n",
-        "        loss=\"sparse_categorical_crossentropy\", optimizer=\"adam\"\n",
-        "    )\n",
-        "    return model"
-      ],
-      "execution_count": 167,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "8aJRGbF1Avpq"
-      },
-      "source": [
-        "Now let's define a transformer that we will use to reshape our input from the sklearn shape (`(n_samples, 784)`) to the Keras shape (which we will be `(n_samples, 28, 28, 1)`)."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "CzVwr7glB1tq"
-      },
-      "source": [
-        "class MultiDimensionalClassifier(KerasClassifier):\n",
-        "\n",
-        "    @property\n",
-        "    def feature_encoder(self):\n",
-        "        return FunctionTransformer(\n",
-        "            func=lambda X: X.reshape(X.shape[0], *input_shape),\n",
-        "        )"
-      ],
-      "execution_count": 168,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "n1CSzViYDFeK"
-      },
-      "source": [
-        "clf = MultiDimensionalClassifier(\n",
-        "    model=get_model,\n",
-        "    epochs=15,\n",
-        "    batch_size=128,\n",
-        "    validation_split=0.1,\n",
-        "    random_state=0,\n",
-        ")"
-      ],
-      "execution_count": 169,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "FP_Z5NIuL3bB"
-      },
-      "source": [
-        "### 3.2 Test"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "RjmTibFlEr6-"
-      },
-      "source": [
-        "Train and score the model (this takes some time)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "NaRVWcnZDRlD",
-        "outputId": "ed105010-c823-438f-a417-6e0fbf7148c4",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 799
-        }
-      },
-      "source": [
-        "clf.fit(x_train, y_train)"
-      ],
-      "execution_count": 170,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Epoch 1/15\n",
-            "422/422 [==============================] - 17s 40ms/step - loss: 0.7640 - val_loss: 0.0871\n",
-            "Epoch 2/15\n",
-            "422/422 [==============================] - 18s 42ms/step - loss: 0.1219 - val_loss: 0.0600\n",
-            "Epoch 3/15\n",
-            "422/422 [==============================] - 17s 41ms/step - loss: 0.0863 - val_loss: 0.0454\n",
-            "Epoch 4/15\n",
-            "422/422 [==============================] - 20s 47ms/step - loss: 0.0755 - val_loss: 0.0435\n",
-            "Epoch 5/15\n",
-            "422/422 [==============================] - 19s 46ms/step - loss: 0.0645 - val_loss: 0.0438\n",
-            "Epoch 6/15\n",
-            "422/422 [==============================] - 18s 42ms/step - loss: 0.0553 - val_loss: 0.0361\n",
-            "Epoch 7/15\n",
-            "422/422 [==============================] - 18s 42ms/step - loss: 0.0508 - val_loss: 0.0372\n",
-            "Epoch 8/15\n",
-            "422/422 [==============================] - 18s 42ms/step - loss: 0.0512 - val_loss: 0.0342\n",
-            "Epoch 9/15\n",
-            "422/422 [==============================] - 18s 42ms/step - loss: 0.0450 - val_loss: 0.0307\n",
-            "Epoch 10/15\n",
-            "422/422 [==============================] - 17s 41ms/step - loss: 0.0422 - val_loss: 0.0308\n",
-            "Epoch 11/15\n",
-            "422/422 [==============================] - 17s 40ms/step - loss: 0.0378 - val_loss: 0.0321\n",
-            "Epoch 12/15\n",
-            "422/422 [==============================] - 17s 40ms/step - loss: 0.0376 - val_loss: 0.0323\n",
-            "Epoch 13/15\n",
-            "422/422 [==============================] - 18s 43ms/step - loss: 0.0343 - val_loss: 0.0299\n",
-            "Epoch 14/15\n",
-            "422/422 [==============================] - 19s 44ms/step - loss: 0.0358 - val_loss: 0.0304\n",
-            "Epoch 15/15\n",
-            "422/422 [==============================] - 18s 43ms/step - loss: 0.0322 - val_loss: 0.0294\n"
-          ]
-        },
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "MultiDimensionalClassifier(\n",
-              "\tmodel=<function get_model at 0x14dd1b9d0>\n",
-              "\tbuild_fn=None\n",
-              "\twarm_start=False\n",
-              "\trandom_state=0\n",
-              "\toptimizer=rmsprop\n",
-              "\tloss=None\n",
-              "\tmetrics=None\n",
-              "\tbatch_size=128\n",
-              "\tverbose=1\n",
-              "\tcallbacks=None\n",
-              "\tvalidation_split=0.1\n",
-              "\tshuffle=True\n",
-              "\trun_eagerly=False\n",
-              "\tepochs=15\n",
-              "\tclass_weight=None\n",
-              ")"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 170
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "Mw_nD5FdEzd-",
-        "outputId": "d23b3c3f-337a-4e27-d7ed-161a7c9a12da",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 51
-        }
-      },
-      "source": [
-        "score = clf.score(x_test, y_test)\n",
-        "print(f\"Test score (accuracy): {score:.2f}\")"
-      ],
-      "execution_count": 171,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "79/79 [==============================] - 1s 14ms/step\n",
-            "Test score (accuracy): 0.99\n"
-          ]
-        }
-      ]
-    },
-    {
-      "source": [
-        "## 4. Ragged datasets with tf.data.Dataset"
-      ],
-      "cell_type": "markdown",
-      "metadata": {}
-    },
-    {
-      "source": [
-        "SciKeras provides a third dependancy injection point that operats on the entire dataset: X, y & sample_weight. This `dataset_transformer` is applied after `target_transformer` and `feature_transformer`. One use case for this dependancy injection point is to transform data from tabular/array-like to the `tf.data.Dataset` format, which only requires iteration. We can use this to create a `tf.data.Dataset` of ragged tensors."
-      ],
-      "cell_type": "markdown",
-      "metadata": {}
-    },
-    {
-      "source": [
-        "Note that `dataset_transformer` should accept a single **3 element tuple** as its argument and return value:"
-      ],
-      "cell_type": "markdown",
-      "metadata": {}
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 172,
-      "metadata": {},
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Help on property:\n\n    Retrieve a transformer to be applied jointly to the entire\n    dataset (X, y & sample_weights).\n    \n    By default, KerasClassifier implements ClassWeightDataTransformer,\n    which embeds class_weight into sample_weight.\n    \n    You can override this method to provide custom transformations.\n    To keep the default class_weight behavior, you can chain your\n    transfromer and ClassWeightDataTransformer using a Pipeline.\n    \n    It MUST accept a 3 element tuple as it's single input argument\n    to `fit` and `transform`. `transform` must also output\n    a 3 element tuple in the same format.\n    The first element corresponds to X, or as an output from the\n    transformer, to a `tf.data.Dataset` instance containing\n    X, y and optionally sample_weights.\n    The second element corresponds to `y`, and may be None\n    on the output side.\n    The third element is `sample_weights` which may be None\n    on the input and output sides.\n    \n    Note that `inverse_transform` is never used\n    and is not required to be implemented.\n    \n    Returns\n    -------\n    dataset_transformer\n        Transformer implementing the sklearn transformer\n        interface.\n\n"
-          ]
-        }
-      ],
-      "source": [
-        "help(KerasClassifier.dataset_transformer)"
-      ]
-    },
-    {
-      "source": [
-        "When you return a tuple like `(tf.data.Dataset(...), None, None)`, SciKeras will pass the data untouched to `Model.fit` like `Model.fit(x=tf.data.Dataset(...), y=None, sample_weight=None)`.\n",
-        "\n",
-        "Let's start by defining our data. We'll have an extra \"feature\" that marks the observation index, but we'll remove it when we deconstruct our data in the transformer."
-      ],
-      "cell_type": "markdown",
-      "metadata": {}
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 173,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "feature_1 = np.random.uniform(size=(10, ))\n",
-        "feature_2 = np.random.uniform(size=(10, ))\n",
-        "obs = [0] * 3 + [1] * 2 + [2] * 1 + [3] * 2 + [4] * 2\n",
-        "\n",
-        "X = np.column_stack([feature_1, feature_2, obs]).astype(\"float32\")\n",
-        "\n",
-        "y = np.array([\"class1\"] * 5 + [\"class2\"] * 5, dtype=str)"
-      ]
-    },
-    {
-      "source": [
-        "Next, we define our `dataset_transformer`. We will do this by defining a custom forward transformation outside of the Keras model. Note that we do not define an inverse transformation since that is never used.\n",
-        "Also note that `dataset_transformer` will _always_ be called with `X` (i.e. the first element of the tuple will always be populated), but will be called with `y=None` when used for `predict`. Thus,\n",
-        "you should check if `y` and `sample_weigh` are None before doing any operations on them."
-      ],
-      "cell_type": "markdown",
-      "metadata": {}
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 174,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "from typing import Tuple, Optional\n",
-        "\n",
-        "from sklearn.base import BaseEstimator, TransformerMixin\n",
-        "from tensorflow import RaggedTensor\n",
-        "from tensorflow.data import Dataset\n",
-        "\n",
-        "\n",
-        "def dataset_transformer(data: Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]) -> Tuple[Dataset, None, None]:\n",
-        "    X, y, sample_weights = data\n",
-        "    if y is not None:\n",
-        "        y = y.reshape(-1, 1 if len(y.shape) == 1 else y.shape[1])\n",
-        "        y = y[RaggedTensor.from_value_rowids(y, X[:, -1]).row_starts().numpy()]\n",
-        "    if sample_weights is not None:\n",
-        "        sample_weights = sample_weights.reshape(-1, 1 if len(sample_weights.shape) == 1 else sample_weights.shape[1])\n",
-        "        sample_weights = sample_weights[RaggedTensor.from_value_rowids(sample_weights, X[:, -1]).row_starts().numpy()]\n",
-        "    X = RaggedTensor.from_value_rowids(X[:, :-1], X[:, -1])\n",
-        "    return (Dataset.from_tensor_slices((X, y, sample_weights)), None, None)\n"
-      ]
-    },
-    {
-      "source": [
-        "Lets quickly test our transformer:"
-      ],
-      "cell_type": "markdown",
-      "metadata": {}
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 175,
-      "metadata": {},
-      "outputs": [
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "(<TensorSliceDataset shapes: ((None, 2), (1,), NoneTensorSpec()), types: (tf.float32, tf.string, NoneTensorSpec())>,\n",
-              " None,\n",
-              " None)"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 175
-        }
-      ],
-      "source": [
-        "data = dataset_transformer((X, y, None))\n",
-        "data"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 176,
-      "metadata": {},
-      "outputs": [
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "(<TensorSliceDataset shapes: ((None, 2), NoneTensorSpec(), NoneTensorSpec()), types: (tf.float32, NoneTensorSpec(), NoneTensorSpec())>,\n",
-              " None,\n",
-              " None)"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 176
-        }
-      ],
-      "source": [
-        "data = dataset_transformer((X, None, None))\n",
-        "data"
-      ]
-    },
-    {
-      "source": [
-        "Our shapes look good, and we can handle the `y=None` case.\n",
-        "Next, we can add our transormer to our model."
-      ],
-      "cell_type": "markdown",
-      "metadata": {}
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 177,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "from sklearn.preprocessing import FunctionTransformer\n",
-        "\n",
-        "\n",
-        "class RaggedClassifier(KerasClassifier):\n",
-        "\n",
-        "    @property\n",
-        "    def dataset_transformer(self):\n",
-        "        return FunctionTransformer(dataset_transformer)"
-      ]
-    },
-    {
-      "source": [
-        "Now we can define a Model. We need some way to handle/flatten our ragged arrays within our model. For this example, we use a custom mean layer, but you could use an Embedding layer, LSTM, etc."
-      ],
-      "cell_type": "markdown",
-      "metadata": {}
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 178,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "from tensorflow import reduce_mean, reshape\n",
-        "from tensorflow.keras import Sequential, layers\n",
-        "\n",
-        "\n",
-        "class CustomMean(layers.Layer):\n",
-        "\n",
-        "    def __init__(self, axis=None):\n",
-        "        super(CustomMean, self).__init__()\n",
-        "        self._supports_ragged_inputs = True\n",
-        "        self.axis = axis\n",
-        "\n",
-        "    def call(self, inputs, **kwargs):\n",
-        "        input_shape = inputs.get_shape()\n",
-        "        return reshape(reduce_mean(inputs, axis=self.axis), (1, *input_shape[1:]))\n",
-        "\n",
-        "\n",
-        "def get_model(meta):\n",
-        "    inp_shape = meta[\"X_shape_\"][1]-1\n",
-        "    model = Sequential([               \n",
-        "        layers.Input(shape=(inp_shape,), ragged=True),\n",
-        "        CustomMean(axis=0),\n",
-        "        layers.Dense(1, activation='sigmoid')\n",
-        "    ])\n",
-        "    return model"
-      ]
-    },
-    {
-      "source": [
-        "And attatch our model to our classifier wrapper:"
-      ],
-      "cell_type": "markdown",
-      "metadata": {}
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 179,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "clf = RaggedClassifier(get_model, loss=\"bce\")"
-      ]
-    },
-    {
-      "source": [
-        "Finally, let's train and predict:"
-      ],
-      "cell_type": "markdown",
-      "metadata": {}
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 180,
-      "metadata": {},
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "5/5 [==============================] - 0s 980us/step - loss: 0.6251\n",
-            "5/5 [==============================] - 0s 968us/step\n"
-          ]
-        },
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "array(['class1', 'class1', 'class1', 'class1', 'class1'], dtype='<U6')"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 180
-        }
-      ],
-      "source": [
-        "clf.fit(X, y)\n",
-        "y_pred = clf.predict(X)\n",
-        "y_pred"
-      ]
-    },
-    {
-      "source": [
-        "If we define our custom layers, transformers and wrappers in their own module, we can easily create a self-contained classifier that is able to handle ragged datasets and has a clean Scikit-Learn compatible API:"
-      ],
-      "cell_type": "markdown",
-      "metadata": {}
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 181,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "class RaggedClassifier(KerasClassifier):\n",
-        "\n",
-        "    @property\n",
-        "    def dataset_transformer(self):\n",
-        "        return FunctionTransformer(dataset_transformer)\n",
-        "    \n",
-        "    def _keras_build_fn(self):\n",
-        "        inp_shape = self.X_shape_[1] - 1\n",
-        "        model = Sequential([               \n",
-        "            layers.Input(shape=(inp_shape,), ragged=True),\n",
-        "            CustomMean(axis=0),\n",
-        "            layers.Dense(1, activation='sigmoid')\n",
-        "        ])\n",
-        "        return model"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 182,
-      "metadata": {},
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "5/5 [==============================] - 0s 1ms/step - loss: 0.6066\n",
-            "5/5 [==============================] - 0s 939us/step\n"
-          ]
-        },
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "array(['class1', 'class1', 'class1', 'class1', 'class1'], dtype='<U6')"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 182
-        }
-      ],
-      "source": [
-        "clf = RaggedClassifier(loss=\"bce\")\n",
-        "clf.fit(X, y)\n",
-        "y_pred = clf.predict(X)\n",
-        "y_pred"
-      ]
-    },
-    {
-      "source": [
-        "## 5. Multi-output class_weight"
-      ],
-      "cell_type": "markdown",
-      "metadata": {}
-    },
-    {
-      "source": [
-        "In this example, we will use `dataset_transformer` to support multi-output class weights. We will re-use our `MultiOutputTransformer` from our previous example to split the output, then we will create `sample_weights` from `class_weight`"
-      ],
-      "cell_type": "markdown",
-      "metadata": {}
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 183,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "from collections import defaultdict\n",
-        "\n",
-        "from sklearn.utils.class_weight import compute_sample_weight\n",
-        "\n",
-        "\n",
-        "class DatasetTransformer(BaseEstimator, TransformerMixin):\n",
-        "\n",
-        "    def __init__(self, output_names, class_weight=None):\n",
-        "        self.class_weight = class_weight\n",
-        "        self.output_names = output_names\n",
-        "\n",
-        "    def fit(self, data: Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]) -> \"DatasetTransformer\":\n",
-        "        return self\n",
-        "\n",
-        "    def transform(self, data: Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]) -> Tuple[Dataset, None, None]:\n",
-        "        if self.class_weight is None:\n",
-        "            return data\n",
-        "        class_weight = self.class_weight\n",
-        "        if isinstance(class_weight, str):  # handle \"balanced\"\n",
-        "            class_weight_ = class_weight\n",
-        "            class_weight = defaultdict(lambda: class_weight_)\n",
-        "        X, y, sample_weights = data\n",
-        "        assert sample_weights is None, \"Cannot use class_weight & sample_weights together\"\n",
-        "        if y is not None:\n",
-        "            # y should be a list of arrays, as split up by MultiOutputTransformer\n",
-        "            sample_weights = dict()\n",
-        "            for output_num, (output_name, output_data) in enumerate(zip(self.output_names, y)):\n",
-        "                # class_weight is expected to be indexable by output_number\n",
-        "                # see https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_sample_weight.html\n",
-        "                # Note that it is trivial to change the expected format to match Keras' ({output_name: weights, ...})\n",
-        "                # see https://github.com/keras-team/keras/issues/4735#issuecomment-267473722\n",
-        "                cls_wt_out = class_weight[output_num]\n",
-        "                sample_weights[output_name] = compute_sample_weight(cls_wt_out, output_data)\n",
-        "        return X, y, sample_weights\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 184,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "def get_model(meta, compile_kwargs):\n",
-        "    inp = keras.layers.Input(shape=(meta[\"n_features_in_\"]))\n",
-        "    x1 = keras.layers.Dense(100, activation=\"relu\")(inp)\n",
-        "    out_bin = keras.layers.Dense(1, activation=\"sigmoid\")(x1)\n",
-        "    out_cat = keras.layers.Dense(meta[\"n_classes_\"][1], activation=\"softmax\")(x1)\n",
-        "    model = keras.Model(inputs=inp, outputs=[out_bin, out_cat])\n",
-        "    model.compile(\n",
-        "        loss=[\"binary_crossentropy\", \"sparse_categorical_crossentropy\"],\n",
-        "        optimizer=compile_kwargs[\"optimizer\"]\n",
-        "    )\n",
-        "    return model\n",
-        "\n",
-        "\n",
-        "class CustomClassifier(KerasClassifier):\n",
-        "\n",
-        "    @property\n",
-        "    def target_encoder(self):\n",
-        "        return MultiOutputTransformer()\n",
-        "    \n",
-        "    @property\n",
-        "    def dataset_transformer(self):\n",
-        "        return DatasetTransformer(\n",
-        "            output_names=self.model_.output_names,\n",
-        "            class_weight=self.class_weight\n",
-        "        )"
-      ]
-    },
-    {
-      "source": [
-        "Next, we define the data. We'll use `sklearn.datasets.make_blobs` to generate a relatively noisy dataset:"
-      ],
-      "cell_type": "markdown",
-      "metadata": {}
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 185,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "from sklearn.datasets import make_blobs\n",
-        "\n",
-        "\n",
-        "X, y = make_blobs(centers=3, random_state=0, cluster_std=20)\n",
-        "# make a binary target for \"is the value of the first class?\"\n",
-        "y_bin = y == y[0]\n",
-        "y = np.column_stack([y_bin, y])"
-      ]
-    },
-    {
-      "source": [
-        "Test the model without specifying class weighting:"
-      ],
-      "cell_type": "markdown",
-      "metadata": {}
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 186,
-      "metadata": {},
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "[91  9]\n[28 30 42]\n"
-          ]
-        }
-      ],
-      "source": [
-        "clf = CustomClassifier(get_model, epochs=100, verbose=0, random_state=0)\n",
-        "clf.fit(X, y)\n",
-        "y_pred = clf.predict(X)\n",
-        "(_, counts_bin) = np.unique(y_pred[:, 0], return_counts=True)\n",
-        "print(counts_bin)\n",
-        "(_, counts_cat) = np.unique(y_pred[:, 1], return_counts=True)\n",
-        "print(counts_cat)"
-      ]
-    },
-    {
-      "source": [
-        "As you can see, without `class_weight=\"balanced\"`, our classifier only predicts mainly a single class for the first output. Now with `class_weight=\"balanced\"`:"
-      ],
-      "cell_type": "markdown",
-      "metadata": {}
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 187,
-      "metadata": {},
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "[57 43]\n[27 27 46]\n"
-          ]
-        }
-      ],
-      "source": [
-        "clf = CustomClassifier(get_model, class_weight=\"balanced\", epochs=100, verbose=0, random_state=0)\n",
-        "clf.fit(X, y)\n",
-        "y_pred = clf.predict(X)\n",
-        "(_, counts_bin) = np.unique(y_pred[:, 0], return_counts=True)\n",
-        "print(counts_bin)\n",
-        "(_, counts_cat) = np.unique(y_pred[:, 1], return_counts=True)\n",
-        "print(counts_cat)"
-      ]
-    },
-    {
-      "source": [
-        "Now, we get (mostly) balanced classes. But what if we want to specify our classes manually? You will notice that in when we defined `DatasetTransformer`, we gave it the ability to handle\n",
-        "a list of class weights. For demonstration purposes, we will highly bias towards the second class in each output:"
-      ],
-      "cell_type": "markdown",
-      "metadata": {}
-    },
-    {
-      "source": [
-        "clf = CustomClassifier(get_model, class_weight=[{0: 0.1, 1: 1}, {0: 0.1, 1: 1, 2: 0.1}], epochs=100, verbose=0, random_state=0)\n",
-        "clf.fit(X, y)\n",
-        "y_pred = clf.predict(X)\n",
-        "(_, counts_bin) = np.unique(y_pred[:, 0], return_counts=True)\n",
-        "print(counts_bin)\n",
-        "(_, counts_cat) = np.unique(y_pred[:, 1], return_counts=True)\n",
-        "print(counts_cat)"
-      ],
-      "cell_type": "code",
-      "metadata": {},
-      "execution_count": 188,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "[ 7 93]\n[ 2 98]\n"
-          ]
-        }
-      ]
-    },
-    {
-      "source": [
-        "Or mixing the two methods, because our first output is unbalanced but our second is (presumably) balanced:"
-      ],
-      "cell_type": "markdown",
-      "metadata": {}
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 189,
-      "metadata": {},
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "[57 43]\n[30 25 45]\n"
-          ]
-        }
-      ],
-      "source": [
-        "clf = CustomClassifier(get_model, class_weight=[\"balanced\", None], epochs=100, verbose=0, random_state=0)\n",
-        "clf.fit(X, y)\n",
-        "y_pred = clf.predict(X)\n",
-        "(_, counts_bin) = np.unique(y_pred[:, 0], return_counts=True)\n",
-        "print(counts_bin)\n",
-        "(_, counts_cat) = np.unique(y_pred[:, 1], return_counts=True)\n",
-        "print(counts_cat)"
-      ]
-    }
-  ]
-}
\ No newline at end of file
diff --git a/scikeras/utils/transformers.py b/scikeras/utils/transformers.py
index 6a1c951bf..920a1ea45 100644
--- a/scikeras/utils/transformers.py
+++ b/scikeras/utils/transformers.py
@@ -25,9 +25,16 @@ class TargetReshaper(BaseEstimator, TransformerMixin):
         Dimensions of y that the transformer was trained on.
     """
 
-    def fit(self, y: np.ndarray) -> "TargetReshaper":
+    def fit(self, y: np.ndarray, dummy: None = None) -> "TargetReshaper":
         """Fit the transformer to a target y.
 
+        Parameters
+        ----------
+        y : np.ndarray
+            The target data to be transformed.
+        dummy: None
+            Unused argument, kept for compatibility with sklearn Pipelines.
+
         Returns
         -------
         TargetReshaper
@@ -124,7 +131,7 @@ def _type_of_target(self, y: np.ndarray) -> str:
             target_type = type_of_target(self.categories[0])
         return target_type
 
-    def fit(self, y: np.ndarray) -> "ClassifierLabelEncoder":
+    def fit(self, y: np.ndarray, dummy: None = None) -> "ClassifierLabelEncoder":
         """Fit the estimator to the target y.
 
         For all targets, this transforms classes into ordinal numbers.
@@ -135,6 +142,8 @@ def fit(self, y: np.ndarray) -> "ClassifierLabelEncoder":
         ----------
         y : np.ndarray
             The target data to be transformed.
+        dummy: None
+            Unused argument, kept for compatibility with sklearn Pipelines.
 
         Returns
         -------
@@ -316,12 +325,19 @@ class RegressorTargetEncoder(BaseEstimator, TransformerMixin):
         Number of outputs the Keras Model is expected to have.
     """
 
-    def fit(self, y: np.ndarray) -> "RegressorTargetEncoder":
+    def fit(self, y: np.ndarray, dummy: None = None) -> "RegressorTargetEncoder":
         """Fit the transformer to the target y.
 
         For RegressorTargetEncoder, this just records the dimensions
         of y as the expected number of outputs and saves the dtype.
 
+        Parameters
+        ----------
+        y : np.ndarray
+            The target data to be transformed.
+        dummy: None
+            Unused argument, kept for compatibility with sklearn Pipelines.
+
         Returns
         -------
         RegressorTargetEncoder
@@ -393,12 +409,38 @@ def get_metadata(self):
         }
 
 
+class DummyDataTransformer(BaseEstimator, TransformerMixin):
+    """A dummy transfomer implementing the data_transformer
+    interface. This is the default data_transformer for BaseWrapper.
+    """
+
+    def fit(
+        self,
+        data: Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]],
+        dummy: None = None,
+    ) -> "ClassWeightDataTransformer":
+        return self
+
+    def transform(
+        self, data: Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]
+    ) -> Tuple[np.ndarray, Union[np.ndarray, None], Union[np.ndarray, None]]:
+        return data
+
+
 class ClassWeightDataTransformer(BaseEstimator, TransformerMixin):
+    """Default dataset_transformer for KerasClassifier.
+
+    This transformer implements handling of the `class_weight` parameter
+    for single output classifiers.
+    """
+
     def __init__(self, class_weight: Optional[Union[str, Dict[int, float]]] = None):
         self.class_weight = class_weight
 
     def fit(
-        self, data: Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]
+        self,
+        data: Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]],
+        dummy: None = None,
     ) -> "ClassWeightDataTransformer":
         return self
 
@@ -407,7 +449,7 @@ def transform(
     ) -> Tuple[np.ndarray, Union[np.ndarray, None], Union[np.ndarray, None]]:
         X, y, sample_weight = data
         if self.class_weight is None or y is None:
-            return data
+            return (X, y, sample_weight)
         sample_weight = 1 if sample_weight is None else sample_weight
         sample_weight *= compute_sample_weight(class_weight=self.class_weight, y=y)
-        return X, y, sample_weight
+        return (X, y, sample_weight)
diff --git a/scikeras/wrappers.py b/scikeras/wrappers.py
index 90d389606..82e0e8974 100644
--- a/scikeras/wrappers.py
+++ b/scikeras/wrappers.py
@@ -36,6 +36,7 @@
 from scikeras.utils.transformers import (
     ClassifierLabelEncoder,
     ClassWeightDataTransformer,
+    DummyDataTransformer,
     RegressorTargetEncoder,
 )
 
@@ -734,7 +735,7 @@ def dataset_transformer(self):
             Transformer implementing the sklearn transformer
             interface.
         """
-        return FunctionTransformer()
+        return DummyDataTransformer()
 
     def fit(self, X, y, sample_weight=None, **kwargs) -> "BaseWrapper":
         """Constructs a new model with ``model`` & fit the model to ``(X, y)``.
diff --git a/tests/test_api.py b/tests/test_api.py
index e0cc80c00..7edb15b37 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -6,6 +6,7 @@
 
 import numpy as np
 import pytest
+import tensorflow as tf
 
 from sklearn.calibration import CalibratedClassifierCV
 from sklearn.datasets import load_boston, load_digits, load_iris
@@ -17,9 +18,8 @@
 )
 from sklearn.exceptions import NotFittedError
 from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
-from sklearn.pipeline import Pipeline
+from sklearn.pipeline import Pipeline, make_pipeline
 from sklearn.preprocessing import FunctionTransformer, StandardScaler
-from tensorflow.data import Dataset
 from tensorflow.keras import losses as losses_module
 from tensorflow.keras import metrics as metrics_module
 from tensorflow.keras.layers import Conv2D, Dense, Flatten, Input
@@ -818,18 +818,21 @@ def test_prebuilt_model(self, wrapper):
 
 class TestDatasetTransformer:
     def test_conversion_to_dataset(self):
+        """Check that the dataset_transformer
+        interface can return a tf Dataset
+        """
         inp = Input((1,))
         out = Dense(1, activation="sigmoid")(inp)
         m = Model(inp, out)
         m.compile(loss="bce")
 
-        def tf(X_y_s: Tuple[np.ndarray, np.ndarray, np.ndarray]):
-            return Dataset.from_tensor_slices(X_y_s), None, None
+        def dtf(X_y_s: Tuple[np.ndarray, np.ndarray, np.ndarray]):
+            return (tf.data.Dataset.from_tensor_slices(X_y_s), None, None)
 
         class MyWrapper(KerasClassifier):
             @property
             def dataset_transformer(self):
-                return FunctionTransformer(tf)
+                return FunctionTransformer(dtf)
 
         est = MyWrapper(m)
         X = np.random.random((100, 1))
@@ -837,7 +840,7 @@ def dataset_transformer(self):
         fit_orig = m.fit
 
         def check_fit(**kwargs):
-            assert isinstance(kwargs["x"], Dataset)
+            assert isinstance(kwargs["x"], tf.data.Dataset)
             assert kwargs["y"] is None
             return fit_orig(**kwargs)
 
@@ -847,3 +850,28 @@ def check_fit(**kwargs):
         assert y_pred.dtype == y.dtype
         assert y_pred.shape == y.shape
         assert set(y_pred).issubset(set(y))
+
+    def test_pipeline(self):
+        """Check that the dataset_transformer
+        interface is compatible with Pipelines
+        """
+        inp = Input((1,))
+        out = Dense(1, activation="sigmoid")(inp)
+        m = Model(inp, out)
+        m.compile(loss="bce")
+
+        def dtf(X_y_s: Tuple[np.ndarray, np.ndarray, np.ndarray]):
+            return (tf.data.Dataset.from_tensor_slices(X_y_s), None, None)
+
+        class MyWrapper(KerasClassifier):
+            @property
+            def dataset_transformer(self):
+                t1 = super().dataset_transformer
+                t2 = FunctionTransformer(dtf)
+                return make_pipeline(t1, t2)
+
+        est = MyWrapper(m, class_weight="balanced")
+        X = np.random.random((100, 1))
+        y = np.array(["a", "b"] * 50, dtype=str)
+
+        est.fit(X, y)

From a3092c28e3adafdecf22166e8192c118f3c6a87e Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Sun, 24 Jan 2021 11:54:32 -0600
Subject: [PATCH 07/29] fix undef transformer

---
 docs/source/notebooks/DataTransformers.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/notebooks/DataTransformers.md b/docs/source/notebooks/DataTransformers.md
index 2ece0fc20..cc79b9fa1 100644
--- a/docs/source/notebooks/DataTransformers.md
+++ b/docs/source/notebooks/DataTransformers.md
@@ -694,7 +694,7 @@ class RaggedClassifier(KerasClassifier):
     @property
     def dataset_transformer(self):
         t1 = FunctionTransformer(ragged_transformer)
-        t2 = ClassWeightDataTransformer(self.class_weight)
+        t2 = super().dataset_transformer  # ClassWeightDataTransformer
         t3 = FunctionTransformer(dataset_transformer)
         return make_pipeline(t1, t2, t3)
     

From 8f9259179490305b485d532775ce345b17071fd2 Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Sun, 24 Jan 2021 11:56:18 -0600
Subject: [PATCH 08/29] remove unused dummy transformer

---
 scikeras/utils/transformers.py | 18 ------------------
 scikeras/wrappers.py           |  3 +--
 2 files changed, 1 insertion(+), 20 deletions(-)

diff --git a/scikeras/utils/transformers.py b/scikeras/utils/transformers.py
index 920a1ea45..7f935ba35 100644
--- a/scikeras/utils/transformers.py
+++ b/scikeras/utils/transformers.py
@@ -409,24 +409,6 @@ def get_metadata(self):
         }
 
 
-class DummyDataTransformer(BaseEstimator, TransformerMixin):
-    """A dummy transfomer implementing the data_transformer
-    interface. This is the default data_transformer for BaseWrapper.
-    """
-
-    def fit(
-        self,
-        data: Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]],
-        dummy: None = None,
-    ) -> "ClassWeightDataTransformer":
-        return self
-
-    def transform(
-        self, data: Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]
-    ) -> Tuple[np.ndarray, Union[np.ndarray, None], Union[np.ndarray, None]]:
-        return data
-
-
 class ClassWeightDataTransformer(BaseEstimator, TransformerMixin):
     """Default dataset_transformer for KerasClassifier.
 
diff --git a/scikeras/wrappers.py b/scikeras/wrappers.py
index 82e0e8974..90d389606 100644
--- a/scikeras/wrappers.py
+++ b/scikeras/wrappers.py
@@ -36,7 +36,6 @@
 from scikeras.utils.transformers import (
     ClassifierLabelEncoder,
     ClassWeightDataTransformer,
-    DummyDataTransformer,
     RegressorTargetEncoder,
 )
 
@@ -735,7 +734,7 @@ def dataset_transformer(self):
             Transformer implementing the sklearn transformer
             interface.
         """
-        return DummyDataTransformer()
+        return FunctionTransformer()
 
     def fit(self, X, y, sample_weight=None, **kwargs) -> "BaseWrapper":
         """Constructs a new model with ``model`` & fit the model to ``(X, y)``.

From fa728c1bd87a8ace5c6c505e083cb50f6e59e213 Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Sun, 24 Jan 2021 11:58:13 -0600
Subject: [PATCH 09/29] Remove unused import

---
 scikeras/wrappers.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scikeras/wrappers.py b/scikeras/wrappers.py
index 90d389606..e798f795f 100644
--- a/scikeras/wrappers.py
+++ b/scikeras/wrappers.py
@@ -5,7 +5,6 @@
 import warnings
 
 from collections import defaultdict
-from random import sample
 from typing import Any, Callable, Dict, Iterable, List, Tuple, Type, Union
 
 import numpy as np

From 6fdea0dbc1d16d29e64f19763b743a8647562401 Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Sun, 24 Jan 2021 12:53:47 -0600
Subject: [PATCH 10/29] remove empty cell

---
 docs/source/notebooks/DataTransformers.md | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/docs/source/notebooks/DataTransformers.md b/docs/source/notebooks/DataTransformers.md
index cc79b9fa1..17a6d6ab7 100644
--- a/docs/source/notebooks/DataTransformers.md
+++ b/docs/source/notebooks/DataTransformers.md
@@ -846,7 +846,3 @@ print(counts_bin)
 (_, counts_cat) = np.unique(y_pred[:, 1], return_counts=True)
 print(counts_cat)
 ```
-
-```python
-
-```

From 6675889e3c79f23cd190bee040071a21396b5c51 Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Sun, 24 Jan 2021 13:15:50 -0600
Subject: [PATCH 11/29] Fix typos

---
 docs/source/notebooks/DataTransformers.md | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/docs/source/notebooks/DataTransformers.md b/docs/source/notebooks/DataTransformers.md
index 17a6d6ab7..816a443e6 100644
--- a/docs/source/notebooks/DataTransformers.md
+++ b/docs/source/notebooks/DataTransformers.md
@@ -552,7 +552,7 @@ print(f"Test score (accuracy): {score:.2f}")
 
 ## 6. Ragged datasets with tf.data.Dataset
 
-SciKeras provides a third dependency injection point that operats on the entire dataset: X, y & sample_weight. This `dataset_transformer` is applied after `target_transformer` and `feature_transformer`. One use case for this dependancy injection point is to transform data from tabular/array-like to the `tf.data.Dataset` format, which only requires iteration. We can use this to create a `tf.data.Dataset` of ragged tensors.
+SciKeras provides a third dependency injection point that operates on the entire dataset: X, y & sample_weight. This `dataset_transformer` is applied after `target_transformer` and `feature_transformer`. One use case for this dependency injection point is to transform data from tabular/array-like to the `tf.data.Dataset` format, which only requires iteration. We can use this to create a `tf.data.Dataset` of ragged tensors.
 
 Note that `dataset_transformer` should accept a single **3 element tuple** as its argument and return value:
 
@@ -562,10 +562,8 @@ help(KerasClassifier.dataset_transformer)
 
 The use of a 3 element tuple allows you to chain transformers with this same interface using a Scikit-Learn Pipeline, as you will see below.
 
-
 When you return a tuple like `(tf.data.Dataset(...), None, None)`, SciKeras will pass the data untouched to `Model.fit` like `Model.fit(x=tf.data.Dataset(...), y=None, sample_weight=None)`. You can process these arguments in any way you like, as long as Keras accepts them, SciKeras will not complain.
 
-
 Let's start by defining our data. We'll have an extra "feature" that marks the observation index, but we'll remove it when we deconstruct our data in the transformer.
 
 ```python
@@ -617,10 +615,8 @@ data[0]
 
 Our shapes look good, and we can handle the `y=None` case.
 
-
 Because Keras will not accept a RaggedTensor directly, we will need to wrap our entire dataset into a tensorflow `Dataset`. We can do this by adding one more transformation step:
 
-
 Next, we can add our transormers to our model. We use an sklearn `Pipeline` (generated via `make_pipeline`) to keep ClassWeightDataTransformer operational while implementing our custom transformation.
 
 ```python

From 5acbd0f8614f95be654bf5b86c36f31b0cd77bc7 Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Sun, 24 Jan 2021 13:18:07 -0600
Subject: [PATCH 12/29] add comment

---
 docs/source/notebooks/DataTransformers.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/notebooks/DataTransformers.md b/docs/source/notebooks/DataTransformers.md
index 816a443e6..70dd89401 100644
--- a/docs/source/notebooks/DataTransformers.md
+++ b/docs/source/notebooks/DataTransformers.md
@@ -599,7 +599,8 @@ def ragged_transformer(data: Tuple[np.ndarray, Optional[np.ndarray], Optional[np
     return (X, y, sample_weights)
 ```
 
-In this case, we chose to keep `y` and `sample_weights` as numpy arrays, which will allow us to re-use 
+In this case, we chose to keep `y` and `sample_weights` as numpy arrays, which will allow us to re-use ClassWeightDataTransformer,
+the default `dataset_transformer` for `KerasClassifier`.
 
 Lets quickly test our transformer:
 

From 5d9e02bbe4317c2bce8633fb7e5447478368a112 Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Sun, 24 Jan 2021 13:19:01 -0600
Subject: [PATCH 13/29] print all data

---
 docs/source/notebooks/DataTransformers.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/notebooks/DataTransformers.md b/docs/source/notebooks/DataTransformers.md
index 70dd89401..da3ab82cc 100644
--- a/docs/source/notebooks/DataTransformers.md
+++ b/docs/source/notebooks/DataTransformers.md
@@ -606,12 +606,12 @@ Lets quickly test our transformer:
 
 ```python
 data = ragged_transformer((X, y, None))
-data[0]
+data
 ```
 
 ```python
 data = ragged_transformer((X, None, None))
-data[0]
+data
 ```
 
 Our shapes look good, and we can handle the `y=None` case.

From 9b43e9c7eda895098aaacb0acd2e7fc87ec58999 Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Sun, 24 Jan 2021 13:55:33 -0600
Subject: [PATCH 14/29] Update data transformer docs

---
 docs/source/advanced.rst | 58 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 52 insertions(+), 6 deletions(-)

diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst
index c771d394c..e1b2d3a3f 100644
--- a/docs/source/advanced.rst
+++ b/docs/source/advanced.rst
@@ -178,11 +178,50 @@ This is basically the same as calling :py:func:`~scikeras.wrappers.BaseWrapper.g
 Data Transformers
 ^^^^^^^^^^^^^^^^^
 
-In some cases, the input actually consists of multiple inputs. E.g.,
+Keras supports a much wider range of inputs/outputs than Scikit-Learn does. E.g.,
 in a text classification task, you might have an array that contains
 the integers representing the tokens for each sample, and another
-array containing the number of tokens of each sample. SciKeras has you
-covered here as well.
+array containing the number of tokens of each sample.
+
+In order to reconcile Keras' expanded input/output support and Scikit-Learn's more
+limited options, SciKeras introduces "data transformers". These are really just
+dependency injection points where you can declare custom data transformations,
+for example to split an array into a list of arrays, join `X` & `y` into a `Dataset`, etc.
+In order to keep these transformations in a familiar format, they are implemented as
+sklearn-style transformers. You can think of this setup as an sklearn Pipeline:
+
+.. code-block::
+
+                                             ↗ feature_encoder ↘
+    your data → sklearn-ecosystem → SciKeras                    dataset_transformer → Keras
+                                             ↘ target_encoder  ↗ 
+
+
+As you can see, there are 2 stages of data transformations within SciKeras:
+
+- Target/Feature transformations:
+    - feature_encoder: Handles transformations to the features (`X`). This can be used 
+      to implement multi-input models.
+    - target_encoder: Handles transformations to the target (`y`). This can be used
+      to implement non-int labels (eg: strings) as well as mutli-output models.
+- Whole dataset transformations:
+    - dataset_transformer: This is the last step before passing the data to Keras.
+      It can be used to implement conversion to a `Dataset`, amongst other things.
+
+`feature_encoder` and `target_encoder` are run before building the Keras Model,
+while `data_transformer` is run after the Model is built. This means that the
+former two will not have access to the Model (eg. to get the number of outputs)
+but *will* be able to inject data into the model building function (more on this
+below). `data_transformer` on the other hand *will* get access to the built Model,
+but it cannot pass any data to model building.
+
+Although you could just implement everything in `dataset_transformer`,
+having several distinct dependency injections points allows for more modularity,
+for example to keep the default processing of string-encoded labels but convert
+the data to a `Dataset` before passing to Keras.
+
+Multi-input and output models
++++++++++++++++++++++++++++++
 
 Scikit-Learn natively supports multiple outputs, although it technically
 requires them to be arrays of equal length
@@ -208,11 +247,11 @@ type, and implements basic handling of the following cases out of the box:
 +--------------------------+--------------+----------------+----------------+---------------+
 | "binary"                 | [1, 0, 1]    | 1              | 1 or 2         | Yes           |
 +--------------------------+--------------+----------------+----------------+---------------+
-| "mulilabel-indicator"    | [[1, 1],     | 1 or >1        | 2 per target   | Single output |
+| "multilabel-indicator"   | [[1, 1],     | 1 or >1        | 2 per target   | Single output |
 |                          |              |                |                |               |
-|                          | [0, 2],      |                |                | only          |
+|                          | [0, 1],      |                |                | only          |
 |                          |              |                |                |               |
-|                          | [1, 1]]      |                |                |               |
+|                          | [1, 0]]      |                |                |               |
 +--------------------------+--------------+----------------+----------------+---------------+
 | "multiclass-multioutput" | [[1, 1],     | >1             | >=2 per target | No            |
 |                          |              |                |                |               |
@@ -232,6 +271,13 @@ type, and implements basic handling of the following cases out of the box:
 If you find that your target is classified as ``"multiclass-multioutput"`` or ``"unknown"``, you will have to
 implement your own data processing routine.
 
+In addition to converting data, `feature_encoder` and `target_encoder`, allows you to inject data
+into your model construction method. This is useful if for example you use `target_encoder` to dynamically
+determine how many outputs your model should have based on the data and then use this information to
+assign the right number of outputs in your Model. To return data from `feature_encoder` or `target_encoder`,
+you will need to provide a transformer with a `get_metadata` method, which is expected to return a dictionary
+which will be injected into your model building function via the `meta` parameter.
+
 For a complete examples implementing custom data processing, see the examples in the
 :ref:`tutorials` section.
 

From deb4858523ec12dff10469f70b6bc18ac884d0ee Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Sun, 24 Jan 2021 14:11:47 -0600
Subject: [PATCH 15/29] Finish sentence

---
 docs/source/advanced.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst
index e1b2d3a3f..f5d5ab4fc 100644
--- a/docs/source/advanced.rst
+++ b/docs/source/advanced.rst
@@ -328,6 +328,8 @@ Custom Scorers
 SciKeras uses :func:`sklearn.metrics.accuracy_score` and :func:`sklearn.metrics.accuracy_score`
 as the scoring functions for :class:`scikeras.wrappers.KerasClassifier`
 and :class:`scikeras.wrappers.KerasRegressor` respectively. To override these scoring functions,
+override :func:`scikeras.wrappers.KerasClassifier.scorer`
+or :func:`scikeras.wrappers.KerasRegressor.scorer`.
 
 
 .. _Keras Callbacks docs: https://www.tensorflow.org/api_docs/python/tf/keras/callbacks

From 981e61c40dfe6d6a1bd1713d0eee54ae5fe61131 Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Mon, 25 Jan 2021 16:13:45 -0600
Subject: [PATCH 16/29] PR feedback

---
 docs/source/advanced.rst                  | 175 +++++++++++++++----
 docs/source/notebooks/DataTransformers.md | 204 +++++-----------------
 2 files changed, 186 insertions(+), 193 deletions(-)

diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst
index f5d5ab4fc..994cb3d73 100644
--- a/docs/source/advanced.rst
+++ b/docs/source/advanced.rst
@@ -186,42 +186,102 @@ array containing the number of tokens of each sample.
 In order to reconcile Keras' expanded input/output support and Scikit-Learn's more
 limited options, SciKeras introduces "data transformers". These are really just
 dependency injection points where you can declare custom data transformations,
-for example to split an array into a list of arrays, join `X` & `y` into a `Dataset`, etc.
+for example to split an array into a list of arrays, join ``X`` & ``y`` into a ``Dataset``, etc.
 In order to keep these transformations in a familiar format, they are implemented as
 sklearn-style transformers. You can think of this setup as an sklearn Pipeline:
 
 .. code-block::
 
-                                             ↗ feature_encoder ↘
-    your data → sklearn-ecosystem → SciKeras                    dataset_transformer → Keras
-                                             ↘ target_encoder  ↗ 
+                                   ↗ feature_encoder ↘
+    SciKeras.fit(features, labels)                    dataset_transformer → keras.Model.fit(data)
+                                   ↘ target_encoder  ↗ 
 
 
-As you can see, there are 2 stages of data transformations within SciKeras:
+Within SciKeras, this is roughly implemented as follows:
 
-- Target/Feature transformations:
-    - feature_encoder: Handles transformations to the features (`X`). This can be used 
-      to implement multi-input models.
-    - target_encoder: Handles transformations to the target (`y`). This can be used
-      to implement non-int labels (eg: strings) as well as mutli-output models.
-- Whole dataset transformations:
-    - dataset_transformer: This is the last step before passing the data to Keras.
-      It can be used to implement conversion to a `Dataset`, amongst other things.
+.. code-block::python
 
-`feature_encoder` and `target_encoder` are run before building the Keras Model,
-while `data_transformer` is run after the Model is built. This means that the
-former two will not have access to the Model (eg. to get the number of outputs)
-but *will* be able to inject data into the model building function (more on this
-below). `data_transformer` on the other hand *will* get access to the built Model,
-but it cannot pass any data to model building.
+    from sklearn.preprocessing import FunctionTransformer
 
-Although you could just implement everything in `dataset_transformer`,
+    from scikeras.utils.transformers import (
+        ClassifierLabelEncoder,
+        RegressorTargetEncoder,
+        ClassWeightDataTransformer
+    )
+
+
+    class BaseWrapper:
+
+        @property
+        def target_encoder(self):
+            return ClassifierLabelEncoder(loss=self.loss)
+        
+        @property
+        def feature_encoder(self):
+            return FunctionTransformer()
+        
+        @property
+        def dataset_transformer(self):
+            return FunctionTransformer()
+
+        def fit(self, X, y, sample_weight):
+            self.target_encoder_ = self.target_encoder
+            self.dataset_transformer_ = self.feature_encoder
+            self.feature_encoder_ = self.feature_encoder
+            y = self.target_encoder_.fit_transform(y)
+            X = self.feature_encoder_.fit_transform(X)
+            X, y, sample_weight = self.dataset_transformer_.fit_transform((X, y, sample_weight))
+            self.model_.fit(x=X, y=y, sample_weight=sample_weight)  # tf.keras.Model.fit
+            return self
+
+        def predict(self, X):
+            X = self.feature_encoder_.transform(X)
+            X, _, _ = self.dataset_transformer_.fit_transform((X, None, None))
+            y_pred = self.model_.predict(X)
+            return self.target_encoder_.inverse_transform(y_pred)
+
+    class KerasClassifier(BaseWrapper):
+
+        @property
+        def target_encoder(self):
+            return ClassifierLabelEncoder(loss=self.loss)
+        
+        @property
+        def dataset_transformer(self):
+            return ClassWeightDataTransformer(class_weight=self.class_weight)
+        
+        def predict_proba(self, X):
+            X = self.feature_encoder_.transform(X)
+            X, _, _ = self.dataset_transformer_.fit_transform((X, None, None))
+            y_pred = self.model_.predict(X)
+            return self.target_encoder_.inverse_transform(y_pred, return_proba=True)
+
+
+    class KerasRegressor(BaseWrapper):
+
+        @property
+        def target_encoder(self):
+            return RegressorTargetEncoder()
+
+
+One important thing to note is that ``feature_encoder`` and ``target_encoder``
+are run before building the Keras Model, while ``data_transformer`` is run after
+the Model is built. This means that the former two will not have access to the Model
+(eg. to get the number of outputs) but *will* be able to inject data into the model building
+function (more on this below). On the other hand,
+``data_transformer`` *will* get access to the built Model, but it cannot pass any data to model building
+function.
+
+Although you could just implement everything in ``dataset_transformer``,
 having several distinct dependency injections points allows for more modularity,
 for example to keep the default processing of string-encoded labels but convert
-the data to a `Dataset` before passing to Keras.
+the data to a ``Dataset`` before passing to Keras.
+
+For a complete examples implementing custom data processing, see the examples in the
+:ref:`tutorials` section.
 
-Multi-input and output models
-+++++++++++++++++++++++++++++
+Multi-input and output models via feature_encoder and target_encoder
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
 Scikit-Learn natively supports multiple outputs, although it technically
 requires them to be arrays of equal length
@@ -229,14 +289,15 @@ requires them to be arrays of equal length
 Scikit-Learn has no support for multiple inputs.
 To work around this issue, SciKeras implements a data conversion
 abstraction in the form of Scikit-Learn style transformers,
-one for ``X`` (features) and one for ``y`` (target).
-By implementing a custom transformer, you can split a single input ``X`` into multiple inputs
-for :py:class:`tensorflow.keras.Model` or perform any other manipulation you need.
+one for ``X`` (features) and one for ``y`` (target). These are implemented
+via :py:func:`scikeras.wrappers.BaseWrappers.feature_encoder` and
+:py:func:`scikeras.wrappers.BaseWrappers.feature_encoder` respectively.
+
 To override the default transformers, simply override
 :py:func:`scikeras.wrappers.BaseWrappers.target_encoder` or
 :py:func:`scikeras.wrappers.BaseWrappers.feature_encoder` for ``y`` and ``X`` respectively.
 
-SciKeras uses :py:func:`sklearn.utils.multiclass.type_of_target` to categorize the target
+By default, SciKeras uses :py:func:`sklearn.utils.multiclass.type_of_target` to categorize the target
 type, and implements basic handling of the following cases out of the box:
 
 +--------------------------+--------------+----------------+----------------+---------------+
@@ -268,18 +329,60 @@ type, and implements basic handling of the following cases out of the box:
 |                          | [.2, .9]]    |                |                |               |
 +--------------------------+--------------+----------------+----------------+---------------+
 
-If you find that your target is classified as ``"multiclass-multioutput"`` or ``"unknown"``, you will have to
-implement your own data processing routine.
+The supported cases are handled by the default implementation of ``target_encoder``.
+The default implementations are available for use as :py:class:`scikeras.utils.transformers.ClassifierLabelEncoder`
+and :py:class:`scikeras.utils.transformers.RegressorTargetEncoder` for
+:py:class:`scikeras.wrappers.KerasClassifier` and :py:class:`scikeras.wrappers.KerasRegressor` respectively.
+
+As per the table above, if you find that your target is classified as
+``"multiclass-multioutput"`` or ``"unknown"``, you will have to implement your own data processing routine.
+
+Whole dataset manipulation via data_transformer
++++++++++++++++++++++++++++++++++++++++++++++++
+
+This is the last step before passing the data to Keras, and it allows for the greatest
+degree of customization because SciKeras does not make any assumptions about the output data
+and passes it directly to :py:func:`tensorflow.keras.Model.fit`.
+Its signature is ``dataset_transformer.fit_transform((X, y, sample_weight))``,
+that is, a 3 element tuple corresponding to the ``x``, ``y`` and ``sample_weight``
+arguments in :py:func:`tensorflow.keras.Model.fit`.
+The output must be a 3 element tuple as well, and it will be passed untouched
+to :py:func:`tensorflow.keras.Model.fit`, so that the second and/or third
+elements are allowed to be ``None``, but the first must always have a value.
 
-In addition to converting data, `feature_encoder` and `target_encoder`, allows you to inject data
-into your model construction method. This is useful if for example you use `target_encoder` to dynamically
+get_metadata method
++++++++++++++++++++
+
+In addition to converting data, ``feature_encoder`` and ``target_encoder``, allows you to inject data
+into your model construction method. This is useful if for example you use ``target_encoder`` to dynamically
 determine how many outputs your model should have based on the data and then use this information to
-assign the right number of outputs in your Model. To return data from `feature_encoder` or `target_encoder`,
-you will need to provide a transformer with a `get_metadata` method, which is expected to return a dictionary
-which will be injected into your model building function via the `meta` parameter.
+assign the right number of outputs in your Model. To return data from ``feature_encoder`` or ``target_encoder``,
+you will need to provide a transformer with a ``get_metadata`` method, which is expected to return a dictionary
+which will be injected into your model building function via the ``meta`` parameter.
 
-For a complete examples implementing custom data processing, see the examples in the
-:ref:`tutorials` section.
+For example, if you wanted to create a calculated parameter called ``my_param_``:
+
+.. code-block::python
+
+    class MultiOutputTransformer(BaseEstimator, TransformerMixin):
+        def get_metadata(self):
+            return {"my_param_": "foobarbaz"}
+
+    class MultiOutputClassifier(KerasClassifier):
+
+        @property
+        def target_encoder(self):
+            return MultiOutputTransformer(...)
+
+    def get_model(meta):
+        print(f"Got: {meta['my_param_']}")
+
+    clf = MultiOutputClassifier(model=get_model)
+    clf.fit(X, y)  # prints 'Got: foobarbaz'
+    print(clf.my_param_)  # prints 'foobarbaz'
+
+Note that it is best practice to end your parameter names with a single underscore,
+which allows sklearn to know which parameters are stateful and which are stateless.
 
 Routed parameters
 -----------------
diff --git a/docs/source/notebooks/DataTransformers.md b/docs/source/notebooks/DataTransformers.md
index da3ab82cc..52f265536 100644
--- a/docs/source/notebooks/DataTransformers.md
+++ b/docs/source/notebooks/DataTransformers.md
@@ -25,26 +25,26 @@ Keras support many types of input and output data formats, including:
 * Multiple outputs
 * Higher-dimensional tensors
 
-In this notebook, we explore how to reconcile this functionality with the sklearn ecosystem via SciKeras data transformer interface.
+This notebook walks through an example of the different data transformations and how SciKeras bridges Keras and Scikit-learn.
+It may be helpful to have a general understanding of the dataflow before tackling these examples, which is available in
+the [data transformer docs][docs].
 
 ## Table of contents
 
 * [1. Setup](#1.-Setup)
-* [2. Data transformer interface](#2.-Data-transformer-interface)
-  * [2.1 get_metadata method](#2.1-get_metadata-method)
-* [3. Multiple outputs](#3.-Multiple-outputs)
+* [2. Multiple outputs](#2.-Multiple-outputs)
+  * [2.1 Define Keras Model](#2.1-Define-Keras-Model)
+  * [2.2 Define output data transformer](#2.2-Define-output-data-transformer)
+  * [2.3 Test classifier](#2.3-Test-classifier)
+* [3. Multiple inputs](#3-multiple-inputs)
   * [3.1 Define Keras Model](#3.1-Define-Keras-Model)
-  * [3.2 Define output data transformer](#3.2-Define-output-data-transformer)
-  * [3.3 Test classifier](#3.3-Test-classifier)
-* [4. Multiple inputs](#4-multiple-inputs)
+  * [3.2 Define data transformer](#3.2-Define-data-transformer)
+  * [3.3 Test regressor](#3.3-Test-regressor)
+* [4. Multidimensional inputs with MNIST dataset](#4.-Multidimensional-inputs-with-MNIST-dataset)
   * [4.1 Define Keras Model](#4.1-Define-Keras-Model)
-  * [4.2 Define data transformer](#4.2-Define-data-transformer)
-  * [4.3 Test regressor](#4.3-Test-regressor)
-* [5. Multidimensional inputs with MNIST dataset](#5.-Multidimensional-inputs-with-MNIST-dataset)
-  * [5.1 Define Keras Model](#5.1-Define-Keras-Model)
-  * [5.2 Test](#5.2-Test)
-* [6. Ragged datasets with tf.data.Dataset](#6.-Ragged-datasets-with-tf.data.Dataset)
-* [7. Multi-output class_weight](#7.-Multi-output-class_weight)
+  * [4.2 Test](#4.2-Test)
+* [5. Ragged datasets with tf.data.Dataset](#5.-Ragged-datasets-with-tf.data.Dataset)
+* [6. Multi-output class_weight](#6.-Multi-output-class_weight)
 
 ## 1. Setup
 
@@ -70,106 +70,7 @@ from scikeras.wrappers import KerasClassifier, KerasRegressor
 from tensorflow import keras
 ```
 
-## 2. Data transformer interface
-
-SciKeras enables advanced Keras use cases by providing an interface to convert sklearn compliant data to whatever format your Keras model requires within SciKeras, right before passing said data to the Keras model.
-
-This interface is implemented in the form of two sklearn transformers, one for the features (`X`) and one for the target (`y`).  SciKeras loads these transformers via the `target_encoder` and `feature_encoder` methods.
-
-By default, SciKeras implements `target_encoder` for both KerasClassifier and KerasRegressor to facilitate common types of tasks in sklearn. The default implementations are `scikeras.utils.transformers.ClassifierLabelEncoder` and `scikeras.utils.transformers.RegressorTargetEncoder` for KerasClassifier and KerasRegressor respectively. Information on the types of tasks that these default transformers are able to perform can be found in the [SciKeras docs](https://scikeras.readthedocs.io/en/latest/advanced.html#data-transformers).
-
-Below is an outline of the inner workings of the data transfomer interfaces to help understand when they are called:
-
-```python
-if False:  # avoid executing pseudocode
-    from scikeras.utils.transformers import (
-        ClassifierLabelEncoder,
-        RegressorTargetEncoder,
-    )
-
-
-    class BaseWrapper:
-        def fit(self, X, y):
-            self.target_encoder_ = self.target_encoder
-            self.feature_encoder_ = self.feature_encoder
-            y = self.target_encoder_.fit_transform(y)
-            X = self.feature_encoder_.fit_transform(X)
-            self.model_.fit(X, y)
-            return self
-        
-        def predict(self, X):
-            X = self.feature_encoder_.transform(X)
-            y_pred = self.model_.predict(X)
-            return self.target_encoder_.inverse_transform(y_pred)
-
-    class KerasClassifier(BaseWrapper):
-
-        @property
-        def target_encoder(self):
-            return ClassifierLabelEncoder(loss=self.loss)
-        
-        def predict_proba(self, X):
-            X = self.feature_encoder_.transform(X)
-            y_pred = self.model_.predict(X)
-            return self.target_encoder_.inverse_transform(y_pred, return_proba=True)
-
-
-    class KerasRegressor(BaseWrapper):
-
-        @property
-        def target_encoder(self):
-            return RegressorTargetEncoder()
-```
-
-To substitute your own data transformation routine, you must subclass the wrappers and override one of the encoder defining functions. You will have access to all attributes of the wrappers, and you can pass these to your transformer, like we do above with `loss`.
-
-```python
-from sklearn.base import BaseEstimator, TransformerMixin
-```
-
-```python
-if False:  # avoid executing pseudocode
-
-    class MultiOutputTransformer(BaseEstimator, TransformerMixin):
-        ...
-
-
-    class MultiOutputClassifier(KerasClassifier):
-
-        @property
-        def target_encoder(self):
-            return MultiOutputTransformer(...)
-```
-
-### 2.1 get_metadata method
-
-SciKeras recognized an optional `get_metadata` on the transformers. `get_metadata` is expected to return a dicionary of with key strings and arbitrary values. SciKeras will set add these items to the wrappers namespace and make them available to your model building function via the `meta` keyword argument:
-
-```python
-if False:  # avoid executing pseudocode
-
-    class MultiOutputTransformer(BaseEstimator, TransformerMixin):
-        def get_metadata(self):
-            return {"my_param_": "foobarbaz"}
-
-
-    class MultiOutputClassifier(KerasClassifier):
-
-        @property
-        def target_encoder(self):
-            return MultiOutputTransformer(...)
-
-
-    def get_model(meta):
-        print(f"Got: {meta['my_param_']}")
-
-
-    clf = MultiOutputClassifier(model=get_model)
-    clf.fit(X, y)  # Got: foobarbaz
-    print(clf.my_param_)  # foobarbaz
-```
-
-## 3. Multiple outputs
+## 2. Multiple outputs
 
 Keras makes it straight forward to define models with multiple outputs, that is a Model with multiple sets of fully-connected heads at the end of the network. This functionality is only available in the Functional Model and subclassed Model definition modes, and is not available when using Sequential.
 
@@ -177,7 +78,7 @@ In practice, the main thing about Keras models with multiple outputs that you ne
 
 Note that "multiple outputs" in Keras has a slightly different meaning than "multiple outputs" in sklearn. Many tasks that would be considered "multiple output" tasks in sklearn can be mapped to a single "output" in Keras with multiple units. This notebook specifically focuses on the cases that require multiple distinct Keras outputs.
 
-### 3.1 Define Keras Model
+### 2.1 Define Keras Model
 
 Here we define a simple perceptron that has two outputs, corresponding to one binary classification taks and one multiclass classification task. For example, one output might be "image has car" (binary) and the other might be "color of car in image" (multiclass).
 
@@ -229,7 +130,7 @@ Our data transormer's job will be to convert from a single numpy array (which is
 We will structure our data on the sklearn side by column-stacking our list
 of arrays. This works well in this case since we have the same number of datapoints in each array.
 
-### 3.2 Define output data transformer
+### 2.2 Define output data transformer
 
 Let's go ahead and protoype this data transformer:
 
@@ -289,7 +190,7 @@ class MultiOutputTransformer(BaseEstimator, TransformerMixin):
 
 Note that in addition to the usual `transform` and `inverse_transform` methods, we implement the `get_metadata` method to return the `n_classes_` attribute.
 
-Lets test our transformer with the same dataset we previoulsy used to test our model:
+Lets test our transformer with the same dataset we previously used to test our model:
 
 ```python
 tf = MultiOutputTransformer()
@@ -331,7 +232,7 @@ class MultiOutputClassifier(KerasClassifier):
         return np.mean([accuracy_score(y_bin, y_pred_bin), accuracy_score(y_cat, y_pred_cat)])
 ```
 
-### 3.3 Test classifier
+### 2.3 Test classifier
 
 ```python
 from sklearn.preprocessing import StandardScaler
@@ -347,27 +248,23 @@ clf = MultiOutputClassifier(model=get_clf_model, verbose=0, random_state=0)
 clf.fit(X, y_sklearn).score(X, y_sklearn)
 ```
 
-## 4. Multiple inputs
+## 3. Multiple inputs
 
 The process for multiple inputs is similar, but instead of overriding the transformer in `target_encoder` we override `feature_encoder`.
 
-```python
-if False:
-    from sklearn.base import BaseEstimator, TransformerMixin
-
-
-    class MultiOutputTransformer(BaseEstimator, TransformerMixin):
-        ...
+```python .noeval
+class MultiOutputTransformer(BaseEstimator, TransformerMixin):
+    ...
 
 
-    class MultiOutputClassifier(KerasClassifier):
+class MultiOutputClassifier(KerasClassifier):
 
-        @property
-        def feature_encoder(self):
-            return MultiInputTransformer(...)
+    @property
+    def feature_encoder(self):
+        return MultiInputTransformer(...)
 ```
 
-### 4.1 Define Keras Model
+### 3.1 Define Keras Model
 
 Let's define a Keras **regression** Model with 2 inputs:
 
@@ -411,7 +308,7 @@ r2_score(y, y_pred)
 
 Having verified that our model builds without errors and accepts the inputs types we expect, we move onto integrating a transformer into our SciKeras model.
 
-### 4.2 Define data transformer
+### 3.2 Define data transformer
 
 Just like for overriding `target_encoder`, we just need to define a sklearn transformer and drop it into our SciKeras wrapper. Since we hardcoded the input
 shapes into our model and do not rely on any transformer-generated metadata, we can simply use `sklearn.preprocessing.FunctionTransformer`:
@@ -431,7 +328,7 @@ class MultiInputRegressor(KerasRegressor):
 
 Note that we did **not** implement `inverse_transform` (that is, we did not pass an `inverse_func` argument to `FunctionTransformer`) because features are never converted back to their original form.
 
-### 4.3 Test regressor
+### 3.3 Test regressor
 
 ```python
 reg = MultiInputRegressor(model=get_reg_model, verbose=0, random_state=0)
@@ -441,7 +338,7 @@ X_sklearn = np.column_stack(X)
 reg.fit(X_sklearn, y).score(X_sklearn, y)
 ```
 
-## 5. Multidimensional inputs with MNIST dataset
+## 4. Multidimensional inputs with MNIST dataset
 
 In this example, we look at how we can use SciKeras to process the MNIST dataset. The dataset is composed of 60,000 images of digits, each of which is a 2D 28x28 image.
 
@@ -487,7 +384,7 @@ print(np.min(x_train), np.max(x_train))  # scaled 0-1
 
 Of course, in this case, we could have just as easily used numpy functions to scale our data, but we use `MinMaxScaler` to demonstrate use of the sklearn ecosystem.
 
-### 5.1 Define Keras Model
+### 4.1 Define Keras Model
 
 Next we will define our Keras model (adapted from [keras.io](https://keras.io/examples/vision/mnist_convnet/)):
 
@@ -537,7 +434,7 @@ clf = MultiDimensionalClassifier(
 )
 ```
 
-### 5.2 Test
+### 4.2 Test
 
 Train and score the model (this takes some time)
 
@@ -550,26 +447,18 @@ score = clf.score(x_test, y_test)
 print(f"Test score (accuracy): {score:.2f}")
 ```
 
-## 6. Ragged datasets with tf.data.Dataset
+## 5. Ragged datasets with tf.data.Dataset
 
 SciKeras provides a third dependency injection point that operates on the entire dataset: X, y & sample_weight. This `dataset_transformer` is applied after `target_transformer` and `feature_transformer`. One use case for this dependency injection point is to transform data from tabular/array-like to the `tf.data.Dataset` format, which only requires iteration. We can use this to create a `tf.data.Dataset` of ragged tensors.
 
-Note that `dataset_transformer` should accept a single **3 element tuple** as its argument and return value:
-
-```python
-help(KerasClassifier.dataset_transformer)
-```
-
-The use of a 3 element tuple allows you to chain transformers with this same interface using a Scikit-Learn Pipeline, as you will see below.
-
-When you return a tuple like `(tf.data.Dataset(...), None, None)`, SciKeras will pass the data untouched to `Model.fit` like `Model.fit(x=tf.data.Dataset(...), y=None, sample_weight=None)`. You can process these arguments in any way you like, as long as Keras accepts them, SciKeras will not complain.
+Note that `dataset_transformer` should accept a single **3 element tuple** as its argument and return value; more details on this are in the [docs][docs].
 
 Let's start by defining our data. We'll have an extra "feature" that marks the observation index, but we'll remove it when we deconstruct our data in the transformer.
 
 ```python
 feature_1 = np.random.uniform(size=(10, ))
 feature_2 = np.random.uniform(size=(10, ))
-obs = [0] * 3 + [1] * 2 + [2] * 1 + [3] * 2 + [4] * 2
+obs = [0, 0, 0, 1, 1, 2, 3, 3, 4, 4]
 
 X = np.column_stack([feature_1, feature_2, obs]).astype("float32")
 
@@ -583,7 +472,6 @@ you should check if `y` and `sample_weigh` are None before doing any operations
 ```python
 from typing import Tuple, Optional
 
-from sklearn.base import BaseEstimator, TransformerMixin
 import tensorflow as tf
 
 
@@ -712,7 +600,7 @@ y_pred = clf.predict(X)
 y_pred
 ```
 
-## 7. Multi-output class_weight
+## 6. Multi-output class_weight
 
 In this example, we will use `dataset_transformer` to support multi-output class weights. We will re-use our `MultiOutputTransformer` from our previous example to split the output, then we will create `sample_weights` from `class_weight`
 
@@ -743,14 +631,14 @@ class DatasetTransformer(BaseEstimator, TransformerMixin):
         assert sample_weights is None, "Cannot use class_weight & sample_weights together"
         if y is not None:
             # y should be a list of arrays, as split up by MultiOutputTransformer
-            sample_weights = dict()
-            for output_num, (output_name, output_data) in enumerate(zip(self.output_names, y)):
-                # class_weight is expected to be indexable by output_number
-                # see https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_sample_weight.html
-                # Note that it is trivial to change the expected format to match Keras' ({output_name: weights, ...})
-                # see https://github.com/keras-team/keras/issues/4735#issuecomment-267473722
-                cls_wt_out = class_weight[output_num]
-                sample_weights[output_name] = compute_sample_weight(cls_wt_out, output_data)
+            sample_weights = {
+                compute_sample_weight(class_weight[output_num], output_data)
+                for output_num, (output_name, output_data) in enumerate(zip(self.output_names, y))
+            }
+            # Note: class_weight is expected to be indexable by output_number in sklearn
+            # see https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_sample_weight.html
+            # It is trivial to change the expected format to match Keras' ({output_name: weights, ...})
+            # see https://github.com/keras-team/keras/issues/4735#issuecomment-267473722
         return X, y, sample_weights
 
 ```
@@ -843,3 +731,5 @@ print(counts_bin)
 (_, counts_cat) = np.unique(y_pred[:, 1], return_counts=True)
 print(counts_cat)
 ```
+
+[docs]: https://www.adriangb.com/scikeras/refs/heads/master/advanced.html#data-transformers  "SciKeras Data Transformer Docs"

From 0d55306b5552e09c9b4ebb797209b1a61bf92daa Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Mon, 25 Jan 2021 16:32:02 -0600
Subject: [PATCH 17/29] fix error

---
 docs/source/notebooks/DataTransformers.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/notebooks/DataTransformers.md b/docs/source/notebooks/DataTransformers.md
index 52f265536..9b20ca874 100644
--- a/docs/source/notebooks/DataTransformers.md
+++ b/docs/source/notebooks/DataTransformers.md
@@ -632,7 +632,7 @@ class DatasetTransformer(BaseEstimator, TransformerMixin):
         if y is not None:
             # y should be a list of arrays, as split up by MultiOutputTransformer
             sample_weights = {
-                compute_sample_weight(class_weight[output_num], output_data)
+                output_name: compute_sample_weight(class_weight[output_num], output_data)
                 for output_num, (output_name, output_data) in enumerate(zip(self.output_names, y))
             }
             # Note: class_weight is expected to be indexable by output_number in sklearn

From 3cf1ed513718dd53b5a8a1882ca5835564f0816f Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Mon, 25 Jan 2021 17:08:48 -0600
Subject: [PATCH 18/29] use embedded links, ref links seem to be broken

---
 docs/source/notebooks/DataTransformers.md | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/docs/source/notebooks/DataTransformers.md b/docs/source/notebooks/DataTransformers.md
index 9b20ca874..cc46cda10 100644
--- a/docs/source/notebooks/DataTransformers.md
+++ b/docs/source/notebooks/DataTransformers.md
@@ -27,7 +27,7 @@ Keras support many types of input and output data formats, including:
 
 This notebook walks through an example of the different data transformations and how SciKeras bridges Keras and Scikit-learn.
 It may be helpful to have a general understanding of the dataflow before tackling these examples, which is available in
-the [data transformer docs][docs].
+the [data transformer docs](https://www.adriangb.com/scikeras/refs/heads/master/advanced.html#data-transformers).
 
 ## Table of contents
 
@@ -451,7 +451,7 @@ print(f"Test score (accuracy): {score:.2f}")
 
 SciKeras provides a third dependency injection point that operates on the entire dataset: X, y & sample_weight. This `dataset_transformer` is applied after `target_transformer` and `feature_transformer`. One use case for this dependency injection point is to transform data from tabular/array-like to the `tf.data.Dataset` format, which only requires iteration. We can use this to create a `tf.data.Dataset` of ragged tensors.
 
-Note that `dataset_transformer` should accept a single **3 element tuple** as its argument and return value; more details on this are in the [docs][docs].
+Note that `dataset_transformer` should accept a single **3 element tuple** as its argument and return value; more details on this are in the [docs](https://www.adriangb.com/scikeras/refs/heads/master/advanced.html#data-transformers).
 
 Let's start by defining our data. We'll have an extra "feature" that marks the observation index, but we'll remove it when we deconstruct our data in the transformer.
 
@@ -731,5 +731,3 @@ print(counts_bin)
 (_, counts_cat) = np.unique(y_pred[:, 1], return_counts=True)
 print(counts_cat)
 ```
-
-[docs]: https://www.adriangb.com/scikeras/refs/heads/master/advanced.html#data-transformers  "SciKeras Data Transformer Docs"

From a198eb31ec478352ab2ccbd7b8274b256579fec9 Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Mon, 25 Jan 2021 17:48:05 -0600
Subject: [PATCH 19/29] spacing

---
 docs/source/advanced.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst
index 994cb3d73..f1db2c2ab 100644
--- a/docs/source/advanced.rst
+++ b/docs/source/advanced.rst
@@ -346,6 +346,7 @@ and passes it directly to :py:func:`tensorflow.keras.Model.fit`.
 Its signature is ``dataset_transformer.fit_transform((X, y, sample_weight))``,
 that is, a 3 element tuple corresponding to the ``x``, ``y`` and ``sample_weight``
 arguments in :py:func:`tensorflow.keras.Model.fit`.
+
 The output must be a 3 element tuple as well, and it will be passed untouched
 to :py:func:`tensorflow.keras.Model.fit`, so that the second and/or third
 elements are allowed to be ``None``, but the first must always have a value.

From 047d430096cd06d6143728a0daf7bbbd1e23eee9 Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Mon, 25 Jan 2021 17:52:12 -0600
Subject: [PATCH 20/29] fix code block

---
 docs/source/advanced.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst
index f1db2c2ab..94892b591 100644
--- a/docs/source/advanced.rst
+++ b/docs/source/advanced.rst
@@ -199,7 +199,7 @@ sklearn-style transformers. You can think of this setup as an sklearn Pipeline:
 
 Within SciKeras, this is roughly implemented as follows:
 
-.. code-block::python
+.. code:: python
 
     from sklearn.preprocessing import FunctionTransformer
 

From 54cfc43f98ac29087b208b15a709ac843c6485f9 Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Wed, 27 Jan 2021 15:59:32 -0600
Subject: [PATCH 21/29] PR feedback

---
 docs/source/advanced.rst | 92 +++++++++-------------------------------
 1 file changed, 19 insertions(+), 73 deletions(-)

diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst
index 94892b591..3a412f996 100644
--- a/docs/source/advanced.rst
+++ b/docs/source/advanced.rst
@@ -201,36 +201,16 @@ Within SciKeras, this is roughly implemented as follows:
 
 .. code:: python
 
-    from sklearn.preprocessing import FunctionTransformer
-
-    from scikeras.utils.transformers import (
-        ClassifierLabelEncoder,
-        RegressorTargetEncoder,
-        ClassWeightDataTransformer
-    )
-
-
-    class BaseWrapper:
-
-        @property
-        def target_encoder(self):
-            return ClassifierLabelEncoder(loss=self.loss)
-        
-        @property
-        def feature_encoder(self):
-            return FunctionTransformer()
-        
-        @property
-        def dataset_transformer(self):
-            return FunctionTransformer()
+    class PseudoBaseWrapper:
 
         def fit(self, X, y, sample_weight):
-            self.target_encoder_ = self.target_encoder
-            self.dataset_transformer_ = self.feature_encoder
-            self.feature_encoder_ = self.feature_encoder
-            y = self.target_encoder_.fit_transform(y)
-            X = self.feature_encoder_.fit_transform(X)
-            X, y, sample_weight = self.dataset_transformer_.fit_transform((X, y, sample_weight))
+            self.target_encoder_ = self.target_encoder.fit(X)
+            X = self.feature_encoder_.transform(X)
+            self.feature_encoder_ = self.feature_encoder.fit(y)
+            y = self.target_encoder_.transform(y)
+            self.model_ = self._build_keras_model()
+            self.dataset_transformer_ = self.dataset_transformer.fit((X, y, sample_weight))
+            X, y, sample_weight = self.dataset_transformer_.transform((X, y, sample_weight))
             self.model_.fit(x=X, y=y, sample_weight=sample_weight)  # tf.keras.Model.fit
             return self
 
@@ -240,42 +220,22 @@ Within SciKeras, this is roughly implemented as follows:
             y_pred = self.model_.predict(X)
             return self.target_encoder_.inverse_transform(y_pred)
 
-    class KerasClassifier(BaseWrapper):
-
-        @property
-        def target_encoder(self):
-            return ClassifierLabelEncoder(loss=self.loss)
-        
-        @property
-        def dataset_transformer(self):
-            return ClassWeightDataTransformer(class_weight=self.class_weight)
-        
-        def predict_proba(self, X):
-            X = self.feature_encoder_.transform(X)
-            X, _, _ = self.dataset_transformer_.fit_transform((X, None, None))
-            y_pred = self.model_.predict(X)
-            return self.target_encoder_.inverse_transform(y_pred, return_proba=True)
-
-
-    class KerasRegressor(BaseWrapper):
-
-        @property
-        def target_encoder(self):
-            return RegressorTargetEncoder()
 
+``dataset_transformer`` is the last step before passing the data to Keras, and it allows for the greatest
+degree of customization because SciKeras does not make any assumptions about the output data
+and passes it directly to :py:func:`tensorflow.keras.Model.fit`.
+Its signature is ``dataset_transformer.fit_transform((X, y, sample_weight))``,
+that is, a 3 element tuple corresponding to the ``x``, ``y`` and ``sample_weight``
+arguments in :py:func:`tensorflow.keras.Model.fit`.
 
-One important thing to note is that ``feature_encoder`` and ``target_encoder``
-are run before building the Keras Model, while ``data_transformer`` is run after
-the Model is built. This means that the former two will not have access to the Model
-(eg. to get the number of outputs) but *will* be able to inject data into the model building
-function (more on this below). On the other hand,
-``data_transformer`` *will* get access to the built Model, but it cannot pass any data to model building
-function.
+The output must be a 3 element tuple as well, and it will be passed untouched
+to :py:func:`tensorflow.keras.Model.fit`, so that the second and/or third
+elements are allowed to be ``None``, but the first must always have a value.
 
-Although you could just implement everything in ``dataset_transformer``,
+Although you could implement *all* data transformations in a single ``dataset_transformer``,
 having several distinct dependency injections points allows for more modularity,
 for example to keep the default processing of string-encoded labels but convert
-the data to a ``Dataset`` before passing to Keras.
+the data to a :py:func:`tensorflow.data.Dataset` before passing to Keras.
 
 For a complete examples implementing custom data processing, see the examples in the
 :ref:`tutorials` section.
@@ -337,20 +297,6 @@ and :py:class:`scikeras.utils.transformers.RegressorTargetEncoder` for
 As per the table above, if you find that your target is classified as
 ``"multiclass-multioutput"`` or ``"unknown"``, you will have to implement your own data processing routine.
 
-Whole dataset manipulation via data_transformer
-+++++++++++++++++++++++++++++++++++++++++++++++
-
-This is the last step before passing the data to Keras, and it allows for the greatest
-degree of customization because SciKeras does not make any assumptions about the output data
-and passes it directly to :py:func:`tensorflow.keras.Model.fit`.
-Its signature is ``dataset_transformer.fit_transform((X, y, sample_weight))``,
-that is, a 3 element tuple corresponding to the ``x``, ``y`` and ``sample_weight``
-arguments in :py:func:`tensorflow.keras.Model.fit`.
-
-The output must be a 3 element tuple as well, and it will be passed untouched
-to :py:func:`tensorflow.keras.Model.fit`, so that the second and/or third
-elements are allowed to be ``None``, but the first must always have a value.
-
 get_metadata method
 +++++++++++++++++++
 

From d03248ff0f53a73bb4085179461d19e274bb9b31 Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Wed, 27 Jan 2021 16:18:19 -0600
Subject: [PATCH 22/29] use code block for signature

---
 docs/source/advanced.rst | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst
index 3a412f996..7b9ec1c44 100644
--- a/docs/source/advanced.rst
+++ b/docs/source/advanced.rst
@@ -224,13 +224,24 @@ Within SciKeras, this is roughly implemented as follows:
 ``dataset_transformer`` is the last step before passing the data to Keras, and it allows for the greatest
 degree of customization because SciKeras does not make any assumptions about the output data
 and passes it directly to :py:func:`tensorflow.keras.Model.fit`.
-Its signature is ``dataset_transformer.fit_transform((X, y, sample_weight))``,
-that is, a 3 element tuple corresponding to the ``x``, ``y`` and ``sample_weight``
-arguments in :py:func:`tensorflow.keras.Model.fit`.
+Its signature is:
+
+.. code:: python
+
+    from sklearn.base import BaseEstimator, TransformerMixin
+
+    class DatasetTransformer(BaseEstimator, TransformerMixin):
+        def fit(self, data, dummy=None) -> "DatasetTransformer":
+            X, y, sample_weight = data  # sample_weight might be None
+            ...
+            return self
+
+        def transform(self, data):  # return a valid input for keras.Model.fit
+            X, y, sample_weight = data  # y and/or sample_weight might be None
+            ...
+            return (X, y, sample_weight)  # option 1
+            return (tensorflow_dataset, None, None)  # option 2
 
-The output must be a 3 element tuple as well, and it will be passed untouched
-to :py:func:`tensorflow.keras.Model.fit`, so that the second and/or third
-elements are allowed to be ``None``, but the first must always have a value.
 
 Although you could implement *all* data transformations in a single ``dataset_transformer``,
 having several distinct dependency injections points allows for more modularity,

From 87452ff428cf6a8e812091c0417f4a38732653ee Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Wed, 27 Jan 2021 17:50:10 -0600
Subject: [PATCH 23/29] remove dummy parameter

---
 docs/source/advanced.rst                  |  2 +-
 docs/source/notebooks/DataTransformers.md |  6 ++++--
 scikeras/utils/transformers.py            | 16 ++++------------
 3 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst
index 7b9ec1c44..68e939858 100644
--- a/docs/source/advanced.rst
+++ b/docs/source/advanced.rst
@@ -231,7 +231,7 @@ Its signature is:
     from sklearn.base import BaseEstimator, TransformerMixin
 
     class DatasetTransformer(BaseEstimator, TransformerMixin):
-        def fit(self, data, dummy=None) -> "DatasetTransformer":
+        def fit(self, data) -> "DatasetTransformer":
             X, y, sample_weight = data  # sample_weight might be None
             ...
             return self
diff --git a/docs/source/notebooks/DataTransformers.md b/docs/source/notebooks/DataTransformers.md
index 0891e203e..279df737d 100644
--- a/docs/source/notebooks/DataTransformers.md
+++ b/docs/source/notebooks/DataTransformers.md
@@ -524,7 +524,8 @@ class RaggedClassifier(KerasClassifier):
         t1 = FunctionTransformer(ragged_transformer)
         t2 = super().dataset_transformer  # ClassWeightDataTransformer
         t3 = FunctionTransformer(dataset_transformer)
-        return make_pipeline(t1, t2, t3)
+        t4 = "passthrough"  # see https://scikit-learn.org/stable/modules/compose.html#pipeline-chaining-estimators
+        return make_pipeline(t1, t2, t3, t4)
 ```
 
 Now we can define a Model. We need some way to handle/flatten our ragged arrays within our model. For this example, we use a custom mean layer, but you could use an Embedding layer, LSTM, etc.
@@ -580,7 +581,8 @@ class RaggedClassifier(KerasClassifier):
         t1 = FunctionTransformer(ragged_transformer)
         t2 = super().dataset_transformer  # ClassWeightDataTransformer
         t3 = FunctionTransformer(dataset_transformer)
-        return make_pipeline(t1, t2, t3)
+        t4 = "passthrough"  # see https://scikit-learn.org/stable/modules/compose.html#pipeline-chaining-estimators
+        return make_pipeline(t1, t2, t3, t4)
     
     def _keras_build_fn(self):
         inp_shape = self.X_shape_[1] - 1
diff --git a/scikeras/utils/transformers.py b/scikeras/utils/transformers.py
index 7caf0fb86..521d0934f 100644
--- a/scikeras/utils/transformers.py
+++ b/scikeras/utils/transformers.py
@@ -25,15 +25,13 @@ class TargetReshaper(BaseEstimator, TransformerMixin):
         Dimensions of y that the transformer was trained on.
     """
 
-    def fit(self, y: np.ndarray, dummy: None = None) -> "TargetReshaper":
+    def fit(self, y: np.ndarray) -> "TargetReshaper":
         """Fit the transformer to a target y.
 
         Parameters
         ----------
         y : np.ndarray
             The target data to be transformed.
-        dummy: None
-            Unused argument, kept for compatibility with sklearn Pipelines.
 
         Returns
         -------
@@ -131,7 +129,7 @@ def _type_of_target(self, y: np.ndarray) -> str:
             target_type = type_of_target(self.categories[0])
         return target_type
 
-    def fit(self, y: np.ndarray, dummy: None = None) -> "ClassifierLabelEncoder":
+    def fit(self, y: np.ndarray) -> "ClassifierLabelEncoder":
         """Fit the estimator to the target y.
 
         For all targets, this transforms classes into ordinal numbers.
@@ -142,8 +140,6 @@ def fit(self, y: np.ndarray, dummy: None = None) -> "ClassifierLabelEncoder":
         ----------
         y : np.ndarray
             The target data to be transformed.
-        dummy: None
-            Unused argument, kept for compatibility with sklearn Pipelines.
 
         Returns
         -------
@@ -325,7 +321,7 @@ class RegressorTargetEncoder(BaseEstimator, TransformerMixin):
         Number of outputs the Keras Model is expected to have.
     """
 
-    def fit(self, y: np.ndarray, dummy: None = None) -> "RegressorTargetEncoder":
+    def fit(self, y: np.ndarray) -> "RegressorTargetEncoder":
         """Fit the transformer to the target y.
 
         For RegressorTargetEncoder, this just records the dimensions
@@ -335,8 +331,6 @@ def fit(self, y: np.ndarray, dummy: None = None) -> "RegressorTargetEncoder":
         ----------
         y : np.ndarray
             The target data to be transformed.
-        dummy: None
-            Unused argument, kept for compatibility with sklearn Pipelines.
 
         Returns
         -------
@@ -418,9 +412,7 @@ def __init__(self, class_weight: Optional[Union[str, Dict[int, float]]] = None):
         self.class_weight = class_weight
 
     def fit(
-        self,
-        data: Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]],
-        dummy: None = None,
+        self, data: Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]],
     ) -> "ClassWeightDataTransformer":
         return self
 

From 3f8f9b426d45deb2312ce50c1f336beb12a4a5bd Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Sun, 31 Jan 2021 00:05:06 -0800
Subject: [PATCH 24/29] re-add dummy

---
 scikeras/utils/transformers.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scikeras/utils/transformers.py b/scikeras/utils/transformers.py
index 521d0934f..0a5d213a2 100644
--- a/scikeras/utils/transformers.py
+++ b/scikeras/utils/transformers.py
@@ -412,7 +412,9 @@ def __init__(self, class_weight: Optional[Union[str, Dict[int, float]]] = None):
         self.class_weight = class_weight
 
     def fit(
-        self, data: Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]],
+        self,
+        data: Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]],
+        dummy: None = None,
     ) -> "ClassWeightDataTransformer":
         return self
 

From fd62b825d2ccf55fcadade08c0270813d1f3c422 Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Mon, 15 Feb 2021 23:59:48 -0600
Subject: [PATCH 25/29] Use dicts, add more examples

---
 docs/source/notebooks/DataTransformers.md | 266 ++++++++++++++++++----
 scikeras/utils/transformers.py            |  15 +-
 scikeras/wrappers.py                      |  24 +-
 tests/test_api.py                         |  31 ++-
 4 files changed, 274 insertions(+), 62 deletions(-)

diff --git a/docs/source/notebooks/DataTransformers.md b/docs/source/notebooks/DataTransformers.md
index fa7459e9f..5d05f20e9 100644
--- a/docs/source/notebooks/DataTransformers.md
+++ b/docs/source/notebooks/DataTransformers.md
@@ -5,8 +5,8 @@ jupyter:
     text_representation:
       extension: .md
       format_name: markdown
-      format_version: '1.2'
-      jupytext_version: 1.9.1
+      format_version: '1.3'
+      jupytext_version: 1.10.1
   kernelspec:
     display_name: Python 3
     language: python
@@ -438,7 +438,7 @@ clf = MultiDimensionalClassifier(
 Train and score the model (this takes some time)
 
 ```python
-clf.fit(x_train, y_train)
+_ = clf.fit(x_train, y_train)
 ```
 
 ```python
@@ -448,9 +448,13 @@ print(f"Test score (accuracy): {score:.2f}")
 
 ## 5. Ragged datasets with tf.data.Dataset
 
-SciKeras provides a third dependency injection point that operates on the entire dataset: X, y & sample_weight. This `dataset_transformer` is applied after `target_transformer` and `feature_transformer`. One use case for this dependency injection point is to transform data from tabular/array-like to the `tf.data.Dataset` format, which only requires iteration. We can use this to create a `tf.data.Dataset` of ragged tensors.
+SciKeras provides a third dependency injection point that operates on the entire dataset: X, y & sample_weight.
+This `dataset_transformer` is applied after `target_transformer` and `feature_transformer`.
+One use case for this dependency injection point is to transform data from tabular/array-like to the `tf.data.Dataset` format, which only requires iteration.
+We can use this to create a `tf.data.Dataset` of ragged tensors.
 
-Note that `dataset_transformer` should accept a single **3 element tuple** as its argument and return value; more details on this are in the [docs](https://www.adriangb.com/scikeras/refs/heads/master/advanced.html#data-transformers).
+Note that `dataset_transformer` should accept a single single dictionary as its argument to `transform` and `fit`, and return a single dictionary as well.
+More details on this are in the [docs](https://www.adriangb.com/scikeras/refs/heads/master/advanced.html#data-transformers).
 
 Let's start by defining our data. We'll have an extra "feature" that marks the observation index, but we'll remove it when we deconstruct our data in the transformer.
 
@@ -469,47 +473,64 @@ Also note that `dataset_transformer` will _always_ be called with `X` (i.e. the
 you should check if `y` and `sample_weigh` are None before doing any operations on them.
 
 ```python
-from typing import Tuple, Optional
+from typing import Dict, Any
 
 import tensorflow as tf
 
 
-def ragged_transformer(data: Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]) -> Tuple[tf.RaggedTensor, None, None]:
-    X, y, sample_weights = data
+def ragged_transformer(data: Dict[str, Any]) -> Dict[str, Any]:
+    x, y, sample_weight = data["x"], data.get("y", None), data.get("sample_weight", None)
     if y is not None:
         y = y.reshape(-1, 1 if len(y.shape) == 1 else y.shape[1])
-        y = y[tf.RaggedTensor.from_value_rowids(y, X[:, -1]).row_starts().numpy()]
-    if sample_weights is not None:
-        sample_weights = sample_weights.reshape(-1, 1 if len(sample_weights.shape) == 1 else sample_weights.shape[1])
-        sample_weights = sample_weights[tf.RaggedTensor.from_value_rowids(sample_weights, X[:, -1]).row_starts().numpy()]
-    X = tf.RaggedTensor.from_value_rowids(X[:, :-1], X[:, -1])
-    return (X, y, sample_weights)
-```
-
-In this case, we chose to keep `y` and `sample_weights` as numpy arrays, which will allow us to re-use ClassWeightDataTransformer,
+        y = y[tf.RaggedTensor.from_value_rowids(y, x[:, -1]).row_starts().numpy()]
+    if sample_weight is not None:
+        sample_weight = sample_weight.reshape(-1, 1 if len(sample_weight.shape) == 1 else sample_weight.shape[1])
+        sample_weight = sample_weight[tf.RaggedTensor.from_value_rowids(sample_weight, x[:, -1]).row_starts().numpy()]
+    x = tf.RaggedTensor.from_value_rowids(x[:, :-1], x[:, -1])
+    data["x"] = x
+    if "y" in data:
+        data["y"] = y
+    if "sample_weight" in data:
+        data["sample_weight"] = sample_weight
+    return data
+```
+
+In this case, we chose to keep `y` and `sample_weight` as numpy arrays, which will allow us to re-use ClassWeightDataTransformer,
 the default `dataset_transformer` for `KerasClassifier`.
 
 Lets quickly test our transformer:
 
 ```python
-data = ragged_transformer((X, y, None))
-data
+data = ragged_transformer(dict(x=X, y=y, sample_weight=None))
+print(type(data["x"]))
+print(data["x"].shape)
 ```
 
+And the `y=None` case:
+
 ```python
-data = ragged_transformer((X, None, None))
-data
+data = ragged_transformer(dict(x=X, y=None, sample_weight=None))
+print(type(data["x"]))
+print(data["x"].shape)
 ```
 
-Our shapes look good, and we can handle the `y=None` case.
+Everything looks good!
 
 Because Keras will not accept a RaggedTensor directly, we will need to wrap our entire dataset into a tensorflow `Dataset`. We can do this by adding one more transformation step:
 
 Next, we can add our transormers to our model. We use an sklearn `Pipeline` (generated via `make_pipeline`) to keep ClassWeightDataTransformer operational while implementing our custom transformation.
 
 ```python
-def dataset_transformer(data: Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]) -> Tuple[tf.data.Dataset, None, None]:
-    return (tf.data.Dataset.from_tensor_slices(data), None, None)
+def dataset_transformer(data: Dict[str, Any]) -> Dict[str, Any]:
+    x_y_s = data["x"], data.get("y", None), data.get("sample_weight", None)
+    data["x"] = tf.data.Dataset.from_tensor_slices(x_y_s)
+    # don't blindly assign y & sw; if being called from
+    # predict they should not just be None, they should not be present at all!
+    if "y" in data:
+        data["y"] = None
+    if "sample_weight" in data:
+        data["sample_weight"] = None
+    return data
 ```
 
 ```python
@@ -603,7 +624,8 @@ y_pred
 
 ## 6. Multi-output class_weight
 
-In this example, we will use `dataset_transformer` to support multi-output class weights. We will re-use our `MultiOutputTransformer` from our previous example to split the output, then we will create `sample_weights` from `class_weight`
+In this example, we will use `dataset_transformer` to support multi-output class weights.
+We will re-use our `MultiOutputTransformer` from our previous example to split the output, then we will create `sample_weight` from `class_weight`.
 
 ```python
 from collections import defaultdict
@@ -614,25 +636,24 @@ from sklearn.utils.class_weight import compute_sample_weight
 
 class DatasetTransformer(BaseEstimator, TransformerMixin):
 
-    def __init__(self, output_names, class_weight=None):
-        self.class_weight = class_weight
+    def __init__(self, output_names):
         self.output_names = output_names
 
-    def fit(self, data: Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]) -> "DatasetTransformer":
+    def fit(self, data: Dict[str, Any]) -> "DatasetTransformer":
         return self
 
-    def transform(self, data: Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]) -> Tuple[np.ndarray, Union[np.ndarray, None], Union[np.ndarray, None]]:
-        if self.class_weight is None:
+    def transform(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        class_weight = data.get("class_weight", None)
+        if class_weight is None:
             return data
-        class_weight = self.class_weight
         if isinstance(class_weight, str):  # handle "balanced"
             class_weight_ = class_weight
             class_weight = defaultdict(lambda: class_weight_)
-        X, y, sample_weights = data
-        assert sample_weights is None, "Cannot use class_weight & sample_weights together"
+        y, sample_weight = data.get("y", None), data.get("sample_weight", None)
+        assert sample_weight is None, "Cannot use class_weight & sample_weight together"
         if y is not None:
             # y should be a list of arrays, as split up by MultiOutputTransformer
-            sample_weights = {
+            sample_weight = {
                 output_name: compute_sample_weight(class_weight[output_num], output_data)
                 for output_num, (output_name, output_data) in enumerate(zip(self.output_names, y))
             }
@@ -640,10 +661,11 @@ class DatasetTransformer(BaseEstimator, TransformerMixin):
             # see https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_sample_weight.html
             # It is trivial to change the expected format to match Keras' ({output_name: weights, ...})
             # see https://github.com/keras-team/keras/issues/4735#issuecomment-267473722
-        return X, y, sample_weights
-```
+            data["sample_weight"] = sample_weight
+            data["class_weight"] = None
+        return data
+
 
-```python
 def get_model(meta, compile_kwargs):
     inp = keras.layers.Input(shape=(meta["n_features_in_"]))
     x1 = keras.layers.Dense(100, activation="relu")(inp)
@@ -667,7 +689,6 @@ class CustomClassifier(KerasClassifier):
     def dataset_transformer(self):
         return DatasetTransformer(
             output_names=self.model_.output_names,
-            class_weight=self.class_weight
         )
 ```
 
@@ -731,3 +752,172 @@ print(counts_bin)
 (_, counts_cat) = np.unique(y_pred[:, 1], return_counts=True)
 print(counts_cat)
 ```
+
+## 6. Custom validation dataset
+
+Although `dataset_transformer` is primarily designed for data transformations, because it returns valid `**kwargs` to fit it can be used for other advanced use cases.
+In this example, we use `dataset_transformer` to implement a custom test/train split for Keras' internal validation. We'll use sklearn's
+`train_test_split`, but this could be implemented via an arbitrary user function, eg. to ensure balanced class distribution.
+
+```python
+from sklearn.model_selection import train_test_split
+
+
+def get_clf(meta: Dict[str, Any]):
+    inp = keras.layers.Input(shape=(meta["n_features_in_"],))
+    x1 = keras.layers.Dense(100, activation="relu")(inp)
+    out = keras.layers.Dense(1, activation="sigmoid")(x1)
+    return keras.Model(inputs=inp, outputs=out)
+
+
+class CustomSplit(BaseEstimator, TransformerMixin):
+
+    def __init__(self, test_size: float):
+        self.test_size = test_size
+    
+    def fit(self, data: Dict[str, Any]) -> "CustomSplit":
+        return self
+
+    def transform(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        if self.test_size == 0:
+            return data
+        x, y, sw = data["x"], data.get("y", None), data.get("sample_weight", None)
+        if y is None:
+            return data
+        if sw is None:
+            x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=self.test_size, stratify=y)
+            validation_data = (x_val, y_val)
+            sw_train = None
+        else:
+            x_train, x_val, y_train, y_val, sw_train, sw_val = train_test_split(x, y, sw, test_size=self.test_size, stratify=y)
+            validation_data = (x_val, y_val, sw_val)
+        data["validation_data"] = validation_data
+        data["x"], data["y"], data["sample_weight"] = x_train, y_train, sw_train
+        return data
+
+
+class CustomClassifier(KerasClassifier):
+
+    @property
+    def dataset_transformer(self):
+        return CustomSplit(test_size=self.validation_split)
+```
+
+And now lets test with a toy dataset. We specifically choose to make the target strings to show
+that with this approach, we can preserve all of the nice data pre-processing that SciKeras does
+for us, while still being able to split the final data before passing it to Keras.
+
+```python
+y = np.array(["a"] * 900 + ["b"] * 100)
+X = np.array([0] * 900 + [1] * 100).reshape(-1, 1)
+```
+
+To get a base measurment to compare against, we'll run first with KerasClassifier as a benchmark.
+
+```python
+clf = KerasClassifier(
+    get_clf,
+    loss="bce",
+    metrics=["binary_accuracy"],
+    verbose=False,
+    validation_split=0.1,
+    shuffle=False,
+    random_state=0,
+    epochs=10
+)
+
+clf.fit(X, y)
+print(f"binary_accuracy = {clf.history_['binary_accuracy'][-1]}")
+print(f"val_binary_accuracy = {clf.history_['val_binary_accuracy'][-1]}")
+```
+
+We see that we get near zero validation accuracy. Because one of our classes was only found in the tail end of our dataset and we specified `validation_split=0.1`, we validated with a class we had never seen before.
+
+We could specify `shuffle=True` (this is actually the default), but for highly imbalanced classes, this may not be as good as stratified splitting.
+
+So lets test our new `CustomClassifier`.
+
+```python
+clf = CustomClassifier(
+    get_clf,
+    loss="bce",
+    metrics=["binary_accuracy"],
+    verbose=False,
+    validation_split=0.1,
+    shuffle=False,
+    random_state=0,
+    epochs=10
+)
+
+clf.fit(X, y)
+print(f"binary_accuracy = {clf.history_['binary_accuracy'][-1]}")
+print(f"val_binary_accuracy = {clf.history_['val_binary_accuracy'][-1]}")
+```
+
+Much better!
+
+
+## 7. Dynamically setting batch_size
+
+
+In this tutorial, we use the `data_transformer` interface to implement a dynamic batch_size, similar to sklearn's [MLPClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html). We will implement `batch_size` as `batch_size=min(200, n_samples)`.
+
+```python
+from sklearn.model_selection import train_test_split
+
+
+def check_batch_size(x):
+    """Check the batch_size used in training.
+    """
+    bs = x.shape[0]
+    if bs is not None:
+        print(f"batch_size={bs}")
+    return x
+
+
+def get_clf(meta: Dict[str, Any]):
+    inp = keras.layers.Input(shape=(meta["n_features_in_"],))
+    x1 = keras.layers.Dense(100, activation="relu")(inp)
+    x2 = keras.layers.Lambda(check_batch_size)(x1)
+    out = keras.layers.Dense(1, activation="sigmoid")(x2)
+    return keras.Model(inputs=inp, outputs=out)
+
+
+class DynamicBatch(BaseEstimator, TransformerMixin):
+
+    def fit(self, data: Dict[str, Any]) -> "DynamicBatch":
+        return self
+
+    def transform(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        n_samples = data["x"].shape[0]
+        data["batch_size"] = min(200, n_samples)
+        return data
+
+
+class DynamicBatchClassifier(KerasClassifier):
+
+    @property
+    def dataset_transformer(self):
+        return DynamicBatch()
+```
+
+Since this is happening inside SciKeras, this will work even if we are doing cross validation (which adjusts the split according to `cv`).
+
+```python
+from sklearn.model_selection import cross_val_score
+
+clf = DynamicBatchClassifier(
+    get_clf,
+    loss="bce",
+    verbose=False,
+    random_state=0
+)
+
+_ = cross_val_score(clf, X, y, cv=6)  # note: 1000 / 6 = 167
+```
+
+But if we train with larger inputs, we can hit the cap of 200 we set:
+
+```python
+_ = cross_val_score(clf, X, y, cv=5)
+```
diff --git a/scikeras/utils/transformers.py b/scikeras/utils/transformers.py
index c3773390d..826c56a76 100644
--- a/scikeras/utils/transformers.py
+++ b/scikeras/utils/transformers.py
@@ -408,18 +408,15 @@ def __init__(self, class_weight: Optional[Union[str, Dict[int, float]]] = None):
         self.class_weight = class_weight
 
     def fit(
-        self,
-        data: Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]],
-        dummy: None = None,
+        self, data: Dict[str, Any], dummy: None = None
     ) -> "ClassWeightDataTransformer":
         return self
 
-    def transform(
-        self, data: Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]
-    ) -> Tuple[np.ndarray, Union[np.ndarray, None], Union[np.ndarray, None]]:
-        X, y, sample_weight = data
+    def transform(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        y, sample_weight = data.get("y", None), data.get("sample_weight", None)
         if self.class_weight is None or y is None:
-            return (X, y, sample_weight)
+            return data
         sample_weight = 1 if sample_weight is None else sample_weight
         sample_weight *= compute_sample_weight(class_weight=self.class_weight, y=y)
-        return (X, y, sample_weight)
+        data["sample_weight"] = sample_weight
+        return data
diff --git a/scikeras/wrappers.py b/scikeras/wrappers.py
index 5ade5ff59..455e4e46a 100644
--- a/scikeras/wrappers.py
+++ b/scikeras/wrappers.py
@@ -139,6 +139,7 @@ class BaseWrapper(BaseEstimator):
         "validation_split",
         "shuffle",
         "sample_weight",
+        "class_weight",
         "initial_epoch",
         "validation_steps",
         "validation_batch_size",
@@ -478,16 +479,21 @@ def _fit_keras_model(
         # collect parameters
         params = self.get_params()
         fit_args = route_params(params, destination="fit", pass_filter=self._fit_kwargs)
-        fit_args["sample_weight"] = sample_weight
         fit_args["epochs"] = initial_epoch + epochs
         fit_args["initial_epoch"] = initial_epoch
         fit_args.update(kwargs)
 
+        fit_args["x"] = X
+        fit_args["y"] = y
+        fit_args["sample_weight"] = sample_weight
+
+        fit_args = self.dataset_transformer_.transform(fit_args)
+
         if self._random_state is not None:
             with TFRandomState(self._random_state):
-                hist = self.model_.fit(x=X, y=y, **fit_args)
+                hist = self.model_.fit(**fit_args)
         else:
-            hist = self.model_.fit(x=X, y=y, **fit_args)
+            hist = self.model_.fit(**fit_args)
 
         if not warm_start or not hasattr(self, "history_") or initial_epoch == 0:
             self.history_ = defaultdict(list)
@@ -808,7 +814,9 @@ def _initialize(
 
         self.model_ = self._build_keras_model()
 
-        self.dataset_transformer_ = self.dataset_transformer.fit((X, y, sample_weight))
+        self.dataset_transformer_ = self.dataset_transformer.fit(
+            dict(x=X, y=y, sample_weight=sample_weight)
+        )
         dataset_meta = getattr(self.dataset_transformer_, "get_metadata", dict)()
         vars(self).update(**dataset_meta)
 
@@ -882,8 +890,6 @@ def _fit(
             y = self.target_encoder_.transform(y)
             self._check_model_compatibility(y)
 
-        X, y, sample_weight = self.dataset_transformer_.transform((X, y, sample_weight))
-
         self._fit_keras_model(
             X,
             y,
@@ -948,7 +954,6 @@ def _predict_raw(self, X, **kwargs):
 
         # pre process X
         X = self.feature_encoder_.transform(X)
-        X, _, _ = self.dataset_transformer_.transform((X, None, None))
 
         # filter kwargs and get attributes for predict
         params = self.get_params()
@@ -956,9 +961,12 @@ def _predict_raw(self, X, **kwargs):
             params, destination="predict", pass_filter=self._predict_kwargs
         )
         pred_args.update(kwargs)
+        pred_args["x"] = X
+
+        pred_args = self.dataset_transformer_.transform(pred_args)
 
         # predict with Keras model
-        y_pred = self.model_.predict(X, **pred_args)
+        y_pred = self.model_.predict(**pred_args)
 
         return y_pred
 
diff --git a/tests/test_api.py b/tests/test_api.py
index 9396ecb57..9b9b0006c 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -815,13 +815,21 @@ def test_conversion_to_dataset(self):
         m = Model(inp, out)
         m.compile(loss="bce")
 
-        def dtf(X_y_s: Tuple[np.ndarray, np.ndarray, np.ndarray]):
-            return (tf.data.Dataset.from_tensor_slices(X_y_s), None, None)
+        def transform(fit_kwargs: Dict[str, Any]):
+            x = fit_kwargs.pop("x")
+            y = fit_kwargs.pop("y") if "y" in fit_kwargs else None
+            sample_weight = (
+                fit_kwargs.pop("sample_weight")
+                if "sample_weight" in fit_kwargs
+                else None
+            )
+            fit_kwargs["x"] = tf.data.Dataset.from_tensor_slices((x, y, sample_weight))
+            return fit_kwargs
 
         class MyWrapper(KerasClassifier):
             @property
             def dataset_transformer(self):
-                return FunctionTransformer(dtf)
+                return FunctionTransformer(transform)
 
         est = MyWrapper(m)
         X = np.random.random((100, 1))
@@ -830,7 +838,8 @@ def dataset_transformer(self):
 
         def check_fit(**kwargs):
             assert isinstance(kwargs["x"], tf.data.Dataset)
-            assert kwargs["y"] is None
+            assert "y" not in kwargs
+            assert "sample_weight" not in kwargs
             return fit_orig(**kwargs)
 
         with patch.object(m, "fit", new=check_fit):
@@ -849,14 +858,22 @@ def test_pipeline(self):
         m = Model(inp, out)
         m.compile(loss="bce")
 
-        def dtf(X_y_s: Tuple[np.ndarray, np.ndarray, np.ndarray]):
-            return (tf.data.Dataset.from_tensor_slices(X_y_s), None, None)
+        def transform(fit_kwargs: Dict[str, Any]):
+            x = fit_kwargs.pop("x")
+            y = fit_kwargs.pop("y") if "y" in fit_kwargs else None
+            sample_weight = (
+                fit_kwargs.pop("sample_weight")
+                if "sample_weight" in fit_kwargs
+                else None
+            )
+            fit_kwargs["x"] = tf.data.Dataset.from_tensor_slices((x, y, sample_weight))
+            return fit_kwargs
 
         class MyWrapper(KerasClassifier):
             @property
             def dataset_transformer(self):
                 t1 = super().dataset_transformer
-                t2 = FunctionTransformer(dtf)
+                t2 = FunctionTransformer(transform)
                 return make_pipeline(t1, t2)
 
         est = MyWrapper(m, class_weight="balanced")

From 6eee3c4a19b28694758e02aab27af9222ff82db8 Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Tue, 16 Feb 2021 00:07:39 -0600
Subject: [PATCH 26/29] fix broken test

---
 scikeras/utils/transformers.py | 1 +
 scikeras/wrappers.py           | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/scikeras/utils/transformers.py b/scikeras/utils/transformers.py
index 826c56a76..f1b7f8542 100644
--- a/scikeras/utils/transformers.py
+++ b/scikeras/utils/transformers.py
@@ -419,4 +419,5 @@ def transform(self, data: Dict[str, Any]) -> Dict[str, Any]:
         sample_weight = 1 if sample_weight is None else sample_weight
         sample_weight *= compute_sample_weight(class_weight=self.class_weight, y=y)
         data["sample_weight"] = sample_weight
+        data["class_weight"] = None
         return data
diff --git a/scikeras/wrappers.py b/scikeras/wrappers.py
index 455e4e46a..e535e4612 100644
--- a/scikeras/wrappers.py
+++ b/scikeras/wrappers.py
@@ -817,8 +817,6 @@ def _initialize(
         self.dataset_transformer_ = self.dataset_transformer.fit(
             dict(x=X, y=y, sample_weight=sample_weight)
         )
-        dataset_meta = getattr(self.dataset_transformer_, "get_metadata", dict)()
-        vars(self).update(**dataset_meta)
 
         return X, y, sample_weight
 

From 5ca7da811e52fdd878fca4db3f8e485525a7498f Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Tue, 16 Feb 2021 00:33:36 -0600
Subject: [PATCH 27/29] update docs

---
 docs/source/advanced.rst | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst
index 68e939858..f99b37965 100644
--- a/docs/source/advanced.rst
+++ b/docs/source/advanced.rst
@@ -209,21 +209,24 @@ Within SciKeras, this is roughly implemented as follows:
             self.feature_encoder_ = self.feature_encoder.fit(y)
             y = self.target_encoder_.transform(y)
             self.model_ = self._build_keras_model()
-            self.dataset_transformer_ = self.dataset_transformer.fit((X, y, sample_weight))
-            X, y, sample_weight = self.dataset_transformer_.transform((X, y, sample_weight))
+            fit_kwargs = dict(x=X, y=y, sample_weight=sample_weight)
+            self.dataset_transformer_ = self.dataset_transformer.fit(fit_kwargs)
+            fit_kwargs = self.dataset_transformer_.transform(fit_kwargs)
             self.model_.fit(x=X, y=y, sample_weight=sample_weight)  # tf.keras.Model.fit
             return self
 
         def predict(self, X):
             X = self.feature_encoder_.transform(X)
-            X, _, _ = self.dataset_transformer_.fit_transform((X, None, None))
-            y_pred = self.model_.predict(X)
+            predict_kwargs = dict(x=X)
+            predict_kwargs = self.dataset_transformer_.fit_transform(predict_kwargs)
+            y_pred = self.model_.predict(**predict_kwargs)
             return self.target_encoder_.inverse_transform(y_pred)
 
 
 ``dataset_transformer`` is the last step before passing the data to Keras, and it allows for the greatest
 degree of customization because SciKeras does not make any assumptions about the output data
 and passes it directly to :py:func:`tensorflow.keras.Model.fit`.
+
 Its signature is:
 
 .. code:: python
@@ -231,17 +234,26 @@ Its signature is:
     from sklearn.base import BaseEstimator, TransformerMixin
 
     class DatasetTransformer(BaseEstimator, TransformerMixin):
-        def fit(self, data) -> "DatasetTransformer":
-            X, y, sample_weight = data  # sample_weight might be None
+        def fit(self, data: Dict[str, Any]) -> "DatasetTransformer":
+            assert data.keys() == {"x", "y", "sample_weight"}  # fixed keys
             ...
             return self
 
         def transform(self, data):  # return a valid input for keras.Model.fit
-            X, y, sample_weight = data  # y and/or sample_weight might be None
+            # data includes x, y, sample_weight
+            assert "x" in data  # "x" is always a keys
+            if "y" in data:
+                # called from fit
+            else:
+                # called from predict
+            # as well as other Model.fit or Model.predict arguments
+            assert "batch_size" in data
             ...
-            return (X, y, sample_weight)  # option 1
-            return (tensorflow_dataset, None, None)  # option 2
+            return data
+
 
+You can modify ``data`` in-place within ``transoform`` but **must** still return
+it.
 
 Although you could implement *all* data transformations in a single ``dataset_transformer``,
 having several distinct dependency injections points allows for more modularity,

From 5bd222ed7cbf33cbd318c2509620a9f233b6a7ae Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Tue, 16 Feb 2021 09:07:54 -0600
Subject: [PATCH 28/29] add clarifying comment in docs

---
 docs/source/advanced.rst | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst
index f99b37965..c045599ab 100644
--- a/docs/source/advanced.rst
+++ b/docs/source/advanced.rst
@@ -227,7 +227,8 @@ Within SciKeras, this is roughly implemented as follows:
 degree of customization because SciKeras does not make any assumptions about the output data
 and passes it directly to :py:func:`tensorflow.keras.Model.fit`.
 
-Its signature is:
+It accepts a dict of valid Keras ``**kwargs`` and is expected to return a dict
+of valid Keras ``**kwargs``:
 
 .. code:: python
 
@@ -255,6 +256,10 @@ Its signature is:
 You can modify ``data`` in-place within ``transoform`` but **must** still return
 it.
 
+When called from ``fit`` or ``initialize``, you will get and return keys that are valid
+``**kwargs`` to ``tf.keras.Model.fit``. When being called from ``predict`` or ``score``
+you will get and return keys that are valid ``**kwargs`` to ``tf.keras.Model.predict``.
+
 Although you could implement *all* data transformations in a single ``dataset_transformer``,
 having several distinct dependency injections points allows for more modularity,
 for example to keep the default processing of string-encoded labels but convert

From f56068758aa99f793c46e9a85ddd496f253545f6 Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Tue, 16 Feb 2021 09:18:02 -0600
Subject: [PATCH 29/29] update TOC

---
 docs/source/notebooks/DataTransformers.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/source/notebooks/DataTransformers.md b/docs/source/notebooks/DataTransformers.md
index 5d05f20e9..6cb32148d 100644
--- a/docs/source/notebooks/DataTransformers.md
+++ b/docs/source/notebooks/DataTransformers.md
@@ -45,6 +45,9 @@ the [data transformer docs](https://www.adriangb.com/scikeras/refs/heads/master/
   * [4.2 Test](#4.2-Test)
 * [5. Ragged datasets with tf.data.Dataset](#5.-Ragged-datasets-with-tf.data.Dataset)
 * [6. Multi-output class_weight](#6.-Multi-output-class_weight)
+* [7. Custom validation dataset](#6.-Custom-validation-dataset)
+* [8. Dynamically setting batch_size](#6.-Dynamically-setting-batch_size)
+
 
 ## 1. Setup
 
@@ -753,7 +756,7 @@ print(counts_bin)
 print(counts_cat)
 ```
 
-## 6. Custom validation dataset
+## 7. Custom validation dataset
 
 Although `dataset_transformer` is primarily designed for data transformations, because it returns valid `**kwargs` to fit it can be used for other advanced use cases.
 In this example, we use `dataset_transformer` to implement a custom test/train split for Keras' internal validation. We'll use sklearn's
@@ -857,7 +860,7 @@ print(f"val_binary_accuracy = {clf.history_['val_binary_accuracy'][-1]}")
 Much better!
 
 
-## 7. Dynamically setting batch_size
+## 8. Dynamically setting batch_size
 
 
 In this tutorial, we use the `data_transformer` interface to implement a dynamic batch_size, similar to sklearn's [MLPClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html). We will implement `batch_size` as `batch_size=min(200, n_samples)`.