diff --git a/examples/model_examples/scikit-learn/Hamilton for ML dataflows.ipynb b/examples/model_examples/scikit-learn/Hamilton for ML dataflows.ipynb new file mode 100644 index 000000000..1326ea202 --- /dev/null +++ b/examples/model_examples/scikit-learn/Hamilton for ML dataflows.ipynb @@ -0,0 +1,547 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "47ed8323-e689-464c-83ec-1ee98d2c2585", + "metadata": {}, + "source": [ + "# Hamilton for ML dataflows\n", + "\n", + "#### Requirements:\n", + "\n", + "- Install dependencies (listed in `requirements.txt`)\n", + "\n", + "More details [here](https://github.com/DAGWorks-Inc/hamilton/blob/main/examples/model_examples/scikit-learn/README.md#using-hamilton-for-ml-dataflows).\n", + "\n", + "***\n", + "\n", + "Uncomment and run the cell below if you are in a Google Colab environment. It will:\n", + "1. Mount google drive. You will be asked to authenticate and give permissions.\n", + "2. Change directory to google drive.\n", + "3. Make a directory \"hamilton-tutorials\"\n", + "4. Change directory to it.\n", + "5. Clone this repository to your google drive\n", + "6. Move your current directory to the hello_world example\n", + "7. Install requirements.\n", + "\n", + "This means that any modifications will be saved, and you won't lose them if you close your browser." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5e12e1c-a8b2-477a-a9ff-6257ab587734", + "metadata": {}, + "outputs": [], + "source": [ + "## 1. Mount google drive\n", + "# from google.colab import drive\n", + "# drive.mount('/content/drive')\n", + "## 2. Change directory to google drive.\n", + "# %cd /content/drive/MyDrive\n", + "## 3. Make a directory \"hamilton-tutorials\"\n", + "# !mkdir hamilton-tutorials\n", + "## 4. Change directory to it.\n", + "# %cd hamilton-tutorials\n", + "## 5. Clone this repository to your google drive\n", + "# !git clone https://github.com/DAGWorks-Inc/hamilton/\n", + "## 6. Move your current directory to the hello_world example\n", + "# %cd hamilton/examples/hello_world\n", + "## 7. Install requirements.\n", + "# %pip install -r requirements.txt\n", + "# clear_output() # optionally clear outputs\n", + "# To check your current working directory you can type `!pwd` in a cell and run it." + ] + }, + { + "cell_type": "markdown", + "id": "9115ca99-cb3b-4dc3-8218-fa26b00d2199", + "metadata": {}, + "source": [ + "***\n", + "Here we have a simple example showing how you can write a ML training and evaluation workflow with Hamilton. \n", + "***" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "04fa1ff7-74f7-4193-9e1f-c17d9e68efc5", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "Example script showing how one might setup a generic model training pipeline that is quickly configurable.\n", + "\"\"\"\n", + "\n", + "import digit_loader\n", + "import iris_loader\n", + "import my_train_evaluate_logic\n", + "\n", + "from hamilton import base, driver\n", + "\n", + "\n", + "def get_data_loader(data_set: str):\n", + " \"\"\"Returns the module to load that will procur data -- the data loaders all have to define the same functions.\"\"\"\n", + " if data_set == \"iris\":\n", + " return iris_loader\n", + " elif data_set == \"digits\":\n", + " return digit_loader\n", + " else:\n", + " raise ValueError(f\"Unknown data_name {data_set}.\")\n", + "\n", + "\n", + "def get_model_config(model_type: str) -> dict:\n", + " \"\"\"Returns model type specific configuration\"\"\"\n", + " if model_type == \"svm\":\n", + " return {\"clf\": \"svm\", \"gamma\": 0.001}\n", + " elif model_type == \"logistic\":\n", + " return {\"clf\": \"logistic\", \"penalty\": \"l2\"}\n", + " else:\n", + " raise ValueError(f\"Unsupported model {model_type}.\")" + ] + }, + { + "cell_type": "markdown", + "id": "88ccbc7c-f265-47fa-a921-f26ac3ed7094", + "metadata": {}, + "source": [ + "***\n", + "For the purpose of this experiment, lets apply the following configuration:\n", + "\n", + "- `_data_set` = 'digits'\n", + "- `_model_type` = 'logistic'\n", + "\n", + "More details [here](https://github.com/DAGWorks-Inc/hamilton/blob/main/examples/model_examples/scikit-learn/README.md).\n", + "***" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "9e5e7282-8286-4055-847f-adb168420da0", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Note: Hamilton collects completely anonymous data about usage. This will help us improve Hamilton over time. See https://github.com/dagworks-inc/hamilton#usage-analytics--data-privacy for details.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "classification_report :\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 0.99 0.99 91\n", + " 1 0.92 0.95 0.94 84\n", + " 2 0.98 1.00 0.99 83\n", + " 3 0.99 0.98 0.98 81\n", + " 4 0.95 0.99 0.97 95\n", + " 5 0.98 0.94 0.96 97\n", + " 6 0.97 0.98 0.97 85\n", + " 7 0.98 0.98 0.98 96\n", + " 8 0.91 0.90 0.91 96\n", + " 9 0.96 0.93 0.94 91\n", + "\n", + " accuracy 0.96 899\n", + " macro avg 0.96 0.96 0.96 899\n", + "weighted avg 0.96 0.96 0.96 899\n", + "\n", + "confusion_matrix :\n", + " [[90 0 0 0 1 0 0 0 0 0]\n", + " [ 0 80 0 0 1 0 1 0 2 0]\n", + " [ 0 0 83 0 0 0 0 0 0 0]\n", + " [ 0 0 0 79 0 0 0 1 0 1]\n", + " [ 0 1 0 0 94 0 0 0 0 0]\n", + " [ 0 1 0 1 1 91 0 1 0 2]\n", + " [ 0 0 0 0 0 0 83 0 2 0]\n", + " [ 0 0 0 0 1 0 0 94 0 1]\n", + " [ 0 5 2 0 0 1 2 0 86 0]\n", + " [ 0 0 0 0 1 1 0 0 4 85]]\n", + "fit_clf :\n", + " LogisticRegression()\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/flaviassantos/github/hamilton/venv/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n" + ] + } + ], + "source": [ + "_data_set = 'digits' # the data set to load\n", + "_model_type = 'logistic' # the model type to fit and evaluate with\n", + "\n", + "dag_config = {\n", + " \"test_size_fraction\": 0.5,\n", + " \"shuffle_train_test_split\": True,\n", + "}\n", + "# augment config\n", + "dag_config.update(get_model_config(_model_type))\n", + "# get module with functions to load data\n", + "data_module = get_data_loader(_data_set)\n", + "# set the desired result container we want\n", + "adapter = base.DefaultAdapter()\n", + "\"\"\"\n", + "What's cool about this, is that by simply changing the `dag_config` and the `data_module` we can\n", + "reuse the logic in the `my_train_evaluate_logic` module very easily for different contexts and purposes if\n", + "want to setup a generic model fitting and prediction dataflow!\n", + "E.g. if we want to support a new data set, then we just need to add a new data loading module.\n", + "E.g. if we want to support a new model type, then we just need to add a single conditional function\n", + " to my_train_evaluate_logic.\n", + "\"\"\"\n", + "dr = driver.Driver(dag_config, data_module, my_train_evaluate_logic, adapter=adapter)\n", + "# ensure you have done \"pip install \"sf-hamilton[visualization]\"\" for the following to work:\n", + "# dr.visualize_execution(['classification_report', 'confusion_matrix', 'fit_clf'],\n", + "# f'./model_dag_{_data_set}_{_model_type}.dot', {\"format\": \"png\"})\n", + "results = dr.execute([\"classification_report\", \"confusion_matrix\", \"fit_clf\"])\n", + "for k, v in results.items():\n", + " print(k, \":\\n\", v)" + ] + }, + { + "cell_type": "markdown", + "id": "2035065c-c409-4c21-bd11-733e74623226", + "metadata": {}, + "source": [ + "***\n", + "Here is the graph of execution for the digits data set:\n", + "***" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "17522217-8a09-46da-8b8d-0ba97d278bdc", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "confusion_matrix\n", + "\n", + "confusion_matrix\n", + "\n", + "\n", + "\n", + "classification_report\n", + "\n", + "classification_report\n", + "\n", + "\n", + "\n", + "y_train\n", + "\n", + "y_train\n", + "\n", + "\n", + "\n", + "fit_clf\n", + "\n", + "fit_clf\n", + "\n", + "\n", + "\n", + "y_train->fit_clf\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "y_test\n", + "\n", + "y_test\n", + "\n", + "\n", + "\n", + "y_test_with_labels\n", + "\n", + "y_test_with_labels\n", + "\n", + "\n", + "\n", + "y_test->y_test_with_labels\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "feature_matrix\n", + "\n", + "feature_matrix\n", + "\n", + "\n", + "\n", + "train_test_split_func\n", + "\n", + "train_test_split_func\n", + "\n", + "\n", + "\n", + "feature_matrix->train_test_split_func\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "target\n", + "\n", + "target\n", + "\n", + "\n", + "\n", + "target->train_test_split_func\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "test_size_fraction\n", + "\n", + "Input: test_size_fraction\n", + "\n", + "\n", + "\n", + "test_size_fraction->train_test_split_func\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "X_test\n", + "\n", + "X_test\n", + "\n", + "\n", + "\n", + "predicted_output\n", + "\n", + "predicted_output\n", + "\n", + "\n", + "\n", + "X_test->predicted_output\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "predicted_output_with_labels\n", + "\n", + "predicted_output_with_labels\n", + "\n", + "\n", + "\n", + "predicted_output_with_labels->confusion_matrix\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "predicted_output_with_labels->classification_report\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "train_test_split_func->y_train\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "train_test_split_func->y_test\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "train_test_split_func->X_test\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "X_train\n", + "\n", + "X_train\n", + "\n", + "\n", + "\n", + "train_test_split_func->X_train\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "fit_clf->predicted_output\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "target_names\n", + "\n", + "target_names\n", + "\n", + "\n", + "\n", + "target_names->predicted_output_with_labels\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "target_names->y_test_with_labels\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "y_test_with_labels->confusion_matrix\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "y_test_with_labels->classification_report\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "prefit_clf\n", + "\n", + "prefit_clf\n", + "\n", + "\n", + "\n", + "prefit_clf->fit_clf\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "penalty\n", + "\n", + "Input: penalty\n", + "\n", + "\n", + "\n", + "penalty->prefit_clf\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "X_train->fit_clf\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "shuffle_train_test_split\n", + "\n", + "Input: shuffle_train_test_split\n", + "\n", + "\n", + "\n", + "shuffle_train_test_split->train_test_split_func\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "digit_data\n", + "\n", + "digit_data\n", + "\n", + "\n", + "\n", + "digit_data->feature_matrix\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "digit_data->target\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "digit_data->target_names\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "predicted_output->predicted_output_with_labels\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dr.visualize_execution(['classification_report', 'confusion_matrix', 'fit_clf'],\n", + " f'./model_dag_{_data_set}_{_model_type}.dot', {\"format\": \"png\"})" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "hamilton", + "language": "python", + "name": "hamilton" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}