diff --git a/.gitignore b/.gitignore index 5934c80c..682e8e30 100644 --- a/.gitignore +++ b/.gitignore @@ -34,3 +34,8 @@ MANIFEST # Setuptools SCM doubleml/_version.py + +# Virtual environment +.venv +venv/ +env/ diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 0d10dea5..efb0a868 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -16,11 +16,12 @@ from doubleml.utils._sensitivity import _compute_sensitivity_bias from doubleml.utils._tune_optuna import OPTUNA_GLOBAL_SETTING_KEYS, TUNE_ML_MODELS_DOC, resolve_optuna_cv from doubleml.utils.gain_statistics import gain_statistics +from doubleml.utils.maketables_mixin import MakeTablesMixin _implemented_data_backends = ["DoubleMLData", "DoubleMLClusterData", "DoubleMLDIDData", "DoubleMLSSMData", "DoubleMLRDDData"] -class DoubleML(SampleSplittingMixin, ABC): +class DoubleML(MakeTablesMixin, SampleSplittingMixin, ABC): """Double Machine Learning.""" def __init__(self, obj_dml_data, n_folds, n_rep, score, draw_sample_splitting, double_sample_splitting=False): diff --git a/doubleml/utils/maketables_mixin.py b/doubleml/utils/maketables_mixin.py new file mode 100644 index 00000000..a26695e7 --- /dev/null +++ b/doubleml/utils/maketables_mixin.py @@ -0,0 +1,188 @@ +""" +MakeTables Mixin for DoubleML Models. + +This module provides a mixin class that adds MakeTables plug-in support to DoubleML models. +The mixin implements the three required attributes for MakeTables compatibility: +- __maketables_coef_table__: Returns coefficient table as DataFrame +- __maketables_stat__: Returns model statistics by key +- __maketables_depvar__: Returns dependent variable name + +This enables zero-coupling integration with MakeTables - DoubleML never imports maketables, +but models automatically work with it when users have maketables installed. +""" + +import numpy as np +import pandas as pd + + +class MakeTablesMixin: + """ + Mixin class for MakeTables plug-in support. + + This mixin adds three attributes that enable DoubleML models to automatically work + with the MakeTables package for creating publication-ready regression tables. + + The plug-in format uses duck typing - MakeTables automatically detects these + attributes when present, without requiring any imports or dependencies. + + Attributes + ---------- + __maketables_coef_table__ : pd.DataFrame (property) + Coefficient table with columns 'b' (estimates), 'se' (standard errors), + 'p' (p-values), 't' (t-statistics), 'ci95l', 'ci95u' (95% CI bounds). + + __maketables_depvar__ : str (property) + Name of the dependent variable. + + __maketables_default_stat_keys__ : list (property) + Default statistics to display in tables. + + Methods + ------- + __maketables_stat__(key) + Return model statistic by key (e.g., 'N' for number of observations). + + Examples + -------- + >>> from doubleml import DoubleMLPLR + >>> # After fitting a DoubleML model + >>> from doubleml.plm.datasets import make_plr_CCDDHNR2018 + >>> from sklearn.ensemble import RandomForestRegressor + >>> from sklearn.base import clone + >>> np.random.seed(3141) + >>> ml_g = RandomForestRegressor(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) + >>> ml_m = RandomForestRegressor(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) + >>> obj_dml_data = make_plr_CCDDHNR2018(alpha=0.5, n_obs=500, dim_x=20) + >>> dml_plr = DoubleMLPLR(obj_dml_data, ml_g, ml_m).fit() + >>> # Access maketables attributes + >>> coef_table = dml_plr.__maketables_coef_table__ + >>> n_obs = dml_plr.__maketables_stat__('N') + >>> depvar = dml_plr.__maketables_depvar__ + """ + + @property + def __maketables_coef_table__(self) -> pd.DataFrame: + """ + Return coefficient table with all required and optional columns for MakeTables. + + Returns a pandas DataFrame with coefficient estimates, standard errors, p-values, + t-statistics, and 95% confidence intervals. The DataFrame index matches the + treatment variable names from the fitted model. + + Returns + ------- + pd.DataFrame + Coefficient table with columns: + - 'b': coefficient estimates (required) + - 'se': standard errors (required) + - 'p': p-values (required) + - 't': t-statistics (optional) + - 'ci95l': lower 95% confidence interval bound (optional) + - 'ci95u': upper 95% confidence interval bound (optional) + + Notes + ----- + - Returns empty DataFrame with correct columns if model is unfitted or all coefficients are NaN + - Index is set to match the summary table index (treatment variable names) + - Handles edge cases gracefully without raising errors + """ + # Handle unfitted model + if not hasattr(self, "coef") or self.coef is None: + return pd.DataFrame(columns=["b", "se", "t", "p", "ci95l", "ci95u"]) + + # Handle NaN coefficients (model fitted but no valid estimates) + if np.isnan(self.coef).all(): + return pd.DataFrame(columns=["b", "se", "t", "p", "ci95l", "ci95u"]) + + # Get 95% confidence intervals + ci = self.confint(level=0.95) + + # Build coefficient table with required and optional columns + coef_table = pd.DataFrame( + { + "b": self.coef, # Required: coefficient estimates + "se": self.se, # Required: standard errors + "p": self.pval, # Required: p-values + "t": self.t_stat, # Optional: t-statistics + "ci95l": ci.iloc[:, 0], # Optional: lower 95% CI bound + "ci95u": ci.iloc[:, 1], # Optional: upper 95% CI bound + } + ) + + # Set index to match summary table (handles treatment variable names) + if hasattr(self, "summary") and self.summary is not None and len(self.summary) > 0: + coef_table.index = self.summary.index + + return coef_table + + def __maketables_stat__(self, key: str): + """ + Return model statistic by key. + + Parameters + ---------- + key : str + The statistic key to retrieve. Common keys include: + - 'N': number of observations + - 'r2': R-squared (not applicable for DoubleML) + - 'adj_r2': adjusted R-squared (not applicable for DoubleML) + - 'aic': Akaike Information Criterion (not applicable for DoubleML) + - 'bic': Bayesian Information Criterion (not applicable for DoubleML) + - 'll': log-likelihood (not applicable for DoubleML) + + Returns + ------- + float, int, or None + The requested statistic value, or None if not available or not applicable. + + Notes + ----- + DoubleML focuses on causal inference, not prediction, so traditional model fit + statistics like R-squared, AIC, and BIC are not applicable and will return None. + Currently only 'N' (number of observations) is supported. + + """ + stats_map = { + "N": self.n_obs if hasattr(self, "n_obs") else None, + } + return stats_map.get(key, None) + + @property + def __maketables_depvar__(self) -> str: + """ + Return the name of the dependent variable. + + Returns + ------- + str + Name of the dependent (outcome) variable. Defaults to "Y" if not available. + + Notes + ----- + Retrieves the dependent variable name from the DoubleMLData object's y_col attribute. + Falls back to "Y" if the attribute is not available. + + """ + if hasattr(self, "_dml_data") and hasattr(self._dml_data, "y_col"): + return self._dml_data.y_col + return "Y" # Fallback + + @property + def __maketables_default_stat_keys__(self) -> list: + """ + Return default statistics to display in MakeTables output. + + Returns + ------- + list + List of statistic keys to display by default. For DoubleML models, + this is ['N'] (number of observations). + + Notes + ----- + This is an optional attribute that helps MakeTables know which statistics + to include in the table by default. Users can override this when calling + ETable() by specifying the model_stats parameter. + + """ + return ["N"] diff --git a/maketables_demo.ipynb b/maketables_demo.ipynb new file mode 100644 index 00000000..3774ac98 --- /dev/null +++ b/maketables_demo.ipynb @@ -0,0 +1,1292 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# DoubleML + MakeTables Integration Demo\n", + "\n", + "This notebook demonstrates the MakeTables integration with DoubleML, showing how to create publication-ready regression tables with beautiful HTML and LaTeX output." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from maketables import ETable\n", + "from sklearn.linear_model import LinearRegression, LogisticRegression\n", + "\n", + "import doubleml as dml\n", + "\n", + "# Set random seed for reproducibility\n", + "np.random.seed(42)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 1: Basic PLR Model\n", + "\n", + "Let's start with a simple Partially Linear Regression (PLR) model estimating the effect of education on income." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data shape: (1000, 12)\n", + "\n", + "First few rows:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
X1X2X3X4X5X6X7X8X9X10incomeeducation
00.496714-0.1382640.6476891.523030-0.234153-0.2341371.5792130.767435-0.4694740.542560-0.146270-0.471617
1-0.463418-0.4657300.241962-1.913280-1.724918-0.562288-1.0128310.314247-0.908024-1.412304-0.823369-0.676927
21.465649-0.2257760.067528-1.424748-0.5443830.110923-1.1509940.375698-0.600639-0.2916940.5200760.067710
3-0.6017071.852278-0.013497-1.0577110.822545-1.2208440.208864-1.959670-1.3281860.1968610.5050310.365248
40.7384670.171368-0.115648-0.301104-1.478522-0.719844-0.4606391.0571220.343618-1.7630402.1125351.617822
\n", + "
" + ], + "text/plain": [ + " X1 X2 X3 X4 X5 X6 X7 \\\n", + "0 0.496714 -0.138264 0.647689 1.523030 -0.234153 -0.234137 1.579213 \n", + "1 -0.463418 -0.465730 0.241962 -1.913280 -1.724918 -0.562288 -1.012831 \n", + "2 1.465649 -0.225776 0.067528 -1.424748 -0.544383 0.110923 -1.150994 \n", + "3 -0.601707 1.852278 -0.013497 -1.057711 0.822545 -1.220844 0.208864 \n", + "4 0.738467 0.171368 -0.115648 -0.301104 -1.478522 -0.719844 -0.460639 \n", + "\n", + " X8 X9 X10 income education \n", + "0 0.767435 -0.469474 0.542560 -0.146270 -0.471617 \n", + "1 0.314247 -0.908024 -1.412304 -0.823369 -0.676927 \n", + "2 0.375698 -0.600639 -0.291694 0.520076 0.067710 \n", + "3 -1.959670 -1.328186 0.196861 0.505031 0.365248 \n", + "4 1.057122 0.343618 -1.763040 2.112535 1.617822 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Generate synthetic data\n", + "n = 1000\n", + "p = 10\n", + "\n", + "X = np.random.normal(size=(n, p))\n", + "education = 0.5 * X[:, 0] + 0.3 * X[:, 1] + np.random.normal(size=n)\n", + "income = 0.8 * education + X[:, 2] + 0.5 * X[:, 3] + np.random.normal(size=n)\n", + "\n", + "df = pd.DataFrame(\n", + " np.column_stack((X, income, education)),\n", + " columns=[f\"X{i+1}\" for i in range(p)] + [\"income\", \"education\"]\n", + ")\n", + "\n", + "print(f\"Data shape: {df.shape}\")\n", + "print(\"\\nFirst few rows:\")\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DoubleML Summary:\n", + " coef std err t P>|t| 2.5 % 97.5 %\n", + "education 0.830282 0.032441 25.593745 1.790892e-144 0.766699 0.893865\n" + ] + } + ], + "source": [ + "# Prepare data for DoubleML\n", + "dml_data = dml.DoubleMLData(df, \"income\", \"education\")\n", + "\n", + "# Fit PLR model\n", + "ml_l = LinearRegression()\n", + "ml_m = LinearRegression()\n", + "\n", + "dml_plr = dml.DoubleMLPLR(dml_data, ml_l, ml_m, n_folds=5, score=\"partialling out\")\n", + "dml_plr.fit()\n", + "\n", + "# Show standard DoubleML summary\n", + "print(\"DoubleML Summary:\")\n", + "print(dml_plr.summary)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Inspect MakeTables Attributes\n", + "\n", + "The model now has special `__maketables_*` attributes that MakeTables uses to create tables:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Coefficient Table (__maketables_coef_table__):\n", + " b se p t ci95l ci95u\n", + "education 0.830282 0.032441 1.790892e-144 25.593745 0.766699 0.893865\n", + "\n", + "Sample Size: 1000\n", + "Dependent Variable: income\n", + "Default Statistics: ['N']\n" + ] + } + ], + "source": [ + "# Coefficient table\n", + "print(\"Coefficient Table (__maketables_coef_table__):\")\n", + "print(dml_plr.__maketables_coef_table__)\n", + "\n", + "print(f\"\\nSample Size: {dml_plr.__maketables_stat__('N')}\")\n", + "print(f\"Dependent Variable: {dml_plr.__maketables_depvar__}\")\n", + "print(f\"Default Statistics: {dml_plr.__maketables_default_stat_keys__}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create Table with MakeTables\n", + "\n", + "Now let's create a publication-ready table using MakeTables:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " income\n", + "
(1)
coef
education0.830***
(0.032)
stats
Observations1,000
Significance levels: * p < 0.1, ** p < 0.05, *** p < 0.01. Format of coefficient cell: Coefficient (Std. Error)
\n", + "\n", + "
\n", + " " + ], + "text/latex": [ + "\\begin{threeparttable}\n", + "\\begingroup\n", + "\\renewcommand\\cellalign{t}\n", + "\\renewcommand\\arraystretch{1}\n", + "\\setlength{\\tabcolsep}{3pt}\n", + "\\begin{tabularx}{\\linewidth}{@{}>{\\raggedright\\arraybackslash}l>{\\centering\\arraybackslash}X}\n", + "\\toprule\n", + " & \\multicolumn{1}{c}{income} \\\\\n", + "\\cmidrule(lr){2-2}\n", + " & (1) \\\\\n", + "\\midrule\n", + "\\addlinespace[1ex]\n", + "education & \\makecell{0.830*** \\\\ (0.032)} \\\\\n", + "\\addlinespace[0.5ex]\n", + "\\midrule\n", + "\\addlinespace[1ex]\n", + "Observations & 1,000 \\\\\n", + "\\addlinespace[0.5ex]\n", + "\\bottomrule\n", + "\\end{tabularx}\n", + "\\endgroup\n", + "\\noindent\\begin{minipage}{\\linewidth}\\smallskip\\footnotesize\n", + "Significance levels: * p < 0.1, ** p < 0.05, *** p < 0.01. Format of coefficient cell: Coefficient (Std. Error)\\end{minipage}\n", + "\n", + "\\end{threeparttable}" + ], + "text/plain": [ + ".DualOutput at 0x128881f70>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create table\n", + "table = ETable([dml_plr], show_se=True, model_stats=['N'])\n", + "\n", + "table" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LaTeX Table Code:\n", + "\\begin{threeparttable}\n", + "\\begingroup\n", + "\\renewcommand\\cellalign{t}\n", + "\\renewcommand\\arraystretch{1}\n", + "\\setlength{\\tabcolsep}{3pt}\n", + "\\begin{tabularx}{\\linewidth}{@{}>{\\raggedright\\arraybackslash}l>{\\centering\\arraybackslash}X}\n", + "\\toprule\n", + " & \\multicolumn{1}{c}{income} \\\\\n", + "\\cmidrule(lr){2-2}\n", + " & (1) \\\\\n", + "\\midrule\n", + "\\addlinespace[1ex]\n", + "education & \\makecell{0.830*** \\\\ (0.032)} \\\\\n", + "\\addlinespace[0.5ex]\n", + "\\midrule\n", + "\\addlinespace[1ex]\n", + "Observations & 1,000 \\\\\n", + "\\addlinespace[0.5ex]\n", + "\\bottomrule\n", + "\\end{tabularx}\n", + "\\endgroup\n", + "\\noindent\\begin{minipage}{\\linewidth}\\smallskip\\footnotesize\n", + "Significance levels: * p < 0.1, ** p < 0.05, *** p < 0.01. Format of coefficient cell: Coefficient (Std. Error)\\end{minipage}\n", + "\n", + "\\end{threeparttable}\n" + ] + } + ], + "source": [ + "# Display LaTeX output\n", + "print(\"LaTeX Table Code:\")\n", + "print(table.make('tex'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 2: Comparing Multiple Models\n", + "\n", + "One of the strengths of MakeTables is easily comparing multiple models side-by-side." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model 1 (Education effect):\n", + " coef std err t P>|t| 2.5 % 97.5 %\n", + "education 0.620113 0.032109 19.31289 4.184858e-83 0.557181 0.683045\n", + "\n", + "Model 2 (Experience effect):\n", + " coef std err t P>|t| 2.5 % 97.5 %\n", + "experience 0.42868 0.032385 13.237065 5.360313e-40 0.365207 0.492153\n" + ] + } + ], + "source": [ + "# Generate data with two treatments\n", + "np.random.seed(43)\n", + "n = 1000\n", + "p = 8\n", + "\n", + "X = np.random.normal(size=(n, p))\n", + "education = 0.5 * X[:, 0] + 0.2 * X[:, 1] + np.random.normal(size=n)\n", + "experience = 0.3 * X[:, 2] + 0.4 * X[:, 3] + np.random.normal(size=n)\n", + "income = 0.6 * education + 0.4 * experience + X[:, 4] + np.random.normal(size=n)\n", + "\n", + "df2 = pd.DataFrame(\n", + " np.column_stack((X, income, education, experience)),\n", + " columns=[f\"X{i+1}\" for i in range(p)] + [\"income\", \"education\", \"experience\"]\n", + ")\n", + "\n", + "# Fit separate models for each treatment\n", + "dml_data_edu = dml.DoubleMLData(df2, \"income\", \"education\")\n", + "dml_data_exp = dml.DoubleMLData(df2, \"income\", \"experience\")\n", + "\n", + "dml_edu = dml.DoubleMLPLR(dml_data_edu, LinearRegression(), LinearRegression(), n_folds=5)\n", + "dml_exp = dml.DoubleMLPLR(dml_data_exp, LinearRegression(), LinearRegression(), n_folds=5)\n", + "\n", + "dml_edu.fit()\n", + "dml_exp.fit()\n", + "\n", + "print(\"Model 1 (Education effect):\")\n", + "print(dml_edu.summary)\n", + "print(\"\\nModel 2 (Experience effect):\")\n", + "print(dml_exp.summary)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "
Comparison of Treatment Effects on Income
\n", + "  \n", + " \n", + " income\n", + "
\n", + " Education Model\n", + " \n", + " Experience Model\n", + "
(1)(2)
coef
education0.620***
(0.032)
experience0.429***
(0.032)
stats
Observations1,0001,000
Significance levels: * p < 0.1, ** p < 0.05, *** p < 0.01. Format of coefficient cell: Coefficient (Std. Error)
\n", + "\n", + "
\n", + " " + ], + "text/latex": [ + "\\begin{table}[htbp]\n", + "\\centering\n", + "\\caption{Comparison of Treatment Effects on Income}\n", + "\\smallskip\n", + "\\begin{threeparttable}\n", + "\\begingroup\n", + "\\renewcommand\\cellalign{t}\n", + "\\renewcommand\\arraystretch{1}\n", + "\\setlength{\\tabcolsep}{3pt}\n", + "\\begin{tabularx}{\\linewidth}{@{}>{\\raggedright\\arraybackslash}l>{\\centering\\arraybackslash}X>{\\centering\\arraybackslash}X}\n", + "\\toprule\n", + " & \\multicolumn{2}{c}{income} \\\\\n", + "\\cmidrule(lr){2-3}\n", + " & \\multicolumn{1}{c}{Education Model} & \\multicolumn{1}{c}{Experience Model} \\\\\n", + "\\cmidrule(lr){2-2} \\cmidrule(lr){3-3}\n", + " & (1) & (2) \\\\\n", + "\\midrule\n", + "\\addlinespace[1ex]\n", + "education & \\makecell{0.620*** \\\\ (0.032)} & \\\\\n", + "\\addlinespace[0.5ex]\n", + "\\addlinespace[0.5ex]\n", + "experience & & \\makecell{0.429*** \\\\ (0.032)} \\\\\n", + "\\addlinespace[0.5ex]\n", + "\\midrule\n", + "\\addlinespace[1ex]\n", + "Observations & 1,000 & 1,000 \\\\\n", + "\\addlinespace[0.5ex]\n", + "\\bottomrule\n", + "\\end{tabularx}\n", + "\\endgroup\n", + "\\noindent\\begin{minipage}{\\linewidth}\\smallskip\\footnotesize\n", + "Significance levels: * p < 0.1, ** p < 0.05, *** p < 0.01. Format of coefficient cell: Coefficient (Std. Error)\\end{minipage}\n", + "\n", + "\\end{threeparttable}\n", + "\\end{table}" + ], + "text/plain": [ + ".DualOutput at 0x11f511250>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create comparison table\n", + "comparison_table = ETable(\n", + " [dml_edu, dml_exp],\n", + " show_se=True,\n", + " model_stats=['N'],\n", + " model_heads=['Education Model', 'Experience Model'],\n", + " caption='Comparison of Treatment Effects on Income'\n", + ")\n", + "\n", + "comparison_table" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 3: Binary Treatment (IRM Model)\n", + "\n", + "Let's demonstrate with a binary treatment using the Interactive Regression Model (IRM)." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "IRM Summary:\n", + " coef std err t P>|t| 2.5 % 97.5 %\n", + "treatment 0.635458 0.070924 8.959683 3.256131e-19 0.496449 0.774467\n" + ] + } + ], + "source": [ + "# Generate data with binary treatment\n", + "np.random.seed(44)\n", + "n = 1000\n", + "p = 8\n", + "\n", + "X = np.random.normal(size=(n, p))\n", + "propensity = 1 / (1 + np.exp(-0.5 * X[:, 0] - 0.3 * X[:, 1]))\n", + "treatment = (np.random.uniform(size=n) < propensity).astype(float)\n", + "outcome = 0.7 * treatment + X[:, 2] + 0.5 * X[:, 3] + np.random.normal(size=n)\n", + "\n", + "df_irm = pd.DataFrame(\n", + " np.column_stack((X, outcome, treatment)),\n", + " columns=[f\"X{i+1}\" for i in range(p)] + [\"outcome\", \"treatment\"]\n", + ")\n", + "\n", + "# Fit IRM model\n", + "dml_data_irm = dml.DoubleMLData(df_irm, \"outcome\", \"treatment\")\n", + "\n", + "dml_irm = dml.DoubleMLIRM(\n", + " dml_data_irm,\n", + " LinearRegression(),\n", + " LogisticRegression(max_iter=1000),\n", + " n_folds=5,\n", + " score=\"ATE\"\n", + ")\n", + "dml_irm.fit()\n", + "\n", + "print(\"IRM Summary:\")\n", + "print(dml_irm.summary)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "
Average Treatment Effect (ATE) Estimation
\n", + " outcome\n", + "
(1)
coef
treatment0.635***
(0.071)
stats
Observations1,000
Significance levels: * p < 0.1, ** p < 0.05, *** p < 0.01. Format of coefficient cell: Coefficient (Std. Error)
\n", + "\n", + "
\n", + " " + ], + "text/latex": [ + "\\begin{table}[htbp]\n", + "\\centering\n", + "\\caption{Average Treatment Effect (ATE) Estimation}\n", + "\\smallskip\n", + "\\begin{threeparttable}\n", + "\\begingroup\n", + "\\renewcommand\\cellalign{t}\n", + "\\renewcommand\\arraystretch{1}\n", + "\\setlength{\\tabcolsep}{3pt}\n", + "\\begin{tabularx}{\\linewidth}{@{}>{\\raggedright\\arraybackslash}l>{\\centering\\arraybackslash}X}\n", + "\\toprule\n", + " & \\multicolumn{1}{c}{outcome} \\\\\n", + "\\cmidrule(lr){2-2}\n", + " & (1) \\\\\n", + "\\midrule\n", + "\\addlinespace[1ex]\n", + "treatment & \\makecell{0.635*** \\\\ (0.071)} \\\\\n", + "\\addlinespace[0.5ex]\n", + "\\midrule\n", + "\\addlinespace[1ex]\n", + "Observations & 1,000 \\\\\n", + "\\addlinespace[0.5ex]\n", + "\\bottomrule\n", + "\\end{tabularx}\n", + "\\endgroup\n", + "\\noindent\\begin{minipage}{\\linewidth}\\smallskip\\footnotesize\n", + "Significance levels: * p < 0.1, ** p < 0.05, *** p < 0.01. Format of coefficient cell: Coefficient (Std. Error)\\end{minipage}\n", + "\n", + "\\end{threeparttable}\n", + "\\end{table}" + ], + "text/plain": [ + ".DualOutput at 0x1296bdc10>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create table for IRM model\n", + "irm_table = ETable(\n", + " [dml_irm],\n", + " show_se=True,\n", + " model_stats=['N'],\n", + " caption='Average Treatment Effect (ATE) Estimation'\n", + ")\n", + "\n", + "irm_table" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 4: Customized Table Formatting\n", + "\n", + "MakeTables allows extensive customization of table appearance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create table with custom formatting\n", + "custom_table = ETable(\n", + " [dml_plr],\n", + " coef_fmt=\"b:.3f \\n [ci95l:.3f, ci95u:.3f]\", # Show CI instead of SE\n", + " model_stats=['N'],\n", + " caption='Custom Formatted Table with Confidence Intervals',\n", + " notes='95% confidence intervals shown in brackets.'\n", + ")\n", + "\n", + "# Display HTML with custom styling\n", + "display(custom_table.make('html', gt_style={'table_font_size': '14px'}))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "
Table with t-statistics
\n", + " income\n", + "
(1)
coef
education0.8303
(25.59)
stats
Observations1,000
t-statistics shown in parentheses.
\n", + "\n", + "
\n", + " " + ], + "text/latex": [ + "\\begin{table}[htbp]\n", + "\\centering\n", + "\\caption{Table with t-statistics}\n", + "\\smallskip\n", + "\\begin{threeparttable}\n", + "\\begingroup\n", + "\\renewcommand\\cellalign{t}\n", + "\\renewcommand\\arraystretch{1}\n", + "\\setlength{\\tabcolsep}{3pt}\n", + "\\begin{tabularx}{\\linewidth}{@{}>{\\raggedright\\arraybackslash}l>{\\centering\\arraybackslash}X}\n", + "\\toprule\n", + " & \\multicolumn{1}{c}{income} \\\\\n", + "\\cmidrule(lr){2-2}\n", + " & (1) \\\\\n", + "\\midrule\n", + "\\addlinespace[1ex]\n", + "education & \\makecell{0.8303 \\\\ (25.59)} \\\\\n", + "\\addlinespace[0.5ex]\n", + "\\midrule\n", + "\\addlinespace[1ex]\n", + "Observations & 1,000 \\\\\n", + "\\addlinespace[0.5ex]\n", + "\\bottomrule\n", + "\\end{tabularx}\n", + "\\endgroup\n", + "\\noindent\\begin{minipage}{\\linewidth}\\smallskip\\footnotesize\n", + "t-statistics shown in parentheses.\\end{minipage}\n", + "\n", + "\\end{threeparttable}\n", + "\\end{table}" + ], + "text/plain": [ + ".DualOutput at 0x128881f70>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Another example: showing t-statistics\n", + "t_stat_table = ETable(\n", + " [dml_plr],\n", + " coef_fmt=\"b:.4f \\n (t:.2f)\", # Show t-stat instead of SE\n", + " model_stats=['N'],\n", + " caption='Table with t-statistics',\n", + " notes='t-statistics shown in parentheses.'\n", + ")\n", + "\n", + "t_stat_table" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 5: Multiple Treatments in One Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Fit model with multiple treatments\n", + "dml_data_multi = dml.DoubleMLData(df2, \"income\", [\"education\", \"experience\"])\n", + "\n", + "dml_multi = dml.DoubleMLPLR(\n", + " dml_data_multi,\n", + " LinearRegression(),\n", + " LinearRegression(),\n", + " n_folds=5\n", + ")\n", + "dml_multi.fit()\n", + "\n", + "print(\"Multi-treatment Summary:\")\n", + "print(dml_multi.summary)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create table\n", + "multi_table = ETable(\n", + " [dml_multi],\n", + " show_se=True,\n", + " model_stats=['N'],\n", + " caption='Joint Estimation of Multiple Treatment Effects',\n", + " labels={'education': 'Years of Education', 'experience': 'Years of Experience'}\n", + ")\n", + "\n", + "multi_table.make('html')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Saving Tables\n", + "\n", + "You can save tables to files for use in your papers/presentations:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save as LaTeX\n", + "table.save('tex', 'table_results.tex')\n", + "print(\"✅ Saved to table_results.tex\")\n", + "\n", + "# Save as HTML\n", + "table.save('html', 'table_results.html')\n", + "print(\"✅ Saved to table_results.html\")\n", + "\n", + "# Save as Word document\n", + "table.save('docx', 'table_results.docx')\n", + "print(\"✅ Saved to table_results.docx\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "This notebook demonstrated:\n", + "\n", + "1. **Basic Integration**: DoubleML models automatically work with MakeTables\n", + "2. **Model Comparison**: Easy side-by-side comparison of multiple models\n", + "3. **Different Model Types**: Works with PLR, IRM, and other DoubleML models\n", + "4. **Customization**: Flexible formatting options for coefficients and statistics\n", + "5. **Multiple Treatments**: Handles models with multiple treatment variables\n", + "6. **Export Options**: Save to LaTeX, HTML, Word, or Typst formats\n", + "\n", + "### Key Advantages\n", + "\n", + "- **Zero Coupling**: DoubleML doesn't depend on MakeTables\n", + "- **Automatic Detection**: MakeTables finds the special attributes automatically\n", + "- **Publication Ready**: Beautiful tables suitable for papers and presentations\n", + "- **Flexible**: Extensive customization options available" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}