From 70d67ad539a7a15fac8dbb968cf4c8d445c49e9d Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Tue, 17 Jun 2025 16:15:31 +0200 Subject: [PATCH 01/40] fix RDDData (finally...) --- doubleml/data/rdd_data.py | 2 +- doubleml/rdd/tests/conftest.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doubleml/data/rdd_data.py b/doubleml/data/rdd_data.py index f19a4fa0..16f9e1c0 100644 --- a/doubleml/data/rdd_data.py +++ b/doubleml/data/rdd_data.py @@ -261,7 +261,7 @@ def _check_disjoint_sets_score_col(self): def _set_score_var(self): """Set the score variable array.""" if hasattr(self, "_data") and self.score_col in self.data.columns: - self._score = self.data.loc[:, [self.score_col]] + self._score = self.data.loc[:, self.score_col] def __str__(self): """String representation.""" diff --git a/doubleml/rdd/tests/conftest.py b/doubleml/rdd/tests/conftest.py index 75c9272b..9d13deaf 100644 --- a/doubleml/rdd/tests/conftest.py +++ b/doubleml/rdd/tests/conftest.py @@ -35,7 +35,7 @@ def _predict_dummy(data: DoubleMLRDDData, cutoff, alpha, n_rep, p, fs_specificat msg = "rdrobust is not installed. Please install it using 'pip install DoubleML[rdd]'" raise ImportError(msg) - rdrobust_model = rdrobust.rdrobust(y=data.y, x=data.s, c=cutoff, level=100 * (1 - alpha), p=p) + rdrobust_model = rdrobust.rdrobust(y=data.y, x=data.score, c=cutoff, level=100 * (1 - alpha), p=p) reference = { "model": rdrobust_model, From a322e359d5dc2e257fbb35c65f7e976569d337d7 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Tue, 17 Jun 2025 16:31:26 +0200 Subject: [PATCH 02/40] adjsut RDD Class --- doubleml/rdd/rdd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doubleml/rdd/rdd.py b/doubleml/rdd/rdd.py index 195fbba4..045789c3 100644 --- a/doubleml/rdd/rdd.py +++ b/doubleml/rdd/rdd.py @@ -22,8 +22,8 @@ class RDFlex: Parameters ---------- - obj_dml_data : :class:`DoubleMLData` object - The :class:`DoubleMLData` object providing the data and specifying the variables for the causal model. + obj_dml_data : :class:`DoubleMLRDDData` object + The :class:`DoubleMLRDDData` object providing the data and specifying the variables for the causal model. ml_g : estimator implementing ``fit()`` and ``predict()`` A machine learner implementing ``fit()`` and ``predict()`` methods and support ``sample_weights`` (e.g. From 0a9b3c7e32948aff252dc51a972c90425bdb521d Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Tue, 17 Jun 2025 16:31:43 +0200 Subject: [PATCH 03/40] adjust DID classes --- doubleml/did/did.py | 12 ++++++------ doubleml/did/did_cs.py | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/doubleml/did/did.py b/doubleml/did/did.py index 170535ea..580d805e 100644 --- a/doubleml/did/did.py +++ b/doubleml/did/did.py @@ -4,7 +4,7 @@ from sklearn.utils import check_X_y from sklearn.utils.multiclass import type_of_target -from doubleml.data.base_data import DoubleMLData +from doubleml.data.did_data import DoubleMLDIDData from doubleml.double_ml import DoubleML from doubleml.double_ml_score_mixins import LinearScoreMixin from doubleml.utils._checks import _check_finite_predictions, _check_is_propensity, _check_score, _check_trimming @@ -17,8 +17,8 @@ class DoubleMLDID(LinearScoreMixin, DoubleML): Parameters ---------- - obj_dml_data : :class:`DoubleMLData` object - The :class:`DoubleMLData` object providing the data and specifying the variables for the causal model. + obj_dml_data : :class:`DoubleMLDIDData` object + The :class:`DoubleMLDIDData` object providing the data and specifying the variables for the causal model. ml_g : estimator implementing ``fit()`` and ``predict()`` A machine learner implementing ``fit()`` and ``predict()`` methods (e.g. @@ -71,7 +71,7 @@ class DoubleMLDID(LinearScoreMixin, DoubleML): >>> ml_g = RandomForestRegressor(n_estimators=100, max_depth=5, min_samples_leaf=5) >>> ml_m = RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_leaf=5) >>> data = make_did_SZ2020(n_obs=500, return_type='DataFrame') - >>> obj_dml_data = dml.DoubleMLData(data, 'y', 'd') + >>> obj_dml_data = dml.DoubleMLDIDData(data, 'y', 'd') >>> dml_did_obj = dml.DoubleMLDID(obj_dml_data, ml_g, ml_m) >>> dml_did_obj.fit().summary coef std err t P>|t| 2.5 % 97.5 % @@ -176,9 +176,9 @@ def _initialize_ml_nuisance_params(self): self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols} for learner in valid_learner} def _check_data(self, obj_dml_data): - if not isinstance(obj_dml_data, DoubleMLData): + if not isinstance(obj_dml_data, DoubleMLDIDData): raise TypeError( - "For repeated outcomes the data must be of DoubleMLData type. " + "For repeated outcomes the data must be of DoubleMLDIDData type. " f"{str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed." ) if obj_dml_data.z_cols is not None: diff --git a/doubleml/did/did_cs.py b/doubleml/did/did_cs.py index bd7d59dd..38cc4952 100644 --- a/doubleml/did/did_cs.py +++ b/doubleml/did/did_cs.py @@ -17,8 +17,8 @@ class DoubleMLDIDCS(LinearScoreMixin, DoubleML): Parameters ---------- - obj_dml_data : :class:`DoubleMLData` object - The :class:`DoubleMLData` object providing the data and specifying the variables for the causal model. + obj_dml_data : :class:`DoubleMLDIDData` object + The :class:`DoubleMLDIDData` object providing the data and specifying the variables for the causal model. ml_g : estimator implementing ``fit()`` and ``predict()`` A machine learner implementing ``fit()`` and ``predict()`` methods (e.g. @@ -71,7 +71,7 @@ class DoubleMLDIDCS(LinearScoreMixin, DoubleML): >>> ml_g = RandomForestRegressor(n_estimators=100, max_depth=5, min_samples_leaf=5) >>> ml_m = RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_leaf=5) >>> data = make_did_SZ2020(n_obs=500, cross_sectional_data=True, return_type='DataFrame') - >>> obj_dml_data = dml.DoubleMLData(data, 'y', 'd', t_col='t') + >>> obj_dml_data = dml.DoubleMLDIDData(data, 'y', 'd', t_col='t') >>> dml_did_obj = dml.DoubleMLDIDCS(obj_dml_data, ml_g, ml_m) >>> dml_did_obj.fit().summary coef std err t P>|t| 2.5 % 97.5 % From 37f11dced954198a6e455f1b8a182ec08c3a28a3 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Tue, 17 Jun 2025 16:32:08 +0200 Subject: [PATCH 04/40] Adjust unit tests for DID --- doubleml/did/tests/test_did.py | 4 ++-- doubleml/did/tests/test_did_binary_exceptions.py | 2 +- doubleml/did/tests/test_did_binary_tune.py | 2 +- doubleml/did/tests/test_did_binary_vs_did_panel.py | 2 +- doubleml/did/tests/test_did_binary_vs_did_two_period.py | 2 +- doubleml/did/tests/test_did_cs.py | 4 ++-- doubleml/did/tests/test_did_cs_binary_exceptions.py | 2 +- doubleml/did/tests/test_did_cs_binary_tune.py | 2 +- doubleml/did/tests/test_did_cs_binary_vs_did_cs_panel.py | 2 +- .../did/tests/test_did_cs_binary_vs_did_cs_two_period.py | 2 +- doubleml/did/tests/test_did_tune.py | 2 +- doubleml/did/tests/test_return_types.py | 6 +++--- 12 files changed, 16 insertions(+), 16 deletions(-) diff --git a/doubleml/did/tests/test_did.py b/doubleml/did/tests/test_did.py index 90d53a95..79feb110 100644 --- a/doubleml/did/tests/test_did.py +++ b/doubleml/did/tests/test_did.py @@ -57,7 +57,7 @@ def dml_did_fixture(generate_data_did, learner, score, in_sample_normalization, np.random.seed(3141) n_obs = len(y) all_smpls = draw_smpls(n_obs, n_folds, n_rep=1, groups=d) - obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d) + obj_dml_data = dml.DoubleMLDIDData.from_arrays(x, y, d) np.random.seed(3141) dml_did_obj = dml.DoubleMLDID( @@ -182,7 +182,7 @@ def test_dml_did_experimental(generate_data_did, in_sample_normalization, learne ml_m = clone(learner[1]) np.random.seed(3141) - obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d) + obj_dml_data = dml.DoubleMLDIDData.from_arrays(x, y, d) np.random.seed(3141) dml_did_obj_without_ml_m = dml.DoubleMLDID( diff --git a/doubleml/did/tests/test_did_binary_exceptions.py b/doubleml/did/tests/test_did_binary_exceptions.py index c7aa2395..78c09a94 100644 --- a/doubleml/did/tests/test_did_binary_exceptions.py +++ b/doubleml/did/tests/test_did_binary_exceptions.py @@ -85,7 +85,7 @@ def test_check_data_exceptions(): # Test 1: Data has to be DoubleMLPanelData invalid_data_types = [ - dml.data.DoubleMLData(df, y_col="Col_0", d_cols="Col_1"), + dml.data.DoubleMLDIDData(df, y_col="Col_0", d_cols="Col_1"), ] for invalid_data in invalid_data_types: diff --git a/doubleml/did/tests/test_did_binary_tune.py b/doubleml/did/tests/test_did_binary_tune.py index a817223f..0962aa5b 100644 --- a/doubleml/did/tests/test_did_binary_tune.py +++ b/doubleml/did/tests/test_did_binary_tune.py @@ -64,7 +64,7 @@ def dml_did_fixture(generate_data_did_binary, learner_g, learner_m, score, in_sa n_obs = df_panel.shape[0] all_smpls = draw_smpls(n_obs, n_folds, n_rep=1, groups=df_panel["d"]) - obj_dml_data = dml.DoubleMLData(df_panel, y_col="y", d_cols="d", x_cols=["Z1", "Z2", "Z3", "Z4"]) + obj_dml_data = dml.DoubleMLDIDData(df_panel, y_col="y", d_cols="d", x_cols=["Z1", "Z2", "Z3", "Z4"]) # Set machine learning methods for m & g ml_g = clone(learner_g) diff --git a/doubleml/did/tests/test_did_binary_vs_did_panel.py b/doubleml/did/tests/test_did_binary_vs_did_panel.py index 426b413c..2eddccaf 100644 --- a/doubleml/did/tests/test_did_binary_vs_did_panel.py +++ b/doubleml/did/tests/test_did_binary_vs_did_panel.py @@ -79,7 +79,7 @@ def dml_did_binary_vs_did_fixture(time_type, learner, score, in_sample_normaliza dml_did_binary_obj.fit() df_wide = dml_did_binary_obj.data_subset.copy() - dml_data = dml.data.DoubleMLData(df_wide, y_col="y_diff", d_cols="G_indicator", x_cols=["Z1", "Z2", "Z3", "Z4"]) + dml_data = dml.data.DoubleMLDIDData(df_wide, y_col="y_diff", d_cols="G_indicator", x_cols=["Z1", "Z2", "Z3", "Z4"]) dml_did_obj = dml.DoubleMLDID( dml_data, **dml_args, diff --git a/doubleml/did/tests/test_did_binary_vs_did_two_period.py b/doubleml/did/tests/test_did_binary_vs_did_two_period.py index 0db2a752..74575664 100644 --- a/doubleml/did/tests/test_did_binary_vs_did_two_period.py +++ b/doubleml/did/tests/test_did_binary_vs_did_two_period.py @@ -56,7 +56,7 @@ def dml_did_binary_vs_did_fixture(generate_data_did_binary, learner, score, in_s n_obs = df_panel.shape[0] all_smpls = draw_smpls(n_obs, n_folds) - obj_dml_data = dml.DoubleMLData(df_panel, y_col="y", d_cols="d", x_cols=["Z1", "Z2", "Z3", "Z4"]) + obj_dml_data = dml.DoubleMLDIDData(df_panel, y_col="y", d_cols="d", x_cols=["Z1", "Z2", "Z3", "Z4"]) # Set machine learning methods for m & g ml_g = clone(learner[0]) diff --git a/doubleml/did/tests/test_did_cs.py b/doubleml/did/tests/test_did_cs.py index ae633588..bc8e2da6 100644 --- a/doubleml/did/tests/test_did_cs.py +++ b/doubleml/did/tests/test_did_cs.py @@ -59,7 +59,7 @@ def dml_did_cs_fixture(generate_data_did_cs, learner, score, in_sample_normaliza n_obs = len(y) all_smpls = draw_smpls(n_obs, n_folds, n_rep=1, groups=d + 2 * t) - obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d, t=t) + obj_dml_data = dml.DoubleMLDIDData.from_arrays(x, y, d, t=t) np.random.seed(3141) dml_did_cs_obj = dml.DoubleMLDIDCS( @@ -185,7 +185,7 @@ def test_dml_did_cs_experimental(generate_data_did_cs, in_sample_normalization, ml_m = clone(learner[1]) np.random.seed(3141) - obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d, t=t) + obj_dml_data = dml.DoubleMLDIDData.from_arrays(x, y, d, t=t) np.random.seed(3141) dml_did_obj_without_ml_m = dml.DoubleMLDIDCS( diff --git a/doubleml/did/tests/test_did_cs_binary_exceptions.py b/doubleml/did/tests/test_did_cs_binary_exceptions.py index b506da2d..e8d33939 100644 --- a/doubleml/did/tests/test_did_cs_binary_exceptions.py +++ b/doubleml/did/tests/test_did_cs_binary_exceptions.py @@ -85,7 +85,7 @@ def test_check_data_exceptions(): # Test 1: Data has to be DoubleMLPanelData invalid_data_types = [ - dml.data.DoubleMLData(df, y_col="Col_0", d_cols="Col_1"), + dml.data.DoubleMLDIDData(df, y_col="Col_0", d_cols="Col_1"), ] for invalid_data in invalid_data_types: diff --git a/doubleml/did/tests/test_did_cs_binary_tune.py b/doubleml/did/tests/test_did_cs_binary_tune.py index 0bd2c6ab..59db23dd 100644 --- a/doubleml/did/tests/test_did_cs_binary_tune.py +++ b/doubleml/did/tests/test_did_cs_binary_tune.py @@ -63,7 +63,7 @@ def dml_did_fixture(generate_data_did_binary, learner_g, learner_m, score, in_sa dml_panel_data = dml.data.DoubleMLPanelData( df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"] ) - obj_dml_data = dml.DoubleMLData(df, y_col="y", d_cols="d", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]) + obj_dml_data = dml.DoubleMLDIDData(df, y_col="y", d_cols="d", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]) n_obs = df.shape[0] strata = df["d"] + 2 * df["t"] # only valid since it values are binary diff --git a/doubleml/did/tests/test_did_cs_binary_vs_did_cs_panel.py b/doubleml/did/tests/test_did_cs_binary_vs_did_cs_panel.py index 8fab2615..da7db085 100644 --- a/doubleml/did/tests/test_did_cs_binary_vs_did_cs_panel.py +++ b/doubleml/did/tests/test_did_cs_binary_vs_did_cs_panel.py @@ -76,7 +76,7 @@ def dml_did_binary_vs_did_fixture(time_type, learner, score, in_sample_normaliza dml_did_binary_obj.fit() df_subset = dml_did_binary_obj.data_subset.copy() - dml_data = dml.data.DoubleMLData( + dml_data = dml.data.DoubleMLDIDData( df_subset, y_col="y", d_cols="G_indicator", x_cols=["Z1", "Z2", "Z3", "Z4"], t_col="t_indicator" ) dml_did_obj = dml.DoubleMLDIDCS( diff --git a/doubleml/did/tests/test_did_cs_binary_vs_did_cs_two_period.py b/doubleml/did/tests/test_did_cs_binary_vs_did_cs_two_period.py index 73e6b827..b9e267ce 100644 --- a/doubleml/did/tests/test_did_cs_binary_vs_did_cs_two_period.py +++ b/doubleml/did/tests/test_did_cs_binary_vs_did_cs_two_period.py @@ -55,7 +55,7 @@ def dml_did_cs_binary_vs_did_cs_fixture(generate_data_did_binary, learner, score dml_panel_data = dml.data.DoubleMLPanelData( df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"] ) - obj_dml_data = dml.DoubleMLData(df, y_col="y", d_cols="d", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]) + obj_dml_data = dml.DoubleMLDIDData(df, y_col="y", d_cols="d", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]) n_obs = df.shape[0] all_smpls = draw_smpls(n_obs, n_folds) diff --git a/doubleml/did/tests/test_did_tune.py b/doubleml/did/tests/test_did_tune.py index 16ec2ee8..25899301 100644 --- a/doubleml/did/tests/test_did_tune.py +++ b/doubleml/did/tests/test_did_tune.py @@ -65,7 +65,7 @@ def dml_did_fixture(generate_data_did, learner_g, learner_m, score, in_sample_no ml_m = clone(learner_m) np.random.seed(3141) - obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d) + obj_dml_data = dml.DoubleMLDIDData.from_arrays(x, y, d) dml_did_obj = dml.DoubleMLDID( obj_dml_data, ml_g, diff --git a/doubleml/did/tests/test_return_types.py b/doubleml/did/tests/test_return_types.py index 37105c3e..531a9706 100644 --- a/doubleml/did/tests/test_return_types.py +++ b/doubleml/did/tests/test_return_types.py @@ -3,7 +3,7 @@ import pytest from sklearn.linear_model import Lasso, LogisticRegression -from doubleml.data import DoubleMLData, DoubleMLPanelData +from doubleml.data import DoubleMLDIDData, DoubleMLPanelData from doubleml.did import DoubleMLDID, DoubleMLDIDBinary, DoubleMLDIDCS, DoubleMLDIDCSBinary from doubleml.did.datasets import make_did_CS2021, make_did_cs_CS2021, make_did_SZ2020 from doubleml.utils._check_return_types import ( @@ -37,8 +37,8 @@ (x, y, d, t) = make_did_SZ2020(n_obs=N_OBS, cross_sectional_data=True, return_type="array") binary_outcome = np.random.binomial(n=1, p=0.5, size=N_OBS) -datasets["did_binary_outcome"] = DoubleMLData.from_arrays(x, binary_outcome, d) -datasets["did_cs_binary_outcome"] = DoubleMLData.from_arrays(x, binary_outcome, d, t=t) +datasets["did_binary_outcome"] = DoubleMLDIDData.from_arrays(x, binary_outcome, d) +datasets["did_cs_binary_outcome"] = DoubleMLDIDData.from_arrays(x, binary_outcome, d, t=t) dml_objs = [ (DoubleMLDID(datasets["did"], Lasso(), LogisticRegression(), **dml_args), DoubleMLDID), From 7be2d8f84a67fb2bfae1b33fc09583d0eb3d27da Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Tue, 17 Jun 2025 16:32:18 +0200 Subject: [PATCH 05/40] Adjust RDD unit tests --- doubleml/rdd/tests/test_rdd_exceptions.py | 8 ++++---- doubleml/rdd/tests/test_rdd_return_types.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doubleml/rdd/tests/test_rdd_exceptions.py b/doubleml/rdd/tests/test_rdd_exceptions.py index 6abf901e..71670793 100644 --- a/doubleml/rdd/tests/test_rdd_exceptions.py +++ b/doubleml/rdd/tests/test_rdd_exceptions.py @@ -6,7 +6,7 @@ from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin from sklearn.linear_model import Lasso, LogisticRegression -from doubleml import DoubleMLData +from doubleml import DoubleMLRDDData from doubleml.rdd import RDFlex from doubleml.rdd.datasets import make_simple_rdd_data @@ -17,7 +17,7 @@ columns=["y", "d", "score"] + ["x" + str(i) for i in range(data["X"].shape[1])], ) -dml_data = DoubleMLData(df, y_col="y", d_cols="d", s_col="score") +dml_data = DoubleMLRDDData(df, y_col="y", d_cols="d", s_col="score") ml_g = Lasso() ml_m = LogisticRegression() @@ -58,8 +58,8 @@ def predict_proba(self, X): @pytest.mark.ci_rdd def test_rdd_exception_data(): - # DoubleMLData - msg = r"The data must be of DoubleMLData type. \[\] of type was passed." + # DoubleMLRDDData + msg = r"The data must be of DoubleMLRDDData type. \[\] of type was passed." with pytest.raises(TypeError, match=msg): _ = RDFlex([], ml_g) diff --git a/doubleml/rdd/tests/test_rdd_return_types.py b/doubleml/rdd/tests/test_rdd_return_types.py index 13248afd..56f2bfe4 100644 --- a/doubleml/rdd/tests/test_rdd_return_types.py +++ b/doubleml/rdd/tests/test_rdd_return_types.py @@ -15,7 +15,7 @@ np.column_stack((data["Y"], data["D"], data["score"], data["X"])), columns=["y", "d", "score"] + ["x" + str(i) for i in range(data["X"].shape[1])], ) -dml_data = dml.DoubleMLData(df, y_col="y", d_cols="d", s_col="score") +dml_data = dml.DoubleMLRDDData(df, y_col="y", d_cols="d", s_col="score") def _assert_return_types(dml_obj): From cbb3818c5ae0c4399d363cd2c8c2f3c2af34e54b Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Tue, 17 Jun 2025 16:46:39 +0200 Subject: [PATCH 06/40] minor changes in high lvl unit tests --- doubleml/tests/test_exceptions.py | 4 ++-- doubleml/tests/test_return_types.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doubleml/tests/test_exceptions.py b/doubleml/tests/test_exceptions.py index 7839d7c4..665e9a0b 100644 --- a/doubleml/tests/test_exceptions.py +++ b/doubleml/tests/test_exceptions.py @@ -75,10 +75,10 @@ def test_doubleml_exception_data(): _ = DoubleMLCVAR(DummyDataClass(pd.DataFrame(np.zeros((100, 10)))), ml_g, ml_m, treatment=1) with pytest.raises(TypeError, match=msg): _ = DoubleMLQTE(DummyDataClass(pd.DataFrame(np.zeros((100, 10)))), ml_g, ml_m) - msg = "For repeated outcomes the data must be of DoubleMLData type." + msg = "For repeated outcomes the data must be of DoubleMLDIDData type." with pytest.raises(TypeError, match=msg): _ = DoubleMLDID(DummyDataClass(pd.DataFrame(np.zeros((100, 10)))), ml_g, ml_m) - msg = "For repeated cross sections the data must be of DoubleMLData type. " + msg = "For repeated cross sections the data must be of DoubleMLDIDData type. " with pytest.raises(TypeError, match=msg): _ = DoubleMLDIDCS(DummyDataClass(pd.DataFrame(np.zeros((100, 10)))), ml_g, ml_m) diff --git a/doubleml/tests/test_return_types.py b/doubleml/tests/test_return_types.py index fdb680f3..4b914f65 100644 --- a/doubleml/tests/test_return_types.py +++ b/doubleml/tests/test_return_types.py @@ -10,7 +10,7 @@ DoubleMLAPO, DoubleMLData, DoubleMLCVAR, - DoubleMLData, + DoubleMLDIDData, DoubleMLDID, DoubleMLDIDCS, DoubleMLFramework, @@ -38,8 +38,8 @@ dml_data_did_cs = make_did_SZ2020(n_obs=n_obs, cross_sectional_data=True) (x, y, d, t) = make_did_SZ2020(n_obs=n_obs, cross_sectional_data=True, return_type="array") binary_outcome = np.random.binomial(n=1, p=0.5, size=n_obs) -dml_data_did_binary_outcome = DoubleMLData.from_arrays(x, binary_outcome, d) -dml_data_did_cs_binary_outcome = DoubleMLData.from_arrays(x, binary_outcome, d, t=t) +dml_data_did_binary_outcome = DoubleMLDIDData.from_arrays(x, binary_outcome, d) +dml_data_did_cs_binary_outcome = DoubleMLDIDData.from_arrays(x, binary_outcome, d, t=t) dml_data_ssm = make_ssm_data(n_obs=n_obs) dml_plr = DoubleMLPLR(dml_data_plr, Lasso(), Lasso()) From fb4f440ef96a9616a9206d0e1b3245c3d8ec6ee6 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Wed, 2 Jul 2025 10:16:55 +0200 Subject: [PATCH 07/40] minor changes in high lvl unit tests --- doubleml/tests/test_exceptions.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doubleml/tests/test_exceptions.py b/doubleml/tests/test_exceptions.py index 665e9a0b..3d664e9d 100644 --- a/doubleml/tests/test_exceptions.py +++ b/doubleml/tests/test_exceptions.py @@ -42,6 +42,7 @@ dml_data_irm = make_irm_data(n_obs=n) dml_data_iivm = make_iivm_data(n_obs=n) +dml_data_iivm_did = DoubleMLDIDData.from_arrays(dml_data_iivm.data, y_col="y", d_cols="d", t_col="t") dml_cluster_data_pliv = make_pliv_multiway_cluster_CKMS2021(N=10, M=10) dml_data_did = make_did_SZ2020(n_obs=n) dml_data_did_cs = make_did_SZ2020(n_obs=n, cross_sectional_data=True) @@ -236,7 +237,7 @@ def test_doubleml_exception_data(): # DID with IV msg = r"Incompatible data. z have been set as instrumental variable\(s\)." with pytest.raises(ValueError, match=msg): - _ = DoubleMLDID(dml_data_iivm, Lasso(), LogisticRegression()) + _ = DoubleMLDID(dml_data_iivm_did, Lasso(), LogisticRegression()) msg = ( "Incompatible data. To fit an DID model with DML exactly one binary variable with values 0 and 1 " "needs to be specified as treatment variable." From cc5a1107b20a8c41551fb64ec6d9c989b9fdc91e Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Wed, 2 Jul 2025 10:36:41 +0200 Subject: [PATCH 08/40] fix rdd unit tests --- doubleml/rdd/tests/test_rdd_classifier.py | 2 +- doubleml/rdd/tests/test_rdd_default_values.py | 2 +- doubleml/rdd/tests/test_rdd_exceptions.py | 2 +- doubleml/rdd/tests/test_rdd_return_types.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doubleml/rdd/tests/test_rdd_classifier.py b/doubleml/rdd/tests/test_rdd_classifier.py index 199fe327..1103b957 100644 --- a/doubleml/rdd/tests/test_rdd_classifier.py +++ b/doubleml/rdd/tests/test_rdd_classifier.py @@ -18,7 +18,7 @@ np.column_stack((data["Y_bin"], data["D"], data["score"], data["X"])), columns=["y", "d", "score"] + ["x" + str(i) for i in range(data["X"].shape[1])], ) -dml_data = dml.DoubleMLData(df, y_col="y", d_cols="d", s_col="score") +dml_data = dml.DoubleMLRDDData(df, y_col="y", d_cols="d", score_col="score") @pytest.mark.ci_rdd diff --git a/doubleml/rdd/tests/test_rdd_default_values.py b/doubleml/rdd/tests/test_rdd_default_values.py index 2f0657f1..b2fdcf29 100644 --- a/doubleml/rdd/tests/test_rdd_default_values.py +++ b/doubleml/rdd/tests/test_rdd_default_values.py @@ -15,7 +15,7 @@ np.column_stack((data["Y"], data["D"], data["score"], data["X"])), columns=["y", "d", "score"] + ["x" + str(i) for i in range(data["X"].shape[1])], ) -dml_data = dml.DoubleMLData(df, y_col="y", d_cols="d", s_col="score") +dml_data = dml.DoubleMLRDDData(df, y_col="y", d_cols="d", score_col="score") def _assert_resampling_default_settings(dml_obj): diff --git a/doubleml/rdd/tests/test_rdd_exceptions.py b/doubleml/rdd/tests/test_rdd_exceptions.py index 71670793..c6b89221 100644 --- a/doubleml/rdd/tests/test_rdd_exceptions.py +++ b/doubleml/rdd/tests/test_rdd_exceptions.py @@ -17,7 +17,7 @@ columns=["y", "d", "score"] + ["x" + str(i) for i in range(data["X"].shape[1])], ) -dml_data = DoubleMLRDDData(df, y_col="y", d_cols="d", s_col="score") +dml_data = DoubleMLRDDData(df, y_col="y", d_cols="d", score_col="score") ml_g = Lasso() ml_m = LogisticRegression() diff --git a/doubleml/rdd/tests/test_rdd_return_types.py b/doubleml/rdd/tests/test_rdd_return_types.py index 56f2bfe4..f7e02427 100644 --- a/doubleml/rdd/tests/test_rdd_return_types.py +++ b/doubleml/rdd/tests/test_rdd_return_types.py @@ -15,7 +15,7 @@ np.column_stack((data["Y"], data["D"], data["score"], data["X"])), columns=["y", "d", "score"] + ["x" + str(i) for i in range(data["X"].shape[1])], ) -dml_data = dml.DoubleMLRDDData(df, y_col="y", d_cols="d", s_col="score") +dml_data = dml.DoubleMLRDDData(df, y_col="y", d_cols="d", score_col="score") def _assert_return_types(dml_obj): From 80a890ed44d4c5f934b146d8747f3a3aa9c0bcc3 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Wed, 2 Jul 2025 10:36:52 +0200 Subject: [PATCH 09/40] fix exception unit test --- doubleml/tests/test_exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doubleml/tests/test_exceptions.py b/doubleml/tests/test_exceptions.py index 3d664e9d..2bc925f9 100644 --- a/doubleml/tests/test_exceptions.py +++ b/doubleml/tests/test_exceptions.py @@ -42,7 +42,7 @@ dml_data_irm = make_irm_data(n_obs=n) dml_data_iivm = make_iivm_data(n_obs=n) -dml_data_iivm_did = DoubleMLDIDData.from_arrays(dml_data_iivm.data, y_col="y", d_cols="d", t_col="t") +dml_data_iivm_did = DoubleMLDIDData(dml_data_iivm.data, y_col="y", d_cols="d") dml_cluster_data_pliv = make_pliv_multiway_cluster_CKMS2021(N=10, M=10) dml_data_did = make_did_SZ2020(n_obs=n) dml_data_did_cs = make_did_SZ2020(n_obs=n, cross_sectional_data=True) From 0207b67b62639ff396e965b3d56f0cbba62312f9 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Wed, 2 Jul 2025 10:41:56 +0200 Subject: [PATCH 10/40] fix unit tests for cluster variables (kwd arg instead of positional arg) --- doubleml/tests/test_nonlinear_cluster.py | 2 +- doubleml/tests/test_sensitivity_cluster.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doubleml/tests/test_nonlinear_cluster.py b/doubleml/tests/test_nonlinear_cluster.py index 6f19b511..76f595ed 100644 --- a/doubleml/tests/test_nonlinear_cluster.py +++ b/doubleml/tests/test_nonlinear_cluster.py @@ -32,7 +32,7 @@ omega_V=np.array([0.25, 0]), return_type="array", ) -obj_dml_oneway_cluster_data = DoubleMLData.from_arrays(x, y, d, cluster_vars, is_cluster_data=True) +obj_dml_oneway_cluster_data = DoubleMLData.from_arrays(x, y, d, cluster_vars=cluster_vars, is_cluster_data=True) # only the first cluster variable is relevant with the weight setting above obj_dml_oneway_cluster_data.cluster_cols = "cluster_var1" diff --git a/doubleml/tests/test_sensitivity_cluster.py b/doubleml/tests/test_sensitivity_cluster.py index 5b6a7f1e..19b25482 100644 --- a/doubleml/tests/test_sensitivity_cluster.py +++ b/doubleml/tests/test_sensitivity_cluster.py @@ -17,7 +17,7 @@ (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(N, M, dim_x, return_type="array") -obj_dml_cluster_data = dml.DoubleMData.from_arrays(x, y, d, cluster_vars, is_cluster_data=True) +obj_dml_cluster_data = dml.DoubleMLData.from_arrays(x, y, d, cluster_vars=cluster_vars, is_cluster_data=True) (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021( N, @@ -29,7 +29,7 @@ omega_V=np.array([0.25, 0]), return_type="array", ) -obj_dml_oneway_cluster_data = dml.DoubleMLData.from_arrays(x, y, d, cluster_vars, is_cluster_data=True) +obj_dml_oneway_cluster_data = dml.DoubleMLData.from_arrays(x, y, d, cluster_vars=cluster_vars, is_cluster_data=True) # only the first cluster variable is relevant with the weight setting above obj_dml_oneway_cluster_data.cluster_cols = "cluster_var1" From 987f8b316e42a182dc734bbd6ee6b59cdfdcd6c6 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Wed, 2 Jul 2025 12:52:10 +0200 Subject: [PATCH 11/40] update checks for correct data backend type --- doubleml/irm/apo.py | 1 + doubleml/irm/apos.py | 2 +- doubleml/irm/cvar.py | 1 + doubleml/irm/iivm.py | 1 + doubleml/irm/irm.py | 1 + doubleml/irm/lpq.py | 1 + doubleml/irm/pq.py | 1 + doubleml/irm/qte.py | 3 +-- doubleml/irm/ssm.py | 1 + doubleml/plm/pliv.py | 1 + doubleml/plm/plr.py | 1 + doubleml/rdd/rdd.py | 1 + 12 files changed, 12 insertions(+), 3 deletions(-) diff --git a/doubleml/irm/apo.py b/doubleml/irm/apo.py index e8c75172..5bfb6aa6 100644 --- a/doubleml/irm/apo.py +++ b/doubleml/irm/apo.py @@ -102,6 +102,7 @@ def __init__( self._treated = self._dml_data.d == self._treatment_level self._check_data(self._dml_data) + self._is_cluster_data = self._dml_data.is_cluster_data valid_scores = ["APO"] _check_score(self.score, valid_scores, allow_callable=False) diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py index c272d0b4..2ef147f1 100644 --- a/doubleml/irm/apos.py +++ b/doubleml/irm/apos.py @@ -36,8 +36,8 @@ def __init__( draw_sample_splitting=True, ): self._dml_data = obj_dml_data - self._is_cluster_data = obj_dml_data.is_cluster_data self._check_data(self._dml_data) + self._is_cluster_data = self._dml_data.is_cluster_data self._all_treatment_levels = np.unique(self._dml_data.d) diff --git a/doubleml/irm/cvar.py b/doubleml/irm/cvar.py index 57347dce..c0da2616 100644 --- a/doubleml/irm/cvar.py +++ b/doubleml/irm/cvar.py @@ -117,6 +117,7 @@ def __init__( self._normalize_ipw = normalize_ipw self._check_data(self._dml_data) + self._is_cluster_data = self._dml_data.is_cluster_data valid_score = ["CVaR"] _check_score(self.score, valid_score, allow_callable=False) _check_quantile(self.quantile) diff --git a/doubleml/irm/iivm.py b/doubleml/irm/iivm.py index 6b8c3c0a..a54694e1 100644 --- a/doubleml/irm/iivm.py +++ b/doubleml/irm/iivm.py @@ -142,6 +142,7 @@ def __init__( super().__init__(obj_dml_data, n_folds, n_rep, score, draw_sample_splitting) self._check_data(self._dml_data) + self._is_cluster_data = self._dml_data.is_cluster_data valid_scores = ["LATE"] _check_score(self.score, valid_scores, allow_callable=True) diff --git a/doubleml/irm/irm.py b/doubleml/irm/irm.py index 10f6377c..f4f8b73f 100644 --- a/doubleml/irm/irm.py +++ b/doubleml/irm/irm.py @@ -138,6 +138,7 @@ def __init__( super().__init__(obj_dml_data, n_folds, n_rep, score, draw_sample_splitting) self._check_data(self._dml_data) + self._is_cluster_data = self._dml_data.is_cluster_data valid_scores = ["ATE", "ATTE"] _check_score(self.score, valid_scores, allow_callable=True) diff --git a/doubleml/irm/lpq.py b/doubleml/irm/lpq.py index f46fb38c..74d3d59f 100644 --- a/doubleml/irm/lpq.py +++ b/doubleml/irm/lpq.py @@ -125,6 +125,7 @@ def __init__( self._normalize_ipw = normalize_ipw self._check_data(self._dml_data) + self._is_cluster_data = self._dml_data.is_cluster_data valid_score = ["LPQ"] _check_score(self.score, valid_score, allow_callable=False) diff --git a/doubleml/irm/pq.py b/doubleml/irm/pq.py index d0425845..36069422 100644 --- a/doubleml/irm/pq.py +++ b/doubleml/irm/pq.py @@ -132,6 +132,7 @@ def __init__( self._normalize_ipw = normalize_ipw self._check_data(self._dml_data) + self._is_cluster_data = self._dml_data.is_cluster_data valid_score = ["PQ"] _check_score(self.score, valid_score, allow_callable=False) diff --git a/doubleml/irm/qte.py b/doubleml/irm/qte.py index de25b3ef..cd03e434 100644 --- a/doubleml/irm/qte.py +++ b/doubleml/irm/qte.py @@ -124,9 +124,8 @@ def __init__( _check_score(self.score, valid_scores, allow_callable=False) # check data - self._is_cluster_data = False - self._is_cluster_data = obj_dml_data.is_cluster_data self._check_data(self._dml_data) + self._is_cluster_data = self._dml_data.is_cluster_data # initialize framework which is constructed after the fit method is called self._framework = None diff --git a/doubleml/irm/ssm.py b/doubleml/irm/ssm.py index 2c8479a7..7bf75e9a 100644 --- a/doubleml/irm/ssm.py +++ b/doubleml/irm/ssm.py @@ -124,6 +124,7 @@ def __init__( _check_trimming(self._trimming_rule, self._trimming_threshold) self._check_data(self._dml_data) + self._is_cluster_data = self._dml_data.is_cluster_data _check_score(self.score, ["missing-at-random", "nonignorable"]) # for both score function stratification by d and s is viable diff --git a/doubleml/plm/pliv.py b/doubleml/plm/pliv.py index 52cb796d..48f75260 100644 --- a/doubleml/plm/pliv.py +++ b/doubleml/plm/pliv.py @@ -108,6 +108,7 @@ def __init__( super().__init__(obj_dml_data, n_folds, n_rep, score, draw_sample_splitting) self._check_data(self._dml_data) + self._is_cluster_data = self._dml_data.is_cluster_data self.partialX = True self.partialZ = False self._check_score(self.score) diff --git a/doubleml/plm/plr.py b/doubleml/plm/plr.py index 4a57dfcb..99a0722a 100644 --- a/doubleml/plm/plr.py +++ b/doubleml/plm/plr.py @@ -93,6 +93,7 @@ def __init__( super().__init__(obj_dml_data, n_folds, n_rep, score, draw_sample_splitting) self._check_data(self._dml_data) + self._is_cluster_data = self._dml_data.is_cluster_data valid_scores = ["IV-type", "partialling out"] _check_score(self.score, valid_scores, allow_callable=True) diff --git a/doubleml/rdd/rdd.py b/doubleml/rdd/rdd.py index 045789c3..68b52f93 100644 --- a/doubleml/rdd/rdd.py +++ b/doubleml/rdd/rdd.py @@ -114,6 +114,7 @@ def __init__( self._check_data(obj_dml_data, cutoff) self._dml_data = obj_dml_data + self._is_cluster_data = self._dml_data.is_cluster_data self._score = self._dml_data.score - cutoff self._cutoff = cutoff From 45b1c356de214eff3b1bbb6f153d0b6203f1e3f5 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Wed, 2 Jul 2025 12:52:17 +0200 Subject: [PATCH 12/40] adjust unit tests --- doubleml/irm/tests/test_ssm_exceptions.py | 9 +++++++-- doubleml/tests/test_exceptions.py | 8 ++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/doubleml/irm/tests/test_ssm_exceptions.py b/doubleml/irm/tests/test_ssm_exceptions.py index ee67dbec..6df76908 100644 --- a/doubleml/irm/tests/test_ssm_exceptions.py +++ b/doubleml/irm/tests/test_ssm_exceptions.py @@ -22,6 +22,7 @@ class DummyDataClass(DoubleMLBaseData): def __init__(self, data): DoubleMLBaseData.__init__(self, data) + self.is_cluster_data = False @property def n_coefs(self): @@ -30,11 +31,15 @@ def n_coefs(self): @pytest.mark.ci def test_ssm_exception_data(): - msg = "The data must be of DoubleMLData type." + msg = ( + r"The data must be of DoubleMLData or DoubleMLClusterData or DoubleMLDIDData or DoubleMLSSMData or " + r"DoubleMLRDDData type\. Empty DataFrame\nColumns: \[\]\nIndex: \[\] of type " + r" was passed\." + ) with pytest.raises(TypeError, match=msg): _ = DoubleMLSSM(pd.DataFrame(), ml_g, ml_pi, ml_m) - msg = "The data must be of DoubleMLData type." + msg = "The data must be of DoubleMLSSMData type." with pytest.raises(TypeError, match=msg): _ = DoubleMLSSM(DummyDataClass(pd.DataFrame(np.zeros((100, 10)))), ml_g, ml_pi, ml_m) diff --git a/doubleml/tests/test_exceptions.py b/doubleml/tests/test_exceptions.py index 2bc925f9..d679afb5 100644 --- a/doubleml/tests/test_exceptions.py +++ b/doubleml/tests/test_exceptions.py @@ -42,7 +42,7 @@ dml_data_irm = make_irm_data(n_obs=n) dml_data_iivm = make_iivm_data(n_obs=n) -dml_data_iivm_did = DoubleMLDIDData(dml_data_iivm.data, y_col="y", d_cols="d") +dml_data_iivm_did = DoubleMLDIDData(dml_data_iivm.data, y_col="y", d_cols="d", z_cols="z") dml_cluster_data_pliv = make_pliv_multiway_cluster_CKMS2021(N=10, M=10) dml_data_did = make_did_SZ2020(n_obs=n) dml_data_did_cs = make_did_SZ2020(n_obs=n, cross_sectional_data=True) @@ -246,16 +246,16 @@ def test_doubleml_exception_data(): df_irm["d"] = df_irm["d"] * 2 with pytest.raises(ValueError, match=msg): # non-binary D for DID - _ = DoubleMLDID(DoubleMLData(df_irm, "y", "d"), Lasso(), LogisticRegression()) + _ = DoubleMLDID(DoubleMLDIDData(df_irm, "y", "d"), Lasso(), LogisticRegression()) df_irm = dml_data_irm.data.copy() with pytest.raises(ValueError, match=msg): # multiple D for DID - _ = DoubleMLDID(DoubleMLData(df_irm, "y", ["d", "X1"]), Lasso(), LogisticRegression()) + _ = DoubleMLDID(DoubleMLDIDData(df_irm, "y", ["d", "X1"]), Lasso(), LogisticRegression()) # DIDCS with IV msg = r"Incompatible data. z have been set as instrumental variable\(s\)." with pytest.raises(ValueError, match=msg): - _ = DoubleMLDIDCS(dml_data_iivm, Lasso(), LogisticRegression()) + _ = DoubleMLDIDCS(dml_data_iivm_did, Lasso(), LogisticRegression()) # DIDCS treatment exceptions msg = ( From 7c2775085b3626ab5bc0c4bb6e80b4c4483e1d9b Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Wed, 2 Jul 2025 13:00:21 +0200 Subject: [PATCH 13/40] adjust unit tests --- doubleml/irm/tests/test_apo_exceptions.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/doubleml/irm/tests/test_apo_exceptions.py b/doubleml/irm/tests/test_apo_exceptions.py index cb267a98..5991ee5e 100644 --- a/doubleml/irm/tests/test_apo_exceptions.py +++ b/doubleml/irm/tests/test_apo_exceptions.py @@ -22,7 +22,11 @@ @pytest.mark.ci def test_apo_exception_data(): - msg = "The data must be of DoubleMLData type." + msg = ( + r"The data must be of DoubleMLData or DoubleMLClusterData or DoubleMLDIDData or DoubleMLSSMData or " + r"DoubleMLRDDData type\. Empty DataFrame\nColumns: \[\]\nIndex: \[\] of type " + r" was passed\." + ) with pytest.raises(TypeError, match=msg): _ = DoubleMLAPO(pd.DataFrame(), ml_g, ml_m, treatment_level=0) From 025b75e8d37da92e76d034aefb3da967a6cbf57f Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Wed, 2 Jul 2025 13:51:21 +0200 Subject: [PATCH 14/40] adjust unit tests --- doubleml/rdd/tests/test_rdd_exceptions.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doubleml/rdd/tests/test_rdd_exceptions.py b/doubleml/rdd/tests/test_rdd_exceptions.py index c6b89221..835cd0f2 100644 --- a/doubleml/rdd/tests/test_rdd_exceptions.py +++ b/doubleml/rdd/tests/test_rdd_exceptions.py @@ -67,12 +67,12 @@ def test_rdd_exception_data(): msg = "Incompatible data. Score variable has not been set. " with pytest.raises(ValueError, match=msg): tmp_dml_data = copy.deepcopy(dml_data) - tmp_dml_data._s_col = None + tmp_dml_data._score_col = None _ = RDFlex(tmp_dml_data, ml_g) msg = "Incompatible data. Score variable has to be continuous. " with pytest.raises(ValueError, match=msg): tmp_dml_data = copy.deepcopy(dml_data) - tmp_dml_data._s = tmp_dml_data._d + tmp_dml_data._score = tmp_dml_data._d _ = RDFlex(tmp_dml_data, ml_g) # existing instruments @@ -128,7 +128,7 @@ def test_rdd_warning_treatment_assignment(): ) with pytest.warns(UserWarning, match=msg): tmp_dml_data = copy.deepcopy(dml_data) - tmp_dml_data._s = -1.0 * tmp_dml_data._s + tmp_dml_data._score = -1.0 * tmp_dml_data._score _ = RDFlex(tmp_dml_data, ml_g, ml_m, fuzzy=True) @@ -169,7 +169,7 @@ def test_rdd_exception_learner(): ) with pytest.warns(UserWarning, match=msg): tmp_dml_data = copy.deepcopy(dml_data) - tmp_dml_data._data["sharp_d"] = tmp_dml_data.s >= 0 + tmp_dml_data._data["sharp_d"] = tmp_dml_data.score >= 0 tmp_dml_data.d_cols = "sharp_d" _ = RDFlex(tmp_dml_data, ml_g, ml_m, fuzzy=False) From 270ed209661ebd60b825be637e38b25df35bedd3 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Wed, 2 Jul 2025 15:16:26 +0200 Subject: [PATCH 15/40] Potential fix for code scanning alert no. 419: Unused import Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> --- doubleml/data/did_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doubleml/data/did_data.py b/doubleml/data/did_data.py index 414cdc5b..5a44e7c6 100644 --- a/doubleml/data/did_data.py +++ b/doubleml/data/did_data.py @@ -5,7 +5,7 @@ from doubleml.data.base_data import DoubleMLData from doubleml.utils._estimation import _assure_2d_array -from sklearn.utils.validation import check_array, check_consistent_length, column_or_1d +from sklearn.utils.validation import check_consistent_length, column_or_1d from sklearn.utils.multiclass import type_of_target From c129395c92d72dc933fe8e09409c90add6c99cf1 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Wed, 2 Jul 2025 15:17:14 +0200 Subject: [PATCH 16/40] Potential fix for code scanning alert no. 414: Unused local variable Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> --- doubleml/datasets/fetch_401K.py | 1 - 1 file changed, 1 deletion(-) diff --git a/doubleml/datasets/fetch_401K.py b/doubleml/datasets/fetch_401K.py index 05a97fe7..bc7e49d5 100644 --- a/doubleml/datasets/fetch_401K.py +++ b/doubleml/datasets/fetch_401K.py @@ -40,7 +40,6 @@ def fetch_401K(return_type="DoubleMLData", polynomial_features=False): Double/debiased machine learning for treatment and structural parameters. The Econometrics Journal, 21: C1-C68. doi:`10.1111/ectj.12097 `_. """ - _array_alias = _get_array_alias() _data_frame_alias = _get_data_frame_alias() _dml_data_alias = _get_dml_data_alias() From a76d4a7d43ef544f685f917ddfd61298c398cfdf Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Wed, 2 Jul 2025 15:17:41 +0200 Subject: [PATCH 17/40] Potential fix for code scanning alert no. 415: Unused local variable Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> --- doubleml/datasets/fetch_bonus.py | 1 - 1 file changed, 1 deletion(-) diff --git a/doubleml/datasets/fetch_bonus.py b/doubleml/datasets/fetch_bonus.py index 155100c3..7dda3045 100644 --- a/doubleml/datasets/fetch_bonus.py +++ b/doubleml/datasets/fetch_bonus.py @@ -42,7 +42,6 @@ def fetch_bonus(return_type="DoubleMLData", polynomial_features=False): Double/debiased machine learning for treatment and structural parameters. The Econometrics Journal, 21: C1-C68. doi:`10.1111/ectj.12097 `_. """ - _array_alias = _get_array_alias() _data_frame_alias = _get_data_frame_alias() _dml_data_alias = _get_dml_data_alias() From 4b9a81b2cd428e9a809f927fe73146487298b661 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Wed, 2 Jul 2025 15:17:50 +0200 Subject: [PATCH 18/40] Potential fix for code scanning alert no. 421: Explicit returns mixed with implicit (fall through) returns Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> --- doubleml/did/datasets/dgp_did_SZ2020.py | 1 + 1 file changed, 1 insertion(+) diff --git a/doubleml/did/datasets/dgp_did_SZ2020.py b/doubleml/did/datasets/dgp_did_SZ2020.py index eb150bbf..e3ceb962 100644 --- a/doubleml/did/datasets/dgp_did_SZ2020.py +++ b/doubleml/did/datasets/dgp_did_SZ2020.py @@ -238,3 +238,4 @@ def make_did_SZ2020(n_obs=500, dgp_type=1, cross_sectional_data=False, return_ty return DoubleMLDIDData(data, y_col="y", d_cols="d", x_cols=z_cols, t_col="t") else: raise ValueError("Invalid return_type.") + return None From 1ffcbc620318641f3ec7ca24460e235ea3d29290 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Wed, 2 Jul 2025 15:18:34 +0200 Subject: [PATCH 19/40] Update doubleml/utils/_check_return_types.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- doubleml/utils/_check_return_types.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doubleml/utils/_check_return_types.py b/doubleml/utils/_check_return_types.py index 6852c405..b73e2e04 100644 --- a/doubleml/utils/_check_return_types.py +++ b/doubleml/utils/_check_return_types.py @@ -60,6 +60,8 @@ def check_basic_property_types_and_shapes(dml_obj, n_obs, n_treat, n_rep, n_fold assert isinstance(dml_obj.psi, np.ndarray) assert dml_obj.psi.shape == score_dim + assert isinstance(dml_obj.psi_deriv, np.ndarray) + assert dml_obj.psi_deriv.shape == score_dim is_nonlinear = isinstance(dml_obj, NonLinearScoreMixin) if is_nonlinear: for score_element in dml_obj._score_element_names: From 7a531bffa47d7360a0ffe45b1309a9c06b894bde Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Wed, 2 Jul 2025 15:20:47 +0200 Subject: [PATCH 20/40] Potential fix for code scanning alert no. 424: Unused import Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> --- doubleml/data/did_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doubleml/data/did_data.py b/doubleml/data/did_data.py index 5a44e7c6..9ee8f085 100644 --- a/doubleml/data/did_data.py +++ b/doubleml/data/did_data.py @@ -4,7 +4,7 @@ from sklearn.utils import assert_all_finite from doubleml.data.base_data import DoubleMLData -from doubleml.utils._estimation import _assure_2d_array +# Line removed as `_assure_2d_array` is unused. from sklearn.utils.validation import check_consistent_length, column_or_1d from sklearn.utils.multiclass import type_of_target From ca8377c921ac331e8db82d29accf3de5a4e00ee5 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Wed, 2 Jul 2025 15:36:20 +0200 Subject: [PATCH 21/40] formatting issues --- doubleml/data/did_data.py | 6 ++---- doubleml/data/panel_data.py | 4 ++-- doubleml/data/rdd_data.py | 5 +++-- doubleml/datasets/fetch_401K.py | 3 ++- doubleml/datasets/fetch_bonus.py | 3 ++- 5 files changed, 11 insertions(+), 10 deletions(-) diff --git a/doubleml/data/did_data.py b/doubleml/data/did_data.py index 9ee8f085..a37b8fdf 100644 --- a/doubleml/data/did_data.py +++ b/doubleml/data/did_data.py @@ -1,12 +1,10 @@ import io + import pandas as pd -from sklearn.utils.validation import check_array from sklearn.utils import assert_all_finite +from sklearn.utils.validation import check_consistent_length, column_or_1d from doubleml.data.base_data import DoubleMLData -# Line removed as `_assure_2d_array` is unused. -from sklearn.utils.validation import check_consistent_length, column_or_1d -from sklearn.utils.multiclass import type_of_target class DoubleMLDIDData(DoubleMLData): diff --git a/doubleml/data/panel_data.py b/doubleml/data/panel_data.py index 204a1bf1..00c8030e 100644 --- a/doubleml/data/panel_data.py +++ b/doubleml/data/panel_data.py @@ -107,7 +107,7 @@ def __init__( force_all_x_finite=force_all_x_finite, force_all_d_finite=False, ) - + # reset index to ensure a simple RangeIndex self.data.reset_index(drop=True, inplace=True) @@ -232,7 +232,7 @@ def g_col(self): """ The treatment variable indicating the time of treatment exposure. """ - return self._d_cols[0] + return self._d_cols[0] @ DoubleMLData.d_cols.setter diff --git a/doubleml/data/rdd_data.py b/doubleml/data/rdd_data.py index 16f9e1c0..6bf4a830 100644 --- a/doubleml/data/rdd_data.py +++ b/doubleml/data/rdd_data.py @@ -1,5 +1,5 @@ import io -import numpy as np + import pandas as pd from sklearn.utils.validation import check_array @@ -30,7 +30,8 @@ class DoubleMLRDDData(DoubleMLData): x_cols : None, str or list The covariates. If ``None``, all variables (columns of ``data``) which are neither specified as outcome variable ``y_col``, nor - treatment variables ``d_cols``, nor instrumental variables ``z_cols``, nor score variable ``score_col`` are used as covariates. + treatment variables ``d_cols``, nor instrumental variables ``z_cols``, nor score variable ``score_col`` are + used as covariates. Default is ``None``. z_cols : None, str or list diff --git a/doubleml/datasets/fetch_401K.py b/doubleml/datasets/fetch_401K.py index bc7e49d5..6d99589e 100644 --- a/doubleml/datasets/fetch_401K.py +++ b/doubleml/datasets/fetch_401K.py @@ -3,6 +3,7 @@ """ import pandas as pd + from doubleml import DoubleMLData @@ -42,7 +43,7 @@ def fetch_401K(return_type="DoubleMLData", polynomial_features=False): """ _data_frame_alias = _get_data_frame_alias() _dml_data_alias = _get_dml_data_alias() - + url = "https://github.com/VC2015/DMLonGitHub/raw/master/sipp1991.dta" raw_data = pd.read_stata(url) diff --git a/doubleml/datasets/fetch_bonus.py b/doubleml/datasets/fetch_bonus.py index 7dda3045..7d803414 100644 --- a/doubleml/datasets/fetch_bonus.py +++ b/doubleml/datasets/fetch_bonus.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures + from doubleml import DoubleMLData @@ -44,7 +45,7 @@ def fetch_bonus(return_type="DoubleMLData", polynomial_features=False): """ _data_frame_alias = _get_data_frame_alias() _dml_data_alias = _get_dml_data_alias() - + url = "https://raw.githubusercontent.com/VC2015/DMLonGitHub/master/penn_jae.dat" raw_data = pd.read_csv(url, sep=r"\s+") From 0428bb00591049216c4370984eb55100cb4db30a Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Mon, 1 Sep 2025 08:15:19 +0200 Subject: [PATCH 22/40] formatting --- doubleml/data/did_data.py | 3 +-- doubleml/data/panel_data.py | 3 +-- doubleml/data/ssm_data.py | 7 ++++--- doubleml/data/tests/test_cluster_data.py | 7 +++++-- doubleml/data/tests/test_dml_data.py | 17 ++++++----------- doubleml/datasets/__init__.py | 4 ++-- doubleml/did/datasets/dgp_did_SZ2020.py | 1 - doubleml/did/tests/test_datasets.py | 3 +-- doubleml/double_ml.py | 2 +- doubleml/irm/datasets/__init__.py | 1 - .../irm/datasets/dgp_confounded_irm_data.py | 3 ++- doubleml/plm/datasets/__init__.py | 7 +++---- doubleml/rdd/rdd.py | 7 ++++++- doubleml/tests/test_exceptions.py | 9 ++++++--- doubleml/tests/test_framework.py | 2 +- doubleml/tests/test_model_defaults.py | 2 +- doubleml/tests/test_return_types.py | 5 ++--- 17 files changed, 42 insertions(+), 41 deletions(-) diff --git a/doubleml/data/did_data.py b/doubleml/data/did_data.py index a37b8fdf..1554e4bc 100644 --- a/doubleml/data/did_data.py +++ b/doubleml/data/did_data.py @@ -81,7 +81,7 @@ def __init__( use_other_treat_as_covariate=True, force_all_x_finite=True, force_all_d_finite=True, - ): # Initialize _t_col to None first to avoid AttributeError during parent init + ): # Initialize _t_col to None first to avoid AttributeError during parent init self._t_col = None # Store whether x_cols was originally None to reset it later @@ -243,7 +243,6 @@ def t_col(self, value): self._check_disjoint_sets() self._set_y_z_t() - @property def t(self): """ diff --git a/doubleml/data/panel_data.py b/doubleml/data/panel_data.py index 00c8030e..2b0b2e1e 100644 --- a/doubleml/data/panel_data.py +++ b/doubleml/data/panel_data.py @@ -234,8 +234,7 @@ def g_col(self): """ return self._d_cols[0] - - @ DoubleMLData.d_cols.setter + @DoubleMLData.d_cols.setter def d_cols(self, value): if isinstance(value, str): value = [value] diff --git a/doubleml/data/ssm_data.py b/doubleml/data/ssm_data.py index 91c50bb0..dc592c96 100644 --- a/doubleml/data/ssm_data.py +++ b/doubleml/data/ssm_data.py @@ -1,7 +1,8 @@ import io + import pandas as pd -from sklearn.utils.validation import check_array from sklearn.utils import assert_all_finite +from sklearn.utils.validation import check_array from doubleml.data.base_data import DoubleMLData from doubleml.utils._estimation import _assure_2d_array @@ -67,7 +68,8 @@ class DoubleMLSSMData(DoubleMLData): >>> df = make_ssm_data(return_type='DataFrame') >>> obj_dml_data_from_df = DoubleMLSSMData(df, 'y', 'd', 's') >>> # initialization from np.ndarray - >>> (x, y, d, s) = make_ssm_data(return_type='array') >>> obj_dml_data_from_array = DoubleMLSSMData.from_arrays(x, y, d, s=s) + >>> (x, y, d, s) = make_ssm_data(return_type='array') + >>> obj_dml_data_from_array = DoubleMLSSMData.from_arrays(x, y, d, s=s) """ def __init__( @@ -225,7 +227,6 @@ def s(self): else: return None - @property def s_col(self): """ diff --git a/doubleml/data/tests/test_cluster_data.py b/doubleml/data/tests/test_cluster_data.py index a2cd726f..0432d754 100644 --- a/doubleml/data/tests/test_cluster_data.py +++ b/doubleml/data/tests/test_cluster_data.py @@ -97,7 +97,10 @@ def test_cluster_cols_setter(): with pytest.raises(ValueError, match=msg): dml_data.cluster_cols = "X13" - msg = r"The cluster variable\(s\) cluster_cols must be of str or list type \(or None\)\. " "5 of type was passed." + msg = ( + r"The cluster variable\(s\) cluster_cols must be of str or list type \(or None\)\. " + "5 of type was passed." + ) with pytest.raises(TypeError, match=msg): dml_data.cluster_cols = 5 @@ -215,4 +218,4 @@ def test_cluster_data_str(): ) dml_str_optional = str(dml_data_with_optional) - assert "Time variable: time_var" in dml_str_optional \ No newline at end of file + assert "Time variable: time_var" in dml_str_optional diff --git a/doubleml/data/tests/test_dml_data.py b/doubleml/data/tests/test_dml_data.py index af09e89e..4890ac7a 100644 --- a/doubleml/data/tests/test_dml_data.py +++ b/doubleml/data/tests/test_dml_data.py @@ -3,16 +3,15 @@ import pytest from sklearn.linear_model import Lasso, LogisticRegression -from doubleml import DoubleMLData, DoubleMLDIDCS, DoubleMLPLR, DoubleMLSSM, DoubleMLDIDData, DoubleMLSSMData - +from doubleml import DoubleMLData, DoubleMLDIDCS, DoubleMLDIDData, DoubleMLPLR, DoubleMLSSM, DoubleMLSSMData from doubleml.data.base_data import DoubleMLBaseData +from doubleml.did.datasets import make_did_SZ2020 +from doubleml.irm.datasets import make_ssm_data from doubleml.plm.datasets import ( _make_pliv_data, make_pliv_CHS2015, make_plr_CCDDHNR2018, ) -from doubleml.irm.datasets import make_ssm_data -from doubleml.did.datasets import make_did_SZ2020 class DummyDataClass(DoubleMLBaseData): @@ -462,15 +461,10 @@ def test_disjoint_sets(): _ = DoubleMLDIDData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], z_cols="zz", t_col="zz") # score or selection variable - msg = ( - r"At least one variable/column is set as outcome variable \(``y_col``\) and selection variable \(``s_col``\)." - ) + msg = r"At least one variable/column is set as outcome variable \(``y_col``\) and selection variable \(``s_col``\)." with pytest.raises(ValueError, match=msg): _ = DoubleMLSSMData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], s_col="yy") - msg = ( - r"At least one variable/column is set as treatment variable \(``d_cols``\) " - r"and selection variable \(``s_col``\)." - ) + msg = r"At least one variable/column is set as treatment variable \(``d_cols``\) " r"and selection variable \(``s_col``\)." with pytest.raises(ValueError, match=msg): _ = DoubleMLSSMData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], s_col="dd1") msg = r"At least one variable/column is set as covariate \(``x_cols``\) and selection variable \(``s_col``\)." @@ -483,6 +477,7 @@ def test_disjoint_sets(): with pytest.raises(ValueError, match=msg): _ = DoubleMLSSMData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], z_cols="zz", s_col="zz") + @pytest.mark.ci def test_duplicates(): np.random.seed(3141) diff --git a/doubleml/datasets/__init__.py b/doubleml/datasets/__init__.py index 6a64a5c8..b09d693d 100644 --- a/doubleml/datasets/__init__.py +++ b/doubleml/datasets/__init__.py @@ -1,12 +1,12 @@ """ -The :mod:`doubleml.datasets` module implements data generating processes for double machine learning simulations and provides access to real datasets. +The :mod:`doubleml.datasets` module implements data generating processes for double machine learning simulations +and provides access to real datasets. """ # Import fetch functions from .fetch_401K import fetch_401K from .fetch_bonus import fetch_bonus - __all__ = [ "fetch_401K", "fetch_bonus", diff --git a/doubleml/did/datasets/dgp_did_SZ2020.py b/doubleml/did/datasets/dgp_did_SZ2020.py index e3ceb962..5e7f283d 100644 --- a/doubleml/did/datasets/dgp_did_SZ2020.py +++ b/doubleml/did/datasets/dgp_did_SZ2020.py @@ -12,7 +12,6 @@ _dml_panel_data_alias = _get_dml_panel_data_alias() - def _generate_features(n_obs, c, dim_x=4): cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=n_obs) diff --git a/doubleml/did/tests/test_datasets.py b/doubleml/did/tests/test_datasets.py index 246d5b87..8e079e9a 100644 --- a/doubleml/did/tests/test_datasets.py +++ b/doubleml/did/tests/test_datasets.py @@ -2,9 +2,8 @@ import pandas as pd import pytest -from doubleml.did.datasets import make_did_CS2021, make_did_cs_CS2021, make_did_SZ2020 from doubleml import DoubleMLDIDData - +from doubleml.did.datasets import make_did_CS2021, make_did_cs_CS2021, make_did_SZ2020 msg_inv_return_type = "Invalid return_type." diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 5d39e1a9..9363c8b9 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -7,7 +7,7 @@ from scipy.stats import norm from sklearn.base import is_classifier, is_regressor -from doubleml.data import DoubleMLPanelData, DoubleMLDIDData, DoubleMLSSMData, DoubleMLRDDData +from doubleml.data import DoubleMLDIDData, DoubleMLPanelData, DoubleMLRDDData, DoubleMLSSMData from doubleml.data.base_data import DoubleMLBaseData from doubleml.double_ml_framework import DoubleMLFramework from doubleml.utils._checks import _check_external_predictions, _check_sample_splitting diff --git a/doubleml/irm/datasets/__init__.py b/doubleml/irm/datasets/__init__.py index 05f95134..c1525eea 100644 --- a/doubleml/irm/datasets/__init__.py +++ b/doubleml/irm/datasets/__init__.py @@ -9,7 +9,6 @@ from .dgp_irm_data_discrete_treatments import make_irm_data_discrete_treatments from .dgp_ssm_data import make_ssm_data - __all__ = [ "make_confounded_irm_data", "make_heterogeneous_data", diff --git a/doubleml/irm/datasets/dgp_confounded_irm_data.py b/doubleml/irm/datasets/dgp_confounded_irm_data.py index 2452e896..392f18a0 100644 --- a/doubleml/irm/datasets/dgp_confounded_irm_data.py +++ b/doubleml/irm/datasets/dgp_confounded_irm_data.py @@ -1,5 +1,6 @@ -import numpy as np import warnings + +import numpy as np from scipy.linalg import toeplitz diff --git a/doubleml/plm/datasets/__init__.py b/doubleml/plm/datasets/__init__.py index f8928902..b2bb7df0 100644 --- a/doubleml/plm/datasets/__init__.py +++ b/doubleml/plm/datasets/__init__.py @@ -2,13 +2,12 @@ The :mod:`doubleml.plm.datasets` module implements data generating processes for partially linear models. """ -from .dgp_plr_CCDDHNR2018 import make_plr_CCDDHNR2018 -from .dgp_plr_turrell2018 import make_plr_turrell2018 +from ._make_pliv_data import _make_pliv_data from .dgp_confounded_plr_data import make_confounded_plr_data from .dgp_pliv_CHS2015 import make_pliv_CHS2015 from .dgp_pliv_multiway_cluster_CKMS2021 import make_pliv_multiway_cluster_CKMS2021 -from ._make_pliv_data import _make_pliv_data - +from .dgp_plr_CCDDHNR2018 import make_plr_CCDDHNR2018 +from .dgp_plr_turrell2018 import make_plr_turrell2018 __all__ = [ "make_plr_CCDDHNR2018", diff --git a/doubleml/rdd/rdd.py b/doubleml/rdd/rdd.py index 68b52f93..eac0e2f9 100644 --- a/doubleml/rdd/rdd.py +++ b/doubleml/rdd/rdd.py @@ -82,7 +82,12 @@ class RDFlex: >>> from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier >>> np.random.seed(123) >>> data_dict = make_simple_rdd_data(fuzzy=True) - >>> obj_dml_data = dml.DoubleMLRDDData.from_arrays(x=data_dict["X"], y=data_dict["Y"], d=data_dict["D"], s=data_dict["score"]) + >>> obj_dml_data = dml.DoubleMLRDDData.from_arrays( + ... x=data_dict["X"], + ... y=data_dict["Y"], + ... d=data_dict["D"], + ... s=data_dict["score"] + ... ) >>> ml_g = RandomForestRegressor() >>> ml_m = RandomForestClassifier() >>> rdflex_obj = dml.rdd.RDFlex(obj_dml_data, ml_g, ml_m, fuzzy=True) diff --git a/doubleml/tests/test_exceptions.py b/doubleml/tests/test_exceptions.py index d679afb5..6d4958c0 100644 --- a/doubleml/tests/test_exceptions.py +++ b/doubleml/tests/test_exceptions.py @@ -8,11 +8,11 @@ from doubleml import ( DoubleMLBLP, - DoubleMLDIDData, DoubleMLCVAR, DoubleMLData, DoubleMLDID, DoubleMLDIDCS, + DoubleMLDIDData, DoubleMLIIVM, DoubleMLIRM, DoubleMLLPQ, @@ -21,9 +21,9 @@ DoubleMLPQ, DoubleMLQTE, ) +from doubleml.did.datasets import make_did_SZ2020 from doubleml.irm.datasets import make_iivm_data, make_irm_data from doubleml.plm.datasets import make_pliv_CHS2015, make_pliv_multiway_cluster_CKMS2021, make_plr_CCDDHNR2018 -from doubleml.did.datasets import make_did_SZ2020 from ._utils import DummyDataClass @@ -55,7 +55,10 @@ @pytest.mark.ci def test_doubleml_exception_data(): - msg = "The data must be of DoubleMLData or DoubleMLClusterData or DoubleMLDIDData or DoubleMLSSMData or DoubleMLRDDData type." + msg = ( + "The data must be of DoubleMLData or DoubleMLClusterData or DoubleMLDIDData or " + "DoubleMLSSMData or DoubleMLRDDData type." + ) with pytest.raises(TypeError, match=msg): _ = DoubleMLPLR(pd.DataFrame(), ml_l, ml_m) diff --git a/doubleml/tests/test_framework.py b/doubleml/tests/test_framework.py index 44dabb71..13222664 100644 --- a/doubleml/tests/test_framework.py +++ b/doubleml/tests/test_framework.py @@ -3,8 +3,8 @@ import pytest from sklearn.linear_model import LinearRegression, LogisticRegression -from doubleml.irm.datasets import make_irm_data from doubleml.double_ml_framework import DoubleMLFramework, concat +from doubleml.irm.datasets import make_irm_data from doubleml.irm.irm import DoubleMLIRM from ._utils import generate_dml_dict diff --git a/doubleml/tests/test_model_defaults.py b/doubleml/tests/test_model_defaults.py index 8417468a..b04117eb 100644 --- a/doubleml/tests/test_model_defaults.py +++ b/doubleml/tests/test_model_defaults.py @@ -4,9 +4,9 @@ from sklearn.linear_model import Lasso, LogisticRegression import doubleml as dml +from doubleml.did.datasets import make_did_SZ2020 from doubleml.irm.datasets import make_iivm_data, make_irm_data, make_ssm_data from doubleml.plm.datasets import make_pliv_CHS2015, make_plr_CCDDHNR2018 -from doubleml.did.datasets import make_did_SZ2020 np.random.seed(3141) dml_data_plr = make_plr_CCDDHNR2018(n_obs=100) diff --git a/doubleml/tests/test_return_types.py b/doubleml/tests/test_return_types.py index 4b914f65..fe3d676d 100644 --- a/doubleml/tests/test_return_types.py +++ b/doubleml/tests/test_return_types.py @@ -8,11 +8,10 @@ from doubleml import ( DoubleMLAPO, - DoubleMLData, DoubleMLCVAR, - DoubleMLDIDData, DoubleMLDID, DoubleMLDIDCS, + DoubleMLDIDData, DoubleMLFramework, DoubleMLIIVM, DoubleMLIRM, @@ -23,9 +22,9 @@ DoubleMLPQ, DoubleMLSSM, ) +from doubleml.did.datasets import make_did_SZ2020 from doubleml.irm.datasets import make_iivm_data, make_irm_data, make_ssm_data from doubleml.plm.datasets import make_pliv_CHS2015, make_pliv_multiway_cluster_CKMS2021, make_plr_CCDDHNR2018 -from doubleml.did.datasets import make_did_SZ2020 np.random.seed(3141) n_obs = 200 From 3ff1810db7a966bf67e5bdd8941fb8e74e9fbd5c Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Mon, 1 Sep 2025 13:44:59 +0200 Subject: [PATCH 23/40] fix import --- doubleml/irm/tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doubleml/irm/tests/conftest.py b/doubleml/irm/tests/conftest.py index 1cf1d525..0a3d4db8 100644 --- a/doubleml/irm/tests/conftest.py +++ b/doubleml/irm/tests/conftest.py @@ -4,7 +4,7 @@ from scipy.linalg import toeplitz from sklearn.datasets import make_spd_matrix -from doubleml.datasets import make_iivm_data, make_irm_data +from doubleml.irm.datasets import make_iivm_data, make_irm_data def _g(x): From 10a500a1140f749fa253cc360ec5a755610fae6f Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Mon, 1 Sep 2025 13:52:40 +0200 Subject: [PATCH 24/40] correct aliases --- doubleml/did/datasets/dgp_did_SZ2020.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doubleml/did/datasets/dgp_did_SZ2020.py b/doubleml/did/datasets/dgp_did_SZ2020.py index 5e7f283d..af46f4ab 100644 --- a/doubleml/did/datasets/dgp_did_SZ2020.py +++ b/doubleml/did/datasets/dgp_did_SZ2020.py @@ -189,7 +189,7 @@ def make_did_SZ2020(n_obs=500, dgp_type=1, cross_sectional_data=False, return_ty return data else: return DoubleMLDIDData(data, y_col="y", d_cols="d", x_cols=z_cols) - elif return_type == "DoubleMLPanelData": + elif return_type in _dml_panel_data_alias: z_cols = [f"Z{i + 1}" for i in np.arange(dim_x)] df0 = ( pd.DataFrame( From e735652a57d0a3a0aec53772858d8dec20108ac5 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Mon, 1 Sep 2025 14:16:42 +0200 Subject: [PATCH 25/40] add cluster exception test for rdd --- doubleml/data/panel_data.py | 3 --- doubleml/rdd/rdd.py | 3 +++ doubleml/rdd/tests/test_rdd_exceptions.py | 7 +++++++ 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/doubleml/data/panel_data.py b/doubleml/data/panel_data.py index ba13cd74..22aad0f7 100644 --- a/doubleml/data/panel_data.py +++ b/doubleml/data/panel_data.py @@ -309,9 +309,6 @@ def _check_disjoint_sets(self): self._check_disjoint_sets_t_col() def _check_disjoint_sets_id_col(self): - # The call to super()._check_disjoint_sets() is removed from here as it's redundant - # and called in the main _check_disjoint_sets method of this class. - # special checks for the additional id variable (and the time variable) id_col_set = {self.id_col} y_col_set = {self.y_col} diff --git a/doubleml/rdd/rdd.py b/doubleml/rdd/rdd.py index 1370777c..f9811c9c 100644 --- a/doubleml/rdd/rdd.py +++ b/doubleml/rdd/rdd.py @@ -509,6 +509,9 @@ def _check_data(self, obj_dml_data, cutoff): f"The data must be of DoubleMLRDDData type. {str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed." ) + if obj_dml_data.is_cluster_data: + raise NotImplementedError("Clustered data is not supported for RDFlex yet.") + # score checks if obj_dml_data.score_col is None: raise ValueError("Incompatible data. " + "Score variable has not been set. ") diff --git a/doubleml/rdd/tests/test_rdd_exceptions.py b/doubleml/rdd/tests/test_rdd_exceptions.py index 9c345080..30153151 100644 --- a/doubleml/rdd/tests/test_rdd_exceptions.py +++ b/doubleml/rdd/tests/test_rdd_exceptions.py @@ -63,6 +63,13 @@ def test_rdd_exception_data(): with pytest.raises(TypeError, match=msg): _ = RDFlex([], ml_g) + # Clusters not implemented + msg = "Clustered data is not supported for RDFlex yet." + with pytest.raises(NotImplementedError, match=msg): + dml_data_clusters = copy.deepcopy(dml_data) + dml_data_clusters._is_cluster_data = True + _ = RDFlex(dml_data_clusters, ml_g, ml_m) + # score column msg = "Incompatible data. Score variable has not been set. " with pytest.raises(ValueError, match=msg): From da8882f678038f49683798e3c73462b48924183e Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Mon, 1 Sep 2025 16:58:53 +0200 Subject: [PATCH 26/40] Infer is_cluster_data from cluster_cols in DoubleMLData Remove is_cluster_data parameter from DoubleMLData.init() and from_arrays() Automatically set _is_cluster_data in cluster_cols setter based on whether cluster_cols is not None Remove is_cluster_data setter to make property read-only and inferred Update dataset generators, tests, and backwards compatibility classes to remove explicit is_cluster_data=True arguments Maintain compatibility by keeping is_cluster_data property for existing code Fixes test collection errors by eliminating unused parameter --- doubleml/data/__init__.py | 12 +++++------- doubleml/data/base_data.py | 16 ++-------------- doubleml/data/tests/test_cluster_data.py | 2 +- .../dgp_pliv_multiway_cluster_CKMS2021.py | 2 +- doubleml/tests/_utils.py | 4 ++-- doubleml/tests/test_exceptions.py | 1 - doubleml/tests/test_multiway_cluster.py | 2 +- doubleml/tests/test_nonlinear_cluster.py | 6 +++--- doubleml/tests/test_sensitivity_cluster.py | 4 ++-- 9 files changed, 17 insertions(+), 32 deletions(-) diff --git a/doubleml/data/__init__.py b/doubleml/data/__init__.py index 0462c763..8343c228 100644 --- a/doubleml/data/__init__.py +++ b/doubleml/data/__init__.py @@ -13,9 +13,9 @@ class DoubleMLClusterData(DoubleMLData): """ - Backwards compatibility wrapper for DoubleMLData with is_cluster_data=True. + Backwards compatibility wrapper for DoubleMLData with cluster_cols. This class is deprecated and will be removed in a future version. - Use DoubleMLData with is_cluster_data=True instead. + Use DoubleMLData with cluster_cols instead. """ def __init__( @@ -33,7 +33,7 @@ def __init__( ): warnings.warn( "DoubleMLClusterData is deprecated and will be removed with version 0.12.0. " - "Use DoubleMLData with is_cluster_data=True instead.", + "Use DoubleMLData with cluster_cols instead.", FutureWarning, stacklevel=2, ) @@ -47,7 +47,6 @@ def __init__( use_other_treat_as_covariate=use_other_treat_as_covariate, force_all_x_finite=force_all_x_finite, force_all_d_finite=True, - is_cluster_data=True, ) @classmethod @@ -57,11 +56,11 @@ def from_arrays( """ Initialize :class:`DoubleMLClusterData` from :class:`numpy.ndarray`'s. This method is deprecated and will be removed with version 0.12.0, - use DoubleMLData.from_arrays with is_cluster_data=True instead. + use DoubleMLData.from_arrays with cluster_vars instead. """ warnings.warn( "DoubleMLClusterData is deprecated and will be removed with version 0.12.0. " - "Use DoubleMLData.from_arrays with is_cluster_data=True instead.", + "Use DoubleMLData.from_arrays with cluster_vars instead.", FutureWarning, stacklevel=2, ) @@ -74,7 +73,6 @@ def from_arrays( use_other_treat_as_covariate=use_other_treat_as_covariate, force_all_x_finite=force_all_x_finite, force_all_d_finite=True, - is_cluster_data=True, ) diff --git a/doubleml/data/base_data.py b/doubleml/data/base_data.py index b4d59b7b..a7ae30f6 100644 --- a/doubleml/data/base_data.py +++ b/doubleml/data/base_data.py @@ -112,10 +112,6 @@ class DoubleMLData(DoubleMLBaseData): Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. Default is ``True``. - is_cluster_data : bool - Flag indicating whether this data object is being used for cluster data. - Default is ``False``. - force_all_x_finite : bool or str Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are @@ -156,7 +152,6 @@ def __init__( use_other_treat_as_covariate=True, force_all_x_finite=True, force_all_d_finite=True, - is_cluster_data=False, ): DoubleMLBaseData.__init__(self, data) @@ -165,7 +160,6 @@ def __init__( self.z_cols = z_cols self.cluster_cols = cluster_cols self.x_cols = x_cols - self.is_cluster_data = is_cluster_data self._check_disjoint_sets() self.use_other_treat_as_covariate = use_other_treat_as_covariate self.force_all_x_finite = force_all_x_finite @@ -219,7 +213,6 @@ def from_arrays( use_other_treat_as_covariate=True, force_all_x_finite=True, force_all_d_finite=True, - is_cluster_data=False, ): """ Initialize :class:`DoubleMLData` from :class:`numpy.ndarray`'s. @@ -349,7 +342,6 @@ def from_arrays( use_other_treat_as_covariate, force_all_x_finite, force_all_d_finite, - is_cluster_data, ) @property @@ -415,6 +407,8 @@ def cluster_cols(self, value): else: self._cluster_cols = None + self._is_cluster_data = self._cluster_cols is not None + if reset_value: self._check_disjoint_sets() if self.cluster_cols is not None: @@ -807,9 +801,3 @@ def is_cluster_data(self): Flag indicating whether this data object is being used for cluster data. """ return self._is_cluster_data - - @is_cluster_data.setter - def is_cluster_data(self, value): - if not isinstance(value, bool): - raise TypeError(f"is_cluster_data must be True or False. Got {str(value)}.") - self._is_cluster_data = value diff --git a/doubleml/data/tests/test_cluster_data.py b/doubleml/data/tests/test_cluster_data.py index 0432d754..bbb7d97f 100644 --- a/doubleml/data/tests/test_cluster_data.py +++ b/doubleml/data/tests/test_cluster_data.py @@ -164,7 +164,7 @@ def test_duplicates(): msg = r"Invalid cluster variable\(s\) cluster_cols: Contains duplicate values." with pytest.raises(ValueError, match=msg): - _ = DoubleMLData(dml_cluster_data.data, y_col="Y", d_cols=["D"], cluster_cols=["X3", "X2", "X3"], is_cluster_data=True) + _ = DoubleMLData(dml_cluster_data.data, y_col="Y", d_cols=["D"], cluster_cols=["X3", "X2", "X3"]) with pytest.raises(ValueError, match=msg): dml_cluster_data.cluster_cols = ["X3", "X2", "X3"] diff --git a/doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py b/doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py index 0d64c42f..9f08b2ef 100644 --- a/doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py +++ b/doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py @@ -195,7 +195,7 @@ def make_pliv_multiway_cluster_CKMS2021(N=25, M=25, dim_X=100, theta=1.0, return return data else: return DoubleMLData( - data, y_col="Y", d_cols="D", cluster_cols=cluster_cols, x_cols=x_cols, z_cols="Z", is_cluster_data=True + data, y_col="Y", d_cols="D", cluster_cols=cluster_cols, x_cols=x_cols, z_cols="Z" ) else: raise ValueError("Invalid return_type.") diff --git a/doubleml/tests/_utils.py b/doubleml/tests/_utils.py index 577ed7ed..907d03d1 100644 --- a/doubleml/tests/_utils.py +++ b/doubleml/tests/_utils.py @@ -9,9 +9,9 @@ class DummyDataClass(DoubleMLBaseData): - def __init__(self, data, is_cluster_data=False): + def __init__(self, data): DoubleMLBaseData.__init__(self, data) - self.is_cluster_data = is_cluster_data + self.is_cluster_data = False @property def n_coefs(self): diff --git a/doubleml/tests/test_exceptions.py b/doubleml/tests/test_exceptions.py index 6d4958c0..56cb61dc 100644 --- a/doubleml/tests/test_exceptions.py +++ b/doubleml/tests/test_exceptions.py @@ -1362,7 +1362,6 @@ def test_doubleml_cluster_not_yet_implemented(): x_cols=["X1", "X5"], z_cols="Z", cluster_cols=["cluster_var_i", "cluster_var_j", "cluster_var_k"], - is_cluster_data=True, ) assert dml_cluster_data_multiway.n_cluster_vars == 3 msg = r"Multi-way \(n_ways > 2\) clustering not yet implemented." diff --git a/doubleml/tests/test_multiway_cluster.py b/doubleml/tests/test_multiway_cluster.py index 4537cb4d..f22e913b 100644 --- a/doubleml/tests/test_multiway_cluster.py +++ b/doubleml/tests/test_multiway_cluster.py @@ -288,7 +288,7 @@ def dml_plr_cluster_with_index(generate_data1, learner): dml_plr_obj.fit() df = data.reset_index() - dml_cluster_data = dml.DoubleMLData(df, y_col="y", d_cols="d", x_cols=x_cols, cluster_cols="index", is_cluster_data=True) + dml_cluster_data = dml.DoubleMLData(df, y_col="y", d_cols="d", x_cols=x_cols, cluster_cols="index") np.random.seed(3141) dml_plr_cluster_obj = dml.DoubleMLPLR(dml_cluster_data, ml_l, ml_m, n_folds=n_folds) np.random.seed(3141) diff --git a/doubleml/tests/test_nonlinear_cluster.py b/doubleml/tests/test_nonlinear_cluster.py index 76f595ed..0623b665 100644 --- a/doubleml/tests/test_nonlinear_cluster.py +++ b/doubleml/tests/test_nonlinear_cluster.py @@ -20,7 +20,7 @@ # create data without insturment for plr x, y, d, cluster_vars, z = make_pliv_multiway_cluster_CKMS2021(N, M, dim_x, return_type="array") -obj_dml_cluster_data = DoubleMLData.from_arrays(x, y, d, cluster_vars, is_cluster_data=True) +obj_dml_cluster_data = DoubleMLData.from_arrays(x, y, d, cluster_vars) x, y, d, cluster_vars, z = make_pliv_multiway_cluster_CKMS2021( N, @@ -32,7 +32,7 @@ omega_V=np.array([0.25, 0]), return_type="array", ) -obj_dml_oneway_cluster_data = DoubleMLData.from_arrays(x, y, d, cluster_vars=cluster_vars, is_cluster_data=True) +obj_dml_oneway_cluster_data = DoubleMLData.from_arrays(x, y, d, cluster_vars=cluster_vars) # only the first cluster variable is relevant with the weight setting above obj_dml_oneway_cluster_data.cluster_cols = "cluster_var1" @@ -196,7 +196,7 @@ def dml_plr_cluster_nonlinear_with_index(generate_data1, learner): dml_plr_obj.fit() df = data.reset_index() - dml_cluster_data = dml.DoubleMLData(df, y_col="y", d_cols="d", x_cols=x_cols, cluster_cols="index", is_cluster_data=True) + dml_cluster_data = dml.DoubleMLData(df, y_col="y", d_cols="d", x_cols=x_cols, cluster_cols="index") np.random.seed(3141) dml_plr_cluster_obj = DoubleMLPLRWithNonLinearScoreMixin(dml_cluster_data, ml_l, ml_m, n_folds=n_folds) dml_plr_cluster_obj.fit() diff --git a/doubleml/tests/test_sensitivity_cluster.py b/doubleml/tests/test_sensitivity_cluster.py index 19b25482..3be3f409 100644 --- a/doubleml/tests/test_sensitivity_cluster.py +++ b/doubleml/tests/test_sensitivity_cluster.py @@ -17,7 +17,7 @@ (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(N, M, dim_x, return_type="array") -obj_dml_cluster_data = dml.DoubleMLData.from_arrays(x, y, d, cluster_vars=cluster_vars, is_cluster_data=True) +obj_dml_cluster_data = dml.DoubleMLData.from_arrays(x, y, d, cluster_vars=cluster_vars) (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021( N, @@ -29,7 +29,7 @@ omega_V=np.array([0.25, 0]), return_type="array", ) -obj_dml_oneway_cluster_data = dml.DoubleMLData.from_arrays(x, y, d, cluster_vars=cluster_vars, is_cluster_data=True) +obj_dml_oneway_cluster_data = dml.DoubleMLData.from_arrays(x, y, d, cluster_vars=cluster_vars) # only the first cluster variable is relevant with the weight setting above obj_dml_oneway_cluster_data.cluster_cols = "cluster_var1" From 68017d7757b5753a5d4c30dd1465b75be6a19354 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Mon, 1 Sep 2025 17:02:45 +0200 Subject: [PATCH 27/40] forgot pre-commit... --- doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py b/doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py index 9f08b2ef..3ccec0f7 100644 --- a/doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py +++ b/doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py @@ -194,8 +194,6 @@ def make_pliv_multiway_cluster_CKMS2021(N=25, M=25, dim_X=100, theta=1.0, return if return_type in _data_frame_alias: return data else: - return DoubleMLData( - data, y_col="Y", d_cols="D", cluster_cols=cluster_cols, x_cols=x_cols, z_cols="Z" - ) + return DoubleMLData(data, y_col="Y", d_cols="D", cluster_cols=cluster_cols, x_cols=x_cols, z_cols="Z") else: raise ValueError("Invalid return_type.") From 883f78047636b0c67bad01696f4c80dfb73e6dbf Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Thu, 2 Oct 2025 10:09:43 +0200 Subject: [PATCH 28/40] Create `SampleSplittingMixin` for `DoubleML`, `DoubleMLQTE` and `DoubleMLAPOS` classes. --- doubleml/double_ml_sampling_mixins.py | 53 +++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 doubleml/double_ml_sampling_mixins.py diff --git a/doubleml/double_ml_sampling_mixins.py b/doubleml/double_ml_sampling_mixins.py new file mode 100644 index 00000000..0116aa5e --- /dev/null +++ b/doubleml/double_ml_sampling_mixins.py @@ -0,0 +1,53 @@ +from doubleml.utils.resampling import DoubleMLClusterResampling, DoubleMLResampling + + +class SampleSplittingMixin: + """ + Mixin class implementing sample splitting for DoubleML models. + + Notes + ----- + The mixin class :class:`SampleSplittingMixin` implements the sample splitting procedure for DoubleML models. + The sample splitting is drawn according to the attributes ``n_folds`` and ``n_rep``. + If the data is clustered, the sample splitting is drawn such that clusters are not split across folds. + For details, see the chapter on + `sample splitting `_ in the DoubleML user guide. + """ + + def __init__(self): + self.n_folds = 5 + self.n_rep = 1 + self._smpls = None + self._smpls_cluster = None + self._is_cluster_data = False + self._n_folds_per_cluster = None + self._n_obs_sample_splitting = None + self._strata = None + + def draw_sample_splitting(self): + """ + Draw sample splitting for DoubleML models. + + The samples are drawn according to the attributes + ``n_folds`` and ``n_rep``. + + Returns + ------- + self : object + """ + if self._is_cluster_data: + obj_dml_resampling = DoubleMLClusterResampling( + n_folds=self._n_folds_per_cluster, + n_rep=self.n_rep, + n_obs=self._n_obs_sample_splitting, + n_cluster_vars=self._dml_data.n_cluster_vars, + cluster_vars=self._dml_data.cluster_vars, + ) + self._smpls, self._smpls_cluster = obj_dml_resampling.split_samples() + else: + obj_dml_resampling = DoubleMLResampling( + n_folds=self.n_folds, n_rep=self.n_rep, n_obs=self._n_obs_sample_splitting, stratify=self._strata + ) + self._smpls = obj_dml_resampling.split_samples() + + return self From 1869e323bcf18b3214920763249bb25ebe9b7264 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Thu, 2 Oct 2025 10:10:04 +0200 Subject: [PATCH 29/40] Implement SampleSplittingMixin in DoubleML, DoubleMLQTE and DoubleMLAPOS --- doubleml/double_ml.py | 31 ++----------------------------- doubleml/irm/apos.py | 22 ++-------------------- doubleml/irm/qte.py | 25 +++---------------------- 3 files changed, 7 insertions(+), 71 deletions(-) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 3544e718..c2990298 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -10,16 +10,16 @@ from doubleml.data import DoubleMLDIDData, DoubleMLPanelData, DoubleMLRDDData, DoubleMLSSMData from doubleml.data.base_data import DoubleMLBaseData from doubleml.double_ml_framework import DoubleMLFramework +from doubleml.double_ml_sampling_mixins import SampleSplittingMixin from doubleml.utils._checks import _check_external_predictions, _check_sample_splitting from doubleml.utils._estimation import _aggregate_coefs_and_ses, _rmse, _set_external_predictions, _var_est from doubleml.utils._sensitivity import _compute_sensitivity_bias from doubleml.utils.gain_statistics import gain_statistics -from doubleml.utils.resampling import DoubleMLClusterResampling, DoubleMLResampling _implemented_data_backends = ["DoubleMLData", "DoubleMLClusterData", "DoubleMLDIDData", "DoubleMLSSMData", "DoubleMLRDDData"] -class DoubleML(ABC): +class DoubleML(SampleSplittingMixin, ABC): """Double Machine Learning.""" def __init__(self, obj_dml_data, n_folds, n_rep, score, draw_sample_splitting): @@ -1247,33 +1247,6 @@ def evaluate_learners(self, learners=None, metric=_rmse): f"The learners have to be a subset of {str(self.params_names)}. Learners {str(learners)} provided." ) - def draw_sample_splitting(self): - """ - Draw sample splitting for DoubleML models. - - The samples are drawn according to the attributes - ``n_folds`` and ``n_rep``. - - Returns - ------- - self : object - """ - if self._is_cluster_data: - obj_dml_resampling = DoubleMLClusterResampling( - n_folds=self._n_folds_per_cluster, - n_rep=self.n_rep, - n_obs=self._n_obs_sample_splitting, - n_cluster_vars=self._dml_data.n_cluster_vars, - cluster_vars=self._dml_data.cluster_vars, - ) - self._smpls, self._smpls_cluster = obj_dml_resampling.split_samples() - else: - obj_dml_resampling = DoubleMLResampling( - n_folds=self.n_folds, n_rep=self.n_rep, n_obs=self._n_obs_sample_splitting, stratify=self._strata - ) - self._smpls = obj_dml_resampling.split_samples() - - return self def set_sample_splitting(self, all_smpls, all_smpls_cluster=None): """ diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py index 2ef147f1..0552770d 100644 --- a/doubleml/irm/apos.py +++ b/doubleml/irm/apos.py @@ -9,15 +9,15 @@ from doubleml.data import DoubleMLData from doubleml.double_ml import DoubleML from doubleml.double_ml_framework import concat +from doubleml.double_ml_sampling_mixins import SampleSplittingMixin from doubleml.irm.apo import DoubleMLAPO from doubleml.utils._checks import _check_sample_splitting, _check_score, _check_trimming, _check_weights from doubleml.utils._descriptive import generate_summary from doubleml.utils._sensitivity import _compute_sensitivity_bias from doubleml.utils.gain_statistics import gain_statistics -from doubleml.utils.resampling import DoubleMLResampling -class DoubleMLAPOS: +class DoubleMLAPOS(SampleSplittingMixin): """Double machine learning for interactive regression models with multiple discrete treatments.""" def __init__( @@ -625,24 +625,6 @@ def sensitivity_benchmark(self, benchmarking_set, fit_args=None): df_benchmark = pd.DataFrame(benchmark_dict, index=self.treatment_levels) return df_benchmark - def draw_sample_splitting(self): - """ - Draw sample splitting for DoubleML models. - - The samples are drawn according to the attributes - ``n_folds`` and ``n_rep``. - - Returns - ------- - self : object - """ - obj_dml_resampling = DoubleMLResampling( - n_folds=self.n_folds, n_rep=self.n_rep, n_obs=self._dml_data.n_obs, stratify=self._dml_data.d - ) - self._smpls = obj_dml_resampling.split_samples() - - return self - def set_sample_splitting(self, all_smpls, all_smpls_cluster=None): """ Set the sample splitting for DoubleML models. diff --git a/doubleml/irm/qte.py b/doubleml/irm/qte.py index f104f600..e7f073a7 100644 --- a/doubleml/irm/qte.py +++ b/doubleml/irm/qte.py @@ -5,16 +5,16 @@ from doubleml.data import DoubleMLData from doubleml.double_ml_framework import concat +from doubleml.double_ml_sampling_mixins import SampleSplittingMixin from doubleml.irm.cvar import DoubleMLCVAR from doubleml.irm.lpq import DoubleMLLPQ from doubleml.irm.pq import DoubleMLPQ from doubleml.utils._checks import _check_sample_splitting, _check_score, _check_trimming, _check_zero_one_treatment from doubleml.utils._descriptive import generate_summary from doubleml.utils._estimation import _default_kde -from doubleml.utils.resampling import DoubleMLResampling -class DoubleMLQTE: +class DoubleMLQTE(SampleSplittingMixin): """Double machine learning for quantile treatment effects Parameters @@ -145,6 +145,7 @@ def __init__( # perform sample splitting self._smpls = None + self._n_obs_sample_splitting = self._dml_data.n_obs if draw_sample_splitting: self.draw_sample_splitting() # initialize all models @@ -440,26 +441,6 @@ def bootstrap(self, method="normal", n_rep_boot=500): return self - def draw_sample_splitting(self): - """ - Draw sample splitting for DoubleML models. - - The samples are drawn according to the attributes - ``n_folds`` and ``n_rep``. - - Returns - ------- - self : object - """ - obj_dml_resampling = DoubleMLResampling( - n_folds=self.n_folds, n_rep=self.n_rep, n_obs=self._dml_data.n_obs, stratify=self._dml_data.d - ) - self._smpls = obj_dml_resampling.split_samples() - # initialize all models - self._modellist_0, self._modellist_1 = self._initialize_models() - - return self - def set_sample_splitting(self, all_smpls, all_smpls_cluster=None): """ Set the sample splitting for DoubleML models. From b00b7ae90eb0cca5f7244481032d13365a66a42f Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Thu, 2 Oct 2025 10:15:50 +0200 Subject: [PATCH 30/40] fix n_obs in APOs --- doubleml/irm/apos.py | 1 + 1 file changed, 1 insertion(+) diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py index 0552770d..c1b9f1ab 100644 --- a/doubleml/irm/apos.py +++ b/doubleml/irm/apos.py @@ -88,6 +88,7 @@ def __init__( # perform sample splitting self._smpls = None + self._n_obs_sample_splitting = self._dml_data.n_obs if draw_sample_splitting: self.draw_sample_splitting() From 5ef471416fc50a8411d5422a032c46ec6d6bcedf Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Thu, 2 Oct 2025 10:32:58 +0200 Subject: [PATCH 31/40] add `_strata` attribute to classes before sample splitting --- doubleml/irm/apos.py | 1 + doubleml/irm/qte.py | 1 + 2 files changed, 2 insertions(+) diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py index c1b9f1ab..562ef837 100644 --- a/doubleml/irm/apos.py +++ b/doubleml/irm/apos.py @@ -89,6 +89,7 @@ def __init__( # perform sample splitting self._smpls = None self._n_obs_sample_splitting = self._dml_data.n_obs + self._strata = None if draw_sample_splitting: self.draw_sample_splitting() diff --git a/doubleml/irm/qte.py b/doubleml/irm/qte.py index e7f073a7..fc5d5263 100644 --- a/doubleml/irm/qte.py +++ b/doubleml/irm/qte.py @@ -146,6 +146,7 @@ def __init__( # perform sample splitting self._smpls = None self._n_obs_sample_splitting = self._dml_data.n_obs + self._strata = self._dml_data.d if draw_sample_splitting: self.draw_sample_splitting() # initialize all models From aa61bd98599c532de44d07f4757e712d7be9280a Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Thu, 2 Oct 2025 12:38:42 +0200 Subject: [PATCH 32/40] add `set_sample_splitting` method to mixin --- doubleml/double_ml_sampling_mixins.py | 81 ++++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) diff --git a/doubleml/double_ml_sampling_mixins.py b/doubleml/double_ml_sampling_mixins.py index 0116aa5e..a345ec59 100644 --- a/doubleml/double_ml_sampling_mixins.py +++ b/doubleml/double_ml_sampling_mixins.py @@ -1,4 +1,6 @@ +from abc import abstractmethod from doubleml.utils.resampling import DoubleMLClusterResampling, DoubleMLResampling +from doubleml.utils._checks import _check_sample_splitting class SampleSplittingMixin: @@ -26,7 +28,7 @@ def __init__(self): def draw_sample_splitting(self): """ - Draw sample splitting for DoubleML models. + Draw sample splitting for DoubleML models. The samples are drawn according to the attributes ``n_folds`` and ``n_rep``. @@ -51,3 +53,80 @@ def draw_sample_splitting(self): self._smpls = obj_dml_resampling.split_samples() return self + + def set_sample_splitting(self, all_smpls, all_smpls_cluster=None): + """ + Set the sample splitting for DoubleML models. + + The attributes ``n_folds`` and ``n_rep`` are derived from the provided partition. + + Parameters + ---------- + all_smpls : list or tuple + If nested list of lists of tuples: + The outer list needs to provide an entry per repeated sample splitting (length of list is set as + ``n_rep``). + The inner list needs to provide a tuple (train_ind, test_ind) per fold (length of list is set as + ``n_folds``). test_ind must form a partition for each inner list. + If list of tuples: + The list needs to provide a tuple (train_ind, test_ind) per fold (length of list is set as + ``n_folds``). test_ind must form a partition. ``n_rep=1`` is always set. + If tuple: + Must be a tuple with two elements train_ind and test_ind. Only viable option is to set + train_ind and test_ind to np.arange(n_obs), which corresponds to no sample splitting. + ``n_folds=1`` and ``n_rep=1`` is always set. + + all_smpls_cluster : list or None + Nested list or ``None``. The first level of nesting corresponds to the number of repetitions. The second level + of nesting corresponds to the number of folds. The third level of nesting contains a tuple of training and + testing lists. Both training and testing contain an array for each cluster variable, which form a partition of + the clusters. + Default is ``None``. + + Returns + ------- + self : object + + Examples + -------- + >>> import numpy as np + >>> import doubleml as dml + >>> from doubleml.plm.datasets import make_plr_CCDDHNR2018 + >>> from sklearn.ensemble import RandomForestRegressor + >>> from sklearn.base import clone + >>> np.random.seed(3141) + >>> learner = RandomForestRegressor(max_depth=2, n_estimators=10) + >>> ml_g = learner + >>> ml_m = learner + >>> obj_dml_data = make_plr_CCDDHNR2018(n_obs=10, alpha=0.5) + >>> dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m) + >>> # simple sample splitting with two folds and without cross-fitting + >>> smpls = ([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]) + >>> dml_plr_obj.set_sample_splitting(smpls) + >>> # sample splitting with two folds and cross-fitting + >>> smpls = [([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]), + >>> ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])] + >>> dml_plr_obj.set_sample_splitting(smpls) + >>> # sample splitting with two folds and repeated cross-fitting with n_rep = 2 + >>> smpls = [[([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]), + >>> ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])], + >>> [([0, 2, 4, 6, 8], [1, 3, 5, 7, 9]), + >>> ([1, 3, 5, 7, 9], [0, 2, 4, 6, 8])]] + >>> dml_plr_obj.set_sample_splitting(smpls) + """ + self._smpls, self._smpls_cluster, self._n_rep, self._n_folds = _check_sample_splitting( + all_smpls, all_smpls_cluster, self._dml_data, self._is_cluster_data, n_obs=self._n_obs_sample_splitting + ) + + self._initialize_dml_model() + + return self + + @abstractmethod + def _initialize_dml_model(self): + """ + Set sample splitting for DoubleML models. Can update the number of repetitions. + Updates model dimensions to (n_folds, n_rep). + This method needs to be implemented in the child class. + """ + pass From d98efe875a616998a8d02dd1fdb4d35cd1f3572f Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Thu, 2 Oct 2025 12:39:02 +0200 Subject: [PATCH 33/40] fix stratification for APOs model --- doubleml/irm/apos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py index 562ef837..8ff345c0 100644 --- a/doubleml/irm/apos.py +++ b/doubleml/irm/apos.py @@ -89,7 +89,7 @@ def __init__( # perform sample splitting self._smpls = None self._n_obs_sample_splitting = self._dml_data.n_obs - self._strata = None + self._strata = self._dml_data.d if draw_sample_splitting: self.draw_sample_splitting() From cc7963ec00d68d810a309a079cf13d9cd2b0a60d Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Thu, 2 Oct 2025 12:39:26 +0200 Subject: [PATCH 34/40] refactor `DoubleML` to use mixin --- doubleml/double_ml.py | 72 ++----------------------------------------- 1 file changed, 2 insertions(+), 70 deletions(-) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index c2990298..bba78a0f 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -111,9 +111,7 @@ def __init__(self, obj_dml_data, n_folds, n_rep, score, draw_sample_splitting): if draw_sample_splitting: self.draw_sample_splitting() - self._score_dim = (self._dml_data.n_obs, self.n_rep, self._dml_data.n_coefs) - # initialize arrays according to obj_dml_data and the resampling settings - self._initialize_arrays() + self._initialize_dml_model() # initialize instance attributes which are later used for iterating self._i_rep = None @@ -1247,76 +1245,10 @@ def evaluate_learners(self, learners=None, metric=_rmse): f"The learners have to be a subset of {str(self.params_names)}. Learners {str(learners)} provided." ) - - def set_sample_splitting(self, all_smpls, all_smpls_cluster=None): - """ - Set the sample splitting for DoubleML models. - - The attributes ``n_folds`` and ``n_rep`` are derived from the provided partition. - - Parameters - ---------- - all_smpls : list or tuple - If nested list of lists of tuples: - The outer list needs to provide an entry per repeated sample splitting (length of list is set as - ``n_rep``). - The inner list needs to provide a tuple (train_ind, test_ind) per fold (length of list is set as - ``n_folds``). test_ind must form a partition for each inner list. - If list of tuples: - The list needs to provide a tuple (train_ind, test_ind) per fold (length of list is set as - ``n_folds``). test_ind must form a partition. ``n_rep=1`` is always set. - If tuple: - Must be a tuple with two elements train_ind and test_ind. Only viable option is to set - train_ind and test_ind to np.arange(n_obs), which corresponds to no sample splitting. - ``n_folds=1`` and ``n_rep=1`` is always set. - - all_smpls_cluster : list or None - Nested list or ``None``. The first level of nesting corresponds to the number of repetitions. The second level - of nesting corresponds to the number of folds. The third level of nesting contains a tuple of training and - testing lists. Both training and testing contain an array for each cluster variable, which form a partition of - the clusters. - Default is ``None``. - - Returns - ------- - self : object - - Examples - -------- - >>> import numpy as np - >>> import doubleml as dml - >>> from doubleml.plm.datasets import make_plr_CCDDHNR2018 - >>> from sklearn.ensemble import RandomForestRegressor - >>> from sklearn.base import clone - >>> np.random.seed(3141) - >>> learner = RandomForestRegressor(max_depth=2, n_estimators=10) - >>> ml_g = learner - >>> ml_m = learner - >>> obj_dml_data = make_plr_CCDDHNR2018(n_obs=10, alpha=0.5) - >>> dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m) - >>> # simple sample splitting with two folds and without cross-fitting - >>> smpls = ([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]) - >>> dml_plr_obj.set_sample_splitting(smpls) - >>> # sample splitting with two folds and cross-fitting - >>> smpls = [([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]), - >>> ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])] - >>> dml_plr_obj.set_sample_splitting(smpls) - >>> # sample splitting with two folds and repeated cross-fitting with n_rep = 2 - >>> smpls = [[([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]), - >>> ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])], - >>> [([0, 2, 4, 6, 8], [1, 3, 5, 7, 9]), - >>> ([1, 3, 5, 7, 9], [0, 2, 4, 6, 8])]] - >>> dml_plr_obj.set_sample_splitting(smpls) - """ - self._smpls, self._smpls_cluster, self._n_rep, self._n_folds = _check_sample_splitting( - all_smpls, all_smpls_cluster, self._dml_data, self._is_cluster_data, n_obs=self._n_obs_sample_splitting - ) - - # set sample splitting can update the number of repetitions + def _initialize_dml_model(self): self._score_dim = (self._score_dim[0], self._n_rep, self._score_dim[2]) self._initialize_arrays() self._initialize_ml_nuisance_params() - return self def _est_causal_pars(self, psi_elements): From 447b628546527d655ebe62b6cf0b4d59b0ca69de Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Thu, 2 Oct 2025 12:52:03 +0200 Subject: [PATCH 35/40] implement new methods of mixin into model classes --- doubleml/double_ml.py | 7 +++-- doubleml/irm/apos.py | 58 ++----------------------------------- doubleml/irm/qte.py | 67 ++----------------------------------------- 3 files changed, 8 insertions(+), 124 deletions(-) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index bba78a0f..62f4b453 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -11,7 +11,7 @@ from doubleml.data.base_data import DoubleMLBaseData from doubleml.double_ml_framework import DoubleMLFramework from doubleml.double_ml_sampling_mixins import SampleSplittingMixin -from doubleml.utils._checks import _check_external_predictions, _check_sample_splitting +from doubleml.utils._checks import _check_external_predictions from doubleml.utils._estimation import _aggregate_coefs_and_ses, _rmse, _set_external_predictions, _var_est from doubleml.utils._sensitivity import _compute_sensitivity_bias from doubleml.utils.gain_statistics import gain_statistics @@ -110,7 +110,7 @@ def __init__(self, obj_dml_data, n_folds, n_rep, score, draw_sample_splitting): self._n_obs_sample_splitting = self.n_obs if draw_sample_splitting: self.draw_sample_splitting() - + self._score_dim = (self._dml_data.n_obs, self.n_rep, self._dml_data.n_coefs) self._initialize_dml_model() # initialize instance attributes which are later used for iterating @@ -1248,7 +1248,8 @@ def evaluate_learners(self, learners=None, metric=_rmse): def _initialize_dml_model(self): self._score_dim = (self._score_dim[0], self._n_rep, self._score_dim[2]) self._initialize_arrays() - self._initialize_ml_nuisance_params() + if self._learner: + self._initialize_ml_nuisance_params() return self def _est_causal_pars(self, psi_elements): diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py index 8ff345c0..59e4a33b 100644 --- a/doubleml/irm/apos.py +++ b/doubleml/irm/apos.py @@ -11,7 +11,7 @@ from doubleml.double_ml_framework import concat from doubleml.double_ml_sampling_mixins import SampleSplittingMixin from doubleml.irm.apo import DoubleMLAPO -from doubleml.utils._checks import _check_sample_splitting, _check_score, _check_trimming, _check_weights +from doubleml.utils._checks import _check_score, _check_trimming, _check_weights from doubleml.utils._descriptive import generate_summary from doubleml.utils._sensitivity import _compute_sensitivity_bias from doubleml.utils.gain_statistics import gain_statistics @@ -627,62 +627,8 @@ def sensitivity_benchmark(self, benchmarking_set, fit_args=None): df_benchmark = pd.DataFrame(benchmark_dict, index=self.treatment_levels) return df_benchmark - def set_sample_splitting(self, all_smpls, all_smpls_cluster=None): - """ - Set the sample splitting for DoubleML models. - - The attributes ``n_folds`` and ``n_rep`` are derived from the provided partition. - - Parameters - ---------- - all_smpls : list or tuple - If nested list of lists of tuples: - The outer list needs to provide an entry per repeated sample splitting (length of list is set as - ``n_rep``). - The inner list needs to provide a tuple (train_ind, test_ind) per fold (length of list is set as - ``n_folds``). test_ind must form a partition for each inner list. - If list of tuples: - The list needs to provide a tuple (train_ind, test_ind) per fold (length of list is set as - ``n_folds``). test_ind must form a partition. ``n_rep=1`` is always set. - If tuple: - Must be a tuple with two elements train_ind and test_ind. Only viable option is to set - train_ind and test_ind to np.arange(n_obs), which corresponds to no sample splitting. - ``n_folds=1`` and ``n_rep=1`` is always set. - - Returns - ------- - self : object - - Examples - -------- - >>> import numpy as np - >>> import doubleml as dml - >>> from doubleml.plm.datasets import make_plr_CCDDHNR2018 - >>> from sklearn.ensemble import RandomForestRegressor - >>> from sklearn.base import clone - >>> np.random.seed(3141) - >>> learner = RandomForestRegressor(max_depth=2, n_estimators=10) - >>> ml_g = learner - >>> ml_m = learner - >>> obj_dml_data = make_plr_CCDDHNR2018(n_obs=10, alpha=0.5) - >>> dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m) - >>> # sample splitting with two folds and cross-fitting - >>> smpls = [([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]), - >>> ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])] - >>> dml_plr_obj.set_sample_splitting(smpls) - >>> # sample splitting with two folds and repeated cross-fitting with n_rep = 2 - >>> smpls = [[([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]), - >>> ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])], - >>> [([0, 2, 4, 6, 8], [1, 3, 5, 7, 9]), - >>> ([1, 3, 5, 7, 9], [0, 2, 4, 6, 8])]] - >>> dml_plr_obj.set_sample_splitting(smpls) - """ - self._smpls, self._smpls_cluster, self._n_rep, self._n_folds = _check_sample_splitting( - all_smpls, all_smpls_cluster, self._dml_data, self._is_cluster_data - ) - + def _initialize_dml_model(self): self._modellist = self._initialize_models() - return self def causal_contrast(self, reference_levels): diff --git a/doubleml/irm/qte.py b/doubleml/irm/qte.py index fc5d5263..d2b10bcd 100644 --- a/doubleml/irm/qte.py +++ b/doubleml/irm/qte.py @@ -9,7 +9,7 @@ from doubleml.irm.cvar import DoubleMLCVAR from doubleml.irm.lpq import DoubleMLLPQ from doubleml.irm.pq import DoubleMLPQ -from doubleml.utils._checks import _check_sample_splitting, _check_score, _check_trimming, _check_zero_one_treatment +from doubleml.utils._checks import _check_score, _check_trimming, _check_zero_one_treatment from doubleml.utils._descriptive import generate_summary from doubleml.utils._estimation import _default_kde @@ -439,74 +439,11 @@ def bootstrap(self, method="normal", n_rep_boot=500): if self._framework is None: raise ValueError("Apply fit() before bootstrap().") self._framework.bootstrap(method=method, n_rep_boot=n_rep_boot) - return self - def set_sample_splitting(self, all_smpls, all_smpls_cluster=None): - """ - Set the sample splitting for DoubleML models. - - The attributes ``n_folds`` and ``n_rep`` are derived from the provided partition. - - Parameters - ---------- - all_smpls : list or tuple - If nested list of lists of tuples: - The outer list needs to provide an entry per repeated sample splitting (length of list is set as - ``n_rep``). - The inner list needs to provide a tuple (train_ind, test_ind) per fold (length of list is set as - ``n_folds``). test_ind must form a partition for each inner list. - If list of tuples: - The list needs to provide a tuple (train_ind, test_ind) per fold (length of list is set as - ``n_folds``). test_ind must form a partition. ``n_rep=1`` is always set. - If tuple: - Must be a tuple with two elements train_ind and test_ind. Only viable option is to set - train_ind and test_ind to np.arange(n_obs), which corresponds to no sample splitting. - ``n_folds=1`` and ``n_rep=1`` is always set. - - all_smpls_cluster : list or None - Nested list or ``None``. The first level of nesting corresponds to the number of repetitions. The second level - of nesting corresponds to the number of folds. The third level of nesting contains a tuple of training and - testing lists. Both training and testing contain an array for each cluster variable, which form a partition of - the clusters. - Default is ``None``. - - Returns - ------- - self : object - - Examples - -------- - >>> import numpy as np - >>> import doubleml as dml - >>> from doubleml.plm.datasets import make_plr_CCDDHNR2018 - >>> from sklearn.ensemble import RandomForestRegressor - >>> from sklearn.base import clone - >>> np.random.seed(3141) - >>> learner = RandomForestRegressor(max_depth=2, n_estimators=10) - >>> ml_g = learner - >>> ml_m = learner - >>> obj_dml_data = make_plr_CCDDHNR2018(n_obs=10, alpha=0.5) - >>> dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m) - >>> dml_plr_obj.set_sample_splitting(smpls) - >>> # sample splitting with two folds and cross-fitting - >>> smpls = [([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]), - >>> ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])] - >>> dml_plr_obj.set_sample_splitting(smpls) - >>> # sample splitting with two folds and repeated cross-fitting with n_rep = 2 - >>> smpls = [[([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]), - >>> ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])], - >>> [([0, 2, 4, 6, 8], [1, 3, 5, 7, 9]), - >>> ([1, 3, 5, 7, 9], [0, 2, 4, 6, 8])]] - >>> dml_plr_obj.set_sample_splitting(smpls) - """ - self._smpls, self._smpls_cluster, self._n_rep, self._n_folds = _check_sample_splitting( - all_smpls, all_smpls_cluster, self._dml_data, self._is_cluster_data - ) - + def _initialize_dml_model(self): # initialize all models self._modellist_0, self._modellist_1 = self._initialize_models() - return self def confint(self, joint=False, level=0.95): From 99bf7119a8f2efabf2badf6b6cee569eab2e3686 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Thu, 2 Oct 2025 13:11:32 +0200 Subject: [PATCH 36/40] add new `_initialize_dml_model` method to `__init__` methods --- doubleml/double_ml.py | 2 +- doubleml/irm/apos.py | 2 +- doubleml/irm/qte.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 62f4b453..faf4c362 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -1248,7 +1248,7 @@ def evaluate_learners(self, learners=None, metric=_rmse): def _initialize_dml_model(self): self._score_dim = (self._score_dim[0], self._n_rep, self._score_dim[2]) self._initialize_arrays() - if self._learner: + if self._learner: # for calling in __init__ of subclasses, we need to check if _learner is already set self._initialize_ml_nuisance_params() return self diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py index 59e4a33b..5a6d41fc 100644 --- a/doubleml/irm/apos.py +++ b/doubleml/irm/apos.py @@ -94,7 +94,7 @@ def __init__( self.draw_sample_splitting() # initialize all models if splits are known - self._modellist = self._initialize_models() + self._initialize_dml_model() def __str__(self): class_name = self.__class__.__name__ diff --git a/doubleml/irm/qte.py b/doubleml/irm/qte.py index d2b10bcd..f896b078 100644 --- a/doubleml/irm/qte.py +++ b/doubleml/irm/qte.py @@ -150,7 +150,7 @@ def __init__( if draw_sample_splitting: self.draw_sample_splitting() # initialize all models - self._modellist_0, self._modellist_1 = self._initialize_models() + self._initialize_dml_model() def __str__(self): class_name = self.__class__.__name__ From 61ef95dd230cc56981d25cf062642374242fc059 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Thu, 2 Oct 2025 13:18:10 +0200 Subject: [PATCH 37/40] run pre-commit --- doubleml/double_ml.py | 2 +- doubleml/double_ml_sampling_mixins.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index faf4c362..4fbf0bd3 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -1248,7 +1248,7 @@ def evaluate_learners(self, learners=None, metric=_rmse): def _initialize_dml_model(self): self._score_dim = (self._score_dim[0], self._n_rep, self._score_dim[2]) self._initialize_arrays() - if self._learner: # for calling in __init__ of subclasses, we need to check if _learner is already set + if self._learner: # for calling in __init__ of subclasses, we need to check if _learner is already set self._initialize_ml_nuisance_params() return self diff --git a/doubleml/double_ml_sampling_mixins.py b/doubleml/double_ml_sampling_mixins.py index a345ec59..67b6f245 100644 --- a/doubleml/double_ml_sampling_mixins.py +++ b/doubleml/double_ml_sampling_mixins.py @@ -1,6 +1,7 @@ from abc import abstractmethod -from doubleml.utils.resampling import DoubleMLClusterResampling, DoubleMLResampling + from doubleml.utils._checks import _check_sample_splitting +from doubleml.utils.resampling import DoubleMLClusterResampling, DoubleMLResampling class SampleSplittingMixin: From 4a4bd174514e86ef1b9a41730e6bc796cb667fb9 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Thu, 2 Oct 2025 13:20:06 +0200 Subject: [PATCH 38/40] adjust mixin class --- doubleml/double_ml_sampling_mixins.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/doubleml/double_ml_sampling_mixins.py b/doubleml/double_ml_sampling_mixins.py index 67b6f245..e9cc0253 100644 --- a/doubleml/double_ml_sampling_mixins.py +++ b/doubleml/double_ml_sampling_mixins.py @@ -17,16 +17,6 @@ class SampleSplittingMixin: `sample splitting `_ in the DoubleML user guide. """ - def __init__(self): - self.n_folds = 5 - self.n_rep = 1 - self._smpls = None - self._smpls_cluster = None - self._is_cluster_data = False - self._n_folds_per_cluster = None - self._n_obs_sample_splitting = None - self._strata = None - def draw_sample_splitting(self): """ Draw sample splitting for DoubleML models. From 4ed1f51d6c47a451f08a3b585d36bda5f3981f20 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Thu, 2 Oct 2025 14:20:29 +0200 Subject: [PATCH 39/40] remove `pass` statement in abstract method. --- doubleml/double_ml_sampling_mixins.py | 1 - 1 file changed, 1 deletion(-) diff --git a/doubleml/double_ml_sampling_mixins.py b/doubleml/double_ml_sampling_mixins.py index e9cc0253..bd9d0c13 100644 --- a/doubleml/double_ml_sampling_mixins.py +++ b/doubleml/double_ml_sampling_mixins.py @@ -120,4 +120,3 @@ def _initialize_dml_model(self): Updates model dimensions to (n_folds, n_rep). This method needs to be implemented in the child class. """ - pass From d80459b11d1b7d9358d6ed9bab53d88dc733069c Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Sat, 4 Oct 2025 10:08:44 +0200 Subject: [PATCH 40/40] correct selection variable array type to remove warning --- doubleml/data/ssm_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doubleml/data/ssm_data.py b/doubleml/data/ssm_data.py index dc592c96..2785821a 100644 --- a/doubleml/data/ssm_data.py +++ b/doubleml/data/ssm_data.py @@ -295,7 +295,7 @@ def _check_disjoint_sets_s_col(self): def _set_selection_var(self): """Set the selection variable array.""" if hasattr(self, "_data") and self.s_col in self.data.columns: - self._s = self.data.loc[:, [self.s_col]] + self._s = self.data.loc[:, [self.s_col]].squeeze() def _set_y_z_s(self): def _set_attr(col):