Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/victor5as/dowhy
Browse files Browse the repository at this point in the history
  • Loading branch information
victor5as committed May 16, 2024
2 parents 13032d7 + cfb25ec commit b4ad456
Showing 1 changed file with 99 additions and 2 deletions.
101 changes: 99 additions & 2 deletions dowhy/gcm/distribution_change_robust.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def __init__(
C = the change vector (a K+1 list of 0s and 1s).
h_fn = the functional of interest. By default, the mean of y.
warn_th = the threshold that generates warning about re-weighting
warn_th = the threshold that generates warning about re-weighting
"""

if any(x not in (0, 1) for x in C):
Expand All @@ -48,6 +49,7 @@ def __init__(
self.calib_dict = {} # A dictionary to store the trained calibrators
self.alpha_dict = {} # A dictionary to store the fitted weights alpha_k (Theorem 2.4)
self.warn_th = warn_th # The threshold that generates warning about re-weighting
self.warn_th = warn_th # The threshold that generates warning about re-weighting

def _simplify_C(self, all_indep: bool = False) -> None:
"""
Expand Down Expand Up @@ -84,7 +86,12 @@ def _train_reg(
y_train: np.ndarray,
T_train: np.ndarray,
w_train: Optional[np.ndarray] = None,
X_train: np.ndarray,
y_train: np.ndarray,
T_train: np.ndarray,
w_train: Optional[np.ndarray] = None,
regressor: PredictionModel = create_linear_regressor,
) -> None:
) -> None:
"""
This function trains the nested regression estimators, that will be stored in self.reg_dict.
Expand All @@ -94,6 +101,10 @@ def _train_reg(
y_train = (n_train,) np.ndarray with the Y data (outcome) for the training set.
T_train = (n_train,) np.ndarray with the T data (sample indicator) for the training set.
w_train = optional (n_train,) np.ndarray with sample weights for the train data.
X_train = (n_train, K) np.ndarray with the X data (explanatory variables) for the training set.
y_train = (n_train,) np.ndarray with the Y data (outcome) for the training set.
T_train = (n_train,) np.ndarray with the T data (sample indicator) for the training set.
w_train = optional (n_train,) np.ndarray with sample weights for the train data.
regressor = the regression estimator: a class supporting .fit and .predict methods.
"""
Expand Down Expand Up @@ -130,11 +141,19 @@ def _train_cla(
T_train: np.ndarray,
X_eval: np.ndarray,
w_train: Optional[np.ndarray] = None,
X_train: np.ndarray,
T_train: np.ndarray,
X_eval: np.ndarray,
w_train: Optional[np.ndarray] = None,
classifier: ClassificationModel = create_logistic_regression_classifier,
calibrator: Optional[PredictionModel] = None,
X_calib: Optional[np.ndarray] = None,
T_calib: Optional[np.ndarray] = None,
w_calib: Optional[np.ndarray] = None,
) -> None:
X_calib: Optional[np.ndarray] = None,
T_calib: Optional[np.ndarray] = None,
w_calib: Optional[np.ndarray] = None,
) -> None:
"""
This function trains the classification estimators for the weights, that will be stored in self.cla_dict.
Expand All @@ -143,9 +162,13 @@ def _train_cla(
Inputs:
X_train = (n_train, K) np.ndarray with the X data (explanatory variables) for the training set.
T_train = (n_train,) np.ndarray with the T data (sample indicator) for the training set.
X_eval = (n_eval, K) np.ndarray with the X data (explanatory variables) for the evaluation set.
X_train = (n_train, K) np.ndarray with the X data (explanatory variables) for the training set.
T_train = (n_train,) np.ndarray with the T data (sample indicator) for the training set.
X_eval = (n_eval, K) np.ndarray with the X data (explanatory variables) for the evaluation set.
Used only to give a warning about low overlap.
w_train = optional (n_train,) np.ndarray with sample weights for the training set.
w_train = optional (n_train,) np.ndarray with sample weights for the training set.
classifier = the classification estimator: a class supporting .fit and .predict_probabilities methods.
Expand All @@ -154,6 +177,9 @@ def _train_cla(
a classifier (e.g. sklearn.LogisticRegression).
No need to do this if classifier is a sklearn.calibration.CalibratedClassifierCV learner.
X_calib = (n_calib, K) np.ndarray with the X data (explanatory variables) for the calibration set.
T_calib = (n_calib,) np.ndarray with the T data (sample indicator) for the calibration set.
w_calib = optional (n_calib,) np.ndarray with sample weights for the calibration set.
X_calib = (n_calib, K) np.ndarray with the X data (explanatory variables) for the calibration set.
T_calib = (n_calib,) np.ndarray with the T data (sample indicator) for the calibration set.
w_calib = optional (n_calib,) np.ndarray with sample weights for the calibration set.
Expand Down Expand Up @@ -205,6 +231,9 @@ def _get_alphas(
X_eval = (n_eval, K) np.ndarray with the X data (explanatory variables) for the evaluation set.
T_eval = (n_eval,) np.ndarray with the T data (sample indicator) for the evaluation set.
ratio = n1/n0 (unless the classifier has been trained with class weight, not supported yet).
X_eval = (n_eval, K) np.ndarray with the X data (explanatory variables) for the evaluation set.
T_eval = (n_eval,) np.ndarray with the T data (sample indicator) for the evaluation set.
ratio = n1/n0 (unless the classifier has been trained with class weight, not supported yet).
calibrator = Optional, a method for probability calibration on a calibration set.
This could be a regressor (e.g. sklearn.isotonic.IsotonicRegression) or
a classifier (e.g. sklearn.LogisticRegression).
Expand All @@ -215,6 +244,7 @@ def _get_alphas(
Returns:
alpha_k = (n0,) or (n1,) np.ndarray of alpha_k weights for sample C_{k+1} \in {0,1}
alpha_k = (n0,) or (n1,) np.ndarray of alpha_k weights for sample C_{k+1} \in {0,1}
"""

for k in range(self.K_simpl):
Expand Down Expand Up @@ -285,6 +315,15 @@ def est_scores(
w_eval: Optional[np.ndarray] = None,
w_train: Optional[np.ndarray] = None,
method: Literal["regression", "re-weighting", "MR"] = "MR",
X_eval: np.ndarray,
y_eval: np.ndarray,
T_eval: np.ndarray,
X_train: np.ndarray,
y_train: np.ndarray,
T_train: np.ndarray,
w_eval: Optional[np.ndarray] = None,
w_train: Optional[np.ndarray] = None,
method: Literal["regression", "re-weighting", "MR"] = "MR",
regressor: PredictionModel = create_linear_regressor,
classifier: ClassificationModel = create_logistic_regression_classifier,
calibrator: Optional[PredictionModel] = None,
Expand All @@ -293,6 +332,12 @@ def est_scores(
w_calib: Optional[np.ndarray] = None,
all_indep: bool = False,
crop: float = 1e-3,
) -> np.ndarray:
X_calib: Optional[np.ndarray] = None,
T_calib: Optional[np.ndarray] = None,
w_calib: Optional[np.ndarray] = None,
all_indep: bool = False,
crop: float = 1e-3,
) -> np.ndarray:
"""
This function computes the scores that are averaged to get each theta_hat.
Expand All @@ -309,6 +354,14 @@ def est_scores(
T_train = (n_train,) np.ndarray with the T data (sample indicator) for the training set.
w_eval = optional (n_eval,) np.ndarray with sample weights for the evaluation set.
w_train = optional (n_train,) np.ndarray with sample weights for the training set.
X_eval = (n_eval, K) np.ndarray with the X data (explanatory variables) for the evaluation set.
y_eval = (n_eval,) np.ndarray with the Y data (outcome) for the evaluation set.
T_eval = (n_eval,) np.ndarray with the T data (sample indicator) for the evaluation set.
X_train = (n_train, K) np.ndarray with the X data (explanatory variables) for the training set.
y_train = (n_train,) np.ndarray with the Y data (outcome) for the training set.
T_train = (n_train,) np.ndarray with the T data (sample indicator) for the training set.
w_eval = optional (n_eval,) np.ndarray with sample weights for the evaluation set.
w_train = optional (n_train,) np.ndarray with sample weights for the training set.
method = One of 'regression', 're-weighting', 'MR'. By default, 'MR'.
Expand All @@ -324,13 +377,17 @@ def est_scores(
X_calib = (n_calib, K) np.ndarray with the X data (explanatory variables) for the calibration set.
T_calib = (n_calib,) np.ndarray with the T data (sample indicator) for the calibration set.
w_calib = optional (n_calib,) np.ndarray with sample weights for the calibration set.
X_calib = (n_calib, K) np.ndarray with the X data (explanatory variables) for the calibration set.
T_calib = (n_calib,) np.ndarray with the T data (sample indicator) for the calibration set.
w_calib = optional (n_calib,) np.ndarray with sample weights for the calibration set.
all_indep = boolean, True if all explanatory variables are independent (used for self._simplify_C).
crop = float, all predicted probabilities from the classifier will be cropped below at this lower bound,
and above at 1-crop.
Returns:
theta_scores = (n_eval,) np.ndarray of scores, such that theta_hat = np.mean(theta_scores).
theta_scores = (n_eval,) np.ndarray of scores, such that theta_hat = np.mean(theta_scores).
"""

if w_eval is None:
Expand Down Expand Up @@ -493,6 +550,15 @@ def est_theta(
w_eval: Optional[np.ndarray] = None,
w_train: Optional[np.ndarray] = None,
method: Literal["regression", "re-weighting", "MR"] = "MR",
X_eval: np.ndarray,
y_eval: np.ndarray,
T_eval: np.ndarray,
X_train: np.ndarray,
y_train: np.ndarray,
T_train: np.ndarray,
w_eval: Optional[np.ndarray] = None,
w_train: Optional[np.ndarray] = None,
method: Literal["regression", "re-weighting", "MR"] = "MR",
regressor: PredictionModel = create_linear_regressor,
classifier: ClassificationModel = create_logistic_regression_classifier,
calibrator: Optional[PredictionModel] = None,
Expand All @@ -501,6 +567,12 @@ def est_theta(
w_calib: Optional[np.ndarray] = None,
all_indep: bool = False,
crop: float = 1e-3,
) -> Tuple[float, float]:
X_calib: Optional[np.ndarray] = None,
T_calib: Optional[np.ndarray] = None,
w_calib: Optional[np.ndarray] = None,
all_indep: bool = False,
crop: float = 1e-3,
) -> Tuple[float, float]:
"""
This function computes the scores that are averaged to get each theta_hat,
Expand All @@ -515,6 +587,14 @@ def est_theta(
T_train = (n_train,) np.ndarray with the T data (sample indicator) for the training set.
w_eval = optional (n_eval,) np.ndarray with sample weights for the evaluation set.
w_train = optional (n_train,) np.ndarray with sample weights for the training set.
X_eval = (n_eval, K) np.ndarray with the X data (explanatory variables) for the evaluation set.
y_eval = (n_eval,) np.ndarray with the Y data (outcome) for the evaluation set.
T_eval = (n_eval,) np.ndarray with the T data (sample indicator) for the evaluation set.
X_train = (n_train, K) np.ndarray with the X data (explanatory variables) for the training set.
y_train = (n_train,) np.ndarray with the Y data (outcome) for the training set.
T_train = (n_train,) np.ndarray with the T data (sample indicator) for the training set.
w_eval = optional (n_eval,) np.ndarray with sample weights for the evaluation set.
w_train = optional (n_train,) np.ndarray with sample weights for the training set.
method = One of 'regression', 're-weighting', 'MR'. By default, 'MR'.
Expand All @@ -530,6 +610,9 @@ def est_theta(
X_calib = (n_calib, K) np.ndarray with the X data (explanatory variables) for the calibration set.
T_calib = (n_calib,) np.ndarray with the T data (sample indicator) for the calibration set.
w_train = optional (n_calib,) np.ndarray with sample weights for the calibration set.
X_calib = (n_calib, K) np.ndarray with the X data (explanatory variables) for the calibration set.
T_calib = (n_calib,) np.ndarray with the T data (sample indicator) for the calibration set.
w_train = optional (n_calib,) np.ndarray with sample weights for the calibration set.
all_indep = boolean, True if all explanatory variables are independent (used for self._simplify_C).
crop = float, all predicted probabilities from the classifier will be cropped below at this lower bound,
Expand All @@ -550,6 +633,7 @@ def est_theta(
w_eval=w_eval,
w_train=w_train,
method=method,
method=method,
regressor=regressor,
classifier=classifier,
calibrator=calibrator,
Expand Down Expand Up @@ -582,12 +666,23 @@ def distribution_change_robust(
calib_size: float = 0.2,
split_random_state: int = 0,
method: Literal["regression", "re-weighting", "MR"] = "MR",
target_functional: str = "mean",
sample_weight: Optional[Any] = None,
xfit: bool = True,
xfit_folds: int = 5,
train_size: float = 0.5,
calib_size: float = 0.2,
split_random_state: int = 0,
method: Literal["regression", "re-weighting", "MR"] = "MR",
regressor: PredictionModel = create_linear_regressor,
classifier: ClassificationModel = create_logistic_regression_classifier,
calibrator: Optional[PredictionModel] = None,
all_indep: bool = False,
crop: float = 1e-3,
all_indep: bool = False,
crop: float = 1e-3,
shapley_config: Optional[ShapleyConfig] = None,
) -> Dict[Any, float]:
) -> Dict[Any, float]:
"""
This function computes the Shapley values for attribution of change in the mean or variance of target_node
Expand All @@ -602,6 +697,9 @@ def distribution_change_robust(
:param target_functional: Target functional of interest, of which the change is attributed. For now, supported
functionals are "mean" and "variance".
:param sample_weight: Sample weight variable, if using (optional).
:param target_functional: Target functional of interest, of which the change is attributed. For now, supported
functionals are "mean" and "variance".
:param sample_weight: Sample weight variable, if using (optional).
:param xfit: Whether to use cross-fitting (True) or sample splitting (False) to estimate the nuisance parameters.
:param xfit_folds: Number of folds for cross-fitting if xfit = True.
Expand Down Expand Up @@ -762,8 +860,7 @@ def set_func(C: np.ndarray) -> float:
w_calib=w_calib,
all_indep=all_indep,
crop=crop,
)[0]
** 2
)[0] ** 2
)

else:
Expand Down

0 comments on commit b4ad456

Please sign in to comment.