Skip to content

Commit 81e2aa0

Browse files
Add refute once function
Signed-off-by: rahulbshrestha <rahulshrestha0101@gmail.com>
1 parent 333c9a8 commit 81e2aa0

File tree

1 file changed

+175
-141
lines changed

1 file changed

+175
-141
lines changed

dowhy/causal_refuters/dummy_outcome_refuter.py

Lines changed: 175 additions & 141 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import numpy as np
77
import pandas as pd
8+
from joblib import Parallel, delayed
89
from sklearn.ensemble import RandomForestRegressor
910
from sklearn.linear_model import LinearRegression
1011
from sklearn.neighbors import KNeighborsRegressor
@@ -241,6 +242,151 @@ def refute_estimate(self, show_progress_bar: bool = False):
241242
return refutes
242243

243244

245+
def _refute_once(
246+
data: pd.DataFrame,
247+
estimate: CausalEstimate,
248+
treatment_name: str,
249+
outcome_name: str,
250+
estimator_present: bool,
251+
unobserved_confounder_values,
252+
causal_effect_map,
253+
identified_estimand,
254+
test_fraction,
255+
chosen_variables: Optional[List] = None,
256+
transformation_list: List = DEFAULT_TRANSFORMATION,
257+
true_causal_effect: Callable = DEFAULT_TRUE_CAUSAL_EFFECT,
258+
min_data_point_threshold: float = MIN_DATA_POINT_THRESHOLD,
259+
bucket_size_scale_factor: float = DEFAULT_BUCKET_SCALE_FACTOR,
260+
):
261+
estimates = []
262+
263+
if estimator_present == False:
264+
# Warn the user that the specified parameter is not applicable when no estimator is present in the transformation
265+
if test_fraction != DEFAULT_TEST_FRACTION:
266+
logger.warning("'test_fraction' is not applicable as there is no base treatment value.")
267+
268+
# Adding an unobserved confounder if provided by the user
269+
if unobserved_confounder_values is not None:
270+
data["simulated"] = unobserved_confounder_values
271+
chosen_variables.append("simulated")
272+
# We set X_train = 0 and outcome_train to be 0
273+
validation_df = data
274+
X_train = None
275+
outcome_train = None
276+
X_validation_df = validation_df[chosen_variables]
277+
278+
X_validation = X_validation_df.values
279+
outcome_validation = validation_df[outcome_name].values
280+
281+
# Get the final outcome, after running through all the values in the transformation list
282+
outcome_validation = process_data(
283+
outcome_name, X_train, outcome_train, X_validation, outcome_validation, transformation_list
284+
)
285+
286+
# Check if the value of true effect has been already stored
287+
# We use None as the key as we have no base category for this refutation
288+
if None not in causal_effect_map:
289+
# As we currently support only one treatment
290+
causal_effect_map[None] = true_causal_effect(validation_df[treatment_name[0]])
291+
292+
outcome_validation += causal_effect_map[None]
293+
294+
new_data = validation_df.assign(dummy_outcome=outcome_validation)
295+
296+
new_estimator = estimate.estimator.get_new_estimator_object(identified_estimand)
297+
new_estimator.fit(
298+
new_data,
299+
effect_modifier_names=estimate.estimator._effect_modifier_names,
300+
**new_estimator._fit_params if hasattr(new_estimator, "_fit_params") else {},
301+
)
302+
new_effect = new_estimator.estimate_effect(
303+
new_data,
304+
control_value=estimate.control_value,
305+
treatment_value=estimate.treatment_value,
306+
target_units=estimate.estimator._target_units,
307+
)
308+
estimates.append(new_effect.value)
309+
310+
else:
311+
groups = preprocess_data_by_treatment(
312+
data, treatment_name, unobserved_confounder_values, bucket_size_scale_factor, chosen_variables
313+
)
314+
group_count = 0
315+
316+
if len(test_fraction) == 1:
317+
test_fraction = len(groups) * test_fraction
318+
319+
for key_train, _ in groups:
320+
base_train = groups.get_group(key_train).sample(frac=test_fraction[group_count].base)
321+
train_set = set([tuple(line) for line in base_train.values])
322+
total_set = set([tuple(line) for line in groups.get_group(key_train).values])
323+
base_validation = pd.DataFrame(list(total_set.difference(train_set)), columns=base_train.columns)
324+
X_train_df = base_train[chosen_variables]
325+
326+
X_train = X_train_df.values
327+
outcome_train = base_train[outcome_name].values
328+
329+
validation_df = []
330+
transformation_list_temp = transformation_list
331+
validation_df.append(base_validation)
332+
333+
for key_validation, _ in groups:
334+
if key_validation != key_train:
335+
validation_df.append(groups.get_group(key_validation).sample(frac=test_fraction[group_count].other))
336+
337+
validation_df = pd.concat(validation_df)
338+
X_validation_df = validation_df[chosen_variables]
339+
340+
X_validation = X_validation_df.values
341+
outcome_validation = validation_df[outcome_name].values
342+
343+
# If the number of data points is too few, run the default transformation: [("zero",""),("noise", {'std_dev':1} )]
344+
if X_train.shape[0] <= min_data_point_threshold:
345+
transformation_list_temp = DEFAULT_TRANSFORMATION
346+
logger.warning(
347+
"The number of data points in X_train:{} for category:{} is less than threshold:{}".format(
348+
X_train.shape[0], key_train, min_data_point_threshold
349+
)
350+
)
351+
logger.warning(
352+
"Therefore, defaulting to the minimal set of transformations:{}".format(transformation_list_temp)
353+
)
354+
355+
outcome_validation = process_data(
356+
outcome_name, X_train, outcome_train, X_validation, outcome_validation, transformation_list_temp
357+
)
358+
359+
# Check if the value of true effect has been already stored
360+
# This ensures that we calculate the causal effect only once.
361+
# We use key_train as we map data with respect to the base category of the data
362+
363+
if key_train not in causal_effect_map:
364+
# As we currently support only one treatment
365+
causal_effect_map[key_train] = true_causal_effect(validation_df[treatment_name[0]])
366+
367+
# Add h(t) to f(W) to get the dummy outcome
368+
outcome_validation += causal_effect_map[key_train]
369+
370+
new_data = validation_df.assign(dummy_outcome=outcome_validation)
371+
new_estimator = estimate.estimator.get_new_estimator_object(identified_estimand)
372+
new_estimator.fit(
373+
new_data,
374+
effect_modifier_names=estimate.estimator._effect_modifier_names,
375+
**new_estimator._fit_params if hasattr(new_estimator, "_fit_params") else {},
376+
)
377+
new_effect = new_estimator.estimate_effect(
378+
new_data,
379+
control_value=estimate.control_value,
380+
treatment_value=estimate.treatment_value,
381+
target_units=estimate.estimator._target_units,
382+
)
383+
384+
estimates.append(new_effect.value)
385+
group_count += 1
386+
387+
return estimates
388+
389+
244390
def refute_dummy_outcome(
245391
data: pd.DataFrame,
246392
target_estimand: IdentifiedEstimand,
@@ -256,6 +402,8 @@ def refute_dummy_outcome(
256402
unobserved_confounder_values: Optional[List] = DEFAULT_NEW_DATA_WITH_UNOBSERVED_CONFOUNDING,
257403
true_causal_effect: Callable = DEFAULT_TRUE_CAUSAL_EFFECT,
258404
show_progress_bar=False,
405+
n_jobs: int = 1,
406+
verbose: int = 0,
259407
**_,
260408
) -> List[CausalRefutation]:
261409
"""Refute an estimate by replacing the outcome with a simulated variable
@@ -447,159 +595,45 @@ def refute_dummy_outcome(
447595
# Train and the Validation Datasets. Thus, we run the simulation loop followed by the training and the validation
448596
# loops. Thus, we can get different values everytime we get the estimator.
449597

450-
# for _ in range( self._num_simulations ):
451-
for _ in tqdm(
452-
range(num_simulations),
453-
colour=CausalRefuter.PROGRESS_BAR_COLOR,
454-
disable=not show_progress_bar,
455-
desc="Refuting Estimates: ",
456-
):
457-
estimates = []
458-
459-
if estimator_present == False:
460-
461-
# Warn the user that the specified parameter is not applicable when no estimator is present in the transformation
462-
if test_fraction != DEFAULT_TEST_FRACTION:
463-
logger.warning("'test_fraction' is not applicable as there is no base treatment value.")
464-
465-
# Adding an unobserved confounder if provided by the user
466-
if unobserved_confounder_values is not None:
467-
data["simulated"] = unobserved_confounder_values
468-
chosen_variables.append("simulated")
469-
# We set X_train = 0 and outcome_train to be 0
470-
validation_df = data
471-
X_train = None
472-
outcome_train = None
473-
X_validation_df = validation_df[chosen_variables]
474-
475-
X_validation = X_validation_df.values
476-
outcome_validation = validation_df[outcome_name].values
477-
478-
# Get the final outcome, after running through all the values in the transformation list
479-
outcome_validation = process_data(
480-
outcome_name, X_train, outcome_train, X_validation, outcome_validation, transformation_list
481-
)
482-
483-
# Check if the value of true effect has been already stored
484-
# We use None as the key as we have no base category for this refutation
485-
if None not in causal_effect_map:
486-
# As we currently support only one treatment
487-
causal_effect_map[None] = true_causal_effect(validation_df[treatment_name[0]])
488-
489-
outcome_validation += causal_effect_map[None]
490-
491-
new_data = validation_df.assign(dummy_outcome=outcome_validation)
492-
493-
new_estimator = estimate.estimator.get_new_estimator_object(identified_estimand)
494-
new_estimator.fit(
495-
new_data,
496-
effect_modifier_names=estimate.estimator._effect_modifier_names,
497-
**new_estimator._fit_params if hasattr(new_estimator, "_fit_params") else {},
498-
)
499-
new_effect = new_estimator.estimate_effect(
500-
new_data,
501-
control_value=estimate.control_value,
502-
treatment_value=estimate.treatment_value,
503-
target_units=estimate.estimator._target_units,
504-
)
505-
estimates.append(new_effect.value)
506-
507-
else:
508-
509-
groups = preprocess_data_by_treatment(
510-
data, treatment_name, unobserved_confounder_values, bucket_size_scale_factor, chosen_variables
511-
)
512-
group_count = 0
513-
514-
if len(test_fraction) == 1:
515-
test_fraction = len(groups) * test_fraction
516-
517-
for key_train, _ in groups:
518-
base_train = groups.get_group(key_train).sample(frac=test_fraction[group_count].base)
519-
train_set = set([tuple(line) for line in base_train.values])
520-
total_set = set([tuple(line) for line in groups.get_group(key_train).values])
521-
base_validation = pd.DataFrame(list(total_set.difference(train_set)), columns=base_train.columns)
522-
X_train_df = base_train[chosen_variables]
523-
524-
X_train = X_train_df.values
525-
outcome_train = base_train[outcome_name].values
526-
527-
validation_df = []
528-
transformation_list_temp = transformation_list
529-
validation_df.append(base_validation)
530-
531-
for key_validation, _ in groups:
532-
if key_validation != key_train:
533-
validation_df.append(
534-
groups.get_group(key_validation).sample(frac=test_fraction[group_count].other)
535-
)
536-
537-
validation_df = pd.concat(validation_df)
538-
X_validation_df = validation_df[chosen_variables]
539-
540-
X_validation = X_validation_df.values
541-
outcome_validation = validation_df[outcome_name].values
542-
543-
# If the number of data points is too few, run the default transformation: [("zero",""),("noise", {'std_dev':1} )]
544-
if X_train.shape[0] <= min_data_point_threshold:
545-
transformation_list_temp = DEFAULT_TRANSFORMATION
546-
logger.warning(
547-
"The number of data points in X_train:{} for category:{} is less than threshold:{}".format(
548-
X_train.shape[0], key_train, min_data_point_threshold
549-
)
550-
)
551-
logger.warning(
552-
"Therefore, defaulting to the minimal set of transformations:{}".format(
553-
transformation_list_temp
554-
)
555-
)
556-
557-
outcome_validation = process_data(
558-
outcome_name, X_train, outcome_train, X_validation, outcome_validation, transformation_list_temp
559-
)
560-
561-
# Check if the value of true effect has been already stored
562-
# This ensures that we calculate the causal effect only once.
563-
# We use key_train as we map data with respect to the base category of the data
564-
565-
if key_train not in causal_effect_map:
566-
# As we currently support only one treatment
567-
causal_effect_map[key_train] = true_causal_effect(validation_df[treatment_name[0]])
568-
569-
# Add h(t) to f(W) to get the dummy outcome
570-
outcome_validation += causal_effect_map[key_train]
571-
572-
new_data = validation_df.assign(dummy_outcome=outcome_validation)
573-
new_estimator = estimate.estimator.get_new_estimator_object(identified_estimand)
574-
new_estimator.fit(
575-
new_data,
576-
effect_modifier_names=estimate.estimator._effect_modifier_names,
577-
**new_estimator._fit_params if hasattr(new_estimator, "_fit_params") else {},
578-
)
579-
new_effect = new_estimator.estimate_effect(
580-
new_data,
581-
control_value=estimate.control_value,
582-
treatment_value=estimate.treatment_value,
583-
target_units=estimate.estimator._target_units,
584-
)
585-
586-
estimates.append(new_effect.value)
587-
group_count += 1
598+
sample_estimates = Parallel(n_jobs=n_jobs, verbose=verbose)(
599+
delayed(_refute_once)(
600+
data=data,
601+
estimate=estimate,
602+
treatment_name=treatment_name,
603+
outcome_name=outcome_name,
604+
estimator_present=estimator_present,
605+
unobserved_confounder_values=unobserved_confounder_values,
606+
causal_effect_map=causal_effect_map,
607+
identified_estimand=identified_estimand,
608+
chosen_variables=chosen_variables,
609+
transformation_list=transformation_list,
610+
true_causal_effect=true_causal_effect,
611+
min_data_point_threshold=min_data_point_threshold,
612+
bucket_size_scale_factor=bucket_size_scale_factor,
613+
test_fraction=test_fraction,
614+
)
615+
for _ in tqdm(
616+
range(num_simulations),
617+
colour=CausalRefuter.PROGRESS_BAR_COLOR,
618+
disable=not show_progress_bar,
619+
desc="Refuting Estimates: ",
620+
)
621+
)
588622

589-
simulation_results.append(estimates)
623+
# simulation_results.append(estimates)
590624

591625
# We convert to ndarray for ease in indexing
592626
# The data is of the form
593627
# sim1: cat1 cat2 ... catn
594628
# sim2: cat1 cat2 ... catn
595-
simulation_results = np.array(simulation_results)
629+
simulation_results = np.array(sample_estimates)
596630

631+
# print('SIMULATION RESULTS::::: ', simulation_results)
597632
# Note: We would like the causal_estimator to find the true causal estimate that we have specified through this
598633
# refuter. Let the value of the true causal effect be h(t). In the following section of code, we wish to find out if h(t) falls in the
599634
# distribution of the refuter.
600635

601636
if estimator_present == False:
602-
603637
dummy_estimate = CausalEstimate(
604638
data=None,
605639
treatment_name=estimate._treatment_name,

0 commit comments

Comments
 (0)