5
5
6
6
import numpy as np
7
7
import pandas as pd
8
+ from joblib import Parallel , delayed
8
9
from sklearn .ensemble import RandomForestRegressor
9
10
from sklearn .linear_model import LinearRegression
10
11
from sklearn .neighbors import KNeighborsRegressor
@@ -241,6 +242,151 @@ def refute_estimate(self, show_progress_bar: bool = False):
241
242
return refutes
242
243
243
244
245
+ def _refute_once (
246
+ data : pd .DataFrame ,
247
+ estimate : CausalEstimate ,
248
+ treatment_name : str ,
249
+ outcome_name : str ,
250
+ estimator_present : bool ,
251
+ unobserved_confounder_values ,
252
+ causal_effect_map ,
253
+ identified_estimand ,
254
+ test_fraction ,
255
+ chosen_variables : Optional [List ] = None ,
256
+ transformation_list : List = DEFAULT_TRANSFORMATION ,
257
+ true_causal_effect : Callable = DEFAULT_TRUE_CAUSAL_EFFECT ,
258
+ min_data_point_threshold : float = MIN_DATA_POINT_THRESHOLD ,
259
+ bucket_size_scale_factor : float = DEFAULT_BUCKET_SCALE_FACTOR ,
260
+ ):
261
+ estimates = []
262
+
263
+ if estimator_present == False :
264
+ # Warn the user that the specified parameter is not applicable when no estimator is present in the transformation
265
+ if test_fraction != DEFAULT_TEST_FRACTION :
266
+ logger .warning ("'test_fraction' is not applicable as there is no base treatment value." )
267
+
268
+ # Adding an unobserved confounder if provided by the user
269
+ if unobserved_confounder_values is not None :
270
+ data ["simulated" ] = unobserved_confounder_values
271
+ chosen_variables .append ("simulated" )
272
+ # We set X_train = 0 and outcome_train to be 0
273
+ validation_df = data
274
+ X_train = None
275
+ outcome_train = None
276
+ X_validation_df = validation_df [chosen_variables ]
277
+
278
+ X_validation = X_validation_df .values
279
+ outcome_validation = validation_df [outcome_name ].values
280
+
281
+ # Get the final outcome, after running through all the values in the transformation list
282
+ outcome_validation = process_data (
283
+ outcome_name , X_train , outcome_train , X_validation , outcome_validation , transformation_list
284
+ )
285
+
286
+ # Check if the value of true effect has been already stored
287
+ # We use None as the key as we have no base category for this refutation
288
+ if None not in causal_effect_map :
289
+ # As we currently support only one treatment
290
+ causal_effect_map [None ] = true_causal_effect (validation_df [treatment_name [0 ]])
291
+
292
+ outcome_validation += causal_effect_map [None ]
293
+
294
+ new_data = validation_df .assign (dummy_outcome = outcome_validation )
295
+
296
+ new_estimator = estimate .estimator .get_new_estimator_object (identified_estimand )
297
+ new_estimator .fit (
298
+ new_data ,
299
+ effect_modifier_names = estimate .estimator ._effect_modifier_names ,
300
+ ** new_estimator ._fit_params if hasattr (new_estimator , "_fit_params" ) else {},
301
+ )
302
+ new_effect = new_estimator .estimate_effect (
303
+ new_data ,
304
+ control_value = estimate .control_value ,
305
+ treatment_value = estimate .treatment_value ,
306
+ target_units = estimate .estimator ._target_units ,
307
+ )
308
+ estimates .append (new_effect .value )
309
+
310
+ else :
311
+ groups = preprocess_data_by_treatment (
312
+ data , treatment_name , unobserved_confounder_values , bucket_size_scale_factor , chosen_variables
313
+ )
314
+ group_count = 0
315
+
316
+ if len (test_fraction ) == 1 :
317
+ test_fraction = len (groups ) * test_fraction
318
+
319
+ for key_train , _ in groups :
320
+ base_train = groups .get_group (key_train ).sample (frac = test_fraction [group_count ].base )
321
+ train_set = set ([tuple (line ) for line in base_train .values ])
322
+ total_set = set ([tuple (line ) for line in groups .get_group (key_train ).values ])
323
+ base_validation = pd .DataFrame (list (total_set .difference (train_set )), columns = base_train .columns )
324
+ X_train_df = base_train [chosen_variables ]
325
+
326
+ X_train = X_train_df .values
327
+ outcome_train = base_train [outcome_name ].values
328
+
329
+ validation_df = []
330
+ transformation_list_temp = transformation_list
331
+ validation_df .append (base_validation )
332
+
333
+ for key_validation , _ in groups :
334
+ if key_validation != key_train :
335
+ validation_df .append (groups .get_group (key_validation ).sample (frac = test_fraction [group_count ].other ))
336
+
337
+ validation_df = pd .concat (validation_df )
338
+ X_validation_df = validation_df [chosen_variables ]
339
+
340
+ X_validation = X_validation_df .values
341
+ outcome_validation = validation_df [outcome_name ].values
342
+
343
+ # If the number of data points is too few, run the default transformation: [("zero",""),("noise", {'std_dev':1} )]
344
+ if X_train .shape [0 ] <= min_data_point_threshold :
345
+ transformation_list_temp = DEFAULT_TRANSFORMATION
346
+ logger .warning (
347
+ "The number of data points in X_train:{} for category:{} is less than threshold:{}" .format (
348
+ X_train .shape [0 ], key_train , min_data_point_threshold
349
+ )
350
+ )
351
+ logger .warning (
352
+ "Therefore, defaulting to the minimal set of transformations:{}" .format (transformation_list_temp )
353
+ )
354
+
355
+ outcome_validation = process_data (
356
+ outcome_name , X_train , outcome_train , X_validation , outcome_validation , transformation_list_temp
357
+ )
358
+
359
+ # Check if the value of true effect has been already stored
360
+ # This ensures that we calculate the causal effect only once.
361
+ # We use key_train as we map data with respect to the base category of the data
362
+
363
+ if key_train not in causal_effect_map :
364
+ # As we currently support only one treatment
365
+ causal_effect_map [key_train ] = true_causal_effect (validation_df [treatment_name [0 ]])
366
+
367
+ # Add h(t) to f(W) to get the dummy outcome
368
+ outcome_validation += causal_effect_map [key_train ]
369
+
370
+ new_data = validation_df .assign (dummy_outcome = outcome_validation )
371
+ new_estimator = estimate .estimator .get_new_estimator_object (identified_estimand )
372
+ new_estimator .fit (
373
+ new_data ,
374
+ effect_modifier_names = estimate .estimator ._effect_modifier_names ,
375
+ ** new_estimator ._fit_params if hasattr (new_estimator , "_fit_params" ) else {},
376
+ )
377
+ new_effect = new_estimator .estimate_effect (
378
+ new_data ,
379
+ control_value = estimate .control_value ,
380
+ treatment_value = estimate .treatment_value ,
381
+ target_units = estimate .estimator ._target_units ,
382
+ )
383
+
384
+ estimates .append (new_effect .value )
385
+ group_count += 1
386
+
387
+ return estimates
388
+
389
+
244
390
def refute_dummy_outcome (
245
391
data : pd .DataFrame ,
246
392
target_estimand : IdentifiedEstimand ,
@@ -256,6 +402,8 @@ def refute_dummy_outcome(
256
402
unobserved_confounder_values : Optional [List ] = DEFAULT_NEW_DATA_WITH_UNOBSERVED_CONFOUNDING ,
257
403
true_causal_effect : Callable = DEFAULT_TRUE_CAUSAL_EFFECT ,
258
404
show_progress_bar = False ,
405
+ n_jobs : int = 1 ,
406
+ verbose : int = 0 ,
259
407
** _ ,
260
408
) -> List [CausalRefutation ]:
261
409
"""Refute an estimate by replacing the outcome with a simulated variable
@@ -447,159 +595,45 @@ def refute_dummy_outcome(
447
595
# Train and the Validation Datasets. Thus, we run the simulation loop followed by the training and the validation
448
596
# loops. Thus, we can get different values everytime we get the estimator.
449
597
450
- # for _ in range( self._num_simulations ):
451
- for _ in tqdm (
452
- range (num_simulations ),
453
- colour = CausalRefuter .PROGRESS_BAR_COLOR ,
454
- disable = not show_progress_bar ,
455
- desc = "Refuting Estimates: " ,
456
- ):
457
- estimates = []
458
-
459
- if estimator_present == False :
460
-
461
- # Warn the user that the specified parameter is not applicable when no estimator is present in the transformation
462
- if test_fraction != DEFAULT_TEST_FRACTION :
463
- logger .warning ("'test_fraction' is not applicable as there is no base treatment value." )
464
-
465
- # Adding an unobserved confounder if provided by the user
466
- if unobserved_confounder_values is not None :
467
- data ["simulated" ] = unobserved_confounder_values
468
- chosen_variables .append ("simulated" )
469
- # We set X_train = 0 and outcome_train to be 0
470
- validation_df = data
471
- X_train = None
472
- outcome_train = None
473
- X_validation_df = validation_df [chosen_variables ]
474
-
475
- X_validation = X_validation_df .values
476
- outcome_validation = validation_df [outcome_name ].values
477
-
478
- # Get the final outcome, after running through all the values in the transformation list
479
- outcome_validation = process_data (
480
- outcome_name , X_train , outcome_train , X_validation , outcome_validation , transformation_list
481
- )
482
-
483
- # Check if the value of true effect has been already stored
484
- # We use None as the key as we have no base category for this refutation
485
- if None not in causal_effect_map :
486
- # As we currently support only one treatment
487
- causal_effect_map [None ] = true_causal_effect (validation_df [treatment_name [0 ]])
488
-
489
- outcome_validation += causal_effect_map [None ]
490
-
491
- new_data = validation_df .assign (dummy_outcome = outcome_validation )
492
-
493
- new_estimator = estimate .estimator .get_new_estimator_object (identified_estimand )
494
- new_estimator .fit (
495
- new_data ,
496
- effect_modifier_names = estimate .estimator ._effect_modifier_names ,
497
- ** new_estimator ._fit_params if hasattr (new_estimator , "_fit_params" ) else {},
498
- )
499
- new_effect = new_estimator .estimate_effect (
500
- new_data ,
501
- control_value = estimate .control_value ,
502
- treatment_value = estimate .treatment_value ,
503
- target_units = estimate .estimator ._target_units ,
504
- )
505
- estimates .append (new_effect .value )
506
-
507
- else :
508
-
509
- groups = preprocess_data_by_treatment (
510
- data , treatment_name , unobserved_confounder_values , bucket_size_scale_factor , chosen_variables
511
- )
512
- group_count = 0
513
-
514
- if len (test_fraction ) == 1 :
515
- test_fraction = len (groups ) * test_fraction
516
-
517
- for key_train , _ in groups :
518
- base_train = groups .get_group (key_train ).sample (frac = test_fraction [group_count ].base )
519
- train_set = set ([tuple (line ) for line in base_train .values ])
520
- total_set = set ([tuple (line ) for line in groups .get_group (key_train ).values ])
521
- base_validation = pd .DataFrame (list (total_set .difference (train_set )), columns = base_train .columns )
522
- X_train_df = base_train [chosen_variables ]
523
-
524
- X_train = X_train_df .values
525
- outcome_train = base_train [outcome_name ].values
526
-
527
- validation_df = []
528
- transformation_list_temp = transformation_list
529
- validation_df .append (base_validation )
530
-
531
- for key_validation , _ in groups :
532
- if key_validation != key_train :
533
- validation_df .append (
534
- groups .get_group (key_validation ).sample (frac = test_fraction [group_count ].other )
535
- )
536
-
537
- validation_df = pd .concat (validation_df )
538
- X_validation_df = validation_df [chosen_variables ]
539
-
540
- X_validation = X_validation_df .values
541
- outcome_validation = validation_df [outcome_name ].values
542
-
543
- # If the number of data points is too few, run the default transformation: [("zero",""),("noise", {'std_dev':1} )]
544
- if X_train .shape [0 ] <= min_data_point_threshold :
545
- transformation_list_temp = DEFAULT_TRANSFORMATION
546
- logger .warning (
547
- "The number of data points in X_train:{} for category:{} is less than threshold:{}" .format (
548
- X_train .shape [0 ], key_train , min_data_point_threshold
549
- )
550
- )
551
- logger .warning (
552
- "Therefore, defaulting to the minimal set of transformations:{}" .format (
553
- transformation_list_temp
554
- )
555
- )
556
-
557
- outcome_validation = process_data (
558
- outcome_name , X_train , outcome_train , X_validation , outcome_validation , transformation_list_temp
559
- )
560
-
561
- # Check if the value of true effect has been already stored
562
- # This ensures that we calculate the causal effect only once.
563
- # We use key_train as we map data with respect to the base category of the data
564
-
565
- if key_train not in causal_effect_map :
566
- # As we currently support only one treatment
567
- causal_effect_map [key_train ] = true_causal_effect (validation_df [treatment_name [0 ]])
568
-
569
- # Add h(t) to f(W) to get the dummy outcome
570
- outcome_validation += causal_effect_map [key_train ]
571
-
572
- new_data = validation_df .assign (dummy_outcome = outcome_validation )
573
- new_estimator = estimate .estimator .get_new_estimator_object (identified_estimand )
574
- new_estimator .fit (
575
- new_data ,
576
- effect_modifier_names = estimate .estimator ._effect_modifier_names ,
577
- ** new_estimator ._fit_params if hasattr (new_estimator , "_fit_params" ) else {},
578
- )
579
- new_effect = new_estimator .estimate_effect (
580
- new_data ,
581
- control_value = estimate .control_value ,
582
- treatment_value = estimate .treatment_value ,
583
- target_units = estimate .estimator ._target_units ,
584
- )
585
-
586
- estimates .append (new_effect .value )
587
- group_count += 1
598
+ sample_estimates = Parallel (n_jobs = n_jobs , verbose = verbose )(
599
+ delayed (_refute_once )(
600
+ data = data ,
601
+ estimate = estimate ,
602
+ treatment_name = treatment_name ,
603
+ outcome_name = outcome_name ,
604
+ estimator_present = estimator_present ,
605
+ unobserved_confounder_values = unobserved_confounder_values ,
606
+ causal_effect_map = causal_effect_map ,
607
+ identified_estimand = identified_estimand ,
608
+ chosen_variables = chosen_variables ,
609
+ transformation_list = transformation_list ,
610
+ true_causal_effect = true_causal_effect ,
611
+ min_data_point_threshold = min_data_point_threshold ,
612
+ bucket_size_scale_factor = bucket_size_scale_factor ,
613
+ test_fraction = test_fraction ,
614
+ )
615
+ for _ in tqdm (
616
+ range (num_simulations ),
617
+ colour = CausalRefuter .PROGRESS_BAR_COLOR ,
618
+ disable = not show_progress_bar ,
619
+ desc = "Refuting Estimates: " ,
620
+ )
621
+ )
588
622
589
- simulation_results .append (estimates )
623
+ # simulation_results.append(estimates)
590
624
591
625
# We convert to ndarray for ease in indexing
592
626
# The data is of the form
593
627
# sim1: cat1 cat2 ... catn
594
628
# sim2: cat1 cat2 ... catn
595
- simulation_results = np .array (simulation_results )
629
+ simulation_results = np .array (sample_estimates )
596
630
631
+ # print('SIMULATION RESULTS::::: ', simulation_results)
597
632
# Note: We would like the causal_estimator to find the true causal estimate that we have specified through this
598
633
# refuter. Let the value of the true causal effect be h(t). In the following section of code, we wish to find out if h(t) falls in the
599
634
# distribution of the refuter.
600
635
601
636
if estimator_present == False :
602
-
603
637
dummy_estimate = CausalEstimate (
604
638
data = None ,
605
639
treatment_name = estimate ._treatment_name ,
0 commit comments