14
14
from sklearn .preprocessing import MultiLabelBinarizer
15
15
16
16
from dowhy .gcm import config
17
- from dowhy .gcm .causal_mechanisms import AdditiveNoiseModel , ClassifierFCM
17
+ from dowhy .gcm .causal_mechanisms import AdditiveNoiseModel , ClassifierFCM , DiscreteAdditiveNoiseModel
18
18
from dowhy .gcm .causal_models import CAUSAL_MECHANISM , ProbabilisticCausalModel , validate_causal_model_assignment
19
19
from dowhy .gcm .ml import (
20
20
ClassificationModel ,
48
48
auto_apply_encoders ,
49
49
auto_fit_encoders ,
50
50
is_categorical ,
51
+ is_discrete ,
51
52
set_random_seed ,
52
53
shape_into_2d ,
53
54
)
@@ -108,7 +109,43 @@ def add_model_performance(self, node, model: str, performance: str, metric_name:
108
109
def __str__ (self ):
109
110
summary_strings = []
110
111
111
- summary_strings .append ("Analyzed %d nodes." % len (list (self ._nodes )))
112
+ summary_strings .append (
113
+ "When using this auto assignment function, the given data is used to automatically assign a causal "
114
+ "mechanism to each node. Note that causal mechanisms can also be customized and assigned manually.\n "
115
+ "The following types of causal mechanisms are considered for the automatic selection:"
116
+ )
117
+ summary_strings .append ("\n If root node:" )
118
+ summary_strings .append (
119
+ "An empirical distribution, i.e., the distribution is represented by randomly sampling from the provided "
120
+ "data. This provides a flexible and non-parametric way to model the marginal distribution and is valid for "
121
+ "all types of data modalities."
122
+ )
123
+ summary_strings .append ("\n If non-root node and the data is continuous:" )
124
+ summary_strings .append (
125
+ "Additive Noise Models (ANM) of the form X_i = f(PA_i) + N_i, where PA_i are the "
126
+ "parents of X_i and the unobserved noise N_i is assumed to be independent of PA_i."
127
+ "To select the best model for f, different regression models are evaluated and the model "
128
+ "with the smallest mean squared error is selected."
129
+ "Note that minimizing the mean squared error here is equivalent to selecting the best "
130
+ "choice of an ANM."
131
+ )
132
+ summary_strings .append ("\n If non-root node and the data is discrete:" )
133
+ summary_strings .append (
134
+ "Discrete Additive Noise Models have almost the same definition as non-discrete ANMs, but come with an "
135
+ "additional constraint for f to only return discrete values.\n "
136
+ "Note that 'discrete' here refers to numerical values with an order. If the data is categorical, consider "
137
+ "representing them as strings to ensure proper model selection."
138
+ )
139
+ summary_strings .append ("\n If non-root node and the data is categorical:" )
140
+ summary_strings .append (
141
+ "A functional causal model based on a classifier, i.e., X_i = f(PA_i, N_i).\n "
142
+ "Here, N_i follows a uniform distribution on [0, 1] and is used to randomly sample a "
143
+ "class (category) using the conditional probability distribution produced by a "
144
+ "classification model."
145
+ "Here, different model classes are evaluated using the (negative) F1 score and the best"
146
+ " performing model class is selected."
147
+ )
148
+ summary_strings .append ("\n In total, %d nodes were analyzed:" % len (list (self ._nodes )))
112
149
113
150
for node in self ._nodes :
114
151
summary_strings .append ("\n --- Node: %s" % node )
@@ -123,11 +160,13 @@ def __str__(self):
123
160
for (model , performance , metric_name ) in self ._nodes [node ]["model_performances" ]:
124
161
summary_strings .append ("%s: %s" % (str (model ()).replace ("()" , "" ), str (performance )))
125
162
126
- summary_strings .append (
127
- "Based on the type of causal mechanism, the model with the lowest metric value "
128
- "represents the best choice."
129
- )
130
-
163
+ summary_strings .append (
164
+ "\n ===Note===\n Note, based on the selected auto assignment quality, the set of " "evaluated models changes."
165
+ )
166
+ summary_strings .append (
167
+ "For more insights toward the quality of the fitted graphical causal model, consider "
168
+ "using the evaluate_causal_model function after fitting the causal mechanisms."
169
+ )
131
170
return "\n " .join (summary_strings )
132
171
133
172
@@ -137,26 +176,86 @@ def assign_causal_mechanisms(
137
176
quality : AssignmentQuality = AssignmentQuality .GOOD ,
138
177
override_models : bool = False ,
139
178
) -> AutoAssignmentSummary :
140
- """Automatically assigns appropriate causal models. If causal models are already assigned to nodes and
141
- override_models is set to False, this function only validates the assignments with respect to the graph structure.
142
- Here, the validation checks whether root nodes have StochasticModels and non-root ConditionalStochasticModels
143
- assigned.
179
+ """Automatically assigns appropriate causal mechanisms to nodes. If causal mechanisms are already assigned to nodes
180
+ and override_models is set to False, this function only validates the assignments with respect to the graph
181
+ structure. This is, the validation checks whether root nodes have StochasticModels and non-root
182
+ ConditionalStochasticModels assigned.
183
+
184
+ The following types of causal mechanisms are considered for the automatic selection:
185
+
186
+ If root node:
187
+ An empirical distribution, i.e., the distribution is represented by randomly sampling from the provided data.
188
+ This provides a flexible and non-parametric way to model the marginal distribution and is valid for all types of
189
+ data modalities.
190
+
191
+ If non-root node and the data is continuous:
192
+ Additive Noise Models (ANM) of the form X_i = f(PA_i) + N_i, where PA_i are the parents of X_i and the unobserved
193
+ noise N_i is assumed to be independent of PA_i. To select the best model for f, different regression models are
194
+ evaluated and the model with the smallest mean squared error is selected. Note that minimizing the mean squared
195
+ error here is equivalent to selecting the best choice of an ANM.
196
+
197
+ If non-root node and the data is discrete:
198
+ Discrete Additive Noise Models have almost the same definition as non-discrete ANMs, but come with an additional
199
+ constraint to return discrete values. Note that 'discrete' here refers to numerical values with an order. If the
200
+ data is categorical, consider representing them as strings to ensure proper model selection.
201
+
202
+ If non-root node and the data is categorical:
203
+ A functional causal model based on a classifier, i.e., X_i = f(PA_i, N_i).
204
+ Here, N_i follows a uniform distribution on [0, 1] and is used to randomly sample a class (category) using the
205
+ conditional probability distribution produced by a classification model. Here, different model classes are evaluated
206
+ using the (negative) F1 score and the best performing model class is selected.
207
+
208
+ The current model zoo is:
209
+
210
+ With "GOOD" quality:
211
+ Numerical:
212
+ - Linear Regressor
213
+ - Linear Regressor with polynomial features
214
+ - Histogram Gradient Boost Regressor
215
+
216
+ Categorical:
217
+ - Logistic Regressor
218
+ - Logistic Regressor with polynomial features
219
+ - Histogram Gradient Boost Classifier
220
+
221
+ With "BETTER" quality:
222
+ Numerical:
223
+ - Linear Regressor
224
+ - Linear Regressor with polynomial features
225
+ - Gradient Boost Regressor
226
+ - Ridge Regressor
227
+ - Lasso Regressor
228
+ - Random Forest Regressor
229
+ - Support Vector Regressor
230
+ - Extra Trees Regressor
231
+ - KNN Regressor
232
+ - Ada Boost Regressor
233
+
234
+ Categorical:
235
+ - Logistic Regressor
236
+ - Logistic Regressor with polynomial features
237
+ - Histogram Gradient Boost Classifier
238
+ - Random Forest Classifier
239
+ - Extra Trees Classifier
240
+ - Support Vector Classifier
241
+ - KNN Classifier
242
+ - Gaussian Naive Bayes Classifier
243
+ - Ada Boost Classifier
244
+
245
+ With "BEST" quality:
246
+ An auto ML model based on AutoGluon (optional dependency, needs to be installed).
144
247
145
248
:param causal_model: The causal model to whose nodes to assign causal models.
146
249
:param based_on: Jointly sampled data corresponding to the nodes of the given graph.
147
250
:param quality: AssignmentQuality for the automatic model selection and model accuracy. This changes the type of
148
- prediction model and time spent on the selection. Options are:
149
- - AssignmentQuality.GOOD: Compares a linear, polynomial and gradient boost model on small test-training split
150
- of the data. The best performing model is then selected .
251
+ prediction model and time spent on the selection. See the docstring for a list of potential models.
252
+ The options for the quality are:
253
+ - AssignmentQuality.GOOD: Only a small set of models are evaluated .
151
254
Model selection speed: Fast
152
255
Model training speed: Fast
153
256
Model inference speed: Fast
154
257
Model accuracy: Medium
155
- - AssignmentQuality.BETTER: Compares multiple model types and uses the one with the best performance
156
- averaged over multiple splits of the training data. By default, the model with the smallest root mean
157
- squared error is selected for regression problems and the model with the highest F1 score is selected for
158
- classification problems. For a list of possible models, see _LIST_OF_POTENTIAL_REGRESSORS_BETTER and
159
- _LIST_OF_POTENTIAL_CLASSIFIERS_BETTER, respectively.
258
+ - AssignmentQuality.BETTER: A larger set of models are evaluated.
160
259
Model selection speed: Medium
161
260
Model training speed: Fast
162
261
Model inference speed: Fast
@@ -168,8 +267,8 @@ def assign_causal_mechanisms(
168
267
Model training speed: Slow
169
268
Model inference speed: Slow-Medium
170
269
Model accuracy: Best
171
- :param override_models: If set to True, existing model assignments are replaced with automatically selected
172
- ones. If set to False, the assigned models are only validated with respect to the graph
270
+ :param override_models: If set to True, existing mechanism assignments are replaced with automatically selected
271
+ ones. If set to False, the assigned mechanisms are only validated with respect to the graph
173
272
structure.
174
273
:return: A summary object containing details about the model selection process.
175
274
"""
@@ -179,7 +278,8 @@ def assign_causal_mechanisms(
179
278
if not override_models and CAUSAL_MECHANISM in causal_model .graph .nodes [node ]:
180
279
auto_assignment_summary .add_node_log_message (
181
280
node ,
182
- "Node %s already has a model assigned and the override parameter is False. Skipping this node." % node ,
281
+ "Node %s already has a causal mechanism assigned and the override parameter is False. Skipping this "
282
+ "node." % node ,
183
283
)
184
284
validate_causal_model_assignment (causal_model .graph , node )
185
285
continue
@@ -189,16 +289,36 @@ def assign_causal_mechanisms(
189
289
if is_root_node (causal_model .graph , node ):
190
290
auto_assignment_summary .add_node_log_message (
191
291
node ,
192
- "Node %s is a root node. Assigning '%s' to the node representing the marginal distribution."
292
+ "Node %s is a root node. Therefore, assigning '%s' to the node representing the marginal distribution."
193
293
% (node , causal_model .causal_mechanism (node )),
194
294
)
195
295
else :
296
+ data_type = "continuous"
297
+ if isinstance (causal_model .causal_mechanism (node ), ClassifierFCM ):
298
+ data_type = "categorical"
299
+ elif isinstance (causal_model .causal_mechanism (node ), DiscreteAdditiveNoiseModel ):
300
+ data_type = "discrete"
301
+
196
302
auto_assignment_summary .add_node_log_message (
197
303
node ,
198
- "Node %s is a non-root node. Assigning '%s' to the node." % (node , causal_model .causal_mechanism (node )),
304
+ "Node %s is a non-root node with %s data. Assigning '%s' to the node."
305
+ % (
306
+ node ,
307
+ data_type ,
308
+ causal_model .causal_mechanism (node ),
309
+ ),
199
310
)
200
311
201
- if isinstance (causal_model .causal_mechanism (node ), AdditiveNoiseModel ):
312
+ if isinstance (causal_model .causal_mechanism (node ), DiscreteAdditiveNoiseModel ):
313
+ auto_assignment_summary .add_node_log_message (
314
+ node ,
315
+ "This represents the discrete causal relationship as "
316
+ + str (node )
317
+ + " := f("
318
+ + "," .join ([str (parent ) for parent in get_ordered_predecessors (causal_model .graph , node )])
319
+ + ") + N." ,
320
+ )
321
+ elif isinstance (causal_model .causal_mechanism (node ), AdditiveNoiseModel ):
202
322
auto_assignment_summary .add_node_log_message (
203
323
node ,
204
324
"This represents the causal relationship as "
@@ -230,16 +350,21 @@ def assign_causal_mechanism_node(
230
350
causal_model .set_causal_mechanism (node , EmpiricalDistribution ())
231
351
model_performances = []
232
352
else :
353
+ node_data = based_on [node ].to_numpy ()
354
+
233
355
best_model , model_performances = select_model (
234
356
based_on [get_ordered_predecessors (causal_model .graph , node )].to_numpy (),
235
- based_on [ node ]. to_numpy () ,
357
+ node_data ,
236
358
quality ,
237
359
)
238
360
239
361
if isinstance (best_model , ClassificationModel ):
240
362
causal_model .set_causal_mechanism (node , ClassifierFCM (best_model ))
241
363
else :
242
- causal_model .set_causal_mechanism (node , AdditiveNoiseModel (best_model ))
364
+ if is_discrete (node_data ):
365
+ causal_model .set_causal_mechanism (node , DiscreteAdditiveNoiseModel (best_model ))
366
+ else :
367
+ causal_model .set_causal_mechanism (node , AdditiveNoiseModel (best_model ))
243
368
244
369
return model_performances
245
370
@@ -263,7 +388,7 @@ def select_model(
263
388
elif model_selection_quality == AssignmentQuality .GOOD :
264
389
list_of_regressor = list (_LIST_OF_POTENTIAL_REGRESSORS_GOOD )
265
390
list_of_classifier = list (_LIST_OF_POTENTIAL_CLASSIFIERS_GOOD )
266
- model_selection_splits = 2
391
+ model_selection_splits = 5
267
392
elif model_selection_quality == AssignmentQuality .BETTER :
268
393
list_of_regressor = list (_LIST_OF_POTENTIAL_REGRESSORS_BETTER )
269
394
list_of_classifier = list (_LIST_OF_POTENTIAL_CLASSIFIERS_BETTER )
0 commit comments