From b5f3d5bb18a80906851274049d7ea33b4443be9d Mon Sep 17 00:00:00 2001
From: Patrick Bloebaum <bloebp@amazon.com>
Date: Wed, 22 Nov 2023 10:46:30 -0800
Subject: [PATCH] Add explicit support for discrete ANMs

- Add new Discrete Additive Noise Model class that enforces the outputs to be discrete. This should help in generating more consistent data.
- As part of this, revised the auto assignment function and revised its docstring.
- Revise the auto assignment summary.
- Revise the evaluation summary.

Signed-off-by: Patrick Bloebaum <bloebp@amazon.com>
---
 .../modeling_gcm/model_evaluation.rst         | 143 +++++++++-----
 dowhy/gcm/__init__.py                         |   2 +-
 dowhy/gcm/auto.py                             | 181 +++++++++++++++---
 dowhy/gcm/causal_mechanisms.py                |  48 ++++-
 dowhy/gcm/fitting_sampling.py                 |  41 +++-
 dowhy/gcm/model_evaluation.py                 |  21 +-
 dowhy/gcm/util/general.py                     |   9 +
 tests/gcm/test_auto.py                        | 125 ++++++++----
 tests/gcm/test_fcms.py                        |  36 ++++
 tests/gcm/test_model_evaluation.py            |  26 ++-
 tests/gcm/test_whatif.py                      |  37 +++-
 tests/gcm/util/test_general.py                |  12 ++
 12 files changed, 538 insertions(+), 143 deletions(-)

diff --git a/docs/source/user_guide/modeling_gcm/model_evaluation.rst b/docs/source/user_guide/modeling_gcm/model_evaluation.rst
index c7259abbe8..58c742c1d1 100644
--- a/docs/source/user_guide/modeling_gcm/model_evaluation.rst
+++ b/docs/source/user_guide/modeling_gcm/model_evaluation.rst
@@ -12,11 +12,17 @@ to misspecifications. Particularly, the impact of misspecification tends to be l
 Additionally, the severity of misspecifications can vary; for instance, defining an incorrect causal direction is
 generally more problematic than including too many (upstream) nodes as potential parents of a node.
 
+Since a causal graph structure defines assumptions about the (conditional) independencies between variables, one can
+falsify a given graph structure. For instance, in a chain X→Y→Z, we know that X and Z have to be independent
+given Y. If this is not the case, we have some evidence that the given graph is wrong. However, on the other hand,
+without stronger assumptions, we cannot confirm the correctness of a graph. Following the chain example, if we flip
+both edges, the conditional independence statement (X independent of Z given Y) would still hold.
+
 **Causal Mechanism Assumption:** To model the causal data generation process, we represent each node with a causal
 mechanism of the form :math:`X_i = f_i(PA_i, N_i)`, where :math:`N_i` denotes unobserved noise, and :math:`PA_i`
 represents the causal parents of :math:`X_i`. In this context, we require an additional assumption regarding the
 form of the function :math:`f_i`. For continuous variables, for instance, it is common to model :math:`f_i` using an
-additive noise model of the form :math:`X_i = f_i(PA_i) + N_i`. However, this representation may not be accurate if the
+`additive noise model <https://proceedings.neurips.cc/paper_files/paper/2008/file/f7664060cc52bc6f3d620bcedc94a4b6-Paper.pdf>`_ of the form :math:`X_i = f_i(PA_i) + N_i`. However, this representation may not be accurate if the
 true relationship is different (e.g., multiplicative). Thus, the type of causal mechanism is another factor that can
 influence the results. Generally, however, the additive noise model assumption in the continuous case tends to be
 relatively robust to violations in practice.
@@ -34,8 +40,16 @@ performances in the selection process.
 Summary of auto assignment
 --------------------------
 
-If we use the auto assignment function, we obtain additional insights into the model selection process. To illustrate
-this, consider the chain structure example X→Y→Z:
+
+If prior knowledge about causal relationships is available, it is always recommended to use that knowledge to model
+the causal mechanisms accordingly. However, if one does not have enough insights, the auto assignment function for GCMs
+can help automatically select an appropriate causal mechanism for each node based on the given data. The auto assignment
+does two things: 1) Select an appropriate causal mechanism and 2) select the best performing model from a small model
+zoo.
+
+When using the auto assignment function, we can obtain additional insights into the model selection process, such as
+the types of causal mechanisms considered and the evaluated models with their performances. To illustrate this, consider
+the chain structure example X→Y→Z again:
 
 >>> import numpy as np, pandas as pd
 >>> import networkx as nx
@@ -50,34 +64,54 @@ this, consider the chain structure example X→Y→Z:
 >>> summary_auto_assignment = gcm.auto.assign_causal_mechanisms(causal_model, data)
 >>> print(summary_auto_assignment)
 
-.. code-block::
+.. code-block:: text
+
+    When using this auto assignment function, the given data is used to automatically assign a causal mechanism to each node. Note that causal mechanisms can also be customized and assigned manually.
+    The following types of causal mechanisms are considered for the automatic selection:
+
+    If root node:
+    An empirical distribution, i.e., the distribution is represented by randomly sampling from the provided data. This provides a flexible and non-parametric way to model the marginal distribution and is valid for all types of data modalities.
+
+    If non-root node and the data is continuous:
+    Additive Noise Models (ANM) of the form X_i = f(PA_i) + N_i, where PA_i are the parents of X_i and the unobserved noise N_i is assumed to be independent of PA_i.To select the best model for f, different regression models are evaluated and the model with the smallest mean squared error is selected.Note that minimizing the mean squared error here is equivalent to selecting the best choice of an ANM.
+
+    If non-root node and the data is discrete:
+    Discrete Additive Noise Models have almost the same definition as non-discrete ANMs, but come with an additional constraint for f to only return discrete values.
+    Note that 'discrete' here refers to numerical values with an order. If the data is categorical, consider representing them as strings to ensure proper model selection.
+
+    If non-root node and the data is categorical:
+    A functional causal model based on a classifier, i.e., X_i = f(PA_i, N_i).
+    Here, N_i follows a uniform distribution on [0, 1] and is used to randomly sample a class (category) using the conditional probability distribution produced by a classification model.Here, different model classes are evaluated using the (negative) F1 score and the best performing model class is selected.
+
+    In total, 3 nodes were analyzed:
 
-    Analyzed 3 nodes.
     --- Node: X
-    Node X is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution.
+    Node X is a root node. Therefore, assigning 'Empirical Distribution' to the node representing the marginal distribution.
 
     --- Node: Y
-    Node Y is a non-root node. Assigning 'AdditiveNoiseModel using LinearRegression' to the node.
+    Node Y is a non-root node with continuous data. Assigning 'AdditiveNoiseModel using LinearRegression' to the node.
     This represents the causal relationship as Y := f(X) + N.
     For the model selection, the following models were evaluated on the mean squared error (MSE) metric:
-    LinearRegression: 1.0023387259040388
+    LinearRegression: 0.9978767184153945
     Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(include_bias=False)),
-                    ('linearregression', LinearRegression)]): 1.0099017476403862
-    HistGradientBoostingRegressor: 1.1091403766880177
-    Based on the type of causal mechanism, the model with the lowest metric value represents the best choice.
+                    ('linearregression', LinearRegression)]): 1.00448207264867
+    HistGradientBoostingRegressor: 1.1386270868995179
 
     --- Node: Z
-    Node Z is a non-root node. Assigning 'AdditiveNoiseModel using LinearRegression' to the node.
+    Node Z is a non-root node with continuous data. Assigning 'AdditiveNoiseModel using LinearRegression' to the node.
     This represents the causal relationship as Z := f(Y) + N.
     For the model selection, the following models were evaluated on the mean squared error (MSE) metric:
-    LinearRegression: 0.9451918596711175
+    LinearRegression: 1.0240822102491627
     Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(include_bias=False)),
-                    ('linearregression', LinearRegression)]): 0.9488259577453813
-    HistGradientBoostingRegressor: 1.682146254853607
-    Based on the type of causal mechanism, the model with the lowest metric value represents the best choice.
+                    ('linearregression', LinearRegression)]): 1.02567150836141
+    HistGradientBoostingRegressor: 1.358002751994007
 
-In this scenario, an empirical distribution is assigned to the root node X, while additive noise models are applied
-to nodes Y and Z. In both of these cases, a linear regression model demonstrated the best performance in terms
+    ===Note===
+    Note, based on the selected auto assignment quality, the set of evaluated models changes.
+    For more insights toward the quality of the fitted graphical causal model, consider using the evaluate_causal_model function after fitting the causal mechanisms.
+
+In this scenario, an empirical distribution is assigned to the root node X, while additive noise models are used for
+nodes Y and Z. In both of these cases, a linear regression model demonstrated the best performance in terms
 of minimizing the mean squared error. A list of evaluated models and their performance is also available. Since we used
 the default parameter for the auto assignment, only a small model zoo is evaluated. However, we can also adjust the
 assigment quality to extend it to more models.
@@ -90,7 +124,7 @@ Evaluating a fitted GCM
 -----------------------
 
 The causal model has been fitted and can be used for different causal questions. However, we might be interested in
-obtaining some insights into the model performance first, i.e., we might wonder:
+obtaining some insights into the causal model performance first, i.e., we might wonder:
 
 - How well do my causal mechanisms perform?
 - Is the additive noise model assumption even valid for my data?
@@ -103,31 +137,41 @@ performance and whether our assumptions hold:
 >>> summary_evaluation = gcm.evaluate_causal_model(causal_model, data, compare_mechanism_baselines=True)
 >>> print(summary_evaluation)
 
-.. code-block::
+.. image:: graph_evaluation.png
+   :alt: Causal Graph Falsification
 
-    Evaluated the performance of the causal mechanisms and the invertibility assumption of the causal mechanisms and the overall average KL divergence between generated and observed distribution and graph structure. The results are as follows:
+.. code-block:: text
+
+    Evaluated the performance of the causal mechanisms and the invertibility assumption of the causal mechanisms and the overall average KL divergence between generated and observed distribution and the graph structure. The results are as follows:
 
     ==== Evaluation of Causal Mechanisms ====
-    Root nodes are evaluated based on the KL divergence between the generated and the observed distribution.
-    Non-root nodes are mainly evaluated based on the (normalized) Continuous Ranked Probability Score (CRPS), which is a generalizes the Mean Absolute Percentage Error to probabilistic predictions. Since the causal mechanisms produce conditional distributions, this should give some insights into their performance and calibration. In addition, the mean squared error (MSE), the normalized MSE (NMSE), the R2 coefficient and the F1 score (for categorical nodes) is reported.
+    The used evaluation metrics are:
+    - KL divergence (only for root-nodes): Evaluates the divergence between the generated and the observed distribution.
+    - Mean Squared Error (MSE): Evaluates the average squared differences between the observed values and the conditional expectation of the causal mechanisms.
+    - Normalized MSE (NMSE): The MSE normalized by the standard deviation for better comparison.
+    - R2 coefficient: Indicates how much variance is explained by the conditional expectations of the mechanisms. Note, however, that this can be misleading for nonlinear relationships.
+    - F1 score (only for categorical non-root nodes): The harmonic mean of the precision and recall indicating the goodness of the underlying classifier model.
+    - (normalized) Continuous Ranked Probability Score (CRPS): The CRPS generalizes the Mean Absolute Percentage Error to probabilistic predictions. This gives insights into the accuracy and calibration of the causal mechanisms.
+    NOTE: Every metric focuses on different aspects and they might not consistently indicate a good or bad performance.
+    We will mostly utilize the CRPS for comparing and interpreting the performance of the mechanisms, since this captures the most important properties for the causal model.
 
     --- Node X
-    - The KL divergence between generated and observed distribution is 0.009626590006593095.
+    - The KL divergence between generated and observed distribution is 0.04082997872632467.
     The estimated KL divergence indicates an overall very good representation of the data distribution.
 
     --- Node Y
-    - The MSE is 0.9757997114620423.
-    - The NMSE is 0.43990166981441525.
-    - The R2 coefficient is 0.8061235344428738.
-    - The normalized CRPS is 0.25017606839653783.
+    - The MSE is 0.9295878353315775.
+    - The NMSE is 0.44191515264388137.
+    - The R2 coefficient is 0.8038281270395207.
+    - The normalized CRPS is 0.25235753447337383.
     The estimated CRPS indicates a good model performance.
     The mechanism is better or equally good than all 7 baseline mechanisms.
 
     --- Node Z
-    - The MSE is 1.0203244742317465.
-    - The NMSE is 0.14823906495213202.
-    - The R2 coefficient is 0.9779316094447573.
-    - The normalized CRPS is 0.08426403180533645.
+    - The MSE is 0.9485970223031653.
+    - The NMSE is 0.14749131486369138.
+    - The R2 coefficient is 0.9781306148527433.
+    - The normalized CRPS is 0.08386782069483441.
     The estimated CRPS indicates a very good model performance.
     The mechanism is better or equally good than all 7 baseline mechanisms.
 
@@ -142,7 +186,7 @@ performance and whether our assumptions hold:
     Note that these results are based on statistical independence tests, and the fact that the assumption was not rejected does not necessarily imply that it is correct. There is just no evidence against it.
 
     ==== Evaluation of Generated Distribution ====
-    The overall average KL divergence between the generated and observed distribution is 0.0017936403551594468
+    The overall average KL divergence between the generated and observed distribution is 0.015438550831663324
     The estimated KL divergence indicates an overall very good representation of the data distribution.
 
     ==== Evaluation of the Causal Graph Structure ====
@@ -152,7 +196,7 @@ performance and whether our assumptions hold:
     | The given DAG is not informative because 2 / 6 of the permutations lie in the Markov                  |
     | equivalence class of the given DAG (p-value: 0.33).                                                   |
     | The given DAG violates 0/1 LMCs and is better than 66.7% of the permuted DAGs (p-value: 0.33).        |
-    | Based on the provided significance level (0.05) and because the DAG is not informative,               |
+    | Based on the provided significance level (0.2) and because the DAG is not informative,               |
     | we do not reject the DAG.                                                                             |
     +-------------------------------------------------------------------------------------------------------+
 
@@ -160,37 +204,36 @@ performance and whether our assumptions hold:
     Always double check the made model assumptions with respect to the graph structure and choice of causal mechanisms.
     All these evaluations give some insight into the goodness of the causal model, but should not be overinterpreted, since some causal relationships can be intrinsically hard to model. Furthermore, many algorithms are fairly robust against misspecifications or poor performances of causal mechanisms.
 
-.. image:: graph_evaluation.png
-   :alt: Causal Graph Falsification
-
-
 As we see, we get a detailed overview of different evaluations:
 
 **Evaluation of Causal Mechanisms:** Evaluation of the causal mechanisms with respect to their model performance.
-For non-root nodes, the most important measure is the Continuous Ranked Probability Score (CRPS), which provides
+For non-root nodes, the most important measure is the (normalized) Continuous Ranked Probability Score (CRPS), which provides
 insights into the mechanism's accuracy and its calibration as a probabilistic model. It further lists other metrics
 such as the mean squared error (MSE), the MSE normalized by the variance (denoted as NMSE), the R2 coefficient and, in
 the case of categorical variables, the F1 score.
 If the node is a root node, the KL divergence between the generated and observed data distributions is measured.
 
-Optionally, we can set the `compare_mechanism_baselines` parameter to `True` in order
+Optionally, we can set the ``compare_mechanism_baselines`` parameter to ``True`` in order
 to compare the mechanisms with some baseline models. This gives us better insights into how the mechanisms perform in
-comparison with other models. Note, however, that this can take significant time for larger graphs.
+comparison to other models. Note, however, that this can take significant time for larger graphs.
 
 **Evaluation of Invertible Functional Causal Model Assumption:** If the causal mechanism is an invertible functional
 causal model, we can validate if the assumption holds true. Note that an invertible function here means with respect to
-the noise, i.e., an additive noise model :math:`X_i = f_i(PA_i) + N_i` and, more generally, post non-linear models
-:math:`X_i = g_i(f_i(PA_i) + N_i)` are examples for such types of mechanisms. In this case, the estimated noise based on
-the observation should be independent of the inputs.
+the noise, i.e., an additive noise model :math:`X_i = f_i(PA_i) + N_i` and, more generally, `post non-linear models <https://arxiv.org/ftp/arxiv/papers/1205/1205.2599.pdf>`_
+:math:`X_i = g_i(f_i(PA_i) + N_i)` where :math:`g_i` is invertible are examples for such types of mechanisms.
+In this case, the estimated noise based on the observation should be independent of the inputs.
 
 **Evaluation of Generated Distribution:** Since the GCM is able to generate new samples from the learned distributions,
 we can evaluate whether the generated (joint) distribution coincides with the observed one. Here, the difference should
-be as small as possible.
-
-**Evaluation of the Causal Graph Structure:** The graph structure should represent the (conditional) independencies
-in the observed data (assuming faithfulness). This can be exploited to obtain some insights on whether the given
-graph violates the (in)dependence structures based on the data. For this, an algorithm is used that checks whether the
-graph can be rejected.
+be as small as possible. To make the KL divergence estimation practical for potentially large graphs, this is
+approximated by taking the mean over the KL divergence between the generated and observed marginal distributions for
+each node.
+
+**Evaluation of the Causal Graph Structure:** As discussed above, the graph structure should represent the (conditional)
+independencies in the observed data (assuming faithfulness). This can be exploited to obtain some insights on whether
+the given graph violates the (in)dependence structures based on the data by running different independence tests. For
+this, an algorithm is used that checks whether the graph can be rejected and whether one is even able to obtain an
+informative insight from such independence tests.
 
 Note that all these evaluation methods only provide some insights into the provided GCM, but cannot fully confirm
 the correctness of a learned model. More details about the metrics and evaluation methods, please see the corresponding
diff --git a/dowhy/gcm/__init__.py b/dowhy/gcm/__init__.py
index 735d5d40e8..4ab7758ced 100644
--- a/dowhy/gcm/__init__.py
+++ b/dowhy/gcm/__init__.py
@@ -10,7 +10,7 @@
     MedianDeviationScorer,
     RescaledMedianCDFQuantileScorer,
 )
-from .causal_mechanisms import AdditiveNoiseModel, ClassifierFCM, PostNonlinearModel
+from .causal_mechanisms import AdditiveNoiseModel, ClassifierFCM, DiscreteAdditiveNoiseModel, PostNonlinearModel
 from .causal_models import InvertibleStructuralCausalModel, ProbabilisticCausalModel, StructuralCausalModel
 from .confidence_intervals import confidence_intervals
 from .confidence_intervals_cms import bootstrap_sampling, fit_and_compute
diff --git a/dowhy/gcm/auto.py b/dowhy/gcm/auto.py
index 0d8021eceb..bce4639da1 100644
--- a/dowhy/gcm/auto.py
+++ b/dowhy/gcm/auto.py
@@ -14,7 +14,7 @@
 from sklearn.preprocessing import MultiLabelBinarizer
 
 from dowhy.gcm import config
-from dowhy.gcm.causal_mechanisms import AdditiveNoiseModel, ClassifierFCM
+from dowhy.gcm.causal_mechanisms import AdditiveNoiseModel, ClassifierFCM, DiscreteAdditiveNoiseModel
 from dowhy.gcm.causal_models import CAUSAL_MECHANISM, ProbabilisticCausalModel, validate_causal_model_assignment
 from dowhy.gcm.ml import (
     ClassificationModel,
@@ -48,6 +48,7 @@
     auto_apply_encoders,
     auto_fit_encoders,
     is_categorical,
+    is_discrete,
     set_random_seed,
     shape_into_2d,
 )
@@ -108,7 +109,43 @@ def add_model_performance(self, node, model: str, performance: str, metric_name:
     def __str__(self):
         summary_strings = []
 
-        summary_strings.append("Analyzed %d nodes." % len(list(self._nodes)))
+        summary_strings.append(
+            "When using this auto assignment function, the given data is used to automatically assign a causal "
+            "mechanism to each node. Note that causal mechanisms can also be customized and assigned manually.\n"
+            "The following types of causal mechanisms are considered for the automatic selection:"
+        )
+        summary_strings.append("\nIf root node:")
+        summary_strings.append(
+            "An empirical distribution, i.e., the distribution is represented by randomly sampling from the provided "
+            "data. This provides a flexible and non-parametric way to model the marginal distribution and is valid for "
+            "all types of data modalities."
+        )
+        summary_strings.append("\nIf non-root node and the data is continuous:")
+        summary_strings.append(
+            "Additive Noise Models (ANM) of the form X_i = f(PA_i) + N_i, where PA_i are the "
+            "parents of X_i and the unobserved noise N_i is assumed to be independent of PA_i."
+            "To select the best model for f, different regression models are evaluated and the model "
+            "with the smallest mean squared error is selected."
+            "Note that minimizing the mean squared error here is equivalent to selecting the best "
+            "choice of an ANM."
+        )
+        summary_strings.append("\nIf non-root node and the data is discrete:")
+        summary_strings.append(
+            "Discrete Additive Noise Models have almost the same definition as non-discrete ANMs, but come with an "
+            "additional constraint for f to only return discrete values.\n"
+            "Note that 'discrete' here refers to numerical values with an order. If the data is categorical, consider "
+            "representing them as strings to ensure proper model selection."
+        )
+        summary_strings.append("\nIf non-root node and the data is categorical:")
+        summary_strings.append(
+            "A functional causal model based on a classifier, i.e., X_i = f(PA_i, N_i).\n"
+            "Here, N_i follows a uniform distribution on [0, 1] and is used to randomly sample a "
+            "class (category) using the conditional probability distribution produced by a "
+            "classification model."
+            "Here, different model classes are evaluated using the (negative) F1 score and the best"
+            " performing model class is selected."
+        )
+        summary_strings.append("\nIn total, %d nodes were analyzed:" % len(list(self._nodes)))
 
         for node in self._nodes:
             summary_strings.append("\n--- Node: %s" % node)
@@ -123,11 +160,13 @@ def __str__(self):
                 for (model, performance, metric_name) in self._nodes[node]["model_performances"]:
                     summary_strings.append("%s: %s" % (str(model()).replace("()", ""), str(performance)))
 
-                summary_strings.append(
-                    "Based on the type of causal mechanism, the model with the lowest metric value "
-                    "represents the best choice."
-                )
-
+        summary_strings.append(
+            "\n===Note===\nNote, based on the selected auto assignment quality, the set of " "evaluated models changes."
+        )
+        summary_strings.append(
+            "For more insights toward the quality of the fitted graphical causal model, consider "
+            "using the evaluate_causal_model function after fitting the causal mechanisms."
+        )
         return "\n".join(summary_strings)
 
 
@@ -137,26 +176,86 @@ def assign_causal_mechanisms(
     quality: AssignmentQuality = AssignmentQuality.GOOD,
     override_models: bool = False,
 ) -> AutoAssignmentSummary:
-    """Automatically assigns appropriate causal models. If causal models are already assigned to nodes and
-    override_models is set to False, this function only validates the assignments with respect to the graph structure.
-    Here, the validation checks whether root nodes have StochasticModels and non-root ConditionalStochasticModels
-    assigned.
+    """Automatically assigns appropriate causal mechanisms to nodes. If causal mechanisms are already assigned to nodes
+    and override_models is set to False, this function only validates the assignments with respect to the graph
+    structure. This is, the validation checks whether root nodes have StochasticModels and non-root
+    ConditionalStochasticModels assigned.
+
+    The following types of causal mechanisms are considered for the automatic selection:
+
+    If root node:
+    An empirical distribution, i.e., the distribution is represented by randomly sampling from the provided data.
+    This provides a flexible and non-parametric way to model the marginal distribution and is valid for all types of
+    data modalities.
+
+    If non-root node and the data is continuous:
+    Additive Noise Models (ANM) of the form X_i = f(PA_i) + N_i, where PA_i are the parents of X_i and the unobserved
+    noise N_i is assumed to be independent of PA_i. To select the best model for f, different regression models are
+    evaluated and the model with the smallest mean squared error is selected. Note that minimizing the mean squared
+    error here is equivalent to selecting the best choice of an ANM.
+
+    If non-root node and the data is discrete:
+    Discrete Additive Noise Models have almost the same definition as non-discrete ANMs, but come with an additional
+    constraint to return discrete values. Note that 'discrete' here refers to numerical values with an order. If the
+    data is categorical, consider representing them as strings to ensure proper model selection.
+
+    If non-root node and the data is categorical:
+    A functional causal model based on a classifier, i.e., X_i = f(PA_i, N_i).
+    Here, N_i follows a uniform distribution on [0, 1] and is used to randomly sample a class (category) using the
+    conditional probability distribution produced by a classification model. Here, different model classes are evaluated
+    using the (negative) F1 score and the best performing model class is selected.
+
+    The current model zoo is:
+
+    With "GOOD" quality:
+        Numerical:
+        - Linear Regressor
+        - Linear Regressor with polynomial features
+        - Histogram Gradient Boost Regressor
+
+        Categorical:
+        - Logistic Regressor
+        - Logistic Regressor with polynomial features
+        - Histogram Gradient Boost Classifier
+
+    With "BETTER" quality:
+        Numerical:
+        - Linear Regressor
+        - Linear Regressor with polynomial features
+        - Gradient Boost Regressor
+        - Ridge Regressor
+        - Lasso Regressor
+        - Random Forest Regressor
+        - Support Vector Regressor
+        - Extra Trees Regressor
+        - KNN Regressor
+        - Ada Boost Regressor
+
+        Categorical:
+        - Logistic Regressor
+        - Logistic Regressor with polynomial features
+        - Histogram Gradient Boost Classifier
+        - Random Forest Classifier
+        - Extra Trees Classifier
+        - Support Vector Classifier
+        - KNN Classifier
+        - Gaussian Naive Bayes Classifier
+        - Ada Boost Classifier
+
+    With "BEST" quality:
+    An auto ML model based on AutoGluon (optional dependency, needs to be installed).
 
     :param causal_model: The causal model to whose nodes to assign causal models.
     :param based_on: Jointly sampled data corresponding to the nodes of the given graph.
     :param quality: AssignmentQuality for the automatic model selection and model accuracy. This changes the type of
-    prediction model and time spent on the selection. Options are:
-        - AssignmentQuality.GOOD: Compares a linear, polynomial and gradient boost model on small test-training split
-            of the data. The best performing model is then selected.
+                    prediction model and time spent on the selection. See the docstring for a list of potential models.
+                    The options for the quality are:
+        - AssignmentQuality.GOOD: Only a small set of models are evaluated.
             Model selection speed: Fast
             Model training speed: Fast
             Model inference speed: Fast
             Model accuracy: Medium
-        - AssignmentQuality.BETTER: Compares multiple model types and uses the one with the best performance
-            averaged over multiple splits of the training data. By default, the model with the smallest root mean
-            squared error is selected for regression problems and the model with the highest F1 score is selected for
-            classification problems. For a list of possible models, see _LIST_OF_POTENTIAL_REGRESSORS_BETTER and
-            _LIST_OF_POTENTIAL_CLASSIFIERS_BETTER, respectively.
+        - AssignmentQuality.BETTER: A larger set of models are evaluated.
             Model selection speed: Medium
             Model training speed: Fast
             Model inference speed: Fast
@@ -168,8 +267,8 @@ def assign_causal_mechanisms(
             Model training speed: Slow
             Model inference speed: Slow-Medium
             Model accuracy: Best
-    :param override_models: If set to True, existing model assignments are replaced with automatically selected
-                            ones. If set to False, the assigned models are only validated with respect to the graph
+    :param override_models: If set to True, existing mechanism assignments are replaced with automatically selected
+                            ones. If set to False, the assigned mechanisms are only validated with respect to the graph
                             structure.
     :return: A summary object containing details about the model selection process.
     """
@@ -179,7 +278,8 @@ def assign_causal_mechanisms(
         if not override_models and CAUSAL_MECHANISM in causal_model.graph.nodes[node]:
             auto_assignment_summary.add_node_log_message(
                 node,
-                "Node %s already has a model assigned and the override parameter is False. Skipping this node." % node,
+                "Node %s already has a causal mechanism assigned and the override parameter is False. Skipping this "
+                "node." % node,
             )
             validate_causal_model_assignment(causal_model.graph, node)
             continue
@@ -189,16 +289,36 @@ def assign_causal_mechanisms(
         if is_root_node(causal_model.graph, node):
             auto_assignment_summary.add_node_log_message(
                 node,
-                "Node %s is a root node. Assigning '%s' to the node representing the marginal distribution."
+                "Node %s is a root node. Therefore, assigning '%s' to the node representing the marginal distribution."
                 % (node, causal_model.causal_mechanism(node)),
             )
         else:
+            data_type = "continuous"
+            if isinstance(causal_model.causal_mechanism(node), ClassifierFCM):
+                data_type = "categorical"
+            elif isinstance(causal_model.causal_mechanism(node), DiscreteAdditiveNoiseModel):
+                data_type = "discrete"
+
             auto_assignment_summary.add_node_log_message(
                 node,
-                "Node %s is a non-root node. Assigning '%s' to the node." % (node, causal_model.causal_mechanism(node)),
+                "Node %s is a non-root node with %s data. Assigning '%s' to the node."
+                % (
+                    node,
+                    data_type,
+                    causal_model.causal_mechanism(node),
+                ),
             )
 
-        if isinstance(causal_model.causal_mechanism(node), AdditiveNoiseModel):
+        if isinstance(causal_model.causal_mechanism(node), DiscreteAdditiveNoiseModel):
+            auto_assignment_summary.add_node_log_message(
+                node,
+                "This represents the discrete causal relationship as "
+                + str(node)
+                + " := f("
+                + ",".join([str(parent) for parent in get_ordered_predecessors(causal_model.graph, node)])
+                + ") + N.",
+            )
+        elif isinstance(causal_model.causal_mechanism(node), AdditiveNoiseModel):
             auto_assignment_summary.add_node_log_message(
                 node,
                 "This represents the causal relationship as "
@@ -230,16 +350,21 @@ def assign_causal_mechanism_node(
         causal_model.set_causal_mechanism(node, EmpiricalDistribution())
         model_performances = []
     else:
+        node_data = based_on[node].to_numpy()
+
         best_model, model_performances = select_model(
             based_on[get_ordered_predecessors(causal_model.graph, node)].to_numpy(),
-            based_on[node].to_numpy(),
+            node_data,
             quality,
         )
 
         if isinstance(best_model, ClassificationModel):
             causal_model.set_causal_mechanism(node, ClassifierFCM(best_model))
         else:
-            causal_model.set_causal_mechanism(node, AdditiveNoiseModel(best_model))
+            if is_discrete(node_data):
+                causal_model.set_causal_mechanism(node, DiscreteAdditiveNoiseModel(best_model))
+            else:
+                causal_model.set_causal_mechanism(node, AdditiveNoiseModel(best_model))
 
     return model_performances
 
@@ -263,7 +388,7 @@ def select_model(
     elif model_selection_quality == AssignmentQuality.GOOD:
         list_of_regressor = list(_LIST_OF_POTENTIAL_REGRESSORS_GOOD)
         list_of_classifier = list(_LIST_OF_POTENTIAL_CLASSIFIERS_GOOD)
-        model_selection_splits = 2
+        model_selection_splits = 5
     elif model_selection_quality == AssignmentQuality.BETTER:
         list_of_regressor = list(_LIST_OF_POTENTIAL_REGRESSORS_BETTER)
         list_of_classifier = list(_LIST_OF_POTENTIAL_CLASSIFIERS_BETTER)
diff --git a/dowhy/gcm/causal_mechanisms.py b/dowhy/gcm/causal_mechanisms.py
index e033a2753f..e1aba3eda2 100644
--- a/dowhy/gcm/causal_mechanisms.py
+++ b/dowhy/gcm/causal_mechanisms.py
@@ -8,7 +8,7 @@
 
 from dowhy.gcm.ml import ClassificationModel, PredictionModel
 from dowhy.gcm.ml.regression import InvertibleFunction, SklearnRegressionModel
-from dowhy.gcm.util.general import is_categorical, shape_into_2d
+from dowhy.gcm.util.general import is_categorical, is_discrete, shape_into_2d
 
 
 class StochasticModel(ABC):
@@ -218,6 +218,52 @@ def __str__(self) -> str:
         return "AdditiveNoiseModel using %s" % prediction_model_string
 
 
+class DiscreteAdditiveNoiseModel(AdditiveNoiseModel):
+    """Implements a discrete ANM. This is, it follows a normal ANM of the form Y = f(X) + N, where N is assumed to be
+    independent of X and f is forced to output discrete values. To allow for flexible models, f can be any regression
+    model and the output will be rounded to a discrete value accordingly. Note that this remains a valid additive noise
+    model, but assumes that Y can take any integer value."""
+
+    def fit(self, X: np.ndarray, Y: np.ndarray) -> None:
+        if not is_discrete(Y):
+            raise ValueError("Cannot fit a discrete ANM to non-discrete target values!")
+
+        X, Y = shape_into_2d(X, Y)
+        Y = Y.astype(np.int32)
+
+        self._prediction_model.fit(X=X, Y=Y)
+        self._noise_model.fit(self._rounded_prediction(X) - Y)
+
+    def evaluate(self, parent_samples: np.ndarray, noise_samples: np.ndarray) -> np.ndarray:
+        if not is_discrete(noise_samples):
+            raise ValueError("Noise values have to be discrete!")
+
+        parent_samples, noise_samples = shape_into_2d(parent_samples, noise_samples)
+        predictions = shape_into_2d(self._rounded_prediction(parent_samples))
+
+        return predictions + noise_samples
+
+    def estimate_noise(self, target_samples: np.ndarray, parent_samples: np.ndarray) -> np.ndarray:
+        if not is_discrete(target_samples):
+            raise ValueError("Target samples have to be discrete!")
+
+        target_samples, parent_samples = shape_into_2d(target_samples, parent_samples)
+
+        return target_samples - self._rounded_prediction(parent_samples)
+
+    def _rounded_prediction(self, X: np.ndarray) -> np.ndarray:
+        return np.round(self._prediction_model.predict(X).astype(float)).astype(np.int32)
+
+    def clone(self):
+        return DiscreteAdditiveNoiseModel(
+            prediction_model=self.prediction_model.clone(),
+            noise_model=self.noise_model.clone(),
+        )
+
+    def __str__(self) -> str:
+        return "Discrete " + super().__str__()
+
+
 class ProbabilityEstimatorModel(ABC):
     @abstractmethod
     def estimate_probabilities(self, parent_samples: np.ndarray) -> np.ndarray:
diff --git a/dowhy/gcm/fitting_sampling.py b/dowhy/gcm/fitting_sampling.py
index d8758a45cb..87e8aebf02 100644
--- a/dowhy/gcm/fitting_sampling.py
+++ b/dowhy/gcm/fitting_sampling.py
@@ -17,11 +17,31 @@
 from dowhy.graph import get_ordered_predecessors, is_root_node
 
 
-def fit(causal_model: ProbabilisticCausalModel, data: pd.DataFrame):
-    """Learns generative causal models of nodes in the causal graph from data.
-
-    :param causal_model: The causal model containing the mechanisms that will be fitted.
+def fit(causal_model: ProbabilisticCausalModel, data: pd.DataFrame, return_evaluation_summary: bool = False):
+    """Fits the causal mechanism of each node to the data. This is done by iterating over the nodes in the graph and
+    fitting their assigned causal mechanisms individually to the data by calling the corresponding fit function. Due to
+    the modularity assumption, we can fit each mechanism in the graph independently of the other mechanisms. For root
+    nodes, the training data is the corresponding column in the provided data. For non-root nodes, the data is based on
+    a node's parents and the node itself. Before a node is fitted, this function first validates whether the assigned
+    mechanism is valid, i.e., whether a root node follows a StochasticModel and whether a non-root node follows a
+    ConditionalStochasticModel.
+
+    The details of fitting a causal mechanism depend on their implementation. For example, if a node follows an additive
+    noise model X_i = f_i(PA_i) + N_i, where N_i is unobserved noise, the fitting involves fitting the function f_i
+    (which could be any scikit-learn regressor) to the data and modeling the distribution N_i based on the residuals
+    X_i - f_i(PA_i). For more details on how each individual mechanism is fitted, refer to the corresponding
+    documentation, since these are individual implementation details.
+
+    This function optionally, returns a summary of different metrics of the causal mechanisms evaluated via
+    cross-validation. Note, this will use the evaluate_causal_model method. For more detailed and extensive evaluations,
+    consider using the evaluate_causal_model method directly.
+
+    :param causal_model: The causal model containing the mechanisms of the node that will be fitted.
     :param data: Observations of nodes in the causal model.
+    :param return_evaluation_summary: If True, returns a summary of the performances of the fitted mechanisms using the
+                                      evaluate_causal_model method. If False, nothing is returned.
+    :return: Optionally, a CausalModelEvaluationResult summarizing the performances of the causal mechanisms via
+             cross-validation.
     """
     progress_bar = tqdm(
         causal_model.graph.nodes,
@@ -41,6 +61,19 @@ def fit(causal_model: ProbabilisticCausalModel, data: pd.DataFrame):
 
         fit_causal_model_of_target(causal_model, node, data)
 
+    if return_evaluation_summary:
+        from dowhy.gcm import evaluate_causal_model
+
+        return evaluate_causal_model(
+            causal_model,
+            data,
+            evaluate_causal_mechanisms=True,
+            compare_mechanism_baselines=False,
+            evaluate_invertibility_assumptions=False,
+            evaluate_overall_kl_divergence=False,
+            evaluate_causal_structure=False,
+        )
+
 
 def fit_causal_model_of_target(
     causal_model: ProbabilisticCausalModel, target_node: Any, training_data: pd.DataFrame
diff --git a/dowhy/gcm/model_evaluation.py b/dowhy/gcm/model_evaluation.py
index 604372f53d..1a1d62612f 100644
--- a/dowhy/gcm/model_evaluation.py
+++ b/dowhy/gcm/model_evaluation.py
@@ -183,7 +183,7 @@ def __str__(self):
         if self.overall_kl_divergence is not None:
             summary_string += " and the overall average KL divergence between generated and observed distribution"
         if self.graph_falsification is not None:
-            summary_string += " and graph structure"
+            summary_string += " and the graph structure"
 
         summary_string += ". The results are as follows:"
         summary_strings = [summary_string]
@@ -191,16 +191,15 @@ def __str__(self):
         if self.mechanism_performances is not None:
             summary_strings.append("\n==== Evaluation of Causal Mechanisms ====")
             summary_strings.append(
-                "Root nodes are evaluated based on the KL divergence between the generated "
-                "and the observed distribution."
-            )
-            summary_strings.append(
-                "Non-root nodes are mainly evaluated based on the (normalized) Continuous Ranked Probability Score "
-                "(CRPS), which is a generalizes the Mean Absolute Percentage Error to probabilistic "
-                "predictions. Since the causal mechanisms produce conditional distributions, this "
-                "should give some insights into their performance and calibration. In addition, the mean squared error "
-                "(MSE), the normalized MSE (NMSE), the R2 coefficient and the F1 score (for categorical nodes) is "
-                "reported."
+                "The used evaluation metrics are:\n"
+                "- KL divergence (only for root-nodes): Evaluates the divergence between the generated and the observed distribution.\n"
+                "- Mean Squared Error (MSE): Evaluates the average squared differences between the observed values and the conditional expectation of the causal mechanisms.\n"
+                "- Normalized MSE (NMSE): The MSE normalized by the standard deviation for better comparison.\n"
+                "- R2 coefficient: Indicates how much variance is explained by the conditional expectations of the mechanisms. Note, however, that this can be misleading for nonlinear relationships.\n"
+                "- F1 score (only for categorical non-root nodes): The harmonic mean of the precision and recall indicating the goodness of the underlying classifier model.\n"
+                "- (normalized) Continuous Ranked Probability Score (CRPS): The CRPS generalizes the Mean Absolute Percentage Error to probabilistic predictions. This gives insights into the accuracy and calibration of the causal mechanisms.\n"
+                "NOTE: Every metric focuses on different aspects and they might not consistently indicate a good or bad performance.\n"
+                "We will mostly utilize the CRPS for comparing and interpreting the performance of the mechanisms, since this captures the most important properties for the causal model."
             )
 
             for mechanism_performance in self.mechanism_performances.values():
diff --git a/dowhy/gcm/util/general.py b/dowhy/gcm/util/general.py
index 97d43c7bf8..15c6618186 100644
--- a/dowhy/gcm/util/general.py
+++ b/dowhy/gcm/util/general.py
@@ -196,6 +196,15 @@ def has_categorical(X: np.ndarray) -> bool:
     return False
 
 
+def is_discrete(X: np.ndarray) -> bool:
+    """Checks if all values in the given array are discrete.
+
+    :param X: Input array to check.
+    :return: True if all values in the input are discrete, False otherwise.
+    """
+    return np.all(X == np.floor(X))
+
+
 def setdiff2d(ar1: np.ndarray, ar2: np.ndarray, assume_unique: bool = False) -> np.ndarray:
     """This method generalizes numpy's setdiff1d to 2d, i.e., it compares vectors for arbitrary length. See
     https://numpy.org/doc/stable/reference/generated/numpy.setdiff1d.html for more details."""
diff --git a/tests/gcm/test_auto.py b/tests/gcm/test_auto.py
index db7c18e6bf..930a56bbd4 100644
--- a/tests/gcm/test_auto.py
+++ b/tests/gcm/test_auto.py
@@ -10,7 +10,15 @@
 from sklearn.naive_bayes import GaussianNB
 from sklearn.pipeline import Pipeline
 
-from dowhy.gcm import ProbabilisticCausalModel, StructuralCausalModel, draw_samples, fit
+from dowhy.gcm import (
+    AdditiveNoiseModel,
+    DiscreteAdditiveNoiseModel,
+    EmpiricalDistribution,
+    ProbabilisticCausalModel,
+    StructuralCausalModel,
+    draw_samples,
+    fit,
+)
 from dowhy.gcm.auto import AssignmentQuality, assign_causal_mechanisms, has_linear_relationship
 
 
@@ -206,6 +214,20 @@ def test_given_polynomial_classification_data_with_categorical_input_when_auto_a
     assign_causal_mechanisms(causal_model, pd.DataFrame(data), quality=AssignmentQuality.GOOD, override_models=True)
 
 
+def test_given_continuous_and_discrete_data_when_auto_assign_then_correct_assigns_discrete_anm():
+    causal_model = ProbabilisticCausalModel(nx.DiGraph([("X", "Y"), ("Y", "Z")]))
+    data = {
+        "X": np.random.normal(0, 1, 100),
+        "Y": np.random.choice(2, 100, replace=True),
+        "Z": np.random.normal(0, 1, 100),
+    }
+
+    assign_causal_mechanisms(causal_model, pd.DataFrame(data), quality=AssignmentQuality.GOOD)
+    assert isinstance(causal_model.causal_mechanism("X"), EmpiricalDistribution)
+    assert isinstance(causal_model.causal_mechanism("Y"), DiscreteAdditiveNoiseModel)
+    assert isinstance(causal_model.causal_mechanism("Z"), AdditiveNoiseModel)
+
+
 def test_when_auto_called_from_main_namespace_returns_no_attribute_error():
     from dowhy import gcm
 
@@ -343,40 +365,51 @@ def test_given_continuous_data_when_print_auto_summary_then_returns_expected_for
     assert len(summary_result._nodes["X4"]["model_performances"]) == 0
     assert len(summary_result._nodes["Y"]["model_performances"]) > 0
 
-    expected_summary = """Analyzed 6 nodes.
+    assert (
+        """When using this auto assignment function, the given data is used to automatically assign a causal mechanism to each node. Note that causal mechanisms can also be customized and assigned manually.
+The following types of causal mechanisms are considered for the automatic selection:
+
+If root node:
+An empirical distribution, i.e., the distribution is represented by randomly sampling from the provided data. This provides a flexible and non-parametric way to model the marginal distribution and is valid for all types of data modalities.
+
+If non-root node and the data is continuous:
+Additive Noise Models (ANM) of the form X_i = f(PA_i) + N_i, where PA_i are the parents of X_i and the unobserved noise N_i is assumed to be independent of PA_i.To select the best model for f, different regression models are evaluated and the model with the smallest mean squared error is selected.Note that minimizing the mean squared error here is equivalent to selecting the best choice of an ANM.
+
+If non-root node and the data is discrete:
+Discrete Additive Noise Models have almost the same definition as non-discrete ANMs, but come with an additional constraint for f to only return discrete values.
+Note that 'discrete' here refers to numerical values with an order. If the data is categorical, consider representing them as strings to ensure proper model selection.
+
+If non-root node and the data is categorical:
+A functional causal model based on a classifier, i.e., X_i = f(PA_i, N_i).
+Here, N_i follows a uniform distribution on [0, 1] and is used to randomly sample a class (category) using the conditional probability distribution produced by a classification model.Here, different model classes are evaluated using the (negative) F1 score and the best performing model class is selected.
+
+In total, 6 nodes were analyzed:
 
 --- Node: X0
-Node X0 is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution.
+Node X0 is a root node. Therefore, assigning 'Empirical Distribution' to the node representing the marginal distribution.
 
 --- Node: X1
-Node X1 is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution.
+Node X1 is a root node. Therefore, assigning 'Empirical Distribution' to the node representing the marginal distribution.
 
 --- Node: X2
-Node X2 is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution.
+Node X2 is a root node. Therefore, assigning 'Empirical Distribution' to the node representing the marginal distribution.
 
 --- Node: X3
-Node X3 is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution.
+Node X3 is a root node. Therefore, assigning 'Empirical Distribution' to the node representing the marginal distribution.
 
 --- Node: X4
-Node X4 is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution.
+Node X4 is a root node. Therefore, assigning 'Empirical Distribution' to the node representing the marginal distribution.
 
 --- Node: Y
-Node Y is a non-root node. Assigning 'AdditiveNoiseModel using HistGradientBoostingRegressor' to the node.
-This represents the causal relationship as Y := f(X0,X1,X2,X3,X4) + N.
-For the model selection, the following models were evaluated on the mean squared error (MSE) metric:
-*
-Based on the type of causal mechanism, the model with the lowest metric value represents the best choice."""
-
-    assert (
-        summary_string.split(
-            "For the model selection, the following models were evaluated on the mean squared error (MSE) metric:"
-        )[0]
-        == expected_summary.split(
-            "For the model selection, the following models were evaluated on the mean squared error (MSE) metric:"
-        )[0]
+Node Y is a non-root node with continuous data. Assigning 'AdditiveNoiseModel using """
+        in summary_string
     )
+    assert "This represents the causal relationship as Y := f(X0,X1,X2,X3,X4) + N." in summary_string
+    assert "For the model selection, the following models were evaluated on the mean squared error (MSE) metric:"
     assert (
-        "Based on the type of causal mechanism, the model with the lowest metric value represents the best choice."
+        """===Note===
+Note, based on the selected auto assignment quality, the set of evaluated models changes.
+For more insights toward the quality of the fitted graphical causal model, consider using the evaluate_causal_model function after fitting the causal mechanisms."""
         in summary_string
     )
 
@@ -408,40 +441,50 @@ def test_given_categorical_data_when_print_auto_summary_then_returns_expected_fo
     assert len(summary_result._nodes["X4"]["model_performances"]) == 0
     assert len(summary_result._nodes["Y"]["model_performances"]) > 0
 
-    expected_summary = """Analyzed 6 nodes.
+    assert (
+        """The following types of causal mechanisms are considered for the automatic selection:
+
+If root node:
+An empirical distribution, i.e., the distribution is represented by randomly sampling from the provided data. This provides a flexible and non-parametric way to model the marginal distribution and is valid for all types of data modalities.
+
+If non-root node and the data is continuous:
+Additive Noise Models (ANM) of the form X_i = f(PA_i) + N_i, where PA_i are the parents of X_i and the unobserved noise N_i is assumed to be independent of PA_i.To select the best model for f, different regression models are evaluated and the model with the smallest mean squared error is selected.Note that minimizing the mean squared error here is equivalent to selecting the best choice of an ANM.
+
+If non-root node and the data is discrete:
+Discrete Additive Noise Models have almost the same definition as non-discrete ANMs, but come with an additional constraint for f to only return discrete values.
+Note that 'discrete' here refers to numerical values with an order. If the data is categorical, consider representing them as strings to ensure proper model selection.
+
+If non-root node and the data is categorical:
+A functional causal model based on a classifier, i.e., X_i = f(PA_i, N_i).
+Here, N_i follows a uniform distribution on [0, 1] and is used to randomly sample a class (category) using the conditional probability distribution produced by a classification model.Here, different model classes are evaluated using the (negative) F1 score and the best performing model class is selected.
+
+In total, 6 nodes were analyzed:
 
 --- Node: X0
-Node X0 is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution.
+Node X0 is a root node. Therefore, assigning 'Empirical Distribution' to the node representing the marginal distribution.
 
 --- Node: X1
-Node X1 is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution.
+Node X1 is a root node. Therefore, assigning 'Empirical Distribution' to the node representing the marginal distribution.
 
 --- Node: X2
-Node X2 is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution.
+Node X2 is a root node. Therefore, assigning 'Empirical Distribution' to the node representing the marginal distribution.
 
 --- Node: X3
-Node X3 is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution.
+Node X3 is a root node. Therefore, assigning 'Empirical Distribution' to the node representing the marginal distribution.
 
 --- Node: X4
-Node X4 is a root node. Assigning 'Empirical Distribution' to the node representing the marginal distribution.
+Node X4 is a root node. Therefore, assigning 'Empirical Distribution' to the node representing the marginal distribution.
 
 --- Node: Y
-Node Y is a non-root node. Assigning 'Classifier FCM based on LogisticRegression(max_iter=10000)' to the node.
-This represents the causal relationship as Y := f(X0,X1,X2,X3,X4,N).
-For the model selection, the following models were evaluated on the (negative) F1 metric:
-*
-Based on the type of causal mechanism, the model with the lowest metric value represents the best choice."""
-
-    assert (
-        summary_string.split(
-            "For the model selection, the following models were evaluated on the (negative) F1 metric:"
-        )[0]
-        == expected_summary.split(
-            "For the model selection, the following models were evaluated on the (negative) F1 metric:"
-        )[0]
+Node Y is a non-root node with categorical data. Assigning 'Classifier FCM based on """
+        in summary_string
     )
+    assert "This represents the causal relationship as Y := f(X0,X1,X2,X3,X4,N)." in summary_string
+    assert "For the model selection, the following models were evaluated on the (negative) F1 metric:" in summary_string
     assert (
-        "Based on the type of causal mechanism, the model with the lowest metric value represents the best choice."
+        """===Note===
+Note, based on the selected auto assignment quality, the set of evaluated models changes.
+For more insights toward the quality of the fitted graphical causal model, consider using the evaluate_causal_model function after fitting the causal mechanisms."""
         in summary_string
     )
 
diff --git a/tests/gcm/test_fcms.py b/tests/gcm/test_fcms.py
index 64ab7f844f..b72d159941 100644
--- a/tests/gcm/test_fcms.py
+++ b/tests/gcm/test_fcms.py
@@ -12,6 +12,7 @@
 from dowhy.gcm import (
     AdditiveNoiseModel,
     ClassifierFCM,
+    DiscreteAdditiveNoiseModel,
     EmpiricalDistribution,
     PostNonlinearModel,
     ProbabilisticCausalModel,
@@ -33,6 +34,7 @@
     InvertibleIdentityFunction,
     InvertibleLogarithmicFunction,
 )
+from dowhy.gcm.util.general import is_discrete
 
 
 def test_given_linear_data_when_fit_causal_graph_with_linear_anm_then_learns_correct_coefficients():
@@ -241,6 +243,40 @@ def test_given_logarithmic_data_when_fit_post_non_linear_sem_with_invertible_log
     assert sem_fitted.prediction_model.sklearn_model.coef_ == approx(np.array([2]), abs=0.05)
 
 
+@flaky(max_runs=3)
+def test_given_discrete_target_data_when_fit_discrete_additive_noise_model_then_behaves_as_expected():
+    X = np.random.normal(0, 1, (1000, 2))
+    X[X > 3] = 3
+    X[X < -3] = -3
+    Y = np.round(np.sum(X, axis=1))
+
+    danm = DiscreteAdditiveNoiseModel(create_linear_regressor())
+    danm.fit(X, Y)
+
+    test_X = np.random.normal(0, 1, (1000, 2))
+    test_X[test_X > 3] = 3
+    test_X[test_X < -3] = -3
+    test_Y = np.round(np.sum(test_X, axis=1)).reshape(-1)
+
+    assert danm.evaluate(test_X, np.zeros(1000)).reshape(-1) == approx(test_Y, abs=3)
+    assert danm.evaluate(test_X, np.zeros(1000)).reshape(-1) == approx(test_Y, abs=3)
+    assert is_discrete(danm.draw_samples(test_X))
+
+    assert danm.estimate_noise(np.array([0, 1, 2]), np.array([[-1, 1], [0, 0], [0, 1]])).reshape(-1) == approx(
+        np.array([0, 1, 1])
+    )
+
+    X = np.array([0.1, 10.5, 20, 30.7, 40.3])
+    Y = np.floor(X)  # Y has only 0, 10, 20, 30, 40
+
+    danm = DiscreteAdditiveNoiseModel(create_linear_regressor())
+    danm.fit(X, Y)
+
+    assert danm.evaluate(np.array([-100, -32.4, 0.4, 4, 9, 11, 30.1, 101.4, 0.9]), np.zeros(9)) == approx(
+        [-100, -32, 0, 4, 9, 11, 30, 101, 1]
+    )
+
+
 def _generate_data_with_categorical_input():
     X0 = np.random.normal(0, 1, 1000)
     X1 = np.random.choice(3, 1000).astype(str)
diff --git a/tests/gcm/test_model_evaluation.py b/tests/gcm/test_model_evaluation.py
index 6ce9d00a47..225e00c444 100644
--- a/tests/gcm/test_model_evaluation.py
+++ b/tests/gcm/test_model_evaluation.py
@@ -234,11 +234,18 @@ def test_given_continuous_data_only_when_evaluate_model_returns_expected_informa
     summary_string = str(summary)
 
     assert (
-        """Evaluated the performance of the causal mechanisms and the invertibility assumption of the causal mechanisms and the overall average KL divergence between generated and observed distribution and graph structure. The results are as follows:
+        """Evaluated the performance of the causal mechanisms and the invertibility assumption of the causal mechanisms and the overall average KL divergence between generated and observed distribution and the graph structure. The results are as follows:
 
 ==== Evaluation of Causal Mechanisms ====
-Root nodes are evaluated based on the KL divergence between the generated and the observed distribution.
-Non-root nodes are mainly evaluated based on the (normalized) Continuous Ranked Probability Score (CRPS), which is a generalizes the Mean Absolute Percentage Error to probabilistic predictions. Since the causal mechanisms produce conditional distributions, this should give some insights into their performance and calibration. In addition, the mean squared error (MSE), the normalized MSE (NMSE), the R2 coefficient and the F1 score (for categorical nodes) is reported."""
+The used evaluation metrics are:
+- KL divergence (only for root-nodes): Evaluates the divergence between the generated and the observed distribution.
+- Mean Squared Error (MSE): Evaluates the average squared differences between the observed values and the conditional expectation of the causal mechanisms.
+- Normalized MSE (NMSE): The MSE normalized by the standard deviation for better comparison.
+- R2 coefficient: Indicates how much variance is explained by the conditional expectations of the mechanisms. Note, however, that this can be misleading for nonlinear relationships.
+- F1 score (only for categorical non-root nodes): The harmonic mean of the precision and recall indicating the goodness of the underlying classifier model.
+- (normalized) Continuous Ranked Probability Score (CRPS): The CRPS generalizes the Mean Absolute Percentage Error to probabilistic predictions. This gives insights into the accuracy and calibration of the causal mechanisms.
+NOTE: Every metric focuses on different aspects and they might not consistently indicate a good or bad performance.
+We will mostly utilize the CRPS for comparing and interpreting the performance of the mechanisms, since this captures the most important properties for the causal model."""
         in summary_string
     )
     assert "--- Node X0\n" "- The KL divergence between generated and observed distribution is " in summary_string
@@ -328,11 +335,18 @@ def test_given_categorical_data_only_when_evaluate_model_returns_expected_inform
     summary_string = str(summary)
 
     assert (
-        """Evaluated the performance of the causal mechanisms and the invertibility assumption of the causal mechanisms and the overall average KL divergence between generated and observed distribution and graph structure. The results are as follows:
+        """Evaluated the performance of the causal mechanisms and the invertibility assumption of the causal mechanisms and the overall average KL divergence between generated and observed distribution and the graph structure. The results are as follows:
 
 ==== Evaluation of Causal Mechanisms ====
-Root nodes are evaluated based on the KL divergence between the generated and the observed distribution.
-Non-root nodes are mainly evaluated based on the (normalized) Continuous Ranked Probability Score (CRPS), which is a generalizes the Mean Absolute Percentage Error to probabilistic predictions. Since the causal mechanisms produce conditional distributions, this should give some insights into their performance and calibration. In addition, the mean squared error (MSE), the normalized MSE (NMSE), the R2 coefficient and the F1 score (for categorical nodes) is reported."""
+The used evaluation metrics are:
+- KL divergence (only for root-nodes): Evaluates the divergence between the generated and the observed distribution.
+- Mean Squared Error (MSE): Evaluates the average squared differences between the observed values and the conditional expectation of the causal mechanisms.
+- Normalized MSE (NMSE): The MSE normalized by the standard deviation for better comparison.
+- R2 coefficient: Indicates how much variance is explained by the conditional expectations of the mechanisms. Note, however, that this can be misleading for nonlinear relationships.
+- F1 score (only for categorical non-root nodes): The harmonic mean of the precision and recall indicating the goodness of the underlying classifier model.
+- (normalized) Continuous Ranked Probability Score (CRPS): The CRPS generalizes the Mean Absolute Percentage Error to probabilistic predictions. This gives insights into the accuracy and calibration of the causal mechanisms.
+NOTE: Every metric focuses on different aspects and they might not consistently indicate a good or bad performance.
+We will mostly utilize the CRPS for comparing and interpreting the performance of the mechanisms, since this captures the most important properties for the causal model."""
         in summary_string
     )
     assert "--- Node X0\n" "- The KL divergence between generated and observed distribution is " in summary_string
diff --git a/tests/gcm/test_whatif.py b/tests/gcm/test_whatif.py
index 5dbdf90f5a..345780bef9 100644
--- a/tests/gcm/test_whatif.py
+++ b/tests/gcm/test_whatif.py
@@ -8,6 +8,7 @@
 from dowhy.gcm import (
     AdditiveNoiseModel,
     ClassifierFCM,
+    DiscreteAdditiveNoiseModel,
     EmpiricalDistribution,
     InvertibleStructuralCausalModel,
     ProbabilisticCausalModel,
@@ -17,7 +18,11 @@
     fit,
     interventional_samples,
 )
-from dowhy.gcm.ml import create_linear_regressor, create_logistic_regression_classifier
+from dowhy.gcm.ml import (
+    create_hist_gradient_boost_regressor,
+    create_linear_regressor,
+    create_logistic_regression_classifier,
+)
 
 
 def _create_and_fit_simple_probabilistic_causal_model():
@@ -243,3 +248,33 @@ def test_given_binary_target_when_estimate_average_causal_effect_then_return_exp
         interventions_reference={"T": lambda x: 0},
         num_samples_to_draw=1000,
     ) == approx(0.5, abs=0.1)
+
+
+@flaky(max_runs=3)
+def test_given_discrete_data_when_performing_interventions_then_returns_correct_samples():
+    X = np.random.normal(0, 1, 1000)
+    Y = []
+    for x in X:
+        if x < -1.5:
+            Y.append(-1)
+        elif -1.5 <= x <= 1.5:
+            Y.append(0)
+        else:
+            Y.append(1)
+    Y = np.array(Y)
+    Z = 2 * Y + np.random.normal(0, 0.1, 1000)
+
+    causal_model = ProbabilisticCausalModel(nx.DiGraph([("X", "Y"), ("Y", "Z")]))
+    causal_model.set_causal_mechanism("X", EmpiricalDistribution())
+    causal_model.set_causal_mechanism(
+        "Y", DiscreteAdditiveNoiseModel(prediction_model=create_hist_gradient_boost_regressor())
+    )
+    causal_model.set_causal_mechanism("Z", AdditiveNoiseModel(prediction_model=create_linear_regressor()))
+    data = pd.DataFrame({"X": X, "Y": Y, "Z": Z})
+
+    fit(causal_model, data)
+
+    samples = interventional_samples(causal_model, {"X": lambda x: -2}, num_samples_to_draw=1000)
+    assert np.all(samples["X"].to_numpy() == -2)
+    assert np.median(samples["Y"].to_numpy()) == -1
+    assert np.mean(samples["Z"].to_numpy()) == approx(-2, abs=0.05)
diff --git a/tests/gcm/util/test_general.py b/tests/gcm/util/test_general.py
index 74904ecaf1..f29df56585 100644
--- a/tests/gcm/util/test_general.py
+++ b/tests/gcm/util/test_general.py
@@ -16,6 +16,7 @@
     fit_one_hot_encoders,
     has_categorical,
     is_categorical,
+    is_discrete,
     set_random_seed,
     setdiff2d,
     shape_into_2d,
@@ -207,3 +208,14 @@ def test_given_categorical_data_when_using_auto_fit_and_apply_encoder_then_retur
             ]
         ).T
     )
+
+
+def test_given_discrete_data_when_calling_is_discrete_then_returns_true():
+    assert is_discrete(np.array([0, -4, 5, 10]))
+    assert is_discrete(np.array([0, -4, 5, 10]).reshape(-1, 1))
+
+
+def test_given_non_discrete_data_when_calling_is_discrete_then_returns_false():
+    assert not is_discrete(np.array([0, -4, 5, 10, 1.0000000001, 0.000000001, 10**-15, 99.9, 40.5]))
+    assert not is_discrete(np.array([10**-15]))
+    assert not is_discrete(np.array([0, -4, 5, 10, 1.0000000001, 0.000000001, 10**-15, 99.9, 40.5]).reshape(-1, 1))