Debugging models chapter

SEATBELTS page has been moved here Also corrected an issue in recent CMake changes where files containing special chars in their name were nolonger supported. In testing, I was unable to find a way to breakpoint into RTC agent functions on Windows.
FLAMEGPU · Aug 31, 2022 · 3856877 · 3856877
1 parent 4590b7a
commit 3856877
Show file tree

Hide file tree

Showing 21 changed files with 531 additions and 8 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -49,6 +49,23 @@ set(DOCS_SRC_FILES
     "${CMAKE_CURRENT_SOURCE_DIR}/src/guide/agent-functions/modifying-agent-variables.rst"
     "${CMAKE_CURRENT_SOURCE_DIR}/src/guide/agent-functions/random-numbers.rst"
     "${CMAKE_CURRENT_SOURCE_DIR}/src/guide/creating-a-model/index.rst"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/guide/debugging-models/index.rst"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/guide/debugging-models/logging.rst"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/guide/debugging-models/NB_cycle_stages_6(x512)_a.png"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/guide/debugging-models/NB_cycle_stages_6(x512)_b.png"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/guide/debugging-models/NB_living_count(x512)_a.png"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/guide/debugging-models/NB_living_count(x512)_b.png"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/guide/debugging-models/NB_MYCN(x512)_a.png"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/guide/debugging-models/NB_MYCN(x512)_b.png"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/guide/debugging-models/NB_telo_count(x512)_a.png"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/guide/debugging-models/NB_telo_count(x512)_b.png"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/guide/debugging-models/printf.rst"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/guide/debugging-models/seatbelts.rst"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/guide/debugging-models/using-a-debugger.rst"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/guide/debugging-models/visual_studio_attach_to_process_dialog.png"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/guide/debugging-models/visual_studio_build_config.png"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/guide/debugging-models/visual_studio_startup_project.png"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/guide/debugging-models/visual_studio_start_debugger.png"
     "${CMAKE_CURRENT_SOURCE_DIR}/src/guide/defining-agents/index.rst"
     "${CMAKE_CURRENT_SOURCE_DIR}/src/guide/defining-execution-order/dependency-graph.rst"
     "${CMAKE_CURRENT_SOURCE_DIR}/src/guide/defining-execution-order/exit-conditions.rst"
@@ -74,7 +91,6 @@ set(DOCS_SRC_FILES
     "${CMAKE_CURRENT_SOURCE_DIR}/src/guide/running-a-simulation/index.rst"
     "${CMAKE_CURRENT_SOURCE_DIR}/src/guide/running-a-simulation/initial-state.rst"
     "${CMAKE_CURRENT_SOURCE_DIR}/src/guide/running-multiple-simulations/index.rst"
-    "${CMAKE_CURRENT_SOURCE_DIR}/src/guide/seatbelts/index.rst"
     "${CMAKE_CURRENT_SOURCE_DIR}/src/guide/visualisation/adding-details.rst"
     "${CMAKE_CURRENT_SOURCE_DIR}/src/guide/visualisation/building-with-vis.rst"
     "${CMAKE_CURRENT_SOURCE_DIR}/src/guide/visualisation/index.rst"
@@ -105,8 +121,8 @@ foreach(SRC_FILE IN LISTS DOCS_SRC_FILES)
     add_custom_command(
         OUTPUT  ${BUILD_SRC_FILE}
         DEPENDS ${SRC_FILE} ${CMAKE_CURRENT_BINARY_DIR}/src/index.rst
-        COMMAND ${CMAKE_COMMAND} -E copy ${SRC_FILE} ${BUILD_SRC_FILE}
-        COMMENT "Copying ${SRC_FILE} to ${BUILD_SRC_FILE}"
+        COMMAND ${CMAKE_COMMAND} -E copy "\"${SRC_FILE}\"" "\"${BUILD_SRC_FILE}\""
+        COMMENT "Copying '${SRC_FILE}' to '${BUILD_SRC_FILE}'"
     )
     list(APPEND BUILD_SRC_DEPENDS "${BUILD_SRC_FILE}")
     unset(BUILD_SRC_FILE)

diff --git a/src/guide/agent-functions/defining-agent-functions.rst b/src/guide/agent-functions/defining-agent-functions.rst
@@ -37,6 +37,8 @@ For compile time (i.e. non-RTC functions), when using the C++ API, the :c:macro:
         // ...
     }
 
+.. _Runtime Compiled Agent Functions:
+
 C++ and Python Runtime Compiled Agent Functions
 -----------------------------------------------
 

diff --git a/src/guide/debugging-models/NB_MYCN(x512)_a.png b/src/guide/debugging-models/NB_MYCN(x512)_a.png
diff --git a/src/guide/debugging-models/NB_MYCN(x512)_b.png b/src/guide/debugging-models/NB_MYCN(x512)_b.png
diff --git a/src/guide/debugging-models/NB_cycle_stages_6(x512)_a.png b/src/guide/debugging-models/NB_cycle_stages_6(x512)_a.png
diff --git a/src/guide/debugging-models/NB_cycle_stages_6(x512)_b.png b/src/guide/debugging-models/NB_cycle_stages_6(x512)_b.png
diff --git a/src/guide/debugging-models/NB_living_count(x512)_a.png b/src/guide/debugging-models/NB_living_count(x512)_a.png
diff --git a/src/guide/debugging-models/NB_living_count(x512)_b.png b/src/guide/debugging-models/NB_living_count(x512)_b.png
diff --git a/src/guide/debugging-models/NB_telo_count(x512)_a.png b/src/guide/debugging-models/NB_telo_count(x512)_a.png
diff --git a/src/guide/debugging-models/NB_telo_count(x512)_b.png b/src/guide/debugging-models/NB_telo_count(x512)_b.png
diff --git a/src/guide/debugging-models/index.rst b/src/guide/debugging-models/index.rst
@@ -0,0 +1,17 @@
+.. _DebuggingModels:
+
+Debugging Models
+================
+
+Implementing large models can be challenging, therefore it's likely you will need to debug your model during development to hunt down the cause of unwanted behaviours. Furthemore, once you have completed your model it's likely that you will need to validate and calibrate your model, which may follow a similar process. 
+
+This chapter has been broken up into several sections, each detailing a different approach to debugging FLAME GPU 2 models:
+
+
+.. toctree::
+   :maxdepth: 1 
+
+   seatbelts.rst
+   printf.rst
+   logging.rst
+   using-a-debugger.rst
diff --git a/src/guide/debugging-models/logging.rst b/src/guide/debugging-models/logging.rst
@@ -0,0 +1,245 @@
+.. _DebuggingModelsLogging:
+
+Logging Timeseries Data
+=======================
+
+:ref:`Logging<Configuring Data to be Logged>` can be used to collect timeseries data, such as how the size of agent populations change throughout a model's execution. Furthermore, with :ref:`ensembles<ensembles>` this data can be collected across a batch of runs, especially useful when dealing with stochastic models.
+
+Use of logging in this manner is particularly useful when porting a pre-existing model to FLAME GPU 2, and is an approach we have used widely to validate our FLAME GPU 2 models behaviour matches.
+
+1. Decide a model configuration which can be executed in both versions of the model
+2. Execute both models ~100+ times, with a variety of different random seeds.
+3. From these executions, collect timeseries data for a range of important model values. These might be population sizes, environment properties or the mean of agent variables.
+4. Calculate the mean and standard deviation of each data point for both data sets and graph them.
+5. The graph should contain two lines, if the models are operating the same they should match closely (this will also depend on the model's stochasticity and number of runs collected).
+6. Presence of differences in some graphs before others can help narrow down the source of differences assisting in them being solved.
+
+
+Below are some example before and after graphs, demonstrating how visible small bugs can be.
+
+Both the Python and FLAME GPU models were executed with 512 different random seeds, for a low number of steps (60-80).
+
+The number of agents is a major signal for the two models being equivalent.
+
+**The average +- the standard deviation of the number of living NB cells at each time step.**
+
+.. image:: NB_living_count(x512)_a.png
+  :width: 400
+  :alt: The graph shows the initial (bugged) case, whereby the two plots diverge early.
+
+.. image:: NB_living_count(x512)_b.png
+  :width: 400
+  :alt: The graph shows the final (fixed) case, whereby the two plots match closely.
+
+When the number of agents is divergent, there will normally be multiple other agent variables which influence agent birth/death which diverge too.
+
+**The average +- the standard deviation of the average of NB agent variable MYCN at each time step.**
+
+.. image:: NB_MYCN(x512)_a.png
+  :width: 400
+  :alt: The graph shows the initial (bugged) case, whereby the two plots diverge early.
+
+.. image:: NB_MYCN(x512)_b.png
+  :width: 400
+  :alt: The graph shows the final (fixed) case, whereby the two plots match closely.
+
+**The average +- the standard deviation of the average of NB agent variable telo_count at each time step.**
+
+.. image:: NB_telo_count(x512)_a.png
+  :width: 400
+  :alt: The graph shows the initial (bugged) case, whereby the two plots diverge early.
+
+.. image:: NB_telo_count(x512)_b.png
+  :width: 400
+  :alt: The graph shows the final (fixed) case, whereby the two plots match closely.
+
+Often times, awareness of the order in which such agent variables diverge will narrow the source of the problem sufficiently. However, in some cases even knowing the agent function at fault is not precise enough to spot a subtle mistake. At this point we suggest adding counters, :ref:`using environment macro properties<EnvironmentMacroPropertyCounters>`, to track how frequently different code-paths are followed, this can lead to identifying the specific condition at fault.
+
+**The average +- the standard deviation of the proportion of NB agent's which passed the 6th branch within the cell cycle agent function.**
+
+.. image:: NB_cycle_stages_6(x512)_a.png
+  :width: 400
+  :alt: The graph shows the initial (bugged) case, whereby the two plots diverge early.
+
+.. image:: NB_cycle_stages_6(x512)_b.png
+  :width: 400
+  :alt: The graph shows the final (fixed) case, whereby the two plots match closely.
+
+From the above graphs, it should be clear how logging can enable the tracing of differences between two models. Longer runs can be useful, as the impact of differences grow with time, however the key to identifying tends to lie in spotting where/when they begin, or the period in which they reoccur. In this example, the Python model is not able to perform as the same scale as FLAME GPU, so runs used a small agent population of around 100 agents, for under 100 steps. The cells have a periodisation of 24 steps, so this is still sufficient, despite in practice this model executing with 100,000 or more agents for 3000+ steps.
+
+.. _EnvironmentMacroPropertyCounters:
+
+Environment Macro Property Counters
+-----------------------------------
+
+As mentioned in the previous example, it may be necessary to add counters to trace what proportion of agents are following each code-path.
+
+For example, given the below simplified agent function:
+
+.. tabs::
+
+  .. code-tab:: cuda CUDA C++
+
+    FLAMEGPU_AGENT_FUNCTION(NB_cell_cycle, flamegpu::MessageNone, flamegpu::MessageNone) {
+
+        unsigned int s_cycle = FLAMEGPU->getVariable<unsigned int>("cycle");
+        const int s_neighbours = FLAMEGPU->getVariable<int>("neighbours");
+        const int s_ID2 = FLAMEGPU->getVariable<int>("ID2");
+        const float s_cycdiff = FLAMEGPU->getVariable<float>("cycdiff");
+        const int s_MAPK_RAS = FLAMEGPU->getVariable<int>("MAPK_RAS");
+        const int s_MYCN = FLAMEGPU->getVariable<int>("MYCN");
+        const int s_p21 = FLAMEGPU->getVariable<int>("p21");
+        const int s_p27 = FLAMEGPU->getVariable<int>("p27");
+        const int s_CDC25C = FLAMEGPU->getVariable<int>("CDC25C");
+
+        const float P_cycle_nb = FLAMEGPU->environment.getProperty<float>("P_cycle_nb");
+        const bool dummy_ncycle = FLAMEGPU->random.uniform<float>() < P_cycle_nb ? true : false;
+
+        if (dummy_ncycle && s_neighbours <= 3) {
+            if (s_cycle < 12) {
+                if (s_cycle == 0) {
+                    if (FLAMEGPU->random.uniform<float>() < s_cycdiff) {
+                        if (((s_MAPK_RAS == 1 || s_MYCN == 1) && s_p21 == 0 && s_p27 == 0) || s_ID2 == 1) {
+                            s_cycle += 1;
+                        }
+                    }
+                } else if (((s_MAPK_RAS == 1 || s_MYCN == 1) && s_p21 == 0 && s_p27 == 0) || s_ID2 == 1) {
+                    s_cycle += 1;
+                    if (s_cycle >= 12 && ((s_MAPK_RAS == 1 && s_p21 == 0 && s_p27 == 0) || s_ID2 == 1) == 0) {
+                        s_cycle -= 1;
+                    }
+                }
+            } else if (s_cycle < 18) {
+                s_cycle += 1;
+                if (s_cycle >= 18 && s_CDC25C == 0) {
+                    s_cycle -= 1;
+                }
+            }
+        }
+        FLAMEGPU->setVariable<unsigned int>("cycle", s_cycle);
+    }
+
+It contains 9 ``if`` and ``else if`` statements, where agent's may diverge.
+
+If we wish to explore how many agents take each path, first it's necessary to extend the model's definition.
+
+.. tabs::
+
+  .. code-tab:: cpp C++
+
+    flamegpu::ModelDescription model("Counters Logging Example");
+
+    ... // Existing model definition
+
+    // New components for counting
+    model.Environment().newMacroProperty<unsigned int, 9>("nb_cycle_counter");
+    model.Environment().newProperty<unsigned int, 9>("nb_cycle_counter");
+
+  .. code-tab:: py Python
+
+    model = pyflamegpu.ModelDescription ("Counters Logging Example")
+
+    ... # Existing model definition
+
+    // New components for counting
+    model.Environment().newMacroPropertyUInt("nb_cycle_counter", 9)
+    model.Environment().newPropertyArrayFloat("nb_cycle_counter", 9)
+
+A step function must also be added, to both copy the macro property to the environment property (to be logged), and to reset the macro property before the next step.
+
+.. tabs::
+
+  .. code-tab:: cpp C++
+
+    FLAMEGPU_STEP_FUNCTION(reset_counters) {
+        // Copy the data from macro environment property to environment property
+        const float NB_COUNT = static_cast<float>(FLAMEGPU->agent("NB").count());  // Cast to avoid integer division
+        auto nb_cycle_counter = FLAMEGPU->environment.getMacroProperty<unsigned int, 9>("nb_cycle_counter");
+        for (unsigned int i = 0; i < 9; ++i) {
+            // Normalise the data by dividing it by the number of agents
+            FLAMEGPU->environment.setProperty<float, 9>("nb_cycle_counter", i, nb_cycle_counter[i] / NB_COUNT);
+        }
+        // Reset the macro environment property
+        nb_cycle_counter.zero();
+    }
+
+    // Attach the step function to the model
+    model.newStepFunction(reset_counters);
+
+  .. code-tab:: py Python
+
+    class reset_counters(pyflamegpu.HostFunctionCallback):
+        def run(self,FLAMEGPU):
+            # Copy the data from macro environment property to environment property
+            NB_COUNT = FLAMEGPU.agent("NB").count()
+            nb_cycle_counter = FLAMEGPU.environment.getMacroPropertyUInt("nb_cycle_counter")
+            for i in range(9):
+                FLAMEGPU.environment.setPropertyArrayFloat("nb_cycle_counter", i, nb_cycle_counter[i] / NB_COUNT)
+            # Reset the macro environment property
+            nb_cycle_counter.zero()
+
+    // Attach the step function to the model
+    model.addStepFunctionCallback(reset_counters().__disown__())
+
+Now the agent function can be updated to increment the counters at each branch
+
+.. tabs::
+
+  .. code-tab:: cuda CUDA C++
+
+    FLAMEGPU_AGENT_FUNCTION(NB_cell_cycle, flamegpu::MessageNone, flamegpu::MessageNone) {
+        auto nb_cycle_counter = FLAMEGPU->environment.getMacroProperty<unsigned int, 9>("nb_cycle_counter");
+
+        unsigned int s_cycle = FLAMEGPU->getVariable<unsigned int>("cycle");
+        const int s_neighbours = FLAMEGPU->getVariable<int>("neighbours");
+        const int s_ID2 = FLAMEGPU->getVariable<int>("ID2");
+        const float s_cycdiff = FLAMEGPU->getVariable<float>("cycdiff");
+        const int s_MAPK_RAS = FLAMEGPU->getVariable<int>("MAPK_RAS");
+        const int s_MYCN = FLAMEGPU->getVariable<int>("MYCN");
+        const int s_p21 = FLAMEGPU->getVariable<int>("p21");
+        const int s_p27 = FLAMEGPU->getVariable<int>("p27");
+        const int s_CDC25C = FLAMEGPU->getVariable<int>("CDC25C");
+
+        const float P_cycle_nb = FLAMEGPU->environment.getProperty<float>("P_cycle_nb");
+        const bool dummy_ncycle = FLAMEGPU->random.uniform<float>() < P_cycle_nb ? true : false;
+
+        if (dummy_ncycle && s_neighbours <= 3) {
+            ++nb_cycle_counter[0];
+            if (s_cycle < 12) {
+                ++nb_cycle_counter[1];
+                if (s_cycle == 0) {
+                    ++nb_cycle_counter[2];
+                    if (FLAMEGPU->random.uniform<float>() < s_cycdiff) {
+                        ++nb_cycle_counter[3];
+                        if (((s_MAPK_RAS == 1 || s_MYCN == 1) && s_p21 == 0 && s_p27 == 0) || s_ID2 == 1) {
+                            ++nb_cycle_counter[4];
+                            s_cycle += 1;
+                        }
+                    }
+                } else if (((s_MAPK_RAS == 1 || s_MYCN == 1) && s_p21 == 0 && s_p27 == 0) || s_ID2 == 1) {
+                    ++nb_cycle_counter[5];
+                    s_cycle += 1;
+                    if (s_cycle >= 12 && ((s_MAPK_RAS == 1 && s_p21 == 0 && s_p27 == 0) || s_ID2 == 1) == 0) {
+                        ++nb_cycle_counter[6];
+                        s_cycle -= 1;
+                    }
+                }
+            } else if (s_cycle < 18) {
+                ++nb_cycle_counter[7];
+                s_cycle += 1;
+                if (s_cycle >= 18 && s_CDC25C == 0) {
+                    ++nb_cycle_counter[8];
+                    s_cycle -= 1;
+                }
+            }
+        }
+        FLAMEGPU->setVariable<unsigned int>("cycle", s_cycle);
+    }
+
+With all this setup, the environment property ``nb_cycle_counter`` will contain the proportion of agents which took each branch at the end of every step. This value can be logged and graphed to compare with other implementations.
+
+
+Related Links
+-------------
+* User Guide: :ref:`Configuring Data to be Logged<Configuring Data to be Logged>`
+* User Guide: :ref:`Defining Environment Macro Properties<Define Macro Environmental Properties>`
diff --git a/src/guide/debugging-models/printf.rst b/src/guide/debugging-models/printf.rst
@@ -0,0 +1,76 @@
+.. _debugging_with_printf:
+
+printf
+======
+
+.. Workaround for nested markup https://docutils.sourceforge.io/FAQ.html#is-nested-inline-markup-possible
+.. |printf| replace:: ``printf()``
+.. _printf: https://cplusplus.com/reference/cstdio/printf
+.. |print| replace:: ``print()``
+.. _print: https://docs.python.org/3/library/functions.html#print
+
+Using |printf|_ (or |print|_ within python) is the usual first step towards debugging.
+
+C++ and Python share very similar syntax
+
+.. tabs::
+
+  .. code-tab:: cpp C++
+
+    // Note C++ does not implicitly terminate the string with a line-break
+    printf("%f: %d\n", foo, bar);
+
+.. tabs::
+
+  .. code-tab:: py Python
+
+    # Note Python implicitly terminates the string with a line-break
+    print('%f: %d'%(foo, bar)) 
+
+These statements can be used in C++ and Python host functions respectively to print messages to console.
+
+Printing From Agent Functions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Agent functions are written using C++, so require the ``printf()`` syntax. :ref:`Python agent functions<Python Agent Functions>` will not transform the python ``print()`` to the C++ equivalent.
+
+However, as agent functions execute for potentially millions of agents in parallel there are some additional things which should be considered:
+
+**Performance**
+
+Printing from agent functions, causes data to be copied from the GPU in order for it to be printed. Large amounts of printing can have a large performance impact
+
+**Order**
+
+Populations of CUDA threads do not execute in lockstep or necessarily order, so messages printed by different agents are likely to occur out of order, it may be useful to include an identifier of each message's source agent (e.g. using the ``%u`` returned by ``FLAMEGPU->getID()``).
+
+**Volume**
+
+Attempting to ``printf()`` from millions of agents simultaneously can lead to programs crashing. There isn't a hard rule for how much is too much, but printing from agent functions should be limited to a subset of the agent population or performed with small agent populations.
+
+**Environment Macro Properties**
+
+Due to how ``printf()`` supports generic type arguments the implicit cast, normally performed when reading macro environment properties, is not performed. As such, attempting to print an environment macro property directly will lead to an undefined value being printed. The below code provides examples that should and should not be used.
+
+.. tabs::
+
+  .. code-tab:: cuda CUDA C++
+
+    FLAMEGPU_AGENT_FUNCTION(agent_fn1, flamegpu::MessageNone, flamegpu::MessageNone) {
+        // Retrieve the macro property
+        auto foo = FLAMEGPU->environment.getMacroProperty<int, 4>("foo");
+
+        // These can be used to print a property
+        printf("%d\n", (int)foo[0]);
+        printf("%d\n", static_cast<int>(foo[1]));
+        const int bar = foo[2];
+        printf("%d\n", bar);
+
+
+        // This should not be used, it will compile but produce bad output
+        printf("%d\n", foo[3]);
+    }
+
+Related Links
+-------------
+* User Guide: :ref:`Python Agent Functions<Python Agent Functions>`