Updates following review.

MoseleyS · Sep 27, 2023 · f7db44d · f7db44d
1 parent dd9202a
commit f7db44d
Show file tree

Hide file tree

Showing 4 changed files with 40 additions and 27 deletions.
diff --git a/doc/source/extended_documentation/categorical/build_a_decision_tree.rst b/doc/source/extended_documentation/categorical/build_a_decision_tree.rst
@@ -1,7 +1,7 @@
 **Decision trees**
 
 Decision trees use diagnostic fields to diagnose a suitable category to represent
-the weather conditions, for a weather symbol for example. The tree is comprised
+the weather conditions, such as a weather symbol. The tree is comprised
 of a series of interconnected decision nodes, leaf nodes and a stand-alone meta node.
 At each decision node one or multiple forecast diagnostics are compared to
 predefined threshold values. The decision node has an if_true and if_false path on
@@ -28,7 +28,7 @@ forecast, proceed to the if_true node, else move to the if_false node.
 **Encoding a decision tree**
 
 The meta node provides the name to use for the metadata of the resulting cube and
-can be anywhere in the decision tree.
+can be anywhere in the decision tree, but must have "meta" as its key.
 This becomes the cube name and is also used for two attributes that describe the
 categorical data: **<name>** and **<name>_meaning**::
 
@@ -126,6 +126,9 @@ accessed with this key contains the following.
   - **is_unreachable** (bool): True for a leaf which needs including in the meta data but
     cannot be reached.
 
+The modal category also relies on the severity of symbols generally increasing with
+the category value, so that in the case of ties, the more severe category is selected.
+
 Every decision tree must have a starting node, and this is taken as the first
 node defined in the dictionary, or second if the first node is the meta node.
 

diff --git a/improver/categorical/decision_tree.py b/improver/categorical/decision_tree.py
@@ -167,6 +167,10 @@ def __repr__(self) -> str:
     @staticmethod
     def _is_decision_node(key: str, query: Dict) -> bool:
         """
+        Determine whether a given node is a decision node.
+        The meta node has a key of "meta", leaf nodes have a query key of "leaf", everything
+        else is a decision node.
+
         Args:
             key:
                 Decision name ("meta" indicates a non-decision node)
@@ -500,7 +504,7 @@ def remove_optional_missing(self, optional_node_data_missing: List[str]):
             target = self.queries[missing]["if_diagnostic_missing"]
             alternative = self.queries[missing][target]
 
-            for node, query in self.queries.items():
+            for query in self.queries.values():
                 if query.get("if_true", None) == missing:
                     query["if_true"] = alternative
                 if query.get("if_false", None) == missing:
@@ -556,7 +560,12 @@ def find_all_routes(
 
     def create_categorical_cube(self, cubes: Union[List[Cube], CubeList]) -> Cube:
         """
-        Create an empty categorical cube
+        Create an empty categorical cube taking the cube name and categorical attribute names
+        from the meta node, and categorical attribute values from the leaf nodes.
+        The reference time is the latest from the set of input cubes and the optional record
+        run attribute is a combination from all source cubes. Everything else comes from the
+        template cube, which is the first cube with time bounds, or the first cube if none
+        have time bounds.
 
         Args:
             cubes:

diff --git a/improver/categorical/modal_code.py b/improver/categorical/modal_code.py
@@ -28,7 +28,7 @@
 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
-"""Module containing a plugin to calculate the modal weather code in a period."""
+"""Module containing a plugin to calculate the modal category in a period."""
 
 from typing import Dict, Optional
 
@@ -52,27 +52,27 @@
 
 
 class ModalCategory(BasePlugin):
-    """Plugin that returns the modal code over the period spanned by the
+    """Plugin that returns the modal category over the period spanned by the
     input data. In cases of a tie in the mode values, scipy returns the smaller
     value. The opposite is desirable in this case as the significance /
-    importance of the weather codes generally increases with the value. To
-    achieve this the codes are subtracted from an arbitrarily larger
+    importance of the weather code categories generally increases with the value. To
+    achieve this the categories are subtracted from an arbitrarily larger
     number prior to calculating the mode, and this operation reversed in the
     final output.
 
-    If there are many different codes for a single point over the time
+    If there are many different categories for a single point over the time
     spanned by the input cubes it may be that the returned mode is not robust.
-    Given the preference to return more significant codes explained above,
-    a 12 hour period with 12 different codes, one of which is thunder, will
-    return a thunder code to describe the whole period. This is likely not a
+    Given the preference to return more significant categories explained above,
+    a 12 hour period with 12 different categories, one of which is severe, will
+    return that severe category to describe the whole period. This is likely not a
     good representation. In these cases grouping is used to try and select
-    a suitable weather code (e.g. a rain shower if the codes include a mix of
+    a suitable category (e.g. a rain shower if the codes include a mix of
     rain showers and dynamic rain) by providing a more robust mode. The lowest
     number (least significant) member of the group is returned as the code.
     Use of the least significant member reflects the lower certainty in the
     forecasts.
 
-    Where there are different weather codes available for night and day, the
+    Where there are different categories available for night and day, the
     modal code returned is always a day code, regardless of the times
     covered by the input files.
     """
@@ -96,7 +96,7 @@ def __init__(
                 a space-separated string.
             record_run_attr:
                 Name of attribute used to record models and cycles used in
-                constructing the weather symbols.
+                constructing the categories.
         """
         self.aggregator_instance = Aggregator("mode", self.mode_aggregator)
         self.decision_tree = decision_tree
@@ -115,12 +115,12 @@ def __init__(
 
     def _unify_day_and_night(self, cube: Cube):
         """Remove distinction between day and night codes so they can each
-        contribute when calculating the modal code. The cube of weather
-        codes is modified in place with all night codes made into their
+        contribute when calculating the modal code. The cube of categorical data
+        is modified in place with all night codes made into their
         daytime equivalents.
 
         Args:
-            A cube of weather codes.
+            A cube of categorical data
         """
         for day, night in self.day_night_map.items():
             cube.data[cube.data == night] = day
@@ -136,17 +136,17 @@ def _code_groups(self) -> Dict:
 
     def _group_codes(self, modal: Cube, cube: Cube):
         """In instances where the mode returned is not significant, i.e. the
-        weather code chosen occurs infrequently in the period, the codes can be
+        category chosen occurs infrequently in the period, the codes can be
         grouped to yield a more definitive period code. Given the uncertainty,
-        the least significant weather type (lowest number in a group that is
+        the least significant category (lowest number in a group that is
         found in the data) is used to replace the other data values that belong
         to that group prior to recalculating the modal code.
 
         The modal cube is modified in place.
 
         Args:
             modal:
-                The modal weather code cube which contains UNSET_CODE_INDICATOR
+                The modal categorical cube which contains UNSET_CODE_INDICATOR
                 values that need to be replaced with a more definitive period
                 code.
             cube:
@@ -218,18 +218,18 @@ def _set_blended_times(cube: Cube) -> None:
             cube.replace_coord(new_coord)
 
     def process(self, cubes: CubeList) -> Cube:
-        """Calculate the modal weather code, with handling for edge cases.
+        """Calculate the modal categorical code, with handling for edge cases.
 
         Args:
             cubes:
-                A list of weather code cubes at different times. A modal
+                A list of categorical cubes at different times. A modal
                 code will be calculated over the time coordinate to return
-                the most comon code, which is taken to be the best
+                the most common code, which is taken to be the best
                 representation of the whole period.
 
         Returns:
-            A single weather code cube with time bounds that span those of
-            the input weather code cubes.
+            A single categorical cube with time bounds that span those of
+            the input categorical cubes.
         """
         # Store the information for the record_run attribute on the cubes.
         if self.record_run_attr and self.model_id_attr:

diff --git a/improver/cli/categorical.py b/improver/cli/categorical.py
@@ -53,7 +53,8 @@ def process(
             A cubelist containing the diagnostics required for the
             decision tree, these at co-incident times.
         decision_tree (dict):
-            A JSON file containing a decision tree definition.
+            A JSON file containing a decision tree definition. Full information on decision
+            trees can be found in improver.categorical.decision_tree.
         model_id_attr (str):
             Name of attribute recording source models that should be
             inherited by the output cube. The source models are expected as