Add scoring function rule (#33)

* Add score_variables function * Add documentation * Update errors
naccdata · Jan 29, 2025 · 81199b9 · 81199b9
1 parent a01240a
commit 81199b9
Show file tree

Hide file tree

Showing 6 changed files with 326 additions and 1 deletion.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 Documentation of release versions of `nacc-form-validator`
 
+## 0.5.0
+
+* Adds new `function` rule `_score_variables` to handle scoring-related rules (e.g. C2F plausibility)
+
 ## 0.4.1
 
 * Updates JSON logic's `soft_equals` and util's `compare_values` to compare two floats for equality with a precision tolerance of 0.01

diff --git a/docs/data-quality-rule-definition-guidelines.md b/docs/data-quality-rule-definition-guidelines.md
@@ -15,6 +15,7 @@
     - [check\_adcid](#check_adcid)
     - [compute\_gds](#compute_gds)
     - [rxnorm](#rxnorm)
+    - [score\_variables](#score_variables)
 
 ## Introduction
 
@@ -1016,3 +1017,131 @@ This function uses the `check_with` rule from Cerberus. The rule definition shou
 }
 ```
 > **NOTE**: To validate `rxnorm`, the validator should have a `Datastore` instance which implements the `is_valid_rxcui` function which will check if the given rxnormid value is a valid RXCUI code
+
+### score_variables
+
+Custom rule that scores the number of correct or incorrect variables based on a mode and scoring key.
+
+This validation is implemented using the `function` rule with custom `score_variables` function in the NACCValidator. The rule definition should be in the following format:
+
+```json
+{
+    "<score_variable>": {
+        "function": {
+            "name": "score_variables",
+            "args": {
+                "mode": "'correct' or 'incorrect'",
+                "scoring_key": {
+                    "var1": "correct_value_1",
+                    "var2": "correct_value_2",
+                    "...etc"
+                },
+                "logic": {
+                    "...same as logic formula"
+                }
+            }
+        }
+    }
+}
+```
+
+* `mode`: Either `correct` or `incorrect`; if `correct`, will count the number of _correct_ variables, and if `incorrect` will count the number of _incorrect_ variables
+* `scoring_key`: Dict representing the scoring key; maps each variable involved in the scoring function to its correct value
+* `logic`: Logic to perform on the `__total_sum` once calculated; same schema as the [logic](#logic) rule defined earlier
+
+This function looks at all variables defined in the `scoring_key` and counts the number that are correct if the mode is `correct` or incorrect if the mode is `incorrect`. It stores this result in a special variable called `__total_sum` (note the double underscore for uniqueness) that can then be used inside the `logic` formula to compare against.
+
+If any of the fields in `scoring_key` are missing or invalid, then validation is skipped and this rule "passes" by default. Otherwise validation succeeds if `__total_sum` satisfies the given formula, else it fails.
+
+**Example:**
+
+`total` must match the total number of correct variables defined under `scoring_key`.
+
+<table>
+<tr>
+<th>YAML Rule Definition</th>
+<th>JSON Rule Definition</th>
+<th>When Validating</th>
+</tr>
+<tr>
+<td style="vertical-align:top;">
+<pre><code>total:
+  type: integer
+  required: true
+  function:
+    name: score_variables
+    args:
+      mode: correct
+      scoring_key:
+        val1: 1
+        val2: 2
+        val3: 3
+      logic:
+        formula:
+          "==":
+          - var: total
+          - var: __total_sum
+val1:
+  type: integer
+  nullable: true
+val2:
+  type: integer
+  nullable: true
+val3:
+  type: integer
+  nullable: true
+</code></pre>
+</td>
+<td style="vertical-align:top;">
+<pre><code>{
+    "total": {
+        "type": "integer",
+        "required": true,
+        "function": {
+            "name": "score_variables",
+            "args": {
+                "mode": "correct",
+                "scoring_key": {
+                    "val1": 1,
+                    "val2": 2,
+                    "val3": 3
+                },
+                "logic": {
+                    "formula": {
+                        "==": [
+                            {
+                                "var": "total"
+                            },
+                            {
+                                "var": "__total_sum"
+                            }
+                        ]
+                    }
+                }
+            }
+        }
+    },
+    "val1": {
+        "type": "integer",
+        "nullable": true
+    },
+    "val2": {
+        "type": "integer",
+        "nullable": true
+    },
+    "val3": {
+        "type": "integer",
+        "nullable": true
+    }
+}
+</code></pre>
+</td>
+<td style="vertical-align:top;">
+<pre><code>
+{'total': 3, 'val1': 1, 'val2': 2, 'val3': 3}   # passes
+{'total': 5}                                    # passes
+{'total': 10, 'val1': 1, 'val2': 2, 'val3': 3}  # fails
+</code></pre>
+</td>
+</tr>
+</table>
diff --git a/nacc_form_validator/BUILD b/nacc_form_validator/BUILD
@@ -7,7 +7,7 @@ python_distribution(
     sdist=True,
     provides=python_artifact(
         name="nacc-form-validator",
-        version="0.4.1",
+        version="0.5.0",
         description="The NACC form validator package",
         author="NACC",
         author_email="nacchelp@uw.edu",

diff --git a/nacc_form_validator/errors.py b/nacc_form_validator/errors.py
@@ -48,6 +48,7 @@ class ErrorDefs:
     TEMPORAL_SWAPPED = ErrorDefinition(0x3004, 'temporalrules')
     ADCID_NOT_MATCH = ErrorDefinition(0x3005, "function")
     ADCID_NOT_VALID = ErrorDefinition(0x3006, "function")
+    SCORING_INVALID = ErrorDefinition(0x3007, "function")
 
 
 class CustomErrorHandler(BasicErrorHandler):
@@ -133,6 +134,8 @@ def __set_custom_error_codes(self):
             "Provided ADCID {0} does not match your center's ADCID",
             0x3006:
             "Provided ADCID {0} is not in the valid list of ADCIDs",
+            0x3007:
+            "Provided value {0} does not match the expected score"
         }
 
         self.messages.update(custom_errors)

diff --git a/nacc_form_validator/nacc_validator.py b/nacc_form_validator/nacc_validator.py
@@ -1,6 +1,7 @@
 """Module for defining NACC specific data validation rules (extending cerberus
 library)."""
 
+import copy
 import logging
 from datetime import datetime as dt
 from typing import Any, Dict, List, Mapping, Optional, Tuple
@@ -1153,3 +1154,63 @@ def _check_adcid(self, field: str, value: int, own: bool = True):
             self._error(
                 field, ErrorDefs.ADCID_NOT_MATCH
                 if own else ErrorDefs.ADCID_NOT_VALID, value)
+
+    def _score_variables(self, field: str, value: int, mode: str,
+                         scoring_key: Dict[str, Any],
+                         logic: Dict[str, Any]) -> None:
+        """Sums all the variables that are correct or incorrect depending on
+        the mode based on scoring_key. Stores the result a special variable
+        called '__total_sum' (double underscore to ensure uniqueness) and runs
+        the defined logic formula against it in a subschema. `logic` field MUST
+        specify __total_sum.
+
+        If any of the keys in the scoring_key are missing/blank/non-integer value,
+        this validation is skipped.
+
+        'function': {
+            'name': 'score_variables',
+            'args': {
+                'mode': 'correct' or 'incorrect',
+                'scoring_key': {
+                    'key1': correct_val,
+                    'key2': correct_val,
+                    ...
+                },
+                'logic': {
+                    ... same schema as logic ...
+                }
+            }
+        }
+
+        Args:
+            field: Name of the scored field
+            value: Value of the scored field
+            mode: Whether to count all correct or all incorrect variables
+            scoring_key: Scoring key for all variables
+            logic: Logic formula to run result on
+        """
+        total_sum = 0
+        for key, correct_value in scoring_key.items():
+            if self.document.get(key, None) is None:
+                log.warning(
+                    f"Field {key} not present or blank, skipping validation")
+                return
+
+            correct = self.document[key] == correct_value
+            if (correct and mode == 'correct') or \
+               (not correct and mode == 'incorrect'):
+                total_sum += 1
+
+        condition = {field: {'nullable': True, 'logic': logic}}
+
+        record = copy.deepcopy(self.document)
+        record['__total_sum'] = total_sum
+
+        valid, errors = self._check_subschema_valid(all_conditions=condition,
+                                                    operator='AND',
+                                                    record=record)
+
+        # Logic formula failed, report errors
+        if errors:
+            for error in errors.items():
+                self._error(field, ErrorDefs.SCORING_INVALID, value)
diff --git a/tests/test_rules_score_variables.py b/tests/test_rules_score_variables.py
@@ -0,0 +1,128 @@
+"""Tests the custom function _score_variables rule."""
+import pytest
+
+
+@pytest.fixture(scope='function')
+def base_schema():
+    """Base schema for tests to use."""
+    return {
+        'total': {
+            'type': 'integer',
+            'required': True,
+            'function': {
+                'name': 'score_variables',
+                'args': {
+                    'mode': 'correct',
+                    'scoring_key': {
+                        'val1': 1,
+                        'val2': 2,
+                        'val3': 3
+                    },
+                    'logic': {
+                        'formula': {
+                            '==': [{
+                                'var': 'total'
+                            }, {
+                                'var': '__total_sum'
+                            }]
+                        }
+                    }
+                }
+            }
+        },
+        'val1': {
+            'type': 'integer',
+            'nullable': True
+        },
+        'val2': {
+            'type': 'integer',
+            'nullable': True
+        },
+        'val3': {
+            'type': 'integer',
+            'nullable': True
+        }
+    }
+
+
+def test_score_variables_total_correct(create_nacc_validator, base_schema):
+    """Test score variables with simple comparison against the total sum
+    counting correct variables."""
+    nv = create_nacc_validator(base_schema)
+
+    # valid cases
+    assert nv.validate({'total': 3, 'val1': 1, 'val2': 2, 'val3': 3})
+    assert nv.validate({'total': 1, 'val1': 5, 'val2': 2, 'val3': -7})
+    assert nv.validate({'total': 0, 'val1': 5, 'val2': 0, 'val3': -7})
+
+    # valid, will skip validation if any variables in scoring key is null
+    assert nv.validate({'total': 5})
+    assert nv.validate({'total': 5, 'val1': 5, 'val3': 4})
+
+    # invalid cases
+    assert not nv.validate({'total': 10, 'val1': 1, 'val2': 2, 'val3': 3})
+    assert nv.errors == {'total': ["Provided value 10 does not match the expected score"]}
+    assert not nv.validate({'total': 9, 'val1': 5, 'val2': 2, 'val3': -7})
+    assert nv.errors == {'total': ["Provided value 9 does not match the expected score"]}
+    assert not nv.validate({'total': 8, 'val1': 5, 'val2': 0, 'val3': -7})
+    assert nv.errors == {'total': ["Provided value 8 does not match the expected score"]}
+
+
+def test_score_variables_total_incorrect(create_nacc_validator, base_schema):
+    """Test score variables with simple comparison against the total sum
+    counting incorrect variables."""
+    base_schema['total']['function']['args']['mode'] = 'incorrect'
+    nv = create_nacc_validator(base_schema)
+
+    # valid cases
+    assert nv.validate({'total': 3, 'val1': 3, 'val2': 1, 'val3': 2})
+    assert nv.validate({'total': 1, 'val1': 3, 'val2': 2, 'val3': 3})
+    assert nv.validate({'total': 0, 'val1': 1, 'val2': 2, 'val3': 3})
+
+    # valid, will skip validation if any variables in scoring key is null
+    assert nv.validate({'total': 5})
+    assert nv.validate({'total': 5, 'val1': 5, 'val3': 4})
+
+    # invalid cases
+    assert not nv.validate({'total': 3, 'val1': 1, 'val2': 2, 'val3': 3})
+    assert nv.errors == {'total': ["Provided value 3 does not match the expected score"]}
+    assert not nv.validate({'total': 1, 'val1': 5, 'val2': 2, 'val3': -7})
+    assert nv.errors == {'total': ["Provided value 1 does not match the expected score"]}
+    assert not nv.validate({'total': 0, 'val1': 5, 'val2': 0, 'val3': -7})
+    assert nv.errors == {'total': ["Provided value 0 does not match the expected score"]}
+
+
+def test_score_variables_total_incorrect_subtraction(create_nacc_validator,
+                                                     base_schema):
+    """Test score variables with comparison involving more involved logic
+    formula counting incorrect values."""
+    base_schema['total']['function']['args']['mode'] = 'incorrect'
+    base_schema['total']['function']['args']['logic'] = {
+        'formula': {
+            '==': [{
+                'var': 'total'
+            }, {
+                '-': [5, {
+                    'var': '__total_sum'
+                }]
+            }]
+        }
+    }
+    nv = create_nacc_validator(base_schema)
+
+    # valid cases
+    assert nv.validate({'total': 2, 'val1': 3, 'val2': 1, 'val3': 2})
+    assert nv.validate({'total': 4, 'val1': 3, 'val2': 2, 'val3': 3})
+    assert nv.validate({'total': 5, 'val1': 1, 'val2': 2, 'val3': 3})
+
+    # valid, will skip validation if any variables in scoring key is null
+    assert nv.validate({'total': 5})
+    assert nv.validate({'total': 5, 'val1': 5, 'val3': 4})
+
+    # invalid cases
+    assert not nv.validate({'total': 3, 'val1': 1, 'val2': 2, 'val3': 3})
+    assert nv.errors == {'total': ["Provided value 3 does not match the expected score"]}
+    assert not nv.validate({'total': 1, 'val1': 5, 'val2': 2, 'val3': -7})
+    assert nv.errors == {'total': ["Provided value 1 does not match the expected score"]}
+    assert not nv.validate({'total': 0, 'val1': 5, 'val2': 0, 'val3': -7})
+    assert nv.errors == {'total': ["Provided value 0 does not match the expected score"]}