From 09842446edad914d19d411b58a83832cec9876ca Mon Sep 17 00:00:00 2001 From: Emily Cheng Date: Wed, 20 Nov 2024 16:42:43 -0800 Subject: [PATCH] Updates temporalrules to be able to swap order of operations (#27) * Try out swapping order * Make temporal error mesage more verbose (explicitly say the second condition) * Docs, lint, reorganize changelog properly * Put refactor after update * Fix error message order * Update error message --- CHANGELOG.md | 20 +++---- ...data-quality-rule-definition-guidelines.md | 56 ++++++++++++------- nacc_form_validator/errors.py | 8 ++- nacc_form_validator/nacc_validator.py | 51 ++++++++++++----- tests/test_nacc_validator_datastore.py | 28 ++++++++-- 5 files changed, 112 insertions(+), 51 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f47f503..d7b54e5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,36 +4,36 @@ Documentation of release versions of `nacc-form-validator` ## 0.4.0 -* Updates `_validate_compute_gds` to remove partial GDS score calculation -* Updates `_check_subschema_valid` to accept an optional record parameter, defaults to the document - used for rules that require the previous record -* Updates `compare_with` to support the `abs` operator -* Updates `compatibility` error message to be more verbose -* Refactors the tests to be more modularized so that they're more manageable -* Refactors logic for `compare_values` by moving it to its own utility method and allows comparing to null values * Adds `get_previous_record` method to grab previous record from Datastore, which can grab the previous record or the previous record where a specific field is non-empty - * Updates `_validate_temporalrules` to also support the `ignore_empty` parameter * Adds support for comparing against the previous record in `compare_with` * Adds new rule `compare_age` to handle rules that need to compare ages relative to a date * Adds custom operator `count_exact` to `json_logic.py` +* Updates `_check_subschema_valid` to accept an optional record parameter, defaults to the document - used for rules that require the previous record +* Updates `_validate_compute_gds` to remove partial GDS score calculation +* Updates `_validate_compare_with` to support the `abs` operator +* Updates `_validate_compatibility` error message to be more verbose +* Updates `_validate_temporalrules` to also support `ignore_empty` and `swap_order` parameters +* Refactors the tests to be more modularized so that they're more manageable +* Refactors logic for `compare_values` by moving it to its own utility method and allows comparing to null values * Fixes issue where `datastore` was not being set for the temp validator in `_check_subschema_valid`, causing nested conditions with previous records to not evaluate correctly ## 0.3.0 +* Adds `_check_with_rxnorm` function to check whether a given Drug ID is valid RXCUI code * Updates `_validate_compare_with` to allow adjustments to be another field, and for base values to be hardcoded values * Updates json_logic `less` function to handle None * Updates `_validate_temporalrules` to iterate on multiple fields for `previous` and `current` clauses, remove `orderby` attribute * Updates `_check_with_gds` function to `_validate_compute_gds` and update GDS score validation -* Adds `_check_with_rxnorm` function to check whether a given Drug ID is valid RXCUI code * Fixes a bug in min/max validation wrt current_year ## 0.2.0 +* Adds documentation/user guides and testing +* Adds this CHANGELOG * Updates `_validate_compatibility` to iterate on multiple fields for `then` and `else` clauses, similar to how the `if` clause was handling it * Updates `cast_record` to set any missing fields to `None` * Updates `pants_version` in `pants.toml` to `2.22.0` (latest) to support building on macOS * Renamed `validator` to `nacc_form_validator` for a more unique namespace, and imported `QualityCheck` into `__init__.py` for easier access -* Adds documentation/user guides and testing -* Adds this CHANGELOG ## 0.1.1 and older diff --git a/docs/data-quality-rule-definition-guidelines.md b/docs/data-quality-rule-definition-guidelines.md index ac1dfab..b228241 100644 --- a/docs/data-quality-rule-definition-guidelines.md +++ b/docs/data-quality-rule-definition-guidelines.md @@ -854,12 +854,17 @@ The validator also has custom operators in addition to the ones provided by json ### temporalrules Used to specify the list of checks to be performed against the previous visit for the same participant. -* Each constraint specifies `previous` and `current` attributes. If conditions specified under `previous` subschema satisfied by the previous visit record, then the current visit record must satisfy the conditions specified under `current` subschema. -* Each `previous/current` attribute can have several fields which need to be satisifed, with the optional `*_op` attribute specifying the boolean operation in which to compare the different fields. For example, if `prev_op = or`, then as long as _any_ of the fields satsify their schema, the `current` attribute will be evaluated. The default `*_op` is `and`. -* Each constraint also allows for an optional `ignore_empty` option, which can take a string or list of strings denoting fields that cannot be empty. When grabbing the previous record, the validator will grab the first previous record where all specified fields are non-empty. + +Each constraint specifies `previous` and `current` attributes. If conditions specified under the `previous` subschema are satisfied by the previous visit record, then the current visit record must satisfy the conditions specified under `current` subschema. + +Each constraint also has optional fields that can be set: + +* Each `previous/current` attribute can have several fields which need to be satisifed, so an optional `*_op` attribute can be used to specify the boolean operation in which to compare the different fields. For example, if `prev_op = or`, then as long as _any_ of the fields satsify their schema, the `current` attribute will be evaluated. The default `*_op` is `and`. +* `ignore_empty`: Takes a string or list of strings denoting fields that cannot be empty. When grabbing the previous record, the validator will grab the first previous record where _all_ specified fields are non-empty. * If no record is found that satisfies all fields being non-empty, the validator will _ignore_ the check (e.g. pass through validation without errors) +* `swap_order`: If set to True, it will swap the order of operations, evluating the `current` subschema first, then the `previous` subschema -*To validate `temporalrules`, the validator should have a `Datastore` instance which will be used to retrieve the previous visit record(s) for the participant.* +> **NOTE**: To validate `temporalrules`, the validator should have a `Datastore` instance which will be used to retrieve the previous visit record(s) for the participant. The rule definition for `temporalrules` should follow the following format: @@ -867,25 +872,34 @@ The rule definition for `temporalrules` should follow the following format: { "": { "temporalrules": [ - { - "previous": { - "": "subschema to be satisfied for the previous record" - }, - "current": { - "": "subschema to be satisfied for the current record" - } + { + "previous": { + "": "subschema to be satisfied for the previous record" }, - { - "ignore_empty": ["", ""], - "prev_op": "or", - "previous": { - "": "subschema to be satisfied for the previous record", - "": "subschema to be satisfied for the previous record" - }, - "current": { - "": "subschema to be satisfied for the current record" - } + "current": { + "": "subschema to be satisfied for the current record, will be evaluated if the previous subschema is satisfied" + } + }, { + "ignore_empty": ["", ""], + "prev_op": "or", + "previous": { + "": "subschema to be satisfied for the previous record", + "": "subschema to be satisfied for the previous record" + }, + "current": { + "": "subschema to be satisfied for the current record, will be evaluated if either of the previous subschemas are satisfied" } + }, + { + "swap_order": true, + "previous": { + "": "subschema to be satisfied for the previous record, will be evaluated if the current subschema is satisfied" + }, + "current": { + "": "subschema to be satisfied for the current record" + } + } + ] } } diff --git a/nacc_form_validator/errors.py b/nacc_form_validator/errors.py index 9e8a8e2..8589840 100644 --- a/nacc_form_validator/errors.py +++ b/nacc_form_validator/errors.py @@ -42,6 +42,7 @@ class ErrorDefs: DATE_CONVERSION = ErrorDefinition(0x3001, 'compare_age') COMPARE_AGE = ErrorDefinition(0x3002, 'compare_age') COMPARE_AGE_INVALID_COMPARISON = ErrorDefinition(0x3003, 'compare_age') + TEMPORAL_SWAPPED = ErrorDefinition(0x3004, 'temporalrules') class CustomErrorHandler(BasicErrorHandler): @@ -88,7 +89,8 @@ def __set_custom_error_codes(self): 0x1009: "{1} for if {2} else {3} - compatibility rule no: {0}", 0x2000: - "{1} in current visit for {2} in previous visit - temporal rule no: {0}", + "{1} for if {2} in previous visit then {3} " + + "in current visit - temporal rule no: {0}", 0x2001: "primary key variable {0} not set in current visit data", 0x2002: @@ -119,6 +121,9 @@ def __set_custom_error_codes(self): "input value {0} doesn't satisfy the condition: {1}", 0x3003: "Error in comparing {0} to age at {1} ({2}): {3}", + 0x3004: + "{1} for if {3} in current visit then {2} " + + "in previous visit - temporal rule no: {0}", } self.messages.update(custom_errors) @@ -177,3 +182,4 @@ class SchemaDefs: BIRTH_DAY = 'birth_day' BIRTH_YEAR = 'birth_year' COMPARE_TO = "compare_to" + SWAP_ORDER = "swap_order" diff --git a/nacc_form_validator/nacc_validator.py b/nacc_form_validator/nacc_validator.py index 05d21ef..885ffed 100755 --- a/nacc_form_validator/nacc_validator.py +++ b/nacc_form_validator/nacc_validator.py @@ -693,6 +693,10 @@ def _validate_temporalrules(self, temporalrules: List[Mapping], field: str, ] }, "required": False + }, + 'swap_order': { + 'type': 'boolean', + 'required': False } } } @@ -700,6 +704,7 @@ def _validate_temporalrules(self, temporalrules: List[Mapping], field: str, """ rule_no = -1 for temporalrule in temporalrules: + swap_order = temporalrule.get(SchemaDefs.SWAP_ORDER, False) ignore_empty_fields = temporalrule.get(SchemaDefs.IGNORE_EMPTY, None) prev_ins = self.get_previous_record( @@ -723,25 +728,45 @@ def _validate_temporalrules(self, temporalrules: List[Mapping], field: str, prev_conds = temporalrule[SchemaDefs.PREVIOUS] curr_conds = temporalrule[SchemaDefs.CURRENT] - errors = None - # Check if conditions for the previous visit is satisfied - valid, _ = self._check_subschema_valid(prev_conds, - prev_operator, - record=prev_ins) + # default order of operations is to first check if conditions for + # previous visit is satisfied, then current visit, but + # occasionally we need to swap that order + error, valid = None, False + error_def = ErrorDefs.TEMPORAL - # If not satisfied, continue to next rule - if not valid: - continue + if not swap_order: + # Check if conditions for the previous visit is satisfied + valid, _ = self._check_subschema_valid(prev_conds, + prev_operator, + record=prev_ins) + + # If not satisfied, continue to next rule + if not valid: + continue + + # If satisfied, validate the current visit + valid, errors = self._check_subschema_valid( + curr_conds, curr_operator) + else: + # do the other way; check if condition for current visit is satisfied + error_def = ErrorDefs.TEMPORAL_SWAPPED + valid, _ = self._check_subschema_valid(curr_conds, + curr_operator) + + # if not satisfied, continue to next rule + if not valid: + continue - # If satisfied, validate the current visit - valid, errors = self._check_subschema_valid( - curr_conds, curr_operator) + # if satisfied, validate previous visit + valid, errors = self._check_subschema_valid(prev_conds, + prev_operator, + record=prev_ins) # Cross visit validation failed - report errors if not valid and errors: for error in errors.items(): - self._error(field, ErrorDefs.TEMPORAL, rule_no, str(error), - prev_conds) + self._error(field, error_def, rule_no, str(error), + prev_conds, curr_conds) def _validate_logic(self, logic: Dict[str, Any], field: str, value: object): diff --git a/tests/test_nacc_validator_datastore.py b/tests/test_nacc_validator_datastore.py index cbadbb3..a0a4c99 100644 --- a/tests/test_nacc_validator_datastore.py +++ b/tests/test_nacc_validator_datastore.py @@ -122,12 +122,24 @@ def test_temporal_check(schema): """ Temporal test check - this is basically a more involved version of the example provided in docs/index.md """ nv = create_nacc_validator_with_ds(schema, 'patient_id', 'visit_num') - assert nv.validate({'patient_id': 'PatientID1', - 'visit_num': 4, 'taxes': 1}) - assert not nv.validate( - {'patient_id': 'PatientID1', 'visit_num': 4, 'taxes': 8}) + assert nv.validate({'patient_id': 'PatientID1','visit_num': 4, 'taxes': 1}) + + assert not nv.validate({'patient_id': 'PatientID1', 'visit_num': 4, 'taxes': 8}) + assert nv.errors == {'taxes': [ + "('taxes', ['unallowed value 8']) for if {'taxes': {'allowed': [0]}} in previous visit then {'taxes': {'forbidden': [8]}} in current visit - temporal rule no: 0"]} + +def test_temporal_check_swap_order(schema): + """ Test temporal check when the order of evaluation is swapped """ + schema['taxes']['temporalrules'][0]['swap_order'] = True + nv = create_nacc_validator_with_ds(schema, 'patient_id', 'visit_num') + + assert nv.validate({'patient_id': 'PatientID1','visit_num': 4, 'taxes': 1}) + assert nv.validate({'patient_id': 'PatientID1', 'visit_num': 4, 'taxes': 8}) # since 8 fails the current condition, validation skipped + + nv.reset_record_cache() + assert not nv.validate({'patient_id': 'PatientID1', 'visit_num': 2, 'taxes': 1}) assert nv.errors == {'taxes': [ - "('taxes', ['unallowed value 8']) in current visit for {'taxes': {'allowed': [0]}} in previous visit - temporal rule no: 0"]} + "('taxes', ['unallowed value 8']) for if {'taxes': {'forbidden': [8]}} in current visit then {'taxes': {'allowed': [0]}} in previous visit - temporal rule no: 0"]} def test_temporal_check_no_prev_visit(schema): """ Temporal test check when there are no previous visits (e.g. before visit 0) """ @@ -292,7 +304,11 @@ def test_temporal_check_with_nested_compare_with_previous_record(): assert nv.validate({'patient_id': 'PatientID1', 'visit_num': 4, 'birthyr': 1950}) assert not nv.validate({'patient_id': 'PatientID1', 'visit_num': 4, 'birthyr': 1951}) - assert nv.errors == {'birthyr': ['(\'birthyr\', ["input value doesn\'t satisfy the condition birthyr == birthyr (previous record)"]) in current visit for {\'birthyr\': {\'forbidden\': [-1]}} in previous visit - temporal rule no: 0']} + assert nv.errors == {'birthyr': [ + '(\'birthyr\', ["input value doesn\'t satisfy the condition birthyr == birthyr (previous record)"]) for ' + + 'if {\'birthyr\': {\'forbidden\': [-1]}} in previous visit ' + + 'then {\'birthyr\': {\'compare_with\': {\'comparator\': \'==\', \'base\': \'birthyr\', \'previous_record\': True}}} in current visit ' + + '- temporal rule no: 0']} def test_check_with_rxnorm(): """ Test checking drugID is a valid RXCUI """