Add post-rule business logic

hubmapconsortium · Dec 19, 2024 · f7f151b · f7f151b
1 parent e1e5a5d
commit f7f151b
Showing 1 changed file with 67 additions and 15 deletions.
diff --git a/src/soft_assay_rules/local_rule_tester.py b/src/soft_assay_rules/local_rule_tester.py
@@ -61,15 +61,15 @@ def lookup_entity_json(uuid):
     raise ValueError(f"No cached JSON for {uuid}")
 
 
-def wrapped_lookup_json(uuid):
+def wrapped_lookup_entity_json(uuid):
     """Like lookup_entity_json but drop the app_ctx"""
     return lookup_entity_json(uuid)[1]
 
 
 def lookup_metadata_json(uuid):
     """
-    Check the directory of cached entity json files for the given example, looking for
-    filenames of the form "entity_{uuid}_SENNET.json" or ..."_HUBMAP.json".  Pick the
+    Check the directory of cached metadata json files for the given example, looking for
+    filenames of the form "metadata_{uuid}_SENNET.json" or ..."_HUBMAP.json".  Pick the
     first of those found, and infer the uuid's app_ctx from the corresponding string.
     Return a tuple containing that app_ctx as a string and the JSON dict loaded from
     the file.  If no such file is found, ValueError is raised.
@@ -81,17 +81,48 @@ def lookup_metadata_json(uuid):
         if Path(fname).exists():
             with open(fname) as infile:
                 json_dict = json.load(infile)
-                LOGGER.debug(f"HERE: {json_dict.keys()}")
             return app_ctx, json_dict
     raise ValueError(f"No cached metadata JSON for {uuid}")
 
 
 def wrapped_lookup_metadata_json(uuid):
-    """Like lookup_entity_json but drop the app_ctx"""
+    """Like lookup_metadata_json but drop the app_ctx"""
     return lookup_metadata_json(uuid)[1]
 
 
-def calculate_assay_info(metadata: dict) -> dict:
+def lookup_rulechain_json(uuid):
+    """
+    Check the directory of cached rulechain json files for the given example, looking for
+    filenames of the form "rulechain_{uuid}_SENNET.json" or ..."_HUBMAP.json".  Pick the
+    first of those found, and infer the uuid's app_ctx from the corresponding string.
+    Return a tuple containing that app_ctx as a string and the JSON dict loaded from
+    the file.  If no such file is found, ValueError is raised.
+    """
+    for app_ctx in ["SENNET", "HUBMAP"]:
+        fname = build_cached_json_fname(uuid, app_ctx,
+                                        dir="captured_rulechain_json",
+                                        prefix="rulechain")
+        if Path(fname).exists():
+            with open(fname) as infile:
+                json_dict = json.load(infile)
+            return app_ctx, json_dict
+    raise ValueError(f"No cached rulechain JSON for {uuid}")
+
+
+def wrapped_lookup_rulechain_json(uuid):
+    """Like lookup_rulechain_json but drop the app_ctx"""
+    return lookup_rulechain_json(uuid)[1]
+
+
+def post_rule_transform(rule_output: dict,
+                        source_is_human: bool) -> dict:
+    rslt = rule_output.copy()
+    if "contains-pii" in rslt:
+        rslt["contains-pii"] = rslt["contains-pii"] and source_is_human
+    return rslt
+
+
+def calculate_assay_info(metadata: dict, source_is_human: bool) -> dict:
     # TODO: this function should really get imported from ingest-api
     if not rule_chain:
         initialize_rule_chain()
@@ -100,13 +131,28 @@ def calculate_assay_info(metadata: dict) -> dict:
             if value.isdigit():
                 metadata[key] = int(value)
     try:
-        rslt = rule_chain.apply(metadata)
+        rslt = post_rule_transform(rule_chain.apply(metadata),
+                                   source_is_human)
         # TODO: check that rslt has the expected parts
         return rslt
     except NoMatchException:
         return {}
 
 
+def smart_equality(val1, val2):
+    """
+    Provide a more robust equality test for json blob terms. Compare lists
+    as sets, etc.
+    """
+    if isinstance(val1, (list, tuple)):
+        if isinstance(val2, list):
+            return set(val1) == set(val2)
+        else:
+            return False
+    else:
+        return val1 == val2
+
+
 def main() -> None:
     for argfile in sys.argv[1:]:
         if argfile.endswith('~'):
@@ -124,36 +170,42 @@ def main() -> None:
                     uuid = row["uuid"]
                     app_ctx, json_dict = lookup_entity_json(uuid)
                     LOGGER.info(f"app_ctx for {uuid} is {app_ctx}")
-                    is_human = source_is_human([uuid], wrapped_lookup_json)
+                    is_human = source_is_human([uuid], wrapped_lookup_entity_json)
                     LOGGER.info(f"source_is_human for [{uuid}] returns {is_human}")
                     payload = wrapped_lookup_metadata_json(uuid)
                     LOGGER.debug(f"PAYLOAD: \n" + pformat(payload))
-                    rslt = calculate_assay_info(payload)
+                    rslt = calculate_assay_info(payload, is_human)
+                    cached_rslt = wrapped_lookup_rulechain_json(uuid)
+                    for elt in rslt:
+                        val = rslt[elt]
+                        cached_val = cached_rslt.get(elt)
+                        if not smart_equality(val, cached_val):
+                            LOGGER.warning(f"DISCORDANT for {uuid} {elt}:"
+                                           f" {val} != {cached_val}")
                     print_rslt(argfile, idx, payload, rslt)
             else:
-                #print(arg_df)
                 for idx, row in arg_df.iterrows():
                     payload = {col: row[col] for col in arg_df.columns}
                     if "parent_sample_id" in payload:
                         # This sample is new enough to have a column of parent
                         # samples, so we can check source type
                         parent_sample_ids = payload["parent_sample_id"].split(",")
                         parent_sample_ids = [elt.strip() for elt in parent_sample_ids]
-                        is_human = source_is_human(parent_sample_ids, wrapped_lookup_json)
+                        is_human = source_is_human(parent_sample_ids, wrapped_lookup_entity_json)
                         LOGGER.info(f"source_is_human {parent_sample_ids} returns {is_human}")
                     else:
                         is_human = True  # legacy data is all human
-                    payload["source_is_human"] = is_human
-                    rslt = calculate_assay_info(payload)
+                    rslt = calculate_assay_info(payload, is_human)
                     print_rslt(argfile, idx, payload, rslt)
         elif argfile.endswith('.json'):
             with open(argfile) as jsonfile:
                 payload = json.load(jsonfile)
                 # This reloaded payload was captured from a valid assayclassifier
                 # version, so the payload should be complete- no added elements
-                # needed.
+                # needed.  But we have no way to tell if the source was human,
+                # so assume that it is human.
                 LOGGER.debug(f"RELOADED PAYLOAD: \n" + pformat(payload))
-                rslt = calculate_assay_info(payload)
+                rslt = calculate_assay_info(payload, True)
                 print_rslt(argfile, 0, payload, rslt)
         else:
             raise RuntimeError(f"Arg file {argfile} is of an"