From 962aa53b5ba7f01d52ca9cf98cdf8bc07024adb9 Mon Sep 17 00:00:00 2001 From: Michael Terry Date: Tue, 10 Sep 2024 09:10:44 -0400 Subject: [PATCH] WIP: don't simply toss extensions --- cumulus_etl/deid/ms-config.json | 2 +- cumulus_etl/deid/scrubber.py | 24 +++++++++++++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/cumulus_etl/deid/ms-config.json b/cumulus_etl/deid/ms-config.json index 225df6c9..c15fe9d4 100644 --- a/cumulus_etl/deid/ms-config.json +++ b/cumulus_etl/deid/ms-config.json @@ -30,7 +30,7 @@ {"path": "Patient.extension('http://hl7.org/fhir/us/core/StructureDefinition/us-core-sex-for-clinical-use')", "method": "keep"}, {"path": "Patient.extension('http://open.epic.com/FHIR/StructureDefinition/extension/sex-for-clinical-use')", "method": "keep"}, // Epic has used this pre-final-spec URL {"path": "nodesByName('modifierExtension')", "method": "keep"}, // keep these so we can ignore resources with modifiers we don't understand - {"path": "nodesByType('Extension')", "method": "redact"}, // drop all unknown extensions + {"path": "nodesByType('Extension')", "method": "keep"}, // drop all unknown extensions // Elements that might be embedded and kept elsewhere -- redact pieces of the whole {"path": "nodesByType('Attachment').title", "method": "redact"}, diff --git a/cumulus_etl/deid/scrubber.py b/cumulus_etl/deid/scrubber.py index c13037f8..6ef79578 100644 --- a/cumulus_etl/deid/scrubber.py +++ b/cumulus_etl/deid/scrubber.py @@ -64,7 +64,7 @@ def scrub_resource(self, node: dict, scrub_attachments: bool = True) -> bool: node.get("resourceType"), "root", node, scrub_attachments=scrub_attachments ) except SkipResource as exc: - logging.warning("Ignoring resource of type %s: %s", node.__class__.__name__, exc) + logging.warning("Ignoring resource: %s", exc) return False except ValueError as exc: logging.warning("Could not parse value: %s", exc) @@ -121,6 +121,7 @@ def _scrub_single_value( """Examines one single property of a node""" # For now, just manually run each operation. If this grows further, we can abstract it more. self._check_ids(node, key, value) + self._check_extensions(resource_type, node, key, value) self._check_modifier_extensions(key, value) self._check_security(node_path, node, key, value) self._check_text(node, key, value) @@ -139,11 +140,32 @@ def _scrub_single_value( # ############################################################################### + def _check_extensions(self, resource_type: str, node: dict, key: str, value: Any) -> None: + """If there's any unrecognized extensions, log and delete them""" + if key == "extension" and isinstance(value, dict): + known_extensions = [ + # {"path": "Patient.extension('http://hl7.org/fhir/Profile/us-core#ethnicity')", "method": "keep"}, // Old DSTU1 URL, still out there in the wild: https://www.hl7.org/fhir/DSTU1/us-core.html + # {"path": "Patient.extension('http://hl7.org/fhir/Profile/us-core#race')", "method": "keep"}, // Old DSTU1 URL, still out there in the wild: https://www.hl7.org/fhir/DSTU1/us-core.html + # {"path": "Patient.extension('http://hl7.org/fhir/us/core/StructureDefinition/us-core-birthsex')", "method": "keep"}, + # {"path": "Patient.extension('http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity')", "method": "keep"}, + # {"path": "Patient.extension('http://hl7.org/fhir/us/core/StructureDefinition/us-core-genderIdentity')", "method": "keep"}, + # {"path": "Patient.extension('http://hl7.org/fhir/us/core/StructureDefinition/us-core-race')", "method": "keep"}, + "http://hl7.org/fhir/us/core/StructureDefinition/us-core-sex-for-clinical-use", + "http://open.epic.com/FHIR/StructureDefinition/extension/sex-for-clinical-use", # Epic has used this pre-final-spec URL + ] + url = value.get("url") + if url not in known_extensions: + value.clear() # get rid of any other keys + value["url"] = url # just keep the url, to track that it existed + @staticmethod def _check_modifier_extensions(key: str, value: Any) -> None: """If there's any unrecognized modifierExtensions, raise a SkipResource exception""" if key == "modifierExtension" and isinstance(value, dict): known_extensions = [ + # These NLP extensions are generated by ctakesclient's text2fhir code. + # While we don't anticipate ingesting any resources using these extensions + # (and we don't currently generate them ourselves), we might in the future. "http://fhir-registry.smarthealthit.org/StructureDefinition/nlp-polarity", "http://fhir-registry.smarthealthit.org/StructureDefinition/nlp-source", ]