Merge pull request #2235 from microbiomedata/calibration-change

Move `has_calibration` from `workflow_execution_set` to `data_generation_set`
microbiomedata · Nov 1, 2024 · 51ac45e · 51ac45e
2 parents 5aacd28 + b8e4ecc
commit 51ac45e
Show file tree

Hide file tree

Showing 7 changed files with 356 additions and 98 deletions.
diff --git a/nmdc_schema/migrators/migrator_from_11_0_3_to_11_1_0.py b/nmdc_schema/migrators/migrator_from_11_0_3_to_11_1_0.py
@@ -0,0 +1,33 @@
+from nmdc_schema.migrators.migrator_base import MigratorBase
+from nmdc_schema.migrators.partials.migrator_from_11_0_3_to_11_1_0 import (
+    get_migrator_classes,
+)
+
+
+class Migrator(MigratorBase):
+    r"""
+    Migrates a database between two schemas.
+
+    Reference: https://pypi.org/project/nmdc-schema/#history
+    """
+
+    _from_version = "11.0.3"
+    _to_version = "11.1.0" 
+
+    def upgrade(self) -> None:
+        r"""
+        Migrates the database from conforming to the original schema, to conforming to the new schema.
+
+        This migrator uses partial migrators. It runs them in the order in which they were designed to be run.
+        """
+
+        migrator_classes = get_migrator_classes()
+        num_migrators = len(migrator_classes)
+        for idx, migrator_class in enumerate(migrator_classes):
+            self.logger.info(f"Running migrator {idx + 1} of {num_migrators}")
+            self.logger.debug(
+                f"Migrating from {migrator_class.get_origin_version()} "
+                f"to {migrator_class.get_destination_version()}"
+            )
+            migrator = migrator_class(adapter=self.adapter, logger=self.logger)
+            migrator.upgrade()
diff --git a/nmdc_schema/migrators/partials/migrator_from_11_0_3_to_11_1_0/__init__.py b/nmdc_schema/migrators/partials/migrator_from_11_0_3_to_11_1_0/__init__.py
@@ -0,0 +1,25 @@
+from typing import List, Type
+
+from nmdc_schema.migrators.migrator_base import MigratorBase
+from nmdc_schema.migrators.partials.migrator_from_11_0_3_to_11_1_0 import (
+    migrator_from_11_0_3_to_11_1_0_part_1
+)
+
+def get_migrator_classes() -> List[Type[MigratorBase]]:
+    r"""
+    Returns a list of migrator classes in the order in which they (i.e. their `upgrade` methods)
+    were designed to be run.
+
+    >>> migrator_classes = get_migrator_classes()
+    >>> type(migrator_classes) is list and len(migrator_classes) > 0  # the function returns a list
+    True
+    >>> from inspect import isclass
+    >>> all(isclass(c) for c in migrator_classes)  # each list item is a class
+    True
+    >>> all(callable(getattr(c, "upgrade")) for c in migrator_classes)  # each class has an `upgrade` method
+    True
+    """
+
+    return [
+        migrator_from_11_0_3_to_11_1_0_part_1.Migrator,
+    ]
diff --git a/...igrators/partials/migrator_from_11_0_3_to_11_1_0/migrator_from_11_0_3_to_11_1_0_part_1.py b/...igrators/partials/migrator_from_11_0_3_to_11_1_0/migrator_from_11_0_3_to_11_1_0_part_1.py
@@ -0,0 +1,215 @@
+from nmdc_schema.migrators.migrator_base import MigratorBase
+import re
+
+
+class Migrator(MigratorBase):
+    r"""
+    Migrates a database between two schemas.
+
+    This migrator removes the `has_calibration` field from all documents that represent an instance of
+    the `NomAnalysis` and 'MetabolomicsAnalysis' class, and moves the information to its corresponding
+    'MassSpectrometry` `has_calibration` slot.
+
+    The creation of this migrator was in response to this issue:
+    https://github.com/microbiomedata/nmdc-schema/issues/2139
+
+    """
+
+    _from_version = "11.0.3"
+    _to_version = "11.1.0.part_1"
+
+    def upgrade(self) -> None:
+        r"""
+        Migrates the database from conforming to the original schema, to conforming to the new schema.
+
+        >>> from nmdc_schema.migrators.adapters.dictionary_adapter import DictionaryAdapter
+        >>> db = {
+        ...   'workflow_execution_set': [
+        ...     {'id': 'nmdc:wfx1', 'has_calibration': 'nmdc:dobj-13-abc123', 'was_informed_by': 'nmdc:dgen1', 'type': 'nmdc:MetabolomicsAnalysis'},
+        ...     {'id': 'nmdc:wfx2', 'has_calibration': 'false', 'was_informed_by': 'nmdc:dgen2', 'type': 'nmdc:NomAnalysis'},
+        ...     {'id': 'nmdc:wfx3', 'was_informed_by': 'nmdc:dgen3', 'type': 'nmdc:MetabolomicsAnalysis'}
+        ...   ],
+        ...   'data_generation_set': [
+        ...     {'id': 'nmdc:dgen1'},
+        ...     {'id': 'nmdc:dgen2'},
+        ...     {'id': 'nmdc:dgen3'}
+        ...   ],
+        ...   'data_object_set': [
+        ...     {'id': 'nmdc:dobj-13-abc123'}
+        ...   ],
+        ...   'calibration_set': [
+        ...     {'id': 'nmdc:calib1', 'calibration_object': 'nmdc:dobj-13-abc123'}
+        ...   ]
+        ... }
+        >>> a = DictionaryAdapter(database=db)
+        >>> m = Migrator(adapter=a)
+        >>> m.upgrade()
+        >>> any('has_calibration' in doc for doc in db['workflow_execution_set'])  # Calibrations removed from workflow
+        False
+        >>> db['data_generation_set'][0]  # Calibration moved to data generation
+        {'id': 'nmdc:dgen1', 'has_calibration': 'nmdc:calib1'}
+        >>> db['data_generation_set'][1]  # No calibration added when value was 'false'
+        {'id': 'nmdc:dgen2'}
+        """
+
+
+        self.adapter.process_each_document(collection_name="workflow_execution_set", pipeline=[self.store_and_remove_calibrations])
+        self.adapter.process_each_document(collection_name="data_generation_set", pipeline=[self.update_data_gen_calibration])
+
+    def check_has_calibration(self, has_calibration_value) -> bool:
+        r"""
+        Checks for a valid data object id format (starts with 'nmdc:dobj')
+
+        >>> from nmdc_schema.migrators.adapters.dictionary_adapter import DictionaryAdapter
+        >>> db = {}
+        >>> a = DictionaryAdapter(database=db)
+        >>> m = Migrator(adapter=a)
+        >>> m.check_has_calibration('nmdc:dobj-13-abc123')  # Valid format
+        True
+        >>> m.check_has_calibration('false')  # Invalid format
+        False
+        >>> m.check_has_calibration('nmdc:something-else')  # Invalid format
+        False
+        """
+
+        pattern = r'^nmdc:dobj'
+
+        return bool(re.match(pattern, has_calibration_value))
+
+    def check_for_valid_data_object(self, data_obj_id) -> bool:
+        r"""
+        Checks database for valid data object. Returns False if not valid
+
+        >>> from nmdc_schema.migrators.adapters.dictionary_adapter import DictionaryAdapter
+        >>> db = {
+        ...   'data_object_set': [
+        ...     {'id': 'nmdc:dobj-13-abc123'},
+        ...     {'id': 'nmdc:dobj-13-def456'}
+        ...   ]
+        ... }
+        >>> a = DictionaryAdapter(database=db)
+        >>> m = Migrator(adapter=a)
+        >>> m.check_for_valid_data_object('nmdc:dobj-13-abc123')  # Exists in database
+        True
+        >>> m.check_for_valid_data_object('nmdc:dobj-13-nonexistent')  # Doesn't exist
+        False
+        """
+
+        data_obj_doc = self.adapter.get_document_having_value_in_field(
+                    collection_name="data_object_set", field_name="id", value=data_obj_id
+                    )
+
+        return data_obj_doc is not None
+
+    def store_and_remove_calibrations(self, workflow_execution_doc) -> dict:
+        r"""
+        Moves the `has_calibration` field from the `WorkflowExecution` document to
+        the corresponding `DataGeneration` document.
+
+        >>> from nmdc_schema.migrators.adapters.dictionary_adapter import DictionaryAdapter
+        >>> db = {
+        ...   'workflow_execution_set': [
+        ...     {'id': 'nmdc:wfx1', 'has_calibration': 'nmdc:dobj-13-abc123', 'was_informed_by': 'nmdc:dgen1'},
+        ...     {'id': 'nmdc:wfx2', 'has_calibration': 'false', 'was_informed_by': 'nmdc:dgen2'}
+        ...   ],
+        ...   'data_generation_set': [
+        ...     {'id': 'nmdc:dgen1'},
+        ...     {'id': 'nmdc:dgen2'}
+        ...   ],
+        ...   'data_object_set': [
+        ...     {'id': 'nmdc:dobj-13-abc123'}
+        ...   ],
+        ...   'calibration_set': [
+        ...     {'id': 'nmdc:calib1', 'calibration_object': 'nmdc:dobj-13-abc123'}
+        ...   ]
+        ... }
+        >>> a = DictionaryAdapter(database=db)
+        >>> m = Migrator(adapter=a)
+        >>> workflow_execution_doc = {'id': 'nmdc:wfx1', 'has_calibration': 'nmdc:dobj-13-abc123', 'was_informed_by': 'nmdc:dgen1'}
+        >>> m.store_and_remove_calibrations(workflow_execution_doc)
+        {'id': 'nmdc:wfx1', 'was_informed_by': 'nmdc:dgen1'}
+        >>> workflow_execution_doc = {'id': 'nmdc:wfx2', 'has_calibration': 'false', 'was_informed_by': 'nmdc:dgen2'}
+        >>> m.store_and_remove_calibrations(workflow_execution_doc)
+        {'id': 'nmdc:wfx2', 'was_informed_by': 'nmdc:dgen2'}
+        >>> workflow_execution_doc = {'id': 'nmdc:wfx3', 'has_calibration': 'invalid', 'was_informed_by': 'nmdc:dgen3'}
+        >>> m.store_and_remove_calibrations(workflow_execution_doc)  # doctest: +IGNORE_EXCEPTION_DETAIL
+        Traceback (most recent call last):
+        ValueError: The 'has_calibration' value (invalid) in document (nmdc:wfx3) is not recognized
+        """
+
+        calibration_mapping = {} #create dictionary to store mappings
+
+        if "has_calibration" in workflow_execution_doc:
+            has_calibration_data_obj_id = workflow_execution_doc.get("has_calibration")
+
+            # If has_calibration has a string value of false, remove the slot altogether from the document
+            if has_calibration_data_obj_id.lower() == 'false':
+                workflow_execution_doc.pop("has_calibration")
+
+            # If the has_calibration value is not a data object id or does not have a value of "false"
+            # raise an error.
+            elif not self.check_has_calibration(has_calibration_data_obj_id):
+                raise ValueError(f"The 'has_calibration' value ({has_calibration_data_obj_id}) in document "
+                             f"({workflow_execution_doc['id']}) is not recognized")
+
+            # If has_calibration is a nmdc data object identifier:
+            elif self.check_has_calibration(has_calibration_data_obj_id):
+
+                if not self.check_for_valid_data_object(has_calibration_data_obj_id):
+                    raise ValueError(f"The 'has_calibration' value ({has_calibration_data_obj_id}) in document "
+                             f"({workflow_execution_doc['id']}) is not a valid data object. The data object does not exist")
+                else:
+                    data_gen_doc = self.adapter.get_document_having_value_in_field(
+                        collection_name="data_generation_set", field_name="id", value=workflow_execution_doc["was_informed_by"])
+
+                    calibration_doc = self.adapter.get_document_having_value_in_field(
+                        collection_name="calibration_set", field_name="calibration_object", value=has_calibration_data_obj_id)
+
+                   # Store has_calibrations in calibration_mapping dictionary
+                    calibration_mapping[data_gen_doc["id"]] = calibration_doc["id"]
+
+                    if not hasattr(self, "calibration_mappings"):
+                        self.calibration_mappings = {}
+                    self.calibration_mappings.update(calibration_mapping)
+
+                    # Remove calibration slot after storing mappings
+                    workflow_execution_doc.pop("has_calibration")
+
+        return workflow_execution_doc
+
+    def update_data_gen_calibration(self, data_gen_doc) -> dict:
+        r"""
+        Updates data generation documents with calibration information
+
+        >>> from nmdc_schema.migrators.adapters.dictionary_adapter import DictionaryAdapter
+        >>> db = {
+        ...   'workflow_execution_set': [
+        ...     {'id': 'nmdc:wfx1', 'has_calibration': 'nmdc:dobj-13-abc123', 'was_informed_by': 'nmdc:dgen1', 'type': 'nmdc:MetabolomicsAnalysis'}
+        ...   ],
+        ...   'data_generation_set': [
+        ...     {'id': 'nmdc:dgen1'},
+        ...     {'id': 'nmdc:dgen2'}  # doc without corresponding calibration
+        ...   ],
+        ...   'data_object_set': [
+        ...     {'id': 'nmdc:dobj-13-abc123'}
+        ...   ],
+        ...   'calibration_set': [
+        ...     {'id': 'nmdc:calib1', 'calibration_object': 'nmdc:dobj-13-abc123'}
+        ...   ]
+        ... }
+        >>> a = DictionaryAdapter(database=db)
+        >>> m = Migrator(adapter=a)
+        >>> # First store calibrations
+        >>> workflow_execution_doc = {'id': 'nmdc:wfx1', 'has_calibration': 'nmdc:dobj-13-abc123', 'was_informed_by': 'nmdc:dgen1', 'type': 'nmdc:MetabolomicsAnalysis'}
+        >>> _ = m.store_and_remove_calibrations(workflow_execution_doc)  # Store the calibrations first
+        >>> # Then test update_data_gen_calibration
+        >>> m.update_data_gen_calibration({'id': 'nmdc:dgen1'})  # doc with corresponding calibration
+        {'id': 'nmdc:dgen1', 'has_calibration': 'nmdc:calib1'}
+        >>> # Test document without calibration
+        >>> m.update_data_gen_calibration({'id': 'nmdc:dgen2'})  # doc without corresponding calibration
+        {'id': 'nmdc:dgen2'}
+        """
+
+        if data_gen_doc["id"] in self.calibration_mappings:
+            data_gen_doc["has_calibration"] = self.calibration_mappings[data_gen_doc["id"]]
+        return data_gen_doc
diff --git a/src/data/invalid/MetabolomicsAnalysis-invalid-has-slot-used.yaml b/src/data/invalid/MetabolomicsAnalysis-invalid-has-slot-used.yaml
@@ -2,7 +2,6 @@ id: nmdc:wfmb-11-547rwq94.1
 ended_at_time: '2021-09-15T10:13:20+00:00'
 execution_resource: NERSC-Cori
 git_url: https://example.org/WorkflowExecutionActivity
-has_calibration: calibration with 0.01% phosphoric acid
 was_informed_by: nmdc:omprc-11-d8a8da
 started_at_time: '2021-08-05T14:48:51+00:00'
 type: nmdc:MetabolomicsAnalysis

diff --git a/src/data/valid/Database-interleaved.yaml b/src/data/valid/Database-interleaved.yaml
@@ -3687,7 +3687,6 @@ workflow_execution_set:
     git_url: https://github.com/microbiomedata/metabolomics_analysis/releases/tag/v0.5.0
     was_informed_by: nmdc:omprc-11-di84md
     started_at_time: '2023-08-02T09:00:00Z'
-    has_calibration: nmdc:calib-l2k-9d6j3
     has_metabolite_identifications:
       - type: nmdc:MetaboliteIdentification
         highest_similarity_score: 0.88
@@ -3901,7 +3900,6 @@ workflow_execution_set:
     git_url: https://github.com/microbiomedata/nom_analysis/releases/tag/v0.3.2
     was_informed_by: nmdc:dgms-12-dfa74b
     started_at_time: '2023-08-08T09:30:00Z'
-    has_calibration: nmdc:calib-99-v9w6
 data_generation_set:
   - id: nmdc:dgms-99-zUCd5N
     type: nmdc:MassSpectrometry