ACED-IDP · quinnwai · Oct 20, 2025 · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025
diff --git a/gen3_tracker/meta/dataframer.py b/gen3_tracker/meta/dataframer.py
@@ -477,27 +477,38 @@ def select_coding(self, resource):
 
     def flattened_research_subjects(self) -> Generator[dict, None, None]:
 
-        # get all observations with a Observation.subject=Patient, mapped from patient ID to observation
+        # setup
         resource_type = "ResearchSubject"
-        conditions_by_patient_id = get_conditions_by_subject(self, "Patient")
+        patient_type = "Patient"
+        cursor = self.connect()
+
+        # grab associated conditions + observations via patient ID at once
+        conditions_by_patient_id = get_conditions_by_subject(self, patient_type)
+        observations_by_patient_id = get_observations_by_focus(self, patient_type)
 
         # get all ResearchSubjects
-        cursor = self.connect()
         cursor.execute(
             "SELECT * FROM resources where resource_type = ?", (resource_type,)
         )
 
-        # get research subject and associated .subject patient
+        # add in new fields to existing research subject
         for _, _, raw_research_subject in cursor.fetchall():
             research_subject = json.loads(raw_research_subject)
             flat_research_subject = SimplifiedResource.build(
                 resource=research_subject
             ).simplified
 
             # return with .subject (ie Patient) fields
-            patient = get_subject(self, research_subject)
+            _, patient = get_subject(self, research_subject)
             flat_research_subject.update(patient)
 
+            # add patient observation values
+            flat_research_subject = update_with_observations(
+                flat_research_subject,
+                patient["patient_id"],
+                observations_by_patient_id,
+            )
+
             # get condition code, eg enrollment diagnosis
             if patient["patient_id"] in conditions_by_patient_id:
                 conditions = conditions_by_patient_id[patient["patient_id"]]
@@ -528,7 +539,7 @@ def flattened_medication_administrations(self) -> Generator[dict, None, None]:
                 resource=medication_administration
             ).simplified
 
-            patient = get_subject(self, medication_administration)
+            _, patient = get_subject(self, medication_administration)
             flat_medication_administration.update(patient)
 
             yield flat_medication_administration
@@ -561,16 +572,17 @@ def flattened_document_reference(
         flat_doc_ref = SimplifiedResource.build(resource=doc_ref).simplified
 
         # extract the corresponding .subject and append its fields
-        flat_doc_ref.update(get_subject(self, doc_ref))
 
-        # populate observation data associated with the document reference document
-        if doc_ref["id"] in observation_by_focus_id:
-            associated_observations = observation_by_focus_id[doc_ref["id"]]
+        raw_subject, simplified_subject = get_subject(self, doc_ref)
+        flat_doc_ref.update(simplified_subject)
 
-            # TODO: assumes there are no duplicate column names in each observation
-            for observation in associated_observations:
-                flat_observation = SimplifiedResource.build(resource=observation).values
-                flat_doc_ref.update(flat_observation)
+        # extract the subject of the .subject and append its fields
+        # eg: a specimen is associated with a patients
+        _, simplified_subject_of_subject = get_subject(self, raw_subject)
+        flat_doc_ref.update(simplified_subject_of_subject)
+
+        # populate observation data associated with the document reference document
+        update_with_observations(flat_doc_ref, doc_ref["id"], observation_by_focus_id)
 
         # TODO: test this based on fhir-gdc
         if "basedOn" in doc_ref:
@@ -606,16 +618,11 @@ def flattened_specimen(self, specimen: dict, observation_by_id: dict) -> dict:
         flat_specimen = SimplifiedResource.build(resource=specimen).simplified
 
         # extract its .subject and append its fields (including id)
-        flat_specimen.update(get_subject(self, specimen))
+        _, simplified_subject = get_subject(self, specimen)
+        flat_specimen.update(simplified_subject)
 
         # populate observation codes for each associated observation
-        if specimen["id"] in observation_by_id:
-            observations = observation_by_id[specimen["id"]]
-
-            # TODO: assumes there are no duplicate column names in each observation
-            for observation in observations:
-                flat_observation = SimplifiedResource.build(resource=observation).values
-                flat_specimen.update(flat_observation)
+        update_with_observations(flat_specimen, specimen["id"], observation_by_id)
 
         return flat_specimen
 
@@ -725,13 +732,21 @@ def is_number(s):
         return False
 
 
+####################
+# MACROS / HELPERS #
+####################
+
+
 def get_subject(db: LocalFHIRDatabase, resource: dict) -> dict:
-    """get the resource's subject field if it exists"""
+    """
+    get the resource's subject if it exists
+    Return both the raw subject and its simplified version
+    """
 
     # ensure resource has subject field
     subject_key = get_nested_value(resource, ["subject", "reference"])
     if subject_key is None:
-        return {}
+        return {}, {}
 
     # traverse the resource of the subject and return its values
     cursor = db.connect()
@@ -740,7 +755,8 @@ def get_subject(db: LocalFHIRDatabase, resource: dict) -> dict:
     assert row, f"{subject_key} not found in database"
     _, _, raw_subject = row
     subject = json.loads(raw_subject)
-    return traverse(subject)
+
+    return subject, traverse(subject)
 
 
 def get_resources_by_reference(
@@ -804,3 +820,16 @@ def get_conditions_by_subject(
 ) -> dict[str, list]:
     """get all Conditions that have a subject of resource type subject_type"""
     return get_resources_by_reference(db, "Condition", "subject", subject_type)
+
+
+def update_with_observations(resource, id, observations_by_id):
+    """update a resource with the observations associated with the provided ID"""
+    if id in observations_by_id:
+        associated_observations = observations_by_id[id]
+
+        # TODO: assumes there are no duplicate column names in each observation
+        for observation in associated_observations:
+            flat_observation = SimplifiedResource.build(resource=observation).values
+            resource.update(flat_observation)
+
+    return resource
diff --git a/gen3_tracker/meta/entities.py b/gen3_tracker/meta/entities.py
@@ -517,6 +517,7 @@ def values(self) -> dict:
                     for parent_dict in self.resource["parent"]
                 ]
             )
+
         return _values
 
 

diff --git a/setup.py b/setup.py
@@ -11,7 +11,7 @@
 
 setup(
     name='gen3_tracker',
-    version='0.0.7rc24',
+    version='0.0.7rc27',
     description='A CLI for adding version control to Gen3 data submission projects.',
     long_description=long_description,
     long_description_content_type='text/markdown',

diff --git a/tests/integration/test_end_to_end_workflow.py b/tests/integration/test_end_to_end_workflow.py
@@ -132,7 +132,6 @@ def test_simple_workflow(runner: CliRunner, project_id, tmpdir) -> None:
     # check the files exist in the cloned directory
     run_command("ls -l")
 
-
     assert Path(
         "my-project-data/hello.txt"
     ).exists(), "hello.txt does not exist in the cloned directory."
-Original file line number
+Diff line change
@@ Expand Up / @@ -517,6 +517,7 @@ def values(self) -> dict: @@
                         for parent_dict in self.resource["parent"]
                     ]
                 )
             return _values
@@ Expand Down @@