elixir-europe · cpommier · Jan 3, 2024 · Jan 3, 2024 · Mar 12, 2024
diff --git a/etl/transform/datadiscovery_cards.py b/etl/transform/datadiscovery_cards.py
@@ -69,16 +69,26 @@
     }
 ]
 
+
 documents_dbid_fields_plus_field_type = {
-    "study": [["germplasmDbIds", "germplasm"], ["locationDbId", "location"], ["locationDbIds", "location"],
-              ["trialDbIds", "trial"], ["trialDbId", "trial"], ["programDbId", "program"], ["programDbIds", "program"]],
+    "study": [
+        ["germplasmDbIds", "germplasm"], ["locationDbId", "location"], ["locationDbIds", "location"],
+        ["trialDbIds", "trial"], ["trialDbId", "trial"], ["programDbId", "program"], ["programDbIds", "program"]
+    ],
     "germplasm": [["locationDbIds", "location"], ["studyDbIds", "study"], ["trialDbIds", "trial"]],
-    "germplasmPedigree": [["germplasmDbId", "germplasm"], ["parent1DbId", "germplasm"], ["parent2DbId", "germplasm"]],
+    "germplasmPedigree":[
+            ["germplasmDbId", "germplasm"], ["parent1DbId", "germplasm"], ["parent2DbId", "germplasm"],
+            ["siblings","germplasmDbId","object-list","germplasm"]#TODO: same with siblings
+    ],
     "germplasmProgeny": [["germplasmDbId", "germplasm"], ["parent1DbId", "germplasm"], ["parent2DbId", "germplasm"]],
+    "germplasmAttribute": [["germplasmDbId", "germplasm"]],
     "observationVariable": [["studyDbIds", "study"]],
+    "observationUnit": [["studyDbIds", "study"], ["germplasmDbId", "germplasm"], ["studyLocationDbIds", "location"]],
     "location": [["studyDbIds", "study"], ["trialDbIds", "trial"]],
-    "trial": [["germplasmDbIds", "germplasm"], ["locationDbIds", "location"], ["studyDbIds", "study"],
-              ["contactDbIds", "contact"]],
+    "trial": [
+        ["germplasmDbIds", "germplasm"], ["locationDbIds", "location"], ["studyDbIds", "study"],
+        ["contactDbIds", "contact"]
+    ],
     "program": [["trialDbIds", "trial"], ["studyDbIds", "study"]],
     "contact": [["trialDbIds", "trial"]]
 }
@@ -268,25 +278,33 @@ def _handle_DbId_URI(document, document_type, documents_dbid_fields_plus_field_t
         document[document_type + 'DbId'] = get_generated_uri_from_dict(source, document_type, document, True)
     # transform other DbIds , skip observationVariable
     if document_type in documents_dbid_fields_plus_field_type:
-        for fields in documents_dbid_fields_plus_field_type[document_type]:
-            if fields[0] in document:
-                if document[fields[0]] and fields[0].endswith("DbIds"):
+        for current_field in documents_dbid_fields_plus_field_type[document_type]:
+            if current_field[0] in document:
+                if document[current_field[0]] and len(current_field)==4 and current_field[2] == "object-list":
+                    # DbIds
+                    field_ids_transformed = map(
+                        lambda x: dict(x, **{
+                            current_field[1]:get_generated_uri_from_str(source, current_field[3], x[current_field[1]], True)
+                        }),
+                        document[current_field[0]])
+                    document[current_field[0]] = list(field_ids_transformed)
+                elif document[current_field[0]] and current_field[0].endswith("DbIds"):#TODO: could be treated as object-list
                     # URIs
                     field_uris_transformed = map(
-                        lambda x: get_generated_uri_from_str(source, fields[1], x, False), document[fields[0]])
-                    document[fields[0].replace("DbIds", "URIs")] = list(set(field_uris_transformed))
+                        lambda x: get_generated_uri_from_str(source, current_field[1], x, False), document[current_field[0]])
+                    document[current_field[0].replace("DbIds", "URIs")] = list(set(field_uris_transformed))
                     # DbIds
                     field_ids_transformed = map(
-                        lambda x: get_generated_uri_from_str(source, fields[1], x, True), document[fields[0]])
-                    document[fields[0]] = list(field_ids_transformed)
+                        lambda x: get_generated_uri_from_str(source, current_field[1], x, True), document[current_field[0]])
+                    document[current_field[0]] = list(field_ids_transformed)
 
-                elif document[fields[0]] and fields[0].endswith("DbId"):
+                elif document[current_field[0]] and current_field[0].endswith("DbId"):
                     # URI
-                    document[fields[0].replace("DbId", "URI")] = get_generated_uri_from_str(source, fields[1],
-                                                                                            document[fields[0]],
+                    document[current_field[0].replace("DbId", "URI")] = get_generated_uri_from_str(source, current_field[1],
+                                                                                            document[current_field[0]],
                                                                                             False)
                     # DbId
-                    document[fields[0]] = get_generated_uri_from_str(source, fields[1], document[fields[0]],
+                    document[current_field[0]] = get_generated_uri_from_str(source, current_field[1], document[current_field[0]],
                                                                      True)
 
     return document

diff --git a/etl/transform/utils.py b/etl/transform/utils.py
@@ -21,25 +21,25 @@ def get_generated_uri_from_dict(source: dict, entity: str, data: dict, do_base64
     #TODO: this is going to be problematic since in GnpIS studies are using germplasmDbIb(num) and not germplasmDbIb(DOI)
     #TODO (cont): consider using a fully generated dbId, using urn, no matter what.
     #TODO (cont): should be ok, check with Célia, Cyril, Maud, Nico ?
-    data_uri = data.get(pui_field)
+    #data_uri = data.get(pui_field)
 
-    if data_uri and not keep_urn and rfc3987.match(data_uri, rule='URI'):
-        # The original PUI is a valid URI
-        if do_base64:
-            data_uri = base64.b64encode(data_uri.encode('utf-8')).decode('utf-8')
-        return data_uri
+    #if data_uri and not keep_urn and rfc3987.match(data_uri, rule='URI'):
+    #    # The original PUI is a valid URI
+    #    if do_base64:
+    #        data_uri = base64.b64encode(data_uri.encode('utf-8')).decode('utf-8')
+    #    return data_uri
 
     source_id = urllib.parse.quote(source['schema:identifier'])
     data_id = get_identifier(entity, data)
-    if not data_uri or keep_urn:
+    #if not data_uri or keep_urn:
         # Generate URI from source id, entity name and data id
-        encoded_entity = urllib.parse.quote(entity)
-        encoded_id = urllib.parse.quote(data_id)
-        data_uri = f"urn:{source_id}/{encoded_entity}/{encoded_id}"
-    else:
+    encoded_entity = urllib.parse.quote(entity)
+    encoded_id = urllib.parse.quote(data_id)
+    data_uri = f"urn:{source_id}/{encoded_entity}/{encoded_id}"
+    #else:
         # Generate URI by prepending the original URI with the source identifier
-        encoded_uri = urllib.parse.quote(data_uri)
-        data_uri = f"urn:{source_id}/{encoded_uri}"
+    #    encoded_uri = urllib.parse.quote(data_uri)
+    #    data_uri = f"urn:{source_id}/{encoded_uri}"
     if not rfc3987.match(data_uri, rule='URI'):
         raise Exception(f'Could not get or create a correct URI for "{entity}" object id "{data_id}"'
                         f' (malformed URI: "{data_uri}")')