Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix/urn study db id in obs unit #67

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 34 additions & 16 deletions etl/transform/datadiscovery_cards.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,16 +69,26 @@
}
]


documents_dbid_fields_plus_field_type = {
"study": [["germplasmDbIds", "germplasm"], ["locationDbId", "location"], ["locationDbIds", "location"],
["trialDbIds", "trial"], ["trialDbId", "trial"], ["programDbId", "program"], ["programDbIds", "program"]],
"study": [
["germplasmDbIds", "germplasm"], ["locationDbId", "location"], ["locationDbIds", "location"],
["trialDbIds", "trial"], ["trialDbId", "trial"], ["programDbId", "program"], ["programDbIds", "program"]
],
"germplasm": [["locationDbIds", "location"], ["studyDbIds", "study"], ["trialDbIds", "trial"]],
"germplasmPedigree": [["germplasmDbId", "germplasm"], ["parent1DbId", "germplasm"], ["parent2DbId", "germplasm"]],
"germplasmPedigree":[
["germplasmDbId", "germplasm"], ["parent1DbId", "germplasm"], ["parent2DbId", "germplasm"],
["siblings","germplasmDbId","object-list","germplasm"]#TODO: same with siblings
],
"germplasmProgeny": [["germplasmDbId", "germplasm"], ["parent1DbId", "germplasm"], ["parent2DbId", "germplasm"]],
"germplasmAttribute": [["germplasmDbId", "germplasm"]],
"observationVariable": [["studyDbIds", "study"]],
"observationUnit": [["studyDbIds", "study"], ["germplasmDbId", "germplasm"], ["studyLocationDbIds", "location"]],
"location": [["studyDbIds", "study"], ["trialDbIds", "trial"]],
"trial": [["germplasmDbIds", "germplasm"], ["locationDbIds", "location"], ["studyDbIds", "study"],
["contactDbIds", "contact"]],
"trial": [
["germplasmDbIds", "germplasm"], ["locationDbIds", "location"], ["studyDbIds", "study"],
["contactDbIds", "contact"]
],
"program": [["trialDbIds", "trial"], ["studyDbIds", "study"]],
"contact": [["trialDbIds", "trial"]]
}
Expand Down Expand Up @@ -268,25 +278,33 @@ def _handle_DbId_URI(document, document_type, documents_dbid_fields_plus_field_t
document[document_type + 'DbId'] = get_generated_uri_from_dict(source, document_type, document, True)
# transform other DbIds , skip observationVariable
if document_type in documents_dbid_fields_plus_field_type:
for fields in documents_dbid_fields_plus_field_type[document_type]:
if fields[0] in document:
if document[fields[0]] and fields[0].endswith("DbIds"):
for current_field in documents_dbid_fields_plus_field_type[document_type]:
if current_field[0] in document:
if document[current_field[0]] and len(current_field)==4 and current_field[2] == "object-list":
# DbIds
field_ids_transformed = map(
lambda x: dict(x, **{
current_field[1]:get_generated_uri_from_str(source, current_field[3], x[current_field[1]], True)
}),
document[current_field[0]])
document[current_field[0]] = list(field_ids_transformed)
elif document[current_field[0]] and current_field[0].endswith("DbIds"):#TODO: could be treated as object-list
# URIs
field_uris_transformed = map(
lambda x: get_generated_uri_from_str(source, fields[1], x, False), document[fields[0]])
document[fields[0].replace("DbIds", "URIs")] = list(set(field_uris_transformed))
lambda x: get_generated_uri_from_str(source, current_field[1], x, False), document[current_field[0]])
document[current_field[0].replace("DbIds", "URIs")] = list(set(field_uris_transformed))
# DbIds
field_ids_transformed = map(
lambda x: get_generated_uri_from_str(source, fields[1], x, True), document[fields[0]])
document[fields[0]] = list(field_ids_transformed)
lambda x: get_generated_uri_from_str(source, current_field[1], x, True), document[current_field[0]])
document[current_field[0]] = list(field_ids_transformed)

elif document[fields[0]] and fields[0].endswith("DbId"):
elif document[current_field[0]] and current_field[0].endswith("DbId"):
# URI
document[fields[0].replace("DbId", "URI")] = get_generated_uri_from_str(source, fields[1],
document[fields[0]],
document[current_field[0].replace("DbId", "URI")] = get_generated_uri_from_str(source, current_field[1],
document[current_field[0]],
False)
# DbId
document[fields[0]] = get_generated_uri_from_str(source, fields[1], document[fields[0]],
document[current_field[0]] = get_generated_uri_from_str(source, current_field[1], document[current_field[0]],
True)

return document
Expand Down
26 changes: 13 additions & 13 deletions etl/transform/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,25 +21,25 @@ def get_generated_uri_from_dict(source: dict, entity: str, data: dict, do_base64
#TODO: this is going to be problematic since in GnpIS studies are using germplasmDbIb(num) and not germplasmDbIb(DOI)
#TODO (cont): consider using a fully generated dbId, using urn, no matter what.
#TODO (cont): should be ok, check with Célia, Cyril, Maud, Nico ?
data_uri = data.get(pui_field)
#data_uri = data.get(pui_field)

if data_uri and not keep_urn and rfc3987.match(data_uri, rule='URI'):
# The original PUI is a valid URI
if do_base64:
data_uri = base64.b64encode(data_uri.encode('utf-8')).decode('utf-8')
return data_uri
#if data_uri and not keep_urn and rfc3987.match(data_uri, rule='URI'):
# # The original PUI is a valid URI
# if do_base64:
# data_uri = base64.b64encode(data_uri.encode('utf-8')).decode('utf-8')
# return data_uri

source_id = urllib.parse.quote(source['schema:identifier'])
data_id = get_identifier(entity, data)
if not data_uri or keep_urn:
#if not data_uri or keep_urn:
# Generate URI from source id, entity name and data id
encoded_entity = urllib.parse.quote(entity)
encoded_id = urllib.parse.quote(data_id)
data_uri = f"urn:{source_id}/{encoded_entity}/{encoded_id}"
else:
encoded_entity = urllib.parse.quote(entity)
encoded_id = urllib.parse.quote(data_id)
data_uri = f"urn:{source_id}/{encoded_entity}/{encoded_id}"
#else:
# Generate URI by prepending the original URI with the source identifier
encoded_uri = urllib.parse.quote(data_uri)
data_uri = f"urn:{source_id}/{encoded_uri}"
# encoded_uri = urllib.parse.quote(data_uri)
# data_uri = f"urn:{source_id}/{encoded_uri}"
if not rfc3987.match(data_uri, rule='URI'):
raise Exception(f'Could not get or create a correct URI for "{entity}" object id "{data_id}"'
f' (malformed URI: "{data_uri}")')
Expand Down