Skip to content

Commit 1b7f9fe

Browse files
alyssadaisurchs
andauthored
[MNT] Deprecate Cognitive Atlas vocab namespace & add check for unsupported namespaces (#410)
* update test data README * update type hint * add test for phenotypic TSV with unrecognized vocab namespace * add check for unrecognized namespaces in data dict * add global var and check for deprecated namespaces * test extraction of unsupported namespaces * test deprecated namespace extraction and move checks to data dict validation * fix outdated docs link in README * add script to regenerate JSONLDs in neurobagel_examples submodule * rework example5 to have unsupported vocabs in data dict - example5 previously wasn't used anywhere and was conceptually a duplicate of example9 * update neurobagel_examples submodule * update tests * update JSONLD regeneration script docstring Co-authored-by: Sebastian Urchs <surchs@users.noreply.github.com> --------- Co-authored-by: Sebastian Urchs <surchs@users.noreply.github.com>
1 parent ec504d5 commit 1b7f9fe

17 files changed

+281
-37
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
The `bagel-cli` is a Python command-line tool to automatically parse and describe subject phenotypic and imaging attributes in an annotated dataset for integration into the Neurobagel graph.
1616

17-
**Please refer to our [official Neurobagel documentation](https://neurobagel.org/cli/) for information on how to install and use the CLI.**
17+
**Please refer to our [official Neurobagel documentation](https://neurobagel.org/user_guide/cli/) for information on how to install and use the CLI.**
1818

1919

2020
## Development environment

bagel/mappings.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,13 @@
1616
NP = Namespace(
1717
"np", "https://github.com/nipoppy/pipeline-catalog/tree/main/processing/"
1818
)
19-
# Store all supported amespaces in a list for easy iteration & testing
20-
ALL_NAMESPACES = [COGATLAS, NB, NCIT, NIDM, SNOMED, NP]
19+
20+
# Store all supported and deprecated namespaces in a list for easy iteration & testing
21+
SUPPORTED_NAMESPACES = [NB, NCIT, NIDM, SNOMED, NP]
22+
SUPPORTED_NAMESPACE_PREFIXES = [ns.pf for ns in SUPPORTED_NAMESPACES]
23+
# Keep deprecated namespaces for informative user messages
24+
DEPRECATED_NAMESPACES = [COGATLAS]
25+
DEPRECATED_NAMESPACE_PREFIXES = [ns.pf for ns in DEPRECATED_NAMESPACES]
2126

2227
BIDS = {
2328
"anat": NIDM.pf + ":Anatomical",

bagel/utilities/model_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,15 @@
77
from pydantic import ValidationError
88

99
from bagel import models
10-
from bagel.mappings import ALL_NAMESPACES, NB
10+
from bagel.mappings import NB, SUPPORTED_NAMESPACES
1111
from bagel.utilities import file_utils
1212

1313

1414
def generate_context():
1515
# Adapted from the dandi-schema context generation function
1616
# https://github.com/dandi/dandi-schema/blob/c616d87eaae8869770df0cb5405c24afdb9db096/dandischema/metadata.py
1717
field_preamble = {
18-
namespace.pf: namespace.url for namespace in ALL_NAMESPACES
18+
namespace.pf: namespace.url for namespace in SUPPORTED_NAMESPACES
1919
}
2020
fields = {}
2121
for klass_name, klass in inspect.getmembers(models):

bagel/utilities/pheno_utils.py

Lines changed: 73 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,11 @@
1111
from typer import BadParameter
1212

1313
from bagel import dictionary_models, mappings
14-
from bagel.mappings import NB
14+
from bagel.mappings import (
15+
DEPRECATED_NAMESPACE_PREFIXES,
16+
NB,
17+
SUPPORTED_NAMESPACE_PREFIXES,
18+
)
1519

1620
DICTIONARY_SCHEMA = dictionary_models.DataDictionary.model_json_schema()
1721

@@ -64,7 +68,7 @@ def get_columns_about(data_dict: dict, concept: str) -> list:
6468
]
6569

6670

67-
def get_annotated_columns(data_dict: dict) -> list(tuple[str, dict]):
71+
def get_annotated_columns(data_dict: dict) -> list[tuple[str, dict]]:
6872
"""
6973
Return a list of all columns that have Neurobagel 'Annotations' in a data dictionary,
7074
where each column is represented as a tuple of the column name (dictionary key from the data dictionary) and
@@ -77,6 +81,53 @@ def get_annotated_columns(data_dict: dict) -> list(tuple[str, dict]):
7781
]
7882

7983

84+
def recursive_find_values_for_key(data: dict, target: str) -> list:
85+
"""
86+
Recursively search for a key in a possibly nested dictionary and return a list of all values found for that key.
87+
88+
TODO: This function currently only considers nested dicts, and would need to be expanded if Neurobagel
89+
data dictionaries grow to have controlled terms inside list objects.
90+
"""
91+
target_values = []
92+
if isinstance(data, dict):
93+
for key, value in data.items():
94+
if key == target:
95+
target_values.append(value)
96+
else:
97+
target_values.extend(
98+
recursive_find_values_for_key(data=value, target=target)
99+
)
100+
return target_values
101+
102+
103+
def find_unsupported_namespaces_and_term_urls(
104+
data_dict: dict,
105+
) -> tuple[list, dict]:
106+
"""
107+
From a provided data dictionary, find all term URLs that contain an unsupported namespace prefix.
108+
Return a tuple of unsupported prefixes and a dictionary of the offending column names and their unrecognized term URLs.
109+
"""
110+
unsupported_prefixes = set()
111+
unrecognized_term_urls = {}
112+
113+
for col, content in get_annotated_columns(data_dict):
114+
for col_term_url in recursive_find_values_for_key(
115+
content["Annotations"], "TermURL"
116+
):
117+
prefix = col_term_url.split(":")[0]
118+
if prefix not in SUPPORTED_NAMESPACE_PREFIXES:
119+
unsupported_prefixes.add(prefix)
120+
unrecognized_term_urls[col] = col_term_url
121+
122+
# sort the prefixes for a predictable order in the error message
123+
return sorted(unsupported_prefixes), unrecognized_term_urls
124+
125+
126+
def find_deprecated_namespaces(namespaces: list) -> list:
127+
"""Return the deprecated vocabulary namespace prefixes found in a list of namespace prefixes."""
128+
return [ns for ns in namespaces if ns in DEPRECATED_NAMESPACE_PREFIXES]
129+
130+
80131
def map_categories_to_columns(data_dict: dict) -> dict:
81132
"""
82133
Maps all pre-defined Neurobagel categories (e.g. "Sex") to a list containing all column names (if any) that
@@ -315,6 +366,26 @@ def validate_data_dict(data_dict: dict) -> None:
315366
"The provided data dictionary must contain at least one column with Neurobagel annotations."
316367
)
317368

369+
unsupported_namespaces, unrecognized_term_urls = (
370+
find_unsupported_namespaces_and_term_urls(data_dict)
371+
)
372+
if unsupported_namespaces:
373+
namespace_deprecation_msg = ""
374+
if deprecated_namespaces := find_deprecated_namespaces(
375+
unsupported_namespaces
376+
):
377+
namespace_deprecation_msg = (
378+
f"\n\nMore info: The following vocabularies have been deprecated by Neurobagel: {deprecated_namespaces}. "
379+
"Please update your data dictionary using the latest version of the annotation tool at https://annotate.neurobagel.org."
380+
)
381+
raise LookupError(
382+
f"The provided data dictionary contains unsupported vocabulary namespace prefixes: {unsupported_namespaces}\n"
383+
f"Unsupported vocabularies are used for terms in the following columns' annotations: {unrecognized_term_urls}\n"
384+
"Please ensure that the data dictionary only includes terms from Neurobagel recognized vocabularies. "
385+
"(See https://neurobagel.org/data_models/dictionaries/.)"
386+
f"{namespace_deprecation_msg}"
387+
)
388+
318389
if (
319390
len(
320391
get_columns_about(
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#!/bin/bash
2+
3+
# Steps to use:
4+
# 1. cd into the tests/neurobagel_examples submodule and create a new branch that will contain the updated example files
5+
# 2. Navigate back to the bagel-cli repository root directory and run this script from there to regenerate the example synthetic JSONLD files inside of the tests/neurobagel_examples submodule
6+
# in neurobagel_examples.
7+
# 3. Navigate again to tests/neurobagel_examples and from there, commit the changes, push the changes to the submodule origin, and open a PR there to merge the updated examples.
8+
9+
docker build -t bagel .
10+
cd tests
11+
12+
data_dir=neurobagel_examples/data-upload
13+
14+
# Phenotypic data only JSONLD
15+
docker run --rm --volume=$PWD:/data/neurobagel/bagel-cli -w /data/neurobagel/bagel-cli bagel pheno \
16+
--pheno "${data_dir}/example_synthetic.tsv" \
17+
--dictionary "${data_dir}/example_synthetic.json" \
18+
--name "BIDS synthetic" \
19+
--output "${data_dir}/example_synthetic.jsonld" \
20+
--overwrite
21+
22+
# Phenotypic & BIDS data JSONLD
23+
docker run --rm --volume=$PWD:/data/neurobagel/bagel-cli -w /data/neurobagel/bagel-cli bagel bids \
24+
--jsonld-path ${data_dir}/example_synthetic.jsonld \
25+
--bids-dir bids-examples/synthetic \
26+
--output ${data_dir}/pheno-bids-output/example_synthetic_pheno-bids.jsonld \
27+
--overwrite
28+
29+
# Phenotypic & derivatives data JSONLD
30+
docker run --rm --volume=$PWD:/data/neurobagel/bagel-cli -w /data/neurobagel/bagel-cli bagel derivatives \
31+
--tabular ${data_dir}/nipoppy_proc_status_synthetic.tsv \
32+
--jsonld-path ${data_dir}/example_synthetic.jsonld \
33+
--output "${data_dir}/pheno-derivatives-output/example_synthetic_pheno-derivatives.jsonld" \
34+
--overwrite
35+
36+
# Phenotypic, BIDS, and derivatives data JSONLD
37+
docker run --rm --volume=$PWD:/data/neurobagel/bagel-cli -w /data/neurobagel/bagel-cli bagel derivatives \
38+
--tabular ${data_dir}/nipoppy_proc_status_synthetic.tsv \
39+
--jsonld-path "${data_dir}/pheno-bids-output/example_synthetic_pheno-bids.jsonld" \
40+
--output "${data_dir}/pheno-bids-derivatives-output/example_synthetic_pheno-bids-derivatives.jsonld" \
41+
--overwrite

tests/data/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
| 2 | valid, unique `participant` and `session` IDs | same as example 1 | pass |
99
| 3 | same as example 2 | valid BIDS data dictionary, BUT: does not contain Neurobagel `"Annotations"` key | fail |
1010
| 4 | valid, has additional columns not described in `.json` | same as example 1 | pass |
11-
| 5 | valid, has additional unique value, not documented in `.json` | same as example 1 | fail |
12-
| 6 | valid, same as example 5. has annotation tool columns | valid, contains `"MissingValues"` attribute for categorical variable | pass |
11+
| 5 | valid, has assessment tool columns | invalid, has TermURLs from unsupported vocabularies | fail |
12+
| 6 | valid, same as example 5. | valid, contains `"MissingValues"` attribute for categorical variable | pass |
1313
| invalid | valid, only exists to be used together with the (invalid) .json | invalid, missing the `"TermURL"` attribute for identifiers | fail |
1414
| 7 | has fewer columns than are annotated in `.json` | same as example 1 | fail |
1515
| 8 | valid, based on ex2 has multiple participant_id columns | valid, based on ex2 multiple participant_id column annotations | fail* |

tests/data/example10.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
"Label": "Assessment tool"
5252
},
5353
"IsPartOf": {
54-
"TermURL": "cogatlas:1234",
54+
"TermURL": "snomed:1234",
5555
"Label": "Imaginary tool"
5656
},
5757
"MissingValues": ["missing", "NOT IN TSV 1", "NOT IN TSV 2"]
@@ -65,7 +65,7 @@
6565
"Label": "Assessment tool"
6666
},
6767
"IsPartOf": {
68-
"TermURL": "cogatlas:1234",
68+
"TermURL": "snomed:1234",
6969
"Label": "Imaginary tool"
7070
},
7171
"MissingValues": ["missing", "NOT IN TSV 1", "NOT IN TSV 2"]
@@ -79,7 +79,7 @@
7979
"Label": "Assessment tool"
8080
},
8181
"IsPartOf": {
82-
"TermURL": "cogatlas:4321",
82+
"TermURL": "snomed:4321",
8383
"Label": "A different imaginary tool"
8484
},
8585
"MissingValues": ["none"]

tests/data/example11.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
"Label": "Assessment tool"
5252
},
5353
"IsPartOf": {
54-
"TermURL": "cogatlas:1234",
54+
"TermURL": "snomed:1234",
5555
"Label": "Imaginary tool"
5656
},
5757
"MissingValues": ["missing"]
@@ -65,7 +65,7 @@
6565
"Label": "Assessment tool"
6666
},
6767
"IsPartOf": {
68-
"TermURL": "cogatlas:1234",
68+
"TermURL": "snomed:1234",
6969
"Label": "Imaginary tool"
7070
},
7171
"MissingValues": ["missing"]
@@ -79,7 +79,7 @@
7979
"Label": "Assessment tool"
8080
},
8181
"IsPartOf": {
82-
"TermURL": "cogatlas:4321",
82+
"TermURL": "snomed:4321",
8383
"Label": "A different imaginary tool"
8484
},
8585
"MissingValues": ["none"]

tests/data/example13.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@
9393
"Label": "Assessment tool"
9494
},
9595
"IsPartOf": {
96-
"TermURL": "cogatlas:1234",
96+
"TermURL": "snomed:1234",
9797
"Label": "Imaginary tool"
9898
},
9999
"MissingValues": ["missing"]
@@ -107,7 +107,7 @@
107107
"Label": "Assessment tool"
108108
},
109109
"IsPartOf": {
110-
"TermURL": "cogatlas:1234",
110+
"TermURL": "snomed:1234",
111111
"Label": "Imaginary tool"
112112
},
113113
"MissingValues": ["missing"]
@@ -121,7 +121,7 @@
121121
"Label": "Assessment tool"
122122
},
123123
"IsPartOf": {
124-
"TermURL": "cogatlas:4321",
124+
"TermURL": "snomed:4321",
125125
"Label": "A different imaginary tool"
126126
},
127127
"MissingValues": ["not completed"]

tests/data/example5.json

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,50 @@
3939
"TermURL": "ncit:C94342",
4040
"Label": "Healthy Control"
4141
}
42-
}
42+
},
43+
"MissingValues": ["OTHER"]
44+
}
45+
},
46+
"tool_item1": {
47+
"Description": "item 1 scores for an imaginary tool",
48+
"Annotations": {
49+
"IsAbout": {
50+
"TermURL": "nb:Assessment",
51+
"Label": "Assessment tool"
52+
},
53+
"IsPartOf": {
54+
"TermURL": "unknownvocab:1234",
55+
"Label": "Imaginary tool"
56+
},
57+
"MissingValues": ["missing"]
58+
}
59+
},
60+
"tool_item2": {
61+
"Description": "item 2 scores for an imaginary tool",
62+
"Annotations": {
63+
"IsAbout": {
64+
"TermURL": "nb:Assessment",
65+
"Label": "Assessment tool"
66+
},
67+
"IsPartOf": {
68+
"TermURL": "unknownvocab:1234",
69+
"Label": "Imaginary tool"
70+
},
71+
"MissingValues": ["missing"]
72+
}
73+
},
74+
"other_tool_item1": {
75+
"Description": "item 1 scores for a different imaginary tool",
76+
"Annotations": {
77+
"IsAbout": {
78+
"TermURL": "nb:Assessment",
79+
"Label": "Assessment tool"
80+
},
81+
"IsPartOf": {
82+
"TermURL": "cogatlas:4321",
83+
"Label": "A different imaginary tool"
84+
},
85+
"MissingValues": ["none"]
4386
}
4487
}
4588
}

tests/data/example5.tsv

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1-
participant_id session_id group
2-
sub-01 ses-01 PAT
3-
sub-01 ses-02 PAT
4-
sub-02 ses-01 OTHER
5-
sub-02 ses-02 CTRL
1+
participant_id session_id group tool_item1 tool_item2 other_tool_item1
2+
sub-01 ses-01 PAT 11.0 "missing" "none"
3+
sub-01 ses-02 PAT "missing" 12.0 "none"
4+
sub-02 ses-01 OTHER "missing" "missing" "none"
5+
sub-02 ses-02 OTHER "missing" "missing" "none"
6+
sub-03 ses-01 CTRL 10.0 8.0 "ok"
7+
sub-03 ses-02 CTRL 10.0 8.0 "bad"

tests/data/example6.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
"Label": "Assessment tool"
5252
},
5353
"IsPartOf": {
54-
"TermURL": "cogatlas:1234",
54+
"TermURL": "snomed:1234",
5555
"Label": "Imaginary tool"
5656
},
5757
"MissingValues": ["missing"]
@@ -65,7 +65,7 @@
6565
"Label": "Assessment tool"
6666
},
6767
"IsPartOf": {
68-
"TermURL": "cogatlas:1234",
68+
"TermURL": "snomed:1234",
6969
"Label": "Imaginary tool"
7070
},
7171
"MissingValues": ["missing"]
@@ -79,7 +79,7 @@
7979
"Label": "Assessment tool"
8080
},
8181
"IsPartOf": {
82-
"TermURL": "cogatlas:4321",
82+
"TermURL": "snomed:4321",
8383
"Label": "A different imaginary tool"
8484
},
8585
"MissingValues": ["none"]

tests/data/example9.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
"Label": "Assessment tool"
5252
},
5353
"IsPartOf": {
54-
"TermURL": "cogatlas:1234",
54+
"TermURL": "snomed:1234",
5555
"Label": "Imaginary tool"
5656
},
5757
"MissingValues": ["missing"]
@@ -65,7 +65,7 @@
6565
"Label": "Assessment tool"
6666
},
6767
"IsPartOf": {
68-
"TermURL": "cogatlas:1234",
68+
"TermURL": "snomed:1234",
6969
"Label": "Imaginary tool"
7070
},
7171
"MissingValues": ["missing"]
@@ -79,7 +79,7 @@
7979
"Label": "Assessment tool"
8080
},
8181
"IsPartOf": {
82-
"TermURL": "cogatlas:4321",
82+
"TermURL": "snomed:4321",
8383
"Label": "A different imaginary tool"
8484
},
8585
"MissingValues": ["none"]

0 commit comments

Comments
 (0)