Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MNT] Deprecate Cognitive Atlas vocab namespace & add check for unsupported namespaces #410

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

The `bagel-cli` is a Python command-line tool to automatically parse and describe subject phenotypic and imaging attributes in an annotated dataset for integration into the Neurobagel graph.

**Please refer to our [official Neurobagel documentation](https://neurobagel.org/cli/) for information on how to install and use the CLI.**
**Please refer to our [official Neurobagel documentation](https://neurobagel.org/user_guide/cli/) for information on how to install and use the CLI.**


## Development environment
Expand Down
9 changes: 7 additions & 2 deletions bagel/mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,13 @@
NP = Namespace(
"np", "https://github.com/nipoppy/pipeline-catalog/tree/main/processing/"
)
# Store all supported amespaces in a list for easy iteration & testing
ALL_NAMESPACES = [COGATLAS, NB, NCIT, NIDM, SNOMED, NP]

# Store all supported and deprecated namespaces in a list for easy iteration & testing
SUPPORTED_NAMESPACES = [NB, NCIT, NIDM, SNOMED, NP]
SUPPORTED_NAMESPACE_PREFIXES = [ns.pf for ns in SUPPORTED_NAMESPACES]
# Keep deprecated namespaces for informative user messages
DEPRECATED_NAMESPACES = [COGATLAS]
DEPRECATED_NAMESPACE_PREFIXES = [ns.pf for ns in DEPRECATED_NAMESPACES]

BIDS = {
"anat": NIDM.pf + ":Anatomical",
Expand Down
4 changes: 2 additions & 2 deletions bagel/utilities/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,15 @@
from pydantic import ValidationError

from bagel import models
from bagel.mappings import ALL_NAMESPACES, NB
from bagel.mappings import NB, SUPPORTED_NAMESPACES
from bagel.utilities import file_utils


def generate_context():
# Adapted from the dandi-schema context generation function
# https://github.com/dandi/dandi-schema/blob/c616d87eaae8869770df0cb5405c24afdb9db096/dandischema/metadata.py
field_preamble = {
namespace.pf: namespace.url for namespace in ALL_NAMESPACES
namespace.pf: namespace.url for namespace in SUPPORTED_NAMESPACES
}
fields = {}
for klass_name, klass in inspect.getmembers(models):
Expand Down
75 changes: 73 additions & 2 deletions bagel/utilities/pheno_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@
from typer import BadParameter

from bagel import dictionary_models, mappings
from bagel.mappings import NB
from bagel.mappings import (
DEPRECATED_NAMESPACE_PREFIXES,
NB,
SUPPORTED_NAMESPACE_PREFIXES,
)

DICTIONARY_SCHEMA = dictionary_models.DataDictionary.model_json_schema()

Expand Down Expand Up @@ -64,7 +68,7 @@ def get_columns_about(data_dict: dict, concept: str) -> list:
]


def get_annotated_columns(data_dict: dict) -> list(tuple[str, dict]):
def get_annotated_columns(data_dict: dict) -> list[tuple[str, dict]]:
alyssadai marked this conversation as resolved.
Show resolved Hide resolved
"""
Return a list of all columns that have Neurobagel 'Annotations' in a data dictionary,
where each column is represented as a tuple of the column name (dictionary key from the data dictionary) and
Expand All @@ -77,6 +81,53 @@ def get_annotated_columns(data_dict: dict) -> list(tuple[str, dict]):
]


def recursive_find_values_for_key(data: dict, target: str) -> list:
"""
Recursively search for a key in a possibly nested dictionary and return a list of all values found for that key.

TODO: This function currently only considers nested dicts, and would need to be expanded if Neurobagel
data dictionaries grow to have controlled terms inside list objects.
"""
target_values = []
if isinstance(data, dict):
for key, value in data.items():
if key == target:
target_values.append(value)
else:
target_values.extend(
recursive_find_values_for_key(data=value, target=target)
)
return target_values


def find_unsupported_namespaces_and_term_urls(
data_dict: dict,
) -> tuple[list, dict]:
"""
From a provided data dictionary, find all term URLs that contain an unsupported namespace prefix.
Return a tuple of unsupported prefixes and a dictionary of the offending column names and their unrecognized term URLs.
"""
unsupported_prefixes = set()
unrecognized_term_urls = {}

for col, content in get_annotated_columns(data_dict):
for col_term_url in recursive_find_values_for_key(
content["Annotations"], "TermURL"
alyssadai marked this conversation as resolved.
Show resolved Hide resolved
):
prefix = col_term_url.split(":")[0]
if prefix not in SUPPORTED_NAMESPACE_PREFIXES:
unsupported_prefixes.add(prefix)
unrecognized_term_urls[col] = col_term_url

# sort the prefixes for a predictable order in the error message
return sorted(unsupported_prefixes), unrecognized_term_urls


def find_deprecated_namespaces(namespaces: list) -> list:
"""Return the deprecated vocabulary namespace prefixes found in a list of namespace prefixes."""
return [ns for ns in namespaces if ns in DEPRECATED_NAMESPACE_PREFIXES]


def map_categories_to_columns(data_dict: dict) -> dict:
"""
Maps all pre-defined Neurobagel categories (e.g. "Sex") to a list containing all column names (if any) that
Expand Down Expand Up @@ -315,6 +366,26 @@ def validate_data_dict(data_dict: dict) -> None:
"The provided data dictionary must contain at least one column with Neurobagel annotations."
)

unsupported_namespaces, unrecognized_term_urls = (
find_unsupported_namespaces_and_term_urls(data_dict)
)
if unsupported_namespaces:
namespace_deprecation_msg = ""
if deprecated_namespaces := find_deprecated_namespaces(
unsupported_namespaces
):
namespace_deprecation_msg = (
f"\n\nMore info: The following vocabularies have been deprecated by Neurobagel: {deprecated_namespaces}. "
"Please update your data dictionary using the latest version of the annotation tool at https://annotate.neurobagel.org."
)
raise LookupError(
f"The provided data dictionary contains unsupported vocabulary namespace prefixes: {unsupported_namespaces}\n"
f"Unsupported vocabularies are used for terms in the following columns' annotations: {unrecognized_term_urls}\n"
"Please ensure that the data dictionary only includes terms from Neurobagel recognized vocabularies. "
"(See https://neurobagel.org/data_models/dictionaries/.)"
f"{namespace_deprecation_msg}"
)

if (
len(
get_columns_about(
Expand Down
41 changes: 41 additions & 0 deletions generate_neurobagel_example_jsonlds.sh
alyssadai marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/bash

# Steps to use:
# 1. cd into the tests/neurobagel_examples submodule and create a new branch that will contain the updated example files
# 2. Navigate back to the bagel-cli repository root directory and run this script from there to regenerate the example synthetic JSONLD files inside of the tests/neurobagel_examples submodule
# in neurobagel_examples.
# 3. Navigate again to tests/neurobagel_examples and from there, commit the changes, push the changes to the submodule origin, and open a PR there to merge the updated examples.

docker build -t bagel .
cd tests

data_dir=neurobagel_examples/data-upload

# Phenotypic data only JSONLD
docker run --rm --volume=$PWD:/data/neurobagel/bagel-cli -w /data/neurobagel/bagel-cli bagel pheno \
alyssadai marked this conversation as resolved.
Show resolved Hide resolved
--pheno "${data_dir}/example_synthetic.tsv" \
--dictionary "${data_dir}/example_synthetic.json" \
--name "BIDS synthetic" \
--output "${data_dir}/example_synthetic.jsonld" \
--overwrite

# Phenotypic & BIDS data JSONLD
docker run --rm --volume=$PWD:/data/neurobagel/bagel-cli -w /data/neurobagel/bagel-cli bagel bids \
--jsonld-path ${data_dir}/example_synthetic.jsonld \
--bids-dir bids-examples/synthetic \
--output ${data_dir}/pheno-bids-output/example_synthetic_pheno-bids.jsonld \
--overwrite

# Phenotypic & derivatives data JSONLD
docker run --rm --volume=$PWD:/data/neurobagel/bagel-cli -w /data/neurobagel/bagel-cli bagel derivatives \
--tabular ${data_dir}/nipoppy_proc_status_synthetic.tsv \
--jsonld-path ${data_dir}/example_synthetic.jsonld \
--output "${data_dir}/pheno-derivatives-output/example_synthetic_pheno-derivatives.jsonld" \
--overwrite

# Phenotypic, BIDS, and derivatives data JSONLD
docker run --rm --volume=$PWD:/data/neurobagel/bagel-cli -w /data/neurobagel/bagel-cli bagel derivatives \
--tabular ${data_dir}/nipoppy_proc_status_synthetic.tsv \
--jsonld-path "${data_dir}/pheno-bids-output/example_synthetic_pheno-bids.jsonld" \
--output "${data_dir}/pheno-bids-derivatives-output/example_synthetic_pheno-bids-derivatives.jsonld" \
--overwrite
4 changes: 2 additions & 2 deletions tests/data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
| 2 | valid, unique `participant` and `session` IDs | same as example 1 | pass |
| 3 | same as example 2 | valid BIDS data dictionary, BUT: does not contain Neurobagel `"Annotations"` key | fail |
| 4 | valid, has additional columns not described in `.json` | same as example 1 | pass |
| 5 | valid, has additional unique value, not documented in `.json` | same as example 1 | fail |
| 6 | valid, same as example 5. has annotation tool columns | valid, contains `"MissingValues"` attribute for categorical variable | pass |
| 5 | valid, has assessment tool columns | invalid, has TermURLs from unsupported vocabularies | fail |
| 6 | valid, same as example 5. | valid, contains `"MissingValues"` attribute for categorical variable | pass |
| invalid | valid, only exists to be used together with the (invalid) .json | invalid, missing the `"TermURL"` attribute for identifiers | fail |
| 7 | has fewer columns than are annotated in `.json` | same as example 1 | fail |
| 8 | valid, based on ex2 has multiple participant_id columns | valid, based on ex2 multiple participant_id column annotations | fail* |
Expand Down
6 changes: 3 additions & 3 deletions tests/data/example10.json
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:1234",
"TermURL": "snomed:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing", "NOT IN TSV 1", "NOT IN TSV 2"]
Expand All @@ -65,7 +65,7 @@
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:1234",
"TermURL": "snomed:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing", "NOT IN TSV 1", "NOT IN TSV 2"]
Expand All @@ -79,7 +79,7 @@
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:4321",
"TermURL": "snomed:4321",
"Label": "A different imaginary tool"
},
"MissingValues": ["none"]
Expand Down
6 changes: 3 additions & 3 deletions tests/data/example11.json
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:1234",
"TermURL": "snomed:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing"]
Expand All @@ -65,7 +65,7 @@
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:1234",
"TermURL": "snomed:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing"]
Expand All @@ -79,7 +79,7 @@
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:4321",
"TermURL": "snomed:4321",
"Label": "A different imaginary tool"
},
"MissingValues": ["none"]
Expand Down
6 changes: 3 additions & 3 deletions tests/data/example13.json
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:1234",
"TermURL": "snomed:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing"]
Expand All @@ -107,7 +107,7 @@
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:1234",
"TermURL": "snomed:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing"]
Expand All @@ -121,7 +121,7 @@
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:4321",
"TermURL": "snomed:4321",
"Label": "A different imaginary tool"
},
"MissingValues": ["not completed"]
Expand Down
45 changes: 44 additions & 1 deletion tests/data/example5.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,50 @@
"TermURL": "ncit:C94342",
"Label": "Healthy Control"
}
}
},
"MissingValues": ["OTHER"]
}
},
"tool_item1": {
"Description": "item 1 scores for an imaginary tool",
"Annotations": {
"IsAbout": {
"TermURL": "nb:Assessment",
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "unknownvocab:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing"]
}
},
"tool_item2": {
"Description": "item 2 scores for an imaginary tool",
"Annotations": {
"IsAbout": {
"TermURL": "nb:Assessment",
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "unknownvocab:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing"]
}
},
"other_tool_item1": {
"Description": "item 1 scores for a different imaginary tool",
"Annotations": {
"IsAbout": {
"TermURL": "nb:Assessment",
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:4321",
"Label": "A different imaginary tool"
},
"MissingValues": ["none"]
}
}
}
12 changes: 7 additions & 5 deletions tests/data/example5.tsv
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
participant_id session_id group
sub-01 ses-01 PAT
sub-01 ses-02 PAT
sub-02 ses-01 OTHER
sub-02 ses-02 CTRL
participant_id session_id group tool_item1 tool_item2 other_tool_item1
sub-01 ses-01 PAT 11.0 "missing" "none"
sub-01 ses-02 PAT "missing" 12.0 "none"
sub-02 ses-01 OTHER "missing" "missing" "none"
sub-02 ses-02 OTHER "missing" "missing" "none"
sub-03 ses-01 CTRL 10.0 8.0 "ok"
sub-03 ses-02 CTRL 10.0 8.0 "bad"
6 changes: 3 additions & 3 deletions tests/data/example6.json
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:1234",
"TermURL": "snomed:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing"]
Expand All @@ -65,7 +65,7 @@
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:1234",
"TermURL": "snomed:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing"]
Expand All @@ -79,7 +79,7 @@
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:4321",
"TermURL": "snomed:4321",
"Label": "A different imaginary tool"
},
"MissingValues": ["none"]
Expand Down
6 changes: 3 additions & 3 deletions tests/data/example9.json
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:1234",
"TermURL": "snomed:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing"]
Expand All @@ -65,7 +65,7 @@
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:1234",
"TermURL": "snomed:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing"]
Expand All @@ -79,7 +79,7 @@
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:4321",
"TermURL": "snomed:4321",
"Label": "A different imaginary tool"
},
"MissingValues": ["none"]
Expand Down
Loading
Loading