Skip to content

Commit

Permalink
[REF] Refactor utilities (#385)
Browse files Browse the repository at this point in the history
* move utils into dedicated module

* rename utility.py to model_utils.py

* fix imports

* split test_utility.py into util-specific tests

* rename generic test_utility.py

* remove uninformative test_generate_context()

* refactor out merging of context and graph dataset instance

* refactor out jsonld saving

* move tests of pipeline-related utils

* test add_context_to_graph_dataset()

* organize tests as integration or unit

* use full name of util modules & rename derivative_utils

* expand name of unit tests for clarity
  • Loading branch information
alyssadai authored Nov 11, 2024
1 parent bacaf13 commit 3f33239
Show file tree
Hide file tree
Showing 17 changed files with 859 additions and 870 deletions.
122 changes: 57 additions & 65 deletions bagel/cli.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,18 @@
import json
from pathlib import Path

import typer
from bids import BIDSLayout

import bagel.bids_utils as butil
import bagel.derivatives_utils as dutil
import bagel.file_utils as futil
import bagel.pheno_utils as putil
from bagel import mappings, models
from bagel.derivatives_utils import PROC_STATUS_COLS
from bagel.utility import (
confirm_subs_match_pheno_data,
extract_and_validate_jsonld_dataset,
generate_context,
get_imaging_session_instances,
get_subject_instances,

from .utilities import (
bids_utils,
derivative_utils,
file_utils,
model_utils,
pheno_utils,
)
from .utilities.derivative_utils import PROC_STATUS_COLS

CUSTOM_SESSION_LABEL = "ses-unnamed"

Expand Down Expand Up @@ -66,7 +62,7 @@ def pheno(
None,
"--portal",
"-u", # for URL
callback=putil.validate_portal_uri,
callback=pheno_utils.validate_portal_uri,
help="URL (HTTP/HTTPS) to a website or page that describes the dataset and access instructions (if available).",
),
output: Path = typer.Option(
Expand Down Expand Up @@ -94,11 +90,11 @@ def pheno(
graph data model for the provided phenotypic file in the .jsonld format.
You can upload this .jsonld file to the Neurobagel graph.
"""
futil.check_overwrite(output, overwrite)
file_utils.check_overwrite(output, overwrite)

data_dictionary = futil.load_json(dictionary)
pheno_df = futil.load_tabular(pheno)
putil.validate_inputs(data_dictionary, pheno_df)
data_dictionary = file_utils.load_json(dictionary)
pheno_df = file_utils.load_tabular(pheno)
pheno_utils.validate_inputs(data_dictionary, pheno_df)

# NOTE: `space` determines the amount of padding (in num. characters) before the file paths in the print statement.
# It is currently calculated as = (length of the longer string, including the 3 leading spaces) + (2 extra spaces)
Expand All @@ -111,8 +107,8 @@ def pheno(

subject_list = []

column_mapping = putil.map_categories_to_columns(data_dictionary)
tool_mapping = putil.map_tools_to_columns(data_dictionary)
column_mapping = pheno_utils.map_categories_to_columns(data_dictionary)
tool_mapping = pheno_utils.map_tools_to_columns(data_dictionary)

# TODO: needs refactoring once we handle multiple participant IDs
participants = column_mapping.get("participant")[0]
Expand Down Expand Up @@ -140,15 +136,15 @@ def pheno(
_ses_pheno = session_row

if "sex" in column_mapping.keys():
_sex_vals = putil.get_transformed_values(
_sex_vals = pheno_utils.get_transformed_values(
column_mapping["sex"], _ses_pheno, data_dictionary
)
if _sex_vals:
# NOTE: Our data model only allows a single sex value, so we only take the first instance if multiple columns are about sex
session.hasSex = models.Sex(identifier=_sex_vals[0])

if "diagnosis" in column_mapping.keys():
_dx_vals = putil.get_transformed_values(
_dx_vals = pheno_utils.get_transformed_values(
column_mapping["diagnosis"], _ses_pheno, data_dictionary
)
if not _dx_vals:
Expand All @@ -166,7 +162,7 @@ def pheno(
]

if "age" in column_mapping.keys():
_age_vals = putil.get_transformed_values(
_age_vals = pheno_utils.get_transformed_values(
column_mapping["age"], _ses_pheno, data_dictionary
)
if _age_vals:
Expand All @@ -177,7 +173,7 @@ def pheno(
_assessments = [
models.Assessment(identifier=tool)
for tool, columns in tool_mapping.items()
if putil.are_any_available(
if pheno_utils.are_any_available(
columns, _ses_pheno, data_dictionary
)
]
Expand All @@ -197,16 +193,10 @@ def pheno(
hasSamples=subject_list,
)

context = generate_context()
# We can't just exclude_unset here because the identifier and schemaKey
# for each instance are created as default values and so technically are never set
# TODO: we should revisit this because there may be reasons to have None be meaningful in the future
context.update(**dataset.dict(exclude_none=True))

with open(output, "w") as f:
f.write(json.dumps(context, indent=2))

print(f"Saved output to: {output}")
file_utils.save_jsonld(
data=model_utils.add_context_to_graph_dataset(dataset),
filename=output,
)


@bagel.command()
Expand Down Expand Up @@ -258,7 +248,7 @@ def bids(
graph data model for the combined metadata in the .jsonld format.
You can upload this .jsonld file to the Neurobagel graph.
"""
futil.check_overwrite(output, overwrite)
file_utils.check_overwrite(output, overwrite)

space = 51
print(
Expand All @@ -267,13 +257,15 @@ def bids(
f" {'BIDS dataset directory:' : <{space}} {bids_dir}"
)

jsonld_dataset = extract_and_validate_jsonld_dataset(jsonld_path)
jsonld_dataset = model_utils.extract_and_validate_jsonld_dataset(
jsonld_path
)

existing_subs_dict = get_subject_instances(jsonld_dataset)
existing_subs_dict = model_utils.get_subject_instances(jsonld_dataset)

# TODO: Revert to using Layout.get_subjects() to get BIDS subjects once pybids performance is improved
confirm_subs_match_pheno_data(
subjects=butil.get_bids_subjects_simple(bids_dir),
model_utils.confirm_subs_match_pheno_data(
subjects=bids_utils.get_bids_subjects_simple(bids_dir),
subject_source_for_err="BIDS directory",
pheno_subjects=existing_subs_dict.keys(),
)
Expand All @@ -287,7 +279,7 @@ def bids(
print("Merging BIDS metadata with existing subject annotations...\n")
for bids_sub_id in layout.get_subjects():
existing_subject = existing_subs_dict.get(f"sub-{bids_sub_id}")
existing_sessions_dict = get_imaging_session_instances(
existing_sessions_dict = model_utils.get_imaging_session_instances(
existing_subject
)

Expand All @@ -300,7 +292,7 @@ def bids(
# For some reason .get_sessions() doesn't always follow alphanumeric order
# By default (without sorting) the session lists look like ["02", "01"] per subject
for session_id in sorted(bids_sessions):
image_list = butil.create_acquisitions(
image_list = bids_utils.create_acquisitions(
layout=layout,
bids_sub_id=bids_sub_id,
session=session_id,
Expand All @@ -321,7 +313,7 @@ def bids(
if session_id is None
else f"ses-{session_id}"
)
session_path = butil.get_session_path(
session_path = bids_utils.get_session_path(
layout=layout,
bids_dir=bids_dir,
bids_sub_id=bids_sub_id,
Expand All @@ -344,13 +336,10 @@ def bids(
)
existing_subject.hasSession.append(new_imaging_session)

context = generate_context()
merged_dataset = {**context, **jsonld_dataset.dict(exclude_none=True)}

with open(output, "w") as f:
f.write(json.dumps(merged_dataset, indent=2))

print(f"Saved output to: {output}")
file_utils.save_jsonld(
data=model_utils.add_context_to_graph_dataset(jsonld_dataset),
filename=output,
)


@bagel.command()
Expand Down Expand Up @@ -402,7 +391,7 @@ def derivatives(
graph data model for the combined metadata in the .jsonld format.
You can upload this .jsonld file to the Neurobagel graph.
"""
futil.check_overwrite(output, overwrite)
file_utils.check_overwrite(output, overwrite)

space = 51
print(
Expand All @@ -411,10 +400,12 @@ def derivatives(
f" {'Processing status file (.tsv):' : <{space}}{tabular}"
)

status_df = futil.load_tabular(tabular, input_type="processing status")
status_df = file_utils.load_tabular(
tabular, input_type="processing status"
)

# We don't allow empty values in the participant ID column
if row_indices := putil.get_rows_with_empty_strings(
if row_indices := pheno_utils.get_rows_with_empty_strings(
status_df, [PROC_STATUS_COLS["participant"]]
):
raise LookupError(
Expand All @@ -424,21 +415,25 @@ def derivatives(
)

pipelines = status_df[PROC_STATUS_COLS["pipeline_name"]].unique()
dutil.check_pipelines_are_recognized(pipelines)
derivative_utils.check_pipelines_are_recognized(pipelines)

# TODO: Do we need to check all versions across all pipelines first, and report all unrecognized versions together?
for pipeline in pipelines:
versions = status_df[
status_df[PROC_STATUS_COLS["pipeline_name"]] == pipeline
][PROC_STATUS_COLS["pipeline_version"]].unique()

dutil.check_pipeline_versions_are_recognized(pipeline, versions)
derivative_utils.check_pipeline_versions_are_recognized(
pipeline, versions
)

jsonld_dataset = extract_and_validate_jsonld_dataset(jsonld_path)
jsonld_dataset = model_utils.extract_and_validate_jsonld_dataset(
jsonld_path
)

existing_subs_dict = get_subject_instances(jsonld_dataset)
existing_subs_dict = model_utils.get_subject_instances(jsonld_dataset)

confirm_subs_match_pheno_data(
model_utils.confirm_subs_match_pheno_data(
subjects=status_df[PROC_STATUS_COLS["participant"]].unique(),
subject_source_for_err="processing status file",
pheno_subjects=existing_subs_dict.keys(),
Expand All @@ -451,14 +446,14 @@ def derivatives(
existing_subject = existing_subs_dict.get(subject)

# Note: Dictionary of existing imaging sessions can be empty if only bagel pheno was run
existing_sessions_dict = get_imaging_session_instances(
existing_sessions_dict = model_utils.get_imaging_session_instances(
existing_subject
)

for session_label, sub_ses_proc_df in sub_proc_df.groupby(
PROC_STATUS_COLS["session"]
):
completed_pipelines = dutil.create_completed_pipelines(
completed_pipelines = derivative_utils.create_completed_pipelines(
sub_ses_proc_df
)

Expand All @@ -480,10 +475,7 @@ def derivatives(
)
existing_subject.hasSession.append(new_img_session)

context = generate_context()
merged_dataset = {**context, **jsonld_dataset.dict(exclude_none=True)}

with open(output, "w") as f:
f.write(json.dumps(merged_dataset, indent=2))

print(f"Saved output to: {output}")
file_utils.save_jsonld(
data=model_utils.add_context_to_graph_dataset(jsonld_dataset),
filename=output,
)
6 changes: 3 additions & 3 deletions bagel/mappings.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from collections import namedtuple
from pathlib import Path

import bagel.file_utils as futil
from .utilities import file_utils

Namespace = namedtuple("Namespace", ["pf", "url"])
COGATLAS = Namespace("cogatlas", "https://www.cognitiveatlas.org/task/id/")
Expand Down Expand Up @@ -51,7 +51,7 @@ def get_pipeline_uris() -> dict:
"""
output_dict = {}
for pipe_file in PROCESSING_PIPELINE_PATH.glob("*.json"):
pipe = futil.load_json(pipe_file)
pipe = file_utils.load_json(pipe_file)
output_dict[pipe["name"]] = f"{NP.pf}:{pipe['name']}"

return output_dict
Expand All @@ -64,7 +64,7 @@ def get_pipeline_versions() -> dict:
"""
output_dict = {}
for pipe_file in PROCESSING_PIPELINE_PATH.glob("*.json"):
pipe = futil.load_json(pipe_file)
pipe = file_utils.load_json(pipe_file)
output_dict[pipe["name"]] = pipe["versions"]

return output_dict
Expand Down
Empty file added bagel/utilities/__init__.py
Empty file.
File renamed without changes.
File renamed without changes.
6 changes: 6 additions & 0 deletions bagel/file_utils.py → bagel/utilities/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,9 @@ def check_overwrite(output: Path, overwrite: bool):
fg=typer.colors.RED,
)
)


def save_jsonld(data: dict, filename: Path):
with open(filename, "w") as f:
f.write(json.dumps(data, indent=2))
print(f"Saved output to: {filename}")
13 changes: 11 additions & 2 deletions bagel/utility.py → bagel/utilities/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
import typer
from pydantic import ValidationError

import bagel.file_utils as futil
from bagel import models
from bagel.mappings import ALL_NAMESPACES, NB
from bagel.utilities import file_utils


def generate_context():
Expand Down Expand Up @@ -35,6 +35,15 @@ def generate_context():
return {"@context": field_preamble}


def add_context_to_graph_dataset(dataset: models.Dataset) -> dict:
"""Add the Neurobagel context to a graph-ready dataset to form a JSONLD dictionary."""
context = generate_context()
# We can't just exclude_unset here because the identifier and schemaKey
# for each instance are created as default values and so technically are never set
# TODO: we should revisit this because there may be reasons to have None be meaningful in the future
return {**context, **dataset.dict(exclude_none=True)}


def get_subs_missing_from_pheno_data(
subjects: Iterable, pheno_subjects: Iterable
) -> list:
Expand Down Expand Up @@ -68,7 +77,7 @@ def extract_and_validate_jsonld_dataset(file_path: Path) -> models.Dataset:
Strip the context from a user-provided JSONLD and validate the remaining contents
against the data model for a Neurobagel dataset.
"""
jsonld = futil.load_json(file_path)
jsonld = file_utils.load_json(file_path)
jsonld.pop("@context")
try:
jsonld_dataset = models.Dataset.parse_obj(jsonld)
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Loading

0 comments on commit 3f33239

Please sign in to comment.