Skip to content

Commit

Permalink
rename PRIMARY_SESSION var
Browse files Browse the repository at this point in the history
  • Loading branch information
alyssadai committed Dec 13, 2024
1 parent e622585 commit 08a5812
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 22 deletions.
8 changes: 4 additions & 4 deletions digest/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from . import plotting as plot
from . import utility as util
from .layout import DEFAULT_DATASET_NAME, construct_layout, upload_buttons
from .utility import PRIMARY_SESSION
from .utility import PRIMARY_SESSION_COL

EMPTY_FIGURE_PROPS = {"data": [], "layout": {}, "frames": []}

Expand Down Expand Up @@ -158,8 +158,8 @@ def process_bagel(upload_contents, available_digest_nclicks, filenames):
# Another side effect of allowing NaN sessions is that if this column has integer values, they will be read in as floats
# (before being converted to str) if there are NaNs in the column.
# This should not be a problem after we disallow NaNs value in "participant_id" and "session_id" columns, https://github.com/neurobagel/digest/issues/20
bagel[PRIMARY_SESSION] = bagel[PRIMARY_SESSION].astype(str)
session_list = bagel[PRIMARY_SESSION].unique().tolist()
bagel[PRIMARY_SESSION_COL] = bagel[PRIMARY_SESSION_COL].astype(str)
session_list = bagel[PRIMARY_SESSION_COL].unique().tolist()

overview_df = util.get_pipelines_overview(
bagel=bagel, schema=schema
Expand Down Expand Up @@ -555,7 +555,7 @@ def plot_phenotypic_column(
data_to_plot = virtual_data

if session_switch_value:
color = PRIMARY_SESSION
color = PRIMARY_SESSION_COL
else:
color = None

Expand Down
10 changes: 5 additions & 5 deletions digest/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import plotly.graph_objects as go

from . import utility as util
from .utility import PRIMARY_SESSION
from .utility import PRIMARY_SESSION_COL

CMAP = px.colors.qualitative.Bold
STATUS_COLORS = {
Expand Down Expand Up @@ -61,28 +61,28 @@ def plot_pipeline_status_by_participants(
) -> go.Figure:
status_counts = (
transform_active_data_to_long(data)
.groupby(["pipeline_name", "status", PRIMARY_SESSION])
.groupby(["pipeline_name", "status", PRIMARY_SESSION_COL])
.size()
.reset_index(name="participants")
)

fig = px.bar(
status_counts,
x=PRIMARY_SESSION,
x=PRIMARY_SESSION_COL,
y="participants",
color="status",
text_auto=True,
facet_col="pipeline_name",
category_orders={
"status": util.PIPE_COMPLETE_STATUS_SHORT_DESC.keys(),
PRIMARY_SESSION: session_list,
PRIMARY_SESSION_COL: session_list,
},
color_discrete_map=STATUS_COLORS,
labels={
"pipeline_name": "Pipeline",
"participants": "Participants (n)",
"status": "Processing status",
PRIMARY_SESSION: "Session",
PRIMARY_SESSION_COL: "Session",
},
title="All participant pipeline statuses by session",
)
Expand Down
24 changes: 14 additions & 10 deletions digest/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
"UNAVAILABLE": "Relevant MRI modality for pipeline not available.",
}
# Column to use as the primary session identifier in the data
PRIMARY_SESSION = "session_id"
PRIMARY_SESSION_COL = "session_id"

# TODO:
# Could also use URLs for "imaging" or "phenotypic" locations if fetching from a remote repo doesn't slow things down too much.
Expand Down Expand Up @@ -62,7 +62,9 @@ def reset_column_dtypes(data: pd.DataFrame) -> pd.DataFrame:
stream.close()

# Just in case, convert session labels back to strings (will avoid sessions being undesirably treated as continuous data in e.g., plots)
data_retyped[PRIMARY_SESSION] = data_retyped[PRIMARY_SESSION].astype(str)
data_retyped[PRIMARY_SESSION_COL] = data_retyped[
PRIMARY_SESSION_COL
].astype(str)
return data_retyped


Expand Down Expand Up @@ -94,7 +96,7 @@ def construct_summary_str(data: pd.DataFrame) -> str:
"""Creates summary of key counts for dataset."""
return f"""Total number of participants: {count_unique_subjects(data)}
Total number of unique records (participant-session pairs): {count_unique_records(data)}
Total number of unique sessions: {data[PRIMARY_SESSION].nunique()}"""
Total number of unique sessions: {data[PRIMARY_SESSION_COL].nunique()}"""


def get_required_bagel_columns(schema_file: str) -> list:
Expand Down Expand Up @@ -204,9 +206,9 @@ def count_unique_subjects(data: pd.DataFrame) -> int:

def count_unique_records(data: pd.DataFrame) -> int:
"""Returns number of unique participant-session pairs."""
if set(["participant_id", PRIMARY_SESSION]).issubset(data.columns):
if set(["participant_id", PRIMARY_SESSION_COL]).issubset(data.columns):
return (
data[["participant_id", PRIMARY_SESSION]]
data[["participant_id", PRIMARY_SESSION_COL]]
.drop_duplicates()
.shape[0]
)
Expand Down Expand Up @@ -248,7 +250,8 @@ def get_pipelines_overview(bagel: pd.DataFrame, schema: str) -> pd.DataFrame:
# NOTE: .reindex only works correctly when there are no NaN values in the index level
# (Here, the entire "session_id" column should have already been cast to a string)
pipeline_complete_df.reindex(
index=bagel[PRIMARY_SESSION].unique(), level=PRIMARY_SESSION
index=bagel[PRIMARY_SESSION_COL].unique(),
level=PRIMARY_SESSION_COL,
)
.reindex(col_order, axis=1) # reorder assessments/pipelines if needed
.reset_index()
Expand Down Expand Up @@ -346,24 +349,25 @@ def filter_records(
matching_subs = []
for sub_id, sub in data.groupby("participant_id"):
if all(
session in sub[PRIMARY_SESSION].unique()
session in sub[PRIMARY_SESSION_COL].unique()
for session in session_values
):
if all(
not sub.query(
" and ".join(
[f"{PRIMARY_SESSION} == '{session}'"]
[f"{PRIMARY_SESSION_COL} == '{session}'"]
+ pipeline_queries
)
).empty
for session in session_values
):
matching_subs.append(sub_id)
query = f"participant_id in {matching_subs} and {PRIMARY_SESSION} in {session_values}"
query = f"participant_id in {matching_subs} and {PRIMARY_SESSION_COL} in {session_values}"
else:
if operator_value == "OR":
query = " and ".join(
[f"{PRIMARY_SESSION} in {session_values}"] + pipeline_queries
[f"{PRIMARY_SESSION_COL} in {session_values}"]
+ pipeline_queries
)

data = data.query(query)
Expand Down
6 changes: 3 additions & 3 deletions tests/test_utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import digest.plotting as plot
import digest.utility as util
from digest.utility import PRIMARY_SESSION
from digest.utility import PRIMARY_SESSION_COL


@pytest.mark.parametrize(
Expand Down Expand Up @@ -166,7 +166,7 @@ def test_get_pipelines_overview(
after reshaping data into a wide format.
"""
bagel = pd.read_csv(bagels_path / bagel_path, sep="\t")
bagel[PRIMARY_SESSION] = bagel[PRIMARY_SESSION].astype(str)
bagel[PRIMARY_SESSION_COL] = bagel[PRIMARY_SESSION_COL].astype(str)
overview_df = util.get_pipelines_overview(bagel=bagel, schema=schema)

assert overview_df.columns.tolist() == expected_columns
Expand Down Expand Up @@ -237,7 +237,7 @@ def test_get_pipelines_overview_handles_nan_correctly(
bagel, expected_overview_df
):
"""Test that get_pipelines_overview() handles NaN values in the original long-format data as expected."""
bagel[PRIMARY_SESSION] = bagel[PRIMARY_SESSION].astype(str)
bagel[PRIMARY_SESSION_COL] = bagel[PRIMARY_SESSION_COL].astype(str)
overview_df = util.get_pipelines_overview(bagel=bagel, schema="phenotypic")

assert overview_df.equals(expected_overview_df), overview_df
Expand Down

0 comments on commit 08a5812

Please sign in to comment.