diff --git a/digest/app.py b/digest/app.py index c3a5b77..a48a194 100644 --- a/digest/app.py +++ b/digest/app.py @@ -11,7 +11,7 @@ from . import plotting as plot from . import utility as util from .layout import DEFAULT_DATASET_NAME, construct_layout, upload_buttons -from .utility import PRIMARY_SESSION +from .utility import PRIMARY_SESSION_COL EMPTY_FIGURE_PROPS = {"data": [], "layout": {}, "frames": []} @@ -158,8 +158,8 @@ def process_bagel(upload_contents, available_digest_nclicks, filenames): # Another side effect of allowing NaN sessions is that if this column has integer values, they will be read in as floats # (before being converted to str) if there are NaNs in the column. # This should not be a problem after we disallow NaNs value in "participant_id" and "session_id" columns, https://github.com/neurobagel/digest/issues/20 - bagel[PRIMARY_SESSION] = bagel[PRIMARY_SESSION].astype(str) - session_list = bagel[PRIMARY_SESSION].unique().tolist() + bagel[PRIMARY_SESSION_COL] = bagel[PRIMARY_SESSION_COL].astype(str) + session_list = bagel[PRIMARY_SESSION_COL].unique().tolist() overview_df = util.get_pipelines_overview( bagel=bagel, schema=schema @@ -555,7 +555,7 @@ def plot_phenotypic_column( data_to_plot = virtual_data if session_switch_value: - color = PRIMARY_SESSION + color = PRIMARY_SESSION_COL else: color = None diff --git a/digest/plotting.py b/digest/plotting.py index 5b306f7..a122727 100644 --- a/digest/plotting.py +++ b/digest/plotting.py @@ -7,7 +7,7 @@ import plotly.graph_objects as go from . import utility as util -from .utility import PRIMARY_SESSION +from .utility import PRIMARY_SESSION_COL CMAP = px.colors.qualitative.Bold STATUS_COLORS = { @@ -61,28 +61,28 @@ def plot_pipeline_status_by_participants( ) -> go.Figure: status_counts = ( transform_active_data_to_long(data) - .groupby(["pipeline_name", "status", PRIMARY_SESSION]) + .groupby(["pipeline_name", "status", PRIMARY_SESSION_COL]) .size() .reset_index(name="participants") ) fig = px.bar( status_counts, - x=PRIMARY_SESSION, + x=PRIMARY_SESSION_COL, y="participants", color="status", text_auto=True, facet_col="pipeline_name", category_orders={ "status": util.PIPE_COMPLETE_STATUS_SHORT_DESC.keys(), - PRIMARY_SESSION: session_list, + PRIMARY_SESSION_COL: session_list, }, color_discrete_map=STATUS_COLORS, labels={ "pipeline_name": "Pipeline", "participants": "Participants (n)", "status": "Processing status", - PRIMARY_SESSION: "Session", + PRIMARY_SESSION_COL: "Session", }, title="All participant pipeline statuses by session", ) diff --git a/digest/utility.py b/digest/utility.py index ec32964..d548999 100644 --- a/digest/utility.py +++ b/digest/utility.py @@ -26,7 +26,7 @@ "UNAVAILABLE": "Relevant MRI modality for pipeline not available.", } # Column to use as the primary session identifier in the data -PRIMARY_SESSION = "session_id" +PRIMARY_SESSION_COL = "session_id" # TODO: # Could also use URLs for "imaging" or "phenotypic" locations if fetching from a remote repo doesn't slow things down too much. @@ -62,7 +62,9 @@ def reset_column_dtypes(data: pd.DataFrame) -> pd.DataFrame: stream.close() # Just in case, convert session labels back to strings (will avoid sessions being undesirably treated as continuous data in e.g., plots) - data_retyped[PRIMARY_SESSION] = data_retyped[PRIMARY_SESSION].astype(str) + data_retyped[PRIMARY_SESSION_COL] = data_retyped[ + PRIMARY_SESSION_COL + ].astype(str) return data_retyped @@ -94,7 +96,7 @@ def construct_summary_str(data: pd.DataFrame) -> str: """Creates summary of key counts for dataset.""" return f"""Total number of participants: {count_unique_subjects(data)} Total number of unique records (participant-session pairs): {count_unique_records(data)} -Total number of unique sessions: {data[PRIMARY_SESSION].nunique()}""" +Total number of unique sessions: {data[PRIMARY_SESSION_COL].nunique()}""" def get_required_bagel_columns(schema_file: str) -> list: @@ -204,9 +206,9 @@ def count_unique_subjects(data: pd.DataFrame) -> int: def count_unique_records(data: pd.DataFrame) -> int: """Returns number of unique participant-session pairs.""" - if set(["participant_id", PRIMARY_SESSION]).issubset(data.columns): + if set(["participant_id", PRIMARY_SESSION_COL]).issubset(data.columns): return ( - data[["participant_id", PRIMARY_SESSION]] + data[["participant_id", PRIMARY_SESSION_COL]] .drop_duplicates() .shape[0] ) @@ -248,7 +250,8 @@ def get_pipelines_overview(bagel: pd.DataFrame, schema: str) -> pd.DataFrame: # NOTE: .reindex only works correctly when there are no NaN values in the index level # (Here, the entire "session_id" column should have already been cast to a string) pipeline_complete_df.reindex( - index=bagel[PRIMARY_SESSION].unique(), level=PRIMARY_SESSION + index=bagel[PRIMARY_SESSION_COL].unique(), + level=PRIMARY_SESSION_COL, ) .reindex(col_order, axis=1) # reorder assessments/pipelines if needed .reset_index() @@ -346,24 +349,25 @@ def filter_records( matching_subs = [] for sub_id, sub in data.groupby("participant_id"): if all( - session in sub[PRIMARY_SESSION].unique() + session in sub[PRIMARY_SESSION_COL].unique() for session in session_values ): if all( not sub.query( " and ".join( - [f"{PRIMARY_SESSION} == '{session}'"] + [f"{PRIMARY_SESSION_COL} == '{session}'"] + pipeline_queries ) ).empty for session in session_values ): matching_subs.append(sub_id) - query = f"participant_id in {matching_subs} and {PRIMARY_SESSION} in {session_values}" + query = f"participant_id in {matching_subs} and {PRIMARY_SESSION_COL} in {session_values}" else: if operator_value == "OR": query = " and ".join( - [f"{PRIMARY_SESSION} in {session_values}"] + pipeline_queries + [f"{PRIMARY_SESSION_COL} in {session_values}"] + + pipeline_queries ) data = data.query(query) diff --git a/tests/test_utility.py b/tests/test_utility.py index 31a1edc..5e5e673 100644 --- a/tests/test_utility.py +++ b/tests/test_utility.py @@ -4,7 +4,7 @@ import digest.plotting as plot import digest.utility as util -from digest.utility import PRIMARY_SESSION +from digest.utility import PRIMARY_SESSION_COL @pytest.mark.parametrize( @@ -166,7 +166,7 @@ def test_get_pipelines_overview( after reshaping data into a wide format. """ bagel = pd.read_csv(bagels_path / bagel_path, sep="\t") - bagel[PRIMARY_SESSION] = bagel[PRIMARY_SESSION].astype(str) + bagel[PRIMARY_SESSION_COL] = bagel[PRIMARY_SESSION_COL].astype(str) overview_df = util.get_pipelines_overview(bagel=bagel, schema=schema) assert overview_df.columns.tolist() == expected_columns @@ -237,7 +237,7 @@ def test_get_pipelines_overview_handles_nan_correctly( bagel, expected_overview_df ): """Test that get_pipelines_overview() handles NaN values in the original long-format data as expected.""" - bagel[PRIMARY_SESSION] = bagel[PRIMARY_SESSION].astype(str) + bagel[PRIMARY_SESSION_COL] = bagel[PRIMARY_SESSION_COL].astype(str) overview_df = util.get_pipelines_overview(bagel=bagel, schema="phenotypic") assert overview_df.equals(expected_overview_df), overview_df