rename PRIMARY_SESSION var

neurobagel · Dec 13, 2024 · 08a5812 · 08a5812
1 parent e622585
commit 08a5812
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 22 deletions.
diff --git a/digest/app.py b/digest/app.py
@@ -11,7 +11,7 @@
 from . import plotting as plot
 from . import utility as util
 from .layout import DEFAULT_DATASET_NAME, construct_layout, upload_buttons
-from .utility import PRIMARY_SESSION
+from .utility import PRIMARY_SESSION_COL
 
 EMPTY_FIGURE_PROPS = {"data": [], "layout": {}, "frames": []}
 
@@ -158,8 +158,8 @@ def process_bagel(upload_contents, available_digest_nclicks, filenames):
             # Another side effect of allowing NaN sessions is that if this column has integer values, they will be read in as floats
             # (before being converted to str) if there are NaNs in the column.
             # This should not be a problem after we disallow NaNs value in "participant_id" and "session_id" columns, https://github.com/neurobagel/digest/issues/20
-            bagel[PRIMARY_SESSION] = bagel[PRIMARY_SESSION].astype(str)
-            session_list = bagel[PRIMARY_SESSION].unique().tolist()
+            bagel[PRIMARY_SESSION_COL] = bagel[PRIMARY_SESSION_COL].astype(str)
+            session_list = bagel[PRIMARY_SESSION_COL].unique().tolist()
 
             overview_df = util.get_pipelines_overview(
                 bagel=bagel, schema=schema
@@ -555,7 +555,7 @@ def plot_phenotypic_column(
         data_to_plot = virtual_data
 
     if session_switch_value:
-        color = PRIMARY_SESSION
+        color = PRIMARY_SESSION_COL
     else:
         color = None
 

diff --git a/digest/plotting.py b/digest/plotting.py
@@ -7,7 +7,7 @@
 import plotly.graph_objects as go
 
 from . import utility as util
-from .utility import PRIMARY_SESSION
+from .utility import PRIMARY_SESSION_COL
 
 CMAP = px.colors.qualitative.Bold
 STATUS_COLORS = {
@@ -61,28 +61,28 @@ def plot_pipeline_status_by_participants(
 ) -> go.Figure:
     status_counts = (
         transform_active_data_to_long(data)
-        .groupby(["pipeline_name", "status", PRIMARY_SESSION])
+        .groupby(["pipeline_name", "status", PRIMARY_SESSION_COL])
         .size()
         .reset_index(name="participants")
     )
 
     fig = px.bar(
         status_counts,
-        x=PRIMARY_SESSION,
+        x=PRIMARY_SESSION_COL,
         y="participants",
         color="status",
         text_auto=True,
         facet_col="pipeline_name",
         category_orders={
             "status": util.PIPE_COMPLETE_STATUS_SHORT_DESC.keys(),
-            PRIMARY_SESSION: session_list,
+            PRIMARY_SESSION_COL: session_list,
         },
         color_discrete_map=STATUS_COLORS,
         labels={
             "pipeline_name": "Pipeline",
             "participants": "Participants (n)",
             "status": "Processing status",
-            PRIMARY_SESSION: "Session",
+            PRIMARY_SESSION_COL: "Session",
         },
         title="All participant pipeline statuses by session",
     )

diff --git a/digest/utility.py b/digest/utility.py
@@ -26,7 +26,7 @@
     "UNAVAILABLE": "Relevant MRI modality for pipeline not available.",
 }
 # Column to use as the primary session identifier in the data
-PRIMARY_SESSION = "session_id"
+PRIMARY_SESSION_COL = "session_id"
 
 # TODO:
 # Could also use URLs for "imaging" or "phenotypic" locations if fetching from a remote repo doesn't slow things down too much.
@@ -62,7 +62,9 @@ def reset_column_dtypes(data: pd.DataFrame) -> pd.DataFrame:
     stream.close()
 
     # Just in case, convert session labels back to strings (will avoid sessions being undesirably treated as continuous data in e.g., plots)
-    data_retyped[PRIMARY_SESSION] = data_retyped[PRIMARY_SESSION].astype(str)
+    data_retyped[PRIMARY_SESSION_COL] = data_retyped[
+        PRIMARY_SESSION_COL
+    ].astype(str)
     return data_retyped
 
 
@@ -94,7 +96,7 @@ def construct_summary_str(data: pd.DataFrame) -> str:
     """Creates summary of key counts for dataset."""
     return f"""Total number of participants: {count_unique_subjects(data)}
 Total number of unique records (participant-session pairs): {count_unique_records(data)}
-Total number of unique sessions: {data[PRIMARY_SESSION].nunique()}"""
+Total number of unique sessions: {data[PRIMARY_SESSION_COL].nunique()}"""
 
 
 def get_required_bagel_columns(schema_file: str) -> list:
@@ -204,9 +206,9 @@ def count_unique_subjects(data: pd.DataFrame) -> int:
 
 def count_unique_records(data: pd.DataFrame) -> int:
     """Returns number of unique participant-session pairs."""
-    if set(["participant_id", PRIMARY_SESSION]).issubset(data.columns):
+    if set(["participant_id", PRIMARY_SESSION_COL]).issubset(data.columns):
         return (
-            data[["participant_id", PRIMARY_SESSION]]
+            data[["participant_id", PRIMARY_SESSION_COL]]
             .drop_duplicates()
             .shape[0]
         )
@@ -248,7 +250,8 @@ def get_pipelines_overview(bagel: pd.DataFrame, schema: str) -> pd.DataFrame:
         #   NOTE: .reindex only works correctly when there are no NaN values in the index level
         #   (Here, the entire "session_id" column should have already been cast to a string)
         pipeline_complete_df.reindex(
-            index=bagel[PRIMARY_SESSION].unique(), level=PRIMARY_SESSION
+            index=bagel[PRIMARY_SESSION_COL].unique(),
+            level=PRIMARY_SESSION_COL,
         )
         .reindex(col_order, axis=1)  # reorder assessments/pipelines if needed
         .reset_index()
@@ -346,24 +349,25 @@ def filter_records(
         matching_subs = []
         for sub_id, sub in data.groupby("participant_id"):
             if all(
-                session in sub[PRIMARY_SESSION].unique()
+                session in sub[PRIMARY_SESSION_COL].unique()
                 for session in session_values
             ):
                 if all(
                     not sub.query(
                         " and ".join(
-                            [f"{PRIMARY_SESSION} == '{session}'"]
+                            [f"{PRIMARY_SESSION_COL} == '{session}'"]
                             + pipeline_queries
                         )
                     ).empty
                     for session in session_values
                 ):
                     matching_subs.append(sub_id)
-        query = f"participant_id in {matching_subs} and {PRIMARY_SESSION} in {session_values}"
+        query = f"participant_id in {matching_subs} and {PRIMARY_SESSION_COL} in {session_values}"
     else:
         if operator_value == "OR":
             query = " and ".join(
-                [f"{PRIMARY_SESSION} in {session_values}"] + pipeline_queries
+                [f"{PRIMARY_SESSION_COL} in {session_values}"]
+                + pipeline_queries
             )
 
     data = data.query(query)

diff --git a/tests/test_utility.py b/tests/test_utility.py
@@ -4,7 +4,7 @@
 
 import digest.plotting as plot
 import digest.utility as util
-from digest.utility import PRIMARY_SESSION
+from digest.utility import PRIMARY_SESSION_COL
 
 
 @pytest.mark.parametrize(
@@ -166,7 +166,7 @@ def test_get_pipelines_overview(
     after reshaping data into a wide format.
     """
     bagel = pd.read_csv(bagels_path / bagel_path, sep="\t")
-    bagel[PRIMARY_SESSION] = bagel[PRIMARY_SESSION].astype(str)
+    bagel[PRIMARY_SESSION_COL] = bagel[PRIMARY_SESSION_COL].astype(str)
     overview_df = util.get_pipelines_overview(bagel=bagel, schema=schema)
 
     assert overview_df.columns.tolist() == expected_columns
@@ -237,7 +237,7 @@ def test_get_pipelines_overview_handles_nan_correctly(
     bagel, expected_overview_df
 ):
     """Test that get_pipelines_overview() handles NaN values in the original long-format data as expected."""
-    bagel[PRIMARY_SESSION] = bagel[PRIMARY_SESSION].astype(str)
+    bagel[PRIMARY_SESSION_COL] = bagel[PRIMARY_SESSION_COL].astype(str)
     overview_df = util.get_pipelines_overview(bagel=bagel, schema="phenotypic")
 
     assert overview_df.equals(expected_overview_df), overview_df