Merge pull request #194 from AI-SDC/adding_mode

adding mode as aggfunc
AI-SDC · Feb 20, 2024 · 4c137fc · 4c137fc
2 parents 4ccf5fb + b8faef1
commit 4c137fc
Show file tree

Hide file tree

Showing 6 changed files with 372 additions and 280 deletions.
diff --git a/acro/acro_regression.py b/acro/acro_regression.py
@@ -114,6 +114,7 @@ def olsr(  # pylint: disable=too-many-locals,keyword-arg-before-vararg
         data must define __getitem__ with the keys in the formula terms
         args and kwargs are passed on to the model instantiation. E.g.,
         a numpy structured or rec array, a dictionary, or a pandas DataFrame.
+        Arguments are passed in the same order as statsmodels.
         """
         logger.debug("olsr()")
         command: str = utils.get_command("olsr()", stack())
@@ -122,7 +123,7 @@ def olsr(  # pylint: disable=too-many-locals,keyword-arg-before-vararg
             data=data,
             subset=subset,
             drop_cols=drop_cols,
-            *args,
+            *args,  # noqa: B026
             **kwargs,
         )
         results = model.fit()
@@ -227,6 +228,7 @@ def logitr(  # pylint: disable=too-many-locals,keyword-arg-before-vararg
         data must define __getitem__ with the keys in the formula terms
         args and kwargs are passed on to the model instantiation. E.g.,
         a numpy structured or rec array, a dictionary, or a pandas DataFrame.
+        Arguments are passed in the same order as statsmodels.
         """
         logger.debug("logitr()")
         command: str = utils.get_command("logitr()", stack())
@@ -235,7 +237,7 @@ def logitr(  # pylint: disable=too-many-locals,keyword-arg-before-vararg
             data=data,
             subset=subset,
             drop_cols=drop_cols,
-            *args,
+            *args,  # noqa: B026
             **kwargs,
         )
         results = model.fit()
@@ -340,6 +342,7 @@ def probitr(  # pylint: disable=too-many-locals,keyword-arg-before-vararg
         data must define __getitem__ with the keys in the formula terms
         args and kwargs are passed on to the model instantiation. E.g.,
         a numpy structured or rec array, a dictionary, or a pandas DataFrame.
+        Arguments are passed in the same order as statsmodels.
         """
         logger.debug("probitr()")
         command: str = utils.get_command("probitr()", stack())
@@ -348,7 +351,7 @@ def probitr(  # pylint: disable=too-many-locals,keyword-arg-before-vararg
             data=data,
             subset=subset,
             drop_cols=drop_cols,
-            *args,
+            *args,  # noqa: B026
             **kwargs,
         )
         results = model.fit()

diff --git a/acro/acro_tables.py b/acro/acro_tables.py
@@ -5,6 +5,7 @@
 
 import logging
 import os
+import secrets
 from collections.abc import Callable
 from inspect import stack
 
@@ -20,12 +21,30 @@
 logger = logging.getLogger("acro")
 
 
-AGGFUNC: dict[str, str] = {
+def mode_aggfunc(values) -> Series:
+    """Calculate the mode or randomly selects one of the modes from a pandas Series.
+
+    Parameters
+    ----------
+    values : Series
+        A pandas Series for which to calculate the mode.
+
+    Returns
+    -------
+    Series
+        The mode. If there are multiple modes, randomly selects and returns one of the modes.
+    """
+    modes = values.mode()
+    return secrets.choice(modes)
+
+
+AGGFUNC: dict[str, str | Callable] = {
     "mean": "mean",
     "median": "median",
     "sum": "sum",
     "std": "std",
     "count": "count",
+    "mode": mode_aggfunc,
 }
 
 # aggregation function parameters
@@ -137,9 +156,11 @@ def crosstab(  # pylint: disable=too-many-arguments,too-many-locals
             dropna,
             normalize,
         )
-        # delete empty rows and columns from table
-        table, comments = delete_empty_rows_columns(table)
-
+        comments: list[str] = []
+        # do not delete empty rows and columns from table if the aggfunc is mode
+        if agg_func is not mode_aggfunc:
+            # delete empty rows and columns from table
+            table, comments = delete_empty_rows_columns(table)
         masks = create_crosstab_masks(
             index,
             columns,
@@ -742,7 +763,7 @@ def create_crosstab_masks(  # pylint: disable=too-many-arguments,too-many-locals
 
     if agg_func is not None:
         # create lists with single entry for when there is only one aggfunc
-        count_funcs: list[str] = [AGGFUNC["count"]]
+        count_funcs: list[str | Callable] = [AGGFUNC["count"]]
         neg_funcs: list[Callable] = [agg_negative]
         pperc_funcs: list[Callable] = [agg_p_percent]
         nk_funcs: list[Callable] = [agg_nk]
@@ -756,49 +777,67 @@ def create_crosstab_masks(  # pylint: disable=too-many-arguments,too-many-locals
             nk_funcs.extend([agg_nk for i in range(1, num)])
             missing_funcs.extend([agg_missing for i in range(1, num)])
         # threshold check- doesn't matter what we pass for value
+        if agg_func is mode_aggfunc:
+            # check that all observations dont have the same value
+            logger.info(
+                "If there are multiple modes, one of them is randomly selected and displayed."
+            )
+            masks["all-values-are-same"] = pd.crosstab(  # type: ignore
+                index,
+                columns,
+                values,
+                aggfunc=agg_values_are_same,
+                margins=margins,
+                dropna=dropna,
+            )
+        else:
+            t_values = pd.crosstab(  # type: ignore
+                index,
+                columns,
+                values=values,
+                rownames=rownames,
+                colnames=colnames,
+                aggfunc=count_funcs,
+                margins=margins,
+                margins_name=margins_name,
+                dropna=dropna,
+                normalize=normalize,
+            )
 
-        t_values = pd.crosstab(  # type: ignore
-            index,
-            columns,
-            values=values,
-            rownames=rownames,
-            colnames=colnames,
-            aggfunc=count_funcs,
-            margins=margins,
-            margins_name=margins_name,
-            dropna=dropna,
-            normalize=normalize,
-        )
-
-        # drop empty columns and rows
-        if dropna or margins:
-            empty_cols_mask = t_values.sum(axis=0) == 0
-            empty_rows_mask = t_values.sum(axis=1) == 0
+            # drop empty columns and rows
+            if dropna or margins:
+                empty_cols_mask = t_values.sum(axis=0) == 0
+                empty_rows_mask = t_values.sum(axis=1) == 0
 
-            t_values = t_values.loc[:, ~empty_cols_mask]
-            t_values = t_values.loc[~empty_rows_mask, :]
+                t_values = t_values.loc[:, ~empty_cols_mask]
+                t_values = t_values.loc[~empty_rows_mask, :]
 
-        t_values = t_values < THRESHOLD
-        masks["threshold"] = t_values
-        # check for negative values -- currently unsupported
-        negative = pd.crosstab(  # type: ignore
-            index, columns, values, aggfunc=neg_funcs, margins=margins
-        )
-        if negative.to_numpy().sum() > 0:
-            masks["negative"] = negative
-        # p-percent check
-        masks["p-ratio"] = pd.crosstab(  # type: ignore
-            index, columns, values, aggfunc=pperc_funcs, margins=margins, dropna=dropna
-        )
-        # nk values check
-        masks["nk-rule"] = pd.crosstab(  # type: ignore
-            index, columns, values, aggfunc=nk_funcs, margins=margins, dropna=dropna
-        )
-        # check for missing values -- currently unsupported
-        if CHECK_MISSING_VALUES:
-            masks["missing"] = pd.crosstab(  # type: ignore
-                index, columns, values, aggfunc=missing_funcs, margins=margins
+            t_values = t_values < THRESHOLD
+            masks["threshold"] = t_values
+            # check for negative values -- currently unsupported
+            negative = pd.crosstab(  # type: ignore
+                index, columns, values, aggfunc=neg_funcs, margins=margins
+            )
+            if negative.to_numpy().sum() > 0:
+                masks["negative"] = negative
+            # p-percent check
+            masks["p-ratio"] = pd.crosstab(  # type: ignore
+                index,
+                columns,
+                values,
+                aggfunc=pperc_funcs,
+                margins=margins,
+                dropna=dropna,
+            )
+            # nk values check
+            masks["nk-rule"] = pd.crosstab(  # type: ignore
+                index, columns, values, aggfunc=nk_funcs, margins=margins, dropna=dropna
             )
+            # check for missing values -- currently unsupported
+            if CHECK_MISSING_VALUES:
+                masks["missing"] = pd.crosstab(  # type: ignore
+                    index, columns, values, aggfunc=missing_funcs, margins=margins
+                )
     else:
         # threshold check- doesn't matter what we pass for value
         t_values = pd.crosstab(  # type: ignore
@@ -908,7 +947,7 @@ def rounded_survival_table(survival_table):
     return survival_table
 
 
-def get_aggfunc(aggfunc: str | None) -> str | None:
+def get_aggfunc(aggfunc: str | None) -> str | Callable | None:
     """Checks whether an aggregation function is allowed and returns the
     appropriate function.
 
@@ -940,7 +979,7 @@ def get_aggfunc(aggfunc: str | None) -> str | None:
 
 def get_aggfuncs(
     aggfuncs: str | list[str] | None,
-) -> str | list[str] | None:
+) -> str | Callable | list[str | Callable] | None:
     """Checks whether a list of aggregation functions is allowed and returns
     the appropriate functions.
 
@@ -963,7 +1002,7 @@ def get_aggfuncs(
         logger.debug("aggfuncs: %s", function)
         return function
     if isinstance(aggfuncs, list):
-        functions: list[str] = []
+        functions: list[str | Callable] = []
         for function_name in aggfuncs:
             function = get_aggfunc(function_name)
             if function is not None:
@@ -1076,6 +1115,24 @@ def agg_threshold(vals: Series) -> bool:
     return vals.count() < THRESHOLD
 
 
+def agg_values_are_same(vals: Series) -> bool:
+    """Aggregation function that returns whether all observations having
+    the same value.
+
+    Parameters
+    ----------
+    vals : Series
+        Series to calculate if all the values are the same.
+
+    Returns
+    -------
+    bool
+        Whether the values are the same.
+    """
+    # the observations are not the same
+    return vals.nunique(dropna=True) == 1
+
+
 def apply_suppression(
     table: DataFrame, masks: dict[str, DataFrame]
 ) -> tuple[DataFrame, DataFrame]:
@@ -1147,6 +1204,7 @@ def get_table_sdc(masks: dict[str, DataFrame], suppress: bool) -> dict:
     sdc["summary"]["threshold"] = 0
     sdc["summary"]["p-ratio"] = 0
     sdc["summary"]["nk-rule"] = 0
+    sdc["summary"]["all-values-are-same"] = 0
     for name, mask in masks.items():
         sdc["summary"][name] = int(np.nansum(mask.to_numpy()))
     # positions of cells to be suppressed
@@ -1155,6 +1213,7 @@ def get_table_sdc(masks: dict[str, DataFrame], suppress: bool) -> dict:
     sdc["cells"]["threshold"] = []
     sdc["cells"]["p-ratio"] = []
     sdc["cells"]["nk-rule"] = []
+    sdc["cells"]["all-values-are-same"] = []
     for name, mask in masks.items():
         true_positions = np.column_stack(np.where(mask.values))
         for pos in true_positions:
@@ -1198,6 +1257,12 @@ def get_summary(sdc: dict) -> tuple[str, str]:
         if sdc_summary["nk-rule"] > 0:
             summary += f"nk-rule: {sdc_summary['nk-rule']} cells {sup}; "
             status = "fail"
+        if sdc_summary["all-values-are-same"] > 0:
+            summary += (
+                f"all-values-are-same: {sdc_summary['all-values-are-same']} "
+                f"cells {sup}; "
+            )
+            status = "fail"
     if summary != "":
         summary = f"{status}; {summary}"
     else:

diff --git a/data/nursery_dataset.dta b/data/nursery_dataset.dta