Skip to content

Commit

Permalink
Merge pull request #194 from AI-SDC/adding_mode
Browse files Browse the repository at this point in the history
adding mode as aggfunc
  • Loading branch information
jim-smith authored Feb 20, 2024
2 parents 4ccf5fb + b8faef1 commit 4c137fc
Show file tree
Hide file tree
Showing 6 changed files with 372 additions and 280 deletions.
9 changes: 6 additions & 3 deletions acro/acro_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ def olsr( # pylint: disable=too-many-locals,keyword-arg-before-vararg
data must define __getitem__ with the keys in the formula terms
args and kwargs are passed on to the model instantiation. E.g.,
a numpy structured or rec array, a dictionary, or a pandas DataFrame.
Arguments are passed in the same order as statsmodels.
"""
logger.debug("olsr()")
command: str = utils.get_command("olsr()", stack())
Expand All @@ -122,7 +123,7 @@ def olsr( # pylint: disable=too-many-locals,keyword-arg-before-vararg
data=data,
subset=subset,
drop_cols=drop_cols,
*args,
*args, # noqa: B026
**kwargs,
)
results = model.fit()
Expand Down Expand Up @@ -227,6 +228,7 @@ def logitr( # pylint: disable=too-many-locals,keyword-arg-before-vararg
data must define __getitem__ with the keys in the formula terms
args and kwargs are passed on to the model instantiation. E.g.,
a numpy structured or rec array, a dictionary, or a pandas DataFrame.
Arguments are passed in the same order as statsmodels.
"""
logger.debug("logitr()")
command: str = utils.get_command("logitr()", stack())
Expand All @@ -235,7 +237,7 @@ def logitr( # pylint: disable=too-many-locals,keyword-arg-before-vararg
data=data,
subset=subset,
drop_cols=drop_cols,
*args,
*args, # noqa: B026
**kwargs,
)
results = model.fit()
Expand Down Expand Up @@ -340,6 +342,7 @@ def probitr( # pylint: disable=too-many-locals,keyword-arg-before-vararg
data must define __getitem__ with the keys in the formula terms
args and kwargs are passed on to the model instantiation. E.g.,
a numpy structured or rec array, a dictionary, or a pandas DataFrame.
Arguments are passed in the same order as statsmodels.
"""
logger.debug("probitr()")
command: str = utils.get_command("probitr()", stack())
Expand All @@ -348,7 +351,7 @@ def probitr( # pylint: disable=too-many-locals,keyword-arg-before-vararg
data=data,
subset=subset,
drop_cols=drop_cols,
*args,
*args, # noqa: B026
**kwargs,
)
results = model.fit()
Expand Down
159 changes: 112 additions & 47 deletions acro/acro_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import logging
import os
import secrets
from collections.abc import Callable
from inspect import stack

Expand All @@ -20,12 +21,30 @@
logger = logging.getLogger("acro")


AGGFUNC: dict[str, str] = {
def mode_aggfunc(values) -> Series:
"""Calculate the mode or randomly selects one of the modes from a pandas Series.
Parameters
----------
values : Series
A pandas Series for which to calculate the mode.
Returns
-------
Series
The mode. If there are multiple modes, randomly selects and returns one of the modes.
"""
modes = values.mode()
return secrets.choice(modes)


AGGFUNC: dict[str, str | Callable] = {
"mean": "mean",
"median": "median",
"sum": "sum",
"std": "std",
"count": "count",
"mode": mode_aggfunc,
}

# aggregation function parameters
Expand Down Expand Up @@ -137,9 +156,11 @@ def crosstab( # pylint: disable=too-many-arguments,too-many-locals
dropna,
normalize,
)
# delete empty rows and columns from table
table, comments = delete_empty_rows_columns(table)

comments: list[str] = []
# do not delete empty rows and columns from table if the aggfunc is mode
if agg_func is not mode_aggfunc:
# delete empty rows and columns from table
table, comments = delete_empty_rows_columns(table)
masks = create_crosstab_masks(
index,
columns,
Expand Down Expand Up @@ -742,7 +763,7 @@ def create_crosstab_masks( # pylint: disable=too-many-arguments,too-many-locals

if agg_func is not None:
# create lists with single entry for when there is only one aggfunc
count_funcs: list[str] = [AGGFUNC["count"]]
count_funcs: list[str | Callable] = [AGGFUNC["count"]]
neg_funcs: list[Callable] = [agg_negative]
pperc_funcs: list[Callable] = [agg_p_percent]
nk_funcs: list[Callable] = [agg_nk]
Expand All @@ -756,49 +777,67 @@ def create_crosstab_masks( # pylint: disable=too-many-arguments,too-many-locals
nk_funcs.extend([agg_nk for i in range(1, num)])
missing_funcs.extend([agg_missing for i in range(1, num)])
# threshold check- doesn't matter what we pass for value
if agg_func is mode_aggfunc:
# check that all observations dont have the same value
logger.info(
"If there are multiple modes, one of them is randomly selected and displayed."
)
masks["all-values-are-same"] = pd.crosstab( # type: ignore
index,
columns,
values,
aggfunc=agg_values_are_same,
margins=margins,
dropna=dropna,
)
else:
t_values = pd.crosstab( # type: ignore
index,
columns,
values=values,
rownames=rownames,
colnames=colnames,
aggfunc=count_funcs,
margins=margins,
margins_name=margins_name,
dropna=dropna,
normalize=normalize,
)

t_values = pd.crosstab( # type: ignore
index,
columns,
values=values,
rownames=rownames,
colnames=colnames,
aggfunc=count_funcs,
margins=margins,
margins_name=margins_name,
dropna=dropna,
normalize=normalize,
)

# drop empty columns and rows
if dropna or margins:
empty_cols_mask = t_values.sum(axis=0) == 0
empty_rows_mask = t_values.sum(axis=1) == 0
# drop empty columns and rows
if dropna or margins:
empty_cols_mask = t_values.sum(axis=0) == 0
empty_rows_mask = t_values.sum(axis=1) == 0

t_values = t_values.loc[:, ~empty_cols_mask]
t_values = t_values.loc[~empty_rows_mask, :]
t_values = t_values.loc[:, ~empty_cols_mask]
t_values = t_values.loc[~empty_rows_mask, :]

t_values = t_values < THRESHOLD
masks["threshold"] = t_values
# check for negative values -- currently unsupported
negative = pd.crosstab( # type: ignore
index, columns, values, aggfunc=neg_funcs, margins=margins
)
if negative.to_numpy().sum() > 0:
masks["negative"] = negative
# p-percent check
masks["p-ratio"] = pd.crosstab( # type: ignore
index, columns, values, aggfunc=pperc_funcs, margins=margins, dropna=dropna
)
# nk values check
masks["nk-rule"] = pd.crosstab( # type: ignore
index, columns, values, aggfunc=nk_funcs, margins=margins, dropna=dropna
)
# check for missing values -- currently unsupported
if CHECK_MISSING_VALUES:
masks["missing"] = pd.crosstab( # type: ignore
index, columns, values, aggfunc=missing_funcs, margins=margins
t_values = t_values < THRESHOLD
masks["threshold"] = t_values
# check for negative values -- currently unsupported
negative = pd.crosstab( # type: ignore
index, columns, values, aggfunc=neg_funcs, margins=margins
)
if negative.to_numpy().sum() > 0:
masks["negative"] = negative
# p-percent check
masks["p-ratio"] = pd.crosstab( # type: ignore
index,
columns,
values,
aggfunc=pperc_funcs,
margins=margins,
dropna=dropna,
)
# nk values check
masks["nk-rule"] = pd.crosstab( # type: ignore
index, columns, values, aggfunc=nk_funcs, margins=margins, dropna=dropna
)
# check for missing values -- currently unsupported
if CHECK_MISSING_VALUES:
masks["missing"] = pd.crosstab( # type: ignore
index, columns, values, aggfunc=missing_funcs, margins=margins
)
else:
# threshold check- doesn't matter what we pass for value
t_values = pd.crosstab( # type: ignore
Expand Down Expand Up @@ -908,7 +947,7 @@ def rounded_survival_table(survival_table):
return survival_table


def get_aggfunc(aggfunc: str | None) -> str | None:
def get_aggfunc(aggfunc: str | None) -> str | Callable | None:
"""Checks whether an aggregation function is allowed and returns the
appropriate function.
Expand Down Expand Up @@ -940,7 +979,7 @@ def get_aggfunc(aggfunc: str | None) -> str | None:

def get_aggfuncs(
aggfuncs: str | list[str] | None,
) -> str | list[str] | None:
) -> str | Callable | list[str | Callable] | None:
"""Checks whether a list of aggregation functions is allowed and returns
the appropriate functions.
Expand All @@ -963,7 +1002,7 @@ def get_aggfuncs(
logger.debug("aggfuncs: %s", function)
return function
if isinstance(aggfuncs, list):
functions: list[str] = []
functions: list[str | Callable] = []
for function_name in aggfuncs:
function = get_aggfunc(function_name)
if function is not None:
Expand Down Expand Up @@ -1076,6 +1115,24 @@ def agg_threshold(vals: Series) -> bool:
return vals.count() < THRESHOLD


def agg_values_are_same(vals: Series) -> bool:
"""Aggregation function that returns whether all observations having
the same value.
Parameters
----------
vals : Series
Series to calculate if all the values are the same.
Returns
-------
bool
Whether the values are the same.
"""
# the observations are not the same
return vals.nunique(dropna=True) == 1


def apply_suppression(
table: DataFrame, masks: dict[str, DataFrame]
) -> tuple[DataFrame, DataFrame]:
Expand Down Expand Up @@ -1147,6 +1204,7 @@ def get_table_sdc(masks: dict[str, DataFrame], suppress: bool) -> dict:
sdc["summary"]["threshold"] = 0
sdc["summary"]["p-ratio"] = 0
sdc["summary"]["nk-rule"] = 0
sdc["summary"]["all-values-are-same"] = 0
for name, mask in masks.items():
sdc["summary"][name] = int(np.nansum(mask.to_numpy()))
# positions of cells to be suppressed
Expand All @@ -1155,6 +1213,7 @@ def get_table_sdc(masks: dict[str, DataFrame], suppress: bool) -> dict:
sdc["cells"]["threshold"] = []
sdc["cells"]["p-ratio"] = []
sdc["cells"]["nk-rule"] = []
sdc["cells"]["all-values-are-same"] = []
for name, mask in masks.items():
true_positions = np.column_stack(np.where(mask.values))
for pos in true_positions:
Expand Down Expand Up @@ -1198,6 +1257,12 @@ def get_summary(sdc: dict) -> tuple[str, str]:
if sdc_summary["nk-rule"] > 0:
summary += f"nk-rule: {sdc_summary['nk-rule']} cells {sup}; "
status = "fail"
if sdc_summary["all-values-are-same"] > 0:
summary += (
f"all-values-are-same: {sdc_summary['all-values-are-same']} "
f"cells {sup}; "
)
status = "fail"
if summary != "":
summary = f"{status}; {summary}"
else:
Expand Down
Binary file added data/nursery_dataset.dta
Binary file not shown.
Loading

0 comments on commit 4c137fc

Please sign in to comment.