Merge remote-tracking branch 'refs/remotes/origin/update_test.ipynb' …

…into update_test.ipynb changed name instead
AI-SDC · Oct 11, 2023 · bb26815 · bb26815
2 parents 56ead1c + 65e41e2
commit bb26815
Show file tree

Hide file tree

Showing 7 changed files with 516 additions and 266 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -8,7 +8,7 @@ repos:
 
   # Standard hooks
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.4.0
+    rev: v4.5.0
     hooks:
       - id: check-merge-conflict
       - id: end-of-file-fixer
@@ -26,7 +26,7 @@ repos:
 
   # Check for spelling
   - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.5
+    rev: v2.2.6
     hooks:
       - id: codespell
         args: ["-L", "tre"]
@@ -39,7 +39,7 @@ repos:
 
   # Upgrade old Python syntax
   - repo: https://github.com/asottile/pyupgrade
-    rev: v3.13.0
+    rev: v3.15.0
     hooks:
       - id: pyupgrade
         args: [--py310-plus]

diff --git a/acro/acro_tables.py b/acro/acro_tables.py
@@ -70,6 +70,9 @@ def crosstab(  # pylint: disable=too-many-arguments,too-many-locals
         default, computes a frequency table of the factors unless an array of
         values and an aggregation function are passed.
 
+        To provide consistent behaviour with different aggregation functions,
+        'empty' rows or columns -i.e. that  are all NaN or 0 (count,sum) are removed.
+
         Parameters
         ----------
         index : array-like, Series, or list of arrays/Series
@@ -133,6 +136,28 @@ def crosstab(  # pylint: disable=too-many-arguments,too-many-locals
             dropna,
             normalize,
         )
+        # delete empty rows and columns from table
+        deleted_rows = []
+        deleted_cols = []
+        # define empty columns and rows using boolean masks
+        empty_cols_mask = table.sum(axis=0) == 0
+        empty_rows_mask = table.sum(axis=1) == 0
+
+        deleted_cols = list(table.columns[empty_cols_mask])
+        table = table.loc[:, ~empty_cols_mask]
+        deleted_rows = list(table.index[empty_rows_mask])
+        table = table.loc[~empty_rows_mask, :]
+
+        # create a message with the deleted column's names
+        comments = []
+        if deleted_cols:
+            msg_cols = ", ".join(str(col) for col in deleted_cols)
+            comments.append(f"Empty columns: {msg_cols} were deleted.")
+        if deleted_rows:
+            msg_rows = ", ".join(str(row) for row in deleted_rows)
+            comments.append(f"Empty rows: {msg_rows} were deleted.")
+        if comments:
+            logger.info(" ".join(comments))
 
         masks = create_crosstab_masks(
             index,
@@ -195,6 +220,7 @@ def crosstab(  # pylint: disable=too-many-arguments,too-many-locals
             summary=summary,
             outcome=outcome,
             output=[table],
+            comments=comments,
         )
         return table
 
@@ -548,10 +574,14 @@ def create_crosstab_masks(  # pylint: disable=too-many-arguments,too-many-locals
             normalize=normalize,
         )
 
+        # drop empty columns and rows
         if dropna or margins:
-            for col in t_values.columns:
-                if t_values[col].sum() == 0:
-                    t_values = t_values.drop(col, axis=1)
+            empty_cols_mask = t_values.sum(axis=0) == 0
+            empty_rows_mask = t_values.sum(axis=1) == 0
+
+            t_values = t_values.loc[:, ~empty_cols_mask]
+            t_values = t_values.loc[~empty_rows_mask, :]
+
         t_values = t_values < THRESHOLD
         masks["threshold"] = t_values
         # check for negative values -- currently unsupported

diff --git a/docs/ACRO_For_Researchers.md b/docs/ACRO_For_Researchers.md
@@ -89,7 +89,7 @@ The finalise function will:
 
 ## Frequently Asked Questions
 ### What if I want to run my code many times before I decide exactly what to send for approval?
-ACRO naturally suppors this way of working. It will not produce the output folder until you are satisfied and add acro.finalise() to the end of your script.
+ACRO naturally supports this way of working. It will not produce the output folder until you are satisfied and add acro.finalise() to the end of your script.
 ### Why is my data exported as unformatted .csv files?
 The outputs are saved in row format (as csv files) for the output checkers to check and make decisions. Although, you can change the format, if you like, the csv files should be there for the checking.
 ### Why is ACRO Python-based ‘under-the-hood’?

diff --git a/notebooks/acro_demo.py b/notebooks/acro_demo.py
@@ -5,6 +5,7 @@
 
 # import libraries
 import os
+
 import pandas as pd
 from scipy.io.arff import loadarff
 
@@ -19,12 +20,12 @@
 acro = ACRO(suppress=False)
 
 # Load test data
-# The dataset used in this notebook is the nursery dataset from OpenML.  
-# - In this version, the data can be read directly from the local machine after it has been downloaded. 
+# The dataset used in this notebook is the nursery dataset from OpenML.
+# - In this version, the data can be read directly from the local machine after it has been downloaded.
 # - The code below reads the data from a folder called "data" which we assume is at the same level as the folder where you are working.
 # - The path might need to be changed if the data has been downloaded and stored elsewhere.
-#  - for example use:  
-#     path = os.path.join("data", "nursery.arff")  
+#  - for example use:
+#     path = os.path.join("data", "nursery.arff")
 #     if the data is in a sub-folder of your work folder
 
 path = os.path.join("../data", "nursery.arff")
@@ -36,12 +37,12 @@
 df.head()
 
 # Examples of producing tabular output
-# We rely on the industry-standard package **pandas** for tabulating data.  
+# We rely on the industry-standard package **pandas** for tabulating data.
 # In the next few examples we show:
 # - first, how a researcher would normally make a call in pandas, saving the results in a variable that they can view on screen (or save to file?)
 # - then how the call is identical in SACRO, except that:
 #   - "pd" is replaced by "acro"
-#   - the researcher immediately sees a copy of what the TRE output checker will see.  
+#   - the researcher immediately sees a copy of what the TRE output checker will see.
 
 print(
     "\nThese examples show acro wrappers around "
@@ -50,28 +51,26 @@
 
 
 # Pandas crosstab
-# This is an example of crosstab using pandas.  
+# This is an example of crosstab using pandas.
 # We first make the call, then the second line print the outputs to screen.
 
 print("\nCalling crosstab of recommendation by parents using pandas")
 table = pd.crosstab(df.recommend, df.parents)
 print(table)
 
 # ACRO crosstab
-# - This is an example of crosstab using ACRO.  
+# - This is an example of crosstab using ACRO.
 # - The INFO lines show the researcher what will be reported to the output checkers.
 # - Then the (suppressed as necessary) table is shown via the print command as before.
 
 print("\nNow the same crosstab call using the ACRO interface")
-safe_table = acro.crosstab(
-    df.recommend, df.parents
-)
+safe_table = acro.crosstab(df.recommend, df.parents)
 print("\nand this is the researchers output")
 print(safe_table)
 
 # ACRO crosstab with suppression
 # - This is an example of crosstab with suppressing the cells that violate the disclosure tests.
-# - Note that you need to change the value of the suppress variable in the acro object to True. Then run the crosstab command.  
+# - Note that you need to change the value of the suppress variable in the acro object to True. Then run the crosstab command.
 # - If you wish to continue the research while suppressing the outputs, leave the suppress variable as it is, otherwise turn it off.
 
 print("\nTurn on the suppression variable")
@@ -84,7 +83,7 @@
 acro.suppress = False
 
 # ACRO functionality to let users manage their outputs
-# 
+#
 # 1: List current ACRO outputs
 # This is an example of using the print_output function to list all the outputs created so far
 
@@ -95,10 +94,10 @@
 )
 acro.print_outputs()
 
-# 2: Remove some ACRO outputs before finalising 
-# This is an example of deleting some of the ACRO outputs.  
-# The name of the output that needs to be removed should be passed to the function remove_output.  
-# - The output name can be taken from the outputs listed by the print_outputs function, 
+# 2: Remove some ACRO outputs before finalising
+# This is an example of deleting some of the ACRO outputs.
+# The name of the output that needs to be removed should be passed to the function remove_output.
+# - The output name can be taken from the outputs listed by the print_outputs function,
 # - or by listing the results and choosing the specific output that needs to be removed
 
 print("\nNow removing the first output")
@@ -111,22 +110,20 @@
 acro.rename_output("output_1", "cross_tabulation")
 
 # 4: Add a comment to output
-# This is an example to add a comment to outputs.  
+# This is an example to add a comment to outputs.
 # It can be used to provide a description or to pass additional information to the output checkers.
 
 print("\nUsers can add comments which the output checkers will see.")
 acro.add_comments("cross_tabulation", "Please let me have this data.")
 
 # 5: (the big one) Finalise ACRO
-# This is an example of the function _finalise()_ which the users must call at the end of each session.  
-# - It takes each output and saves it to a CSV file.    
-# - It also saves the SDC analysis for each output to a json file or Excel file  
+# This is an example of the function _finalise()_ which the users must call at the end of each session.
+# - It takes each output and saves it to a CSV file.
+# - It also saves the SDC analysis for each output to a json file or Excel file
 #   (depending on the extension of the name of the file provided as an input to the function)
 
 print(
     "\nUsers MUST call finalise to send their outputs to the checkers"
     " If they don't, the SDC analysis, and their outputs, are lost."
 )
 output = acro.finalise("Examples", "json")
-
-