From 7428da8f3718ca07e46b7c4f20bbedadae5ef2d0 Mon Sep 17 00:00:00 2001 From: mahaalbashir Date: Thu, 5 Oct 2023 18:07:43 +0100 Subject: [PATCH 01/11] updating crosstab --- acro/acro_tables.py | 17 + notebooks/test.ipynb | 955 +++++++++++++++++++++++++++++++------------ test/test_initial.py | 38 +- 3 files changed, 728 insertions(+), 282 deletions(-) diff --git a/acro/acro_tables.py b/acro/acro_tables.py index cd50361..4464ab4 100644 --- a/acro/acro_tables.py +++ b/acro/acro_tables.py @@ -133,6 +133,22 @@ def crosstab( # pylint: disable=too-many-arguments,too-many-locals dropna, normalize, ) + # delete empty columns from table + deleted_cols = [] + for col in table.columns: + if table[col].sum() == 0: + table = table.drop(col, axis=1) + deleted_cols.append(col) + # create a message with the deleted column's names + if len(deleted_cols) > 0: + deleted_cols = [ + f"{col}" for col in deleted_cols + ] # to handle column's names of type tuple + msg = ", ".join(deleted_cols) + comments = [f"Empty columns: {msg} were deleted."] + logger.info(comments) + else: # pragma: no cover + comments = None masks = create_crosstab_masks( index, @@ -195,6 +211,7 @@ def crosstab( # pylint: disable=too-many-arguments,too-many-locals summary=summary, outcome=outcome, output=[table], + comments=comments, ) return table diff --git a/notebooks/test.ipynb b/notebooks/test.ipynb index feeda2b..29ecbef 100644 --- a/notebooks/test.ipynb +++ b/notebooks/test.ipynb @@ -713,10 +713,62 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "ef42beb6", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:acro:[\"Empty columns: ('N', 'Dead in 2015'), ('R/G', 'Dead in 2015') were deleted.\"]\n", + "INFO:acro:get_summary(): fail; threshold: 14 cells may need suppressing; p-ratio: 8 cells may need suppressing; nk-rule: 7 cells may need suppressing; \n", + "INFO:acro:outcome_df:\n", + "------------------------------------------------------------------------------------------------------------------------------------------------|\n", + "grant_type |G |N |R |R/G |All|\n", + "survivor |Dead in 2015 Alive in 2015 |Alive in 2015 |Dead in 2015 Alive in 2015 |Alive in 2015 | |\n", + "year | | | | | |\n", + "------------------------------------------------------------------------------------------------------------------------------------------------|\n", + "2010 | threshold; p-ratio; nk-rule; ok | threshold; p-ratio; | ok ok | threshold; p-ratio; nk-rule; | ok|\n", + "2011 | threshold; p-ratio; nk-rule; ok | ok | ok ok | threshold; | ok|\n", + "2012 | threshold; p-ratio; nk-rule; ok | ok | ok ok | threshold; | ok|\n", + "2013 | threshold; p-ratio; nk-rule; ok | ok | ok ok | threshold; | ok|\n", + "2014 | threshold; p-ratio; nk-rule; ok | ok | ok ok | threshold; | ok|\n", + "2015 | threshold; p-ratio; nk-rule; threshold; | ok | ok ok | threshold; | ok|\n", + "All | ok ok | ok | ok ok | ok | ok|\n", + "------------------------------------------------------------------------------------------------------------------------------------------------|\n", + "\n", + "INFO:acro:records:add(): output_2\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "grant_type G N R \\\n", + "survivor Dead in 2015 Alive in 2015 Alive in 2015 Dead in 2015 \n", + "year \n", + "2010 2 12 5 40 \n", + "2011 3 12 58 45 \n", + "2012 3 12 59 45 \n", + "2013 3 12 59 47 \n", + "2014 3 12 59 43 \n", + "2015 3 9 58 28 \n", + "All 17 69 298 248 \n", + "\n", + "grant_type R/G All \n", + "survivor Alive in 2015 Alive in 2015 \n", + "year \n", + "2010 20 4 83 \n", + "2011 24 8 150 \n", + "2012 24 8 151 \n", + "2013 24 8 153 \n", + "2014 24 8 149 \n", + "2015 23 8 129 \n", + "All 139 44 815 \n" + ] + } + ], "source": [ "acro.suppress = False\n", "table = acro.crosstab(\n", @@ -731,7 +783,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "506135e0", "metadata": {}, "outputs": [], @@ -749,7 +801,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "4ae844a0", "metadata": {}, "outputs": [ @@ -771,7 +823,7 @@ "2015 | ok | ok | ok | threshold; |\n", "---------------------------------------------------------------------------|\n", "\n", - "INFO:acro:records:add(): output_2\n" + "INFO:acro:records:add(): output_3\n" ] }, { @@ -866,7 +918,7 @@ "2015 11133433.0 146572.187500 10812888.0 18278624.0" ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -907,7 +959,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "bf132239", "metadata": {}, "outputs": [ @@ -930,7 +982,7 @@ "All | | | | | |\n", "-------------------------------------------------------|\n", "\n", - "INFO:acro:records:add(): output_3\n" + "INFO:acro:records:add(): output_4\n" ] }, { @@ -1042,7 +1094,7 @@ "All 11412787.0 136158.859375 8006360.5 16648273.0 5968295.5" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -1061,7 +1113,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "7cc417a0", "metadata": {}, "outputs": [], @@ -1079,7 +1131,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "id": "15bcdc7c", "metadata": {}, "outputs": [ @@ -1101,7 +1153,7 @@ "2015 | | negative | negative | |\n", "----------------------------------------|\n", "\n", - "INFO:acro:records:add(): output_4\n" + "INFO:acro:records:add(): output_5\n" ] }, { @@ -1196,7 +1248,7 @@ "2015 11133433.0 146572.015625 10388613.0 18278624.0" ] }, - "execution_count": 13, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -1219,7 +1271,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "id": "b13b5f7e", "metadata": {}, "outputs": [ @@ -1241,7 +1293,7 @@ "All |\n", "------------------------------------------------------------------|\n", "\n", - "INFO:acro:records:add(): output_5\n" + "INFO:acro:records:add(): output_6\n" ] }, { @@ -1367,7 +1419,7 @@ "All 839788672.0 4.888204e+09 " ] }, - "execution_count": 14, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1386,7 +1438,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "id": "3f016823", "metadata": {}, "outputs": [ @@ -1503,7 +1555,7 @@ "All 839788672.0 4.888204e+09 " ] }, - "execution_count": 15, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1521,7 +1573,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "id": "6d4730c4", "metadata": {}, "outputs": [ @@ -1542,7 +1594,7 @@ "R/G missing | missing |\n", "---------------------------------|\n", "\n", - "INFO:acro:records:add(): output_6\n" + "INFO:acro:records:add(): output_7\n" ] }, { @@ -1619,7 +1671,7 @@ "R/G 1.664827e+07 1.583532e+07" ] }, - "execution_count": 16, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1641,7 +1693,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "id": "f3a87c20", "metadata": {}, "outputs": [ @@ -1662,7 +1714,7 @@ "R/G missing | missing |\n", "---------------------------------|\n", "\n", - "INFO:acro:records:add(): output_7\n" + "INFO:acro:records:add(): output_8\n" ] }, { @@ -1739,7 +1791,7 @@ "R/G 1.664827e+07 1.583532e+07" ] }, - "execution_count": 17, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1757,7 +1809,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "id": "8b603548", "metadata": {}, "outputs": [], @@ -1775,7 +1827,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "id": "de4266cd-b4d4-417b-ae44-5d972e8bfdde", "metadata": {}, "outputs": [ @@ -1796,7 +1848,7 @@ "R/G | |\n", "---------------------------------|\n", "\n", - "INFO:acro:records:add(): output_8\n" + "INFO:acro:records:add(): output_9\n" ] }, { @@ -1873,7 +1925,7 @@ "R/G 1.664827e+07 1.583532e+07" ] }, - "execution_count": 19, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1897,7 +1949,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "id": "a521cb83", "metadata": {}, "outputs": [ @@ -1905,8 +1957,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:acro:ols() outcome: pass; dof=807.0 >= 10\n", - "INFO:acro:records:add(): output_9\n" + "INFO:acro:ols() outcome: pass; dof=807.0 >= 10\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:acro:records:add(): output_10\n" ] }, { @@ -1924,10 +1982,10 @@ " Method: Least Squares F-statistic: 2261. \n", "\n", "\n", - " Date: Wed, 04 Oct 2023 Prob (F-statistic): 0.00 \n", + " Date: Thu, 05 Oct 2023 Prob (F-statistic): 0.00 \n", "\n", "\n", - " Time: 16:08:40 Log-Likelihood: -14495. \n", + " Time: 18:04:09 Log-Likelihood: -14495. \n", "\n", "\n", " No. Observations: 811 AIC: 2.900e+04\n", @@ -1982,8 +2040,8 @@ "Dep. Variable: inc_activity R-squared: 0.894\n", "Model: OLS Adj. R-squared: 0.893\n", "Method: Least Squares F-statistic: 2261.\n", - "Date: Wed, 04 Oct 2023 Prob (F-statistic): 0.00\n", - "Time: 16:08:40 Log-Likelihood: -14495.\n", + "Date: Thu, 05 Oct 2023 Prob (F-statistic): 0.00\n", + "Time: 18:04:09 Log-Likelihood: -14495.\n", "No. Observations: 811 AIC: 2.900e+04\n", "Df Residuals: 807 BIC: 2.902e+04\n", "Df Model: 3 \n", @@ -2009,7 +2067,7 @@ "\"\"\"" ] }, - "execution_count": 20, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -2036,7 +2094,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "id": "cc90f7c9", "metadata": {}, "outputs": [ @@ -2044,8 +2102,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:acro:olsr() outcome: pass; dof=807.0 >= 10\n", - "INFO:acro:records:add(): output_10\n" + "INFO:acro:olsr() outcome: pass; dof=807.0 >= 10\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:acro:records:add(): output_11\n" ] }, { @@ -2063,10 +2127,10 @@ " Method: Least Squares F-statistic: 2261. \n", "\n", "\n", - " Date: Wed, 04 Oct 2023 Prob (F-statistic): 0.00 \n", + " Date: Thu, 05 Oct 2023 Prob (F-statistic): 0.00 \n", "\n", "\n", - " Time: 16:08:40 Log-Likelihood: -14495. \n", + " Time: 18:04:09 Log-Likelihood: -14495. \n", "\n", "\n", " No. Observations: 811 AIC: 2.900e+04\n", @@ -2121,8 +2185,8 @@ "Dep. Variable: inc_activity R-squared: 0.894\n", "Model: OLS Adj. R-squared: 0.893\n", "Method: Least Squares F-statistic: 2261.\n", - "Date: Wed, 04 Oct 2023 Prob (F-statistic): 0.00\n", - "Time: 16:08:40 Log-Likelihood: -14495.\n", + "Date: Thu, 05 Oct 2023 Prob (F-statistic): 0.00\n", + "Time: 18:04:09 Log-Likelihood: -14495.\n", "No. Observations: 811 AIC: 2.900e+04\n", "Df Residuals: 807 BIC: 2.902e+04\n", "Df Model: 3 \n", @@ -2148,7 +2212,7 @@ "\"\"\"" ] }, - "execution_count": 21, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -2170,7 +2234,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "id": "5b1a1611", "metadata": {}, "outputs": [ @@ -2178,8 +2242,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:acro:probit() outcome: pass; dof=806.0 >= 10\n", - "INFO:acro:records:add(): output_11\n" + "INFO:acro:probit() outcome: pass; dof=806.0 >= 10\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:acro:records:add(): output_12\n" ] }, { @@ -2206,10 +2276,10 @@ " Method: MLE Df Model: 4 \n", "\n", "\n", - " Date: Wed, 04 Oct 2023 Pseudo R-squ.: 0.2140 \n", + " Date: Thu, 05 Oct 2023 Pseudo R-squ.: 0.2140 \n", "\n", "\n", - " Time: 16:08:40 Log-Likelihood: -400.46 \n", + " Time: 18:04:09 Log-Likelihood: -400.46 \n", "\n", "\n", " converged: True LL-Null: -509.50 \n", @@ -2247,8 +2317,8 @@ "Dep. Variable: survivor No. Observations: 811\n", "Model: Probit Df Residuals: 806\n", "Method: MLE Df Model: 4\n", - "Date: Wed, 04 Oct 2023 Pseudo R-squ.: 0.2140\n", - "Time: 16:08:40 Log-Likelihood: -400.46\n", + "Date: Thu, 05 Oct 2023 Pseudo R-squ.: 0.2140\n", + "Time: 18:04:09 Log-Likelihood: -400.46\n", "converged: True LL-Null: -509.50\n", "Covariance Type: nonrobust LLR p-value: 4.875e-46\n", "=================================================================================\n", @@ -2267,7 +2337,7 @@ "\"\"\"" ] }, - "execution_count": 22, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -2295,7 +2365,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "id": "dcf30f8f", "metadata": {}, "outputs": [ @@ -2303,8 +2373,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:acro:logit() outcome: pass; dof=806.0 >= 10\n", - "INFO:acro:records:add(): output_12\n" + "INFO:acro:logit() outcome: pass; dof=806.0 >= 10\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:acro:records:add(): output_13\n" ] }, { @@ -2331,10 +2407,10 @@ " Method: MLE Df Model: 4 \n", "\n", "\n", - " Date: Wed, 04 Oct 2023 Pseudo R-squ.: 0.2187 \n", + " Date: Thu, 05 Oct 2023 Pseudo R-squ.: 0.2187 \n", "\n", "\n", - " Time: 16:08:40 Log-Likelihood: -398.07 \n", + " Time: 18:04:09 Log-Likelihood: -398.07 \n", "\n", "\n", " converged: True LL-Null: -509.50 \n", @@ -2372,8 +2448,8 @@ "Dep. Variable: survivor No. Observations: 811\n", "Model: Logit Df Residuals: 806\n", "Method: MLE Df Model: 4\n", - "Date: Wed, 04 Oct 2023 Pseudo R-squ.: 0.2187\n", - "Time: 16:08:40 Log-Likelihood: -398.07\n", + "Date: Thu, 05 Oct 2023 Pseudo R-squ.: 0.2187\n", + "Time: 18:04:09 Log-Likelihood: -398.07\n", "converged: True LL-Null: -509.50\n", "Covariance Type: nonrobust LLR p-value: 4.532e-47\n", "=================================================================================\n", @@ -2392,7 +2468,7 @@ "\"\"\"" ] }, - "execution_count": 23, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -2412,7 +2488,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "id": "ec960039", "metadata": { "scrolled": true @@ -2445,7 +2521,7 @@ "2013 15 59 71 8\n", "2014 15 59 71 8\n", "2015 15 59 71 8]\n", - "timestamp: 2023-10-04T16:08:29.606127\n", + "timestamp: 2023-10-05T18:03:53.064163\n", "comments: []\n", "exception: \n", "\n", @@ -2472,7 +2548,7 @@ "2013 13557147.0 147937.796875 7202273.5 NaN\n", "2014 13748147.0 133198.250000 8277525.5 NaN\n", "2015 11133433.0 146572.187500 10812888.0 NaN]\n", - "timestamp: 2023-10-04T16:08:29.806068\n", + "timestamp: 2023-10-05T18:03:54.913352\n", "comments: []\n", "exception: \n", "\n", @@ -2480,6 +2556,59 @@ "status: fail\n", "type: table\n", "properties: {'method': 'crosstab'}\n", + "sdc: {'summary': {'suppressed': False, 'negative': 0, 'missing': 0, 'threshold': 14, 'p-ratio': 8, 'nk-rule': 7}, 'cells': {'negative': [], 'missing': [], 'threshold': [[0, 0], [0, 2], [0, 5], [1, 0], [1, 5], [2, 0], [2, 5], [3, 0], [3, 5], [4, 0], [4, 5], [5, 0], [5, 1], [5, 5]], 'p-ratio': [[0, 0], [0, 2], [0, 5], [1, 0], [2, 0], [3, 0], [4, 0], [5, 0]], 'nk-rule': [[0, 0], [0, 5], [1, 0], [2, 0], [3, 0], [4, 0], [5, 0]]}}\n", + "command: table = acro.crosstab(\n", + "summary: fail; threshold: 14 cells may need suppressing; p-ratio: 8 cells may need suppressing; nk-rule: 7 cells may need suppressing; \n", + "outcome: grant_type G N \\\n", + "survivor Dead in 2015 Alive in 2015 Alive in 2015 \n", + "year \n", + "2010 threshold; p-ratio; nk-rule; ok threshold; p-ratio; \n", + "2011 threshold; p-ratio; nk-rule; ok ok \n", + "2012 threshold; p-ratio; nk-rule; ok ok \n", + "2013 threshold; p-ratio; nk-rule; ok ok \n", + "2014 threshold; p-ratio; nk-rule; ok ok \n", + "2015 threshold; p-ratio; nk-rule; threshold; ok \n", + "All ok ok ok \n", + "\n", + "grant_type R R/G All \n", + "survivor Dead in 2015 Alive in 2015 Alive in 2015 \n", + "year \n", + "2010 ok ok threshold; p-ratio; nk-rule; ok \n", + "2011 ok ok threshold; ok \n", + "2012 ok ok threshold; ok \n", + "2013 ok ok threshold; ok \n", + "2014 ok ok threshold; ok \n", + "2015 ok ok threshold; ok \n", + "All ok ok ok ok \n", + "output: [grant_type G N R \\\n", + "survivor Dead in 2015 Alive in 2015 Alive in 2015 Dead in 2015 \n", + "year \n", + "2010 2 12 5 40 \n", + "2011 3 12 58 45 \n", + "2012 3 12 59 45 \n", + "2013 3 12 59 47 \n", + "2014 3 12 59 43 \n", + "2015 3 9 58 28 \n", + "All 17 69 298 248 \n", + "\n", + "grant_type R/G All \n", + "survivor Alive in 2015 Alive in 2015 \n", + "year \n", + "2010 20 4 83 \n", + "2011 24 8 150 \n", + "2012 24 8 151 \n", + "2013 24 8 153 \n", + "2014 24 8 149 \n", + "2015 23 8 129 \n", + "All 139 44 815 ]\n", + "timestamp: 2023-10-05T18:03:56.973956\n", + "comments: [\"Empty columns: ('N', 'Dead in 2015'), ('R/G', 'Dead in 2015') were deleted.\"]\n", + "exception: \n", + "\n", + "uid: output_3\n", + "status: fail\n", + "type: table\n", + "properties: {'method': 'crosstab'}\n", "sdc: {'summary': {'suppressed': False, 'negative': 0, 'missing': 0, 'threshold': 7, 'p-ratio': 2, 'nk-rule': 1}, 'cells': {'negative': [], 'missing': [], 'threshold': [[0, 1], [0, 3], [1, 3], [2, 3], [3, 3], [4, 3], [5, 3]], 'p-ratio': [[0, 1], [0, 3]], 'nk-rule': [[0, 3]]}}\n", "command: safe_table = acro.crosstab(df.year, df.grant_type, values=df.inc_grants, aggfunc=\"mean\")\n", "summary: fail; threshold: 7 cells may need suppressing; p-ratio: 2 cells may need suppressing; nk-rule: 1 cells may need suppressing; \n", @@ -2499,11 +2628,11 @@ "2013 13557147.0 147937.796875 7202273.5 16765625.0\n", "2014 13748147.0 133198.250000 8277525.5 17845750.0\n", "2015 11133433.0 146572.187500 10812888.0 18278624.0]\n", - "timestamp: 2023-10-04T16:08:34.251644\n", + "timestamp: 2023-10-05T18:04:01.751627\n", "comments: []\n", "exception: \n", "\n", - "uid: output_3\n", + "uid: output_4\n", "status: review\n", "type: table\n", "properties: {'method': 'crosstab'}\n", @@ -2528,11 +2657,11 @@ "2014 13748147.0 135494.781250 8118565.0 17845750.0 6072600.0\n", "2015 11133433.0 149143.625000 10596385.0 18278624.0 6442131.0\n", "All 11412787.0 136158.859375 8006360.5 16648273.0 5968295.5]\n", - "timestamp: 2023-10-04T16:08:39.363301\n", + "timestamp: 2023-10-05T18:04:05.126101\n", "comments: []\n", "exception: \n", "\n", - "uid: output_4\n", + "uid: output_5\n", "status: review\n", "type: table\n", "properties: {'method': 'crosstab'}\n", @@ -2555,11 +2684,11 @@ "2013 13557147.0 147937.625000 6988263.5 16765625.0\n", "2014 13748147.0 133198.078125 7997392.5 17845750.0\n", "2015 11133433.0 146572.015625 10388613.0 18278624.0]\n", - "timestamp: 2023-10-04T16:08:39.557302\n", + "timestamp: 2023-10-05T18:04:08.961665\n", "comments: []\n", "exception: \n", "\n", - "uid: output_5\n", + "uid: output_6\n", "status: review\n", "type: table\n", "properties: {'method': 'pivot_table'}\n", @@ -2591,11 +2720,11 @@ "R 551457280.0 3.134120e+09 \n", "R/G 146228992.0 7.325240e+08 \n", "All 839788672.0 4.888204e+09 ]\n", - "timestamp: 2023-10-04T16:08:39.760564\n", + "timestamp: 2023-10-05T18:04:09.105670\n", "comments: []\n", "exception: \n", "\n", - "uid: output_6\n", + "uid: output_7\n", "status: review\n", "type: table\n", "properties: {'method': 'pivot_table'}\n", @@ -2616,11 +2745,11 @@ "N 1.344319e+05 1.988737e+05\n", "R 8.098502e+06 3.204495e+07\n", "R/G 1.664827e+07 1.583532e+07]\n", - "timestamp: 2023-10-04T16:08:39.951135\n", + "timestamp: 2023-10-05T18:04:09.203761\n", "comments: []\n", "exception: \n", "\n", - "uid: output_7\n", + "uid: output_8\n", "status: review\n", "type: table\n", "properties: {'method': 'pivot_table'}\n", @@ -2641,11 +2770,11 @@ "N 1.364700e+05 1.999335e+05\n", "R 8.006360e+06 3.228216e+07\n", "R/G 1.664827e+07 1.583532e+07]\n", - "timestamp: 2023-10-04T16:08:40.063744\n", + "timestamp: 2023-10-05T18:04:09.264100\n", "comments: []\n", "exception: \n", "\n", - "uid: output_8\n", + "uid: output_9\n", "status: review\n", "type: table\n", "properties: {'method': 'pivot_table'}\n", @@ -2666,11 +2795,11 @@ "N 1.341800e+05 1.990196e+05\n", "R 7.882231e+06 3.204558e+07\n", "R/G 1.664827e+07 1.583532e+07]\n", - "timestamp: 2023-10-04T16:08:40.206681\n", + "timestamp: 2023-10-05T18:04:09.342995\n", "comments: []\n", "exception: \n", "\n", - "uid: output_9\n", + "uid: output_10\n", "status: pass\n", "type: regression\n", "properties: {'method': 'ols', 'dof': 807.0}\n", @@ -2684,8 +2813,8 @@ "Dep. Variable: \n", "Model: OLS Adj. R-squared: 0.893\n", "Method: Least Squares F-statistic: 2261.000\n", - "Date: Wed, 04 Oct 2023 Prob (F-statistic): 0.000\n", - "Time: 16:08:40 Log-Likelihood: -14495.000\n", + "Date: Thu, 05 Oct 2023 Prob (F-statistic): 0.000\n", + "Time: 18:04:09 Log-Likelihood: -14495.000\n", "No. Observations: 811 AIC: 29000.000\n", "Df Residuals: 807 BIC: 29020.000\n", "Df Model: 3 NaN NaN\n", @@ -2698,11 +2827,11 @@ "Prob(Omnibus): 0.000 Jarque-Bera (JB): 1.253318e+06\n", "Skew: 9.899 Prob(JB): 0.000000e+00\n", "Kurtosis: 194.566 Cond. No. 1.050000e+08]\n", - "timestamp: 2023-10-04T16:08:40.319370\n", + "timestamp: 2023-10-05T18:04:09.406745\n", "comments: []\n", "exception: \n", "\n", - "uid: output_10\n", + "uid: output_11\n", "status: pass\n", "type: regression\n", "properties: {'method': 'olsr', 'dof': 807.0}\n", @@ -2716,8 +2845,8 @@ "Dep. Variable: \n", "Model: OLS Adj. R-squared: 0.893\n", "Method: Least Squares F-statistic: 2261.000\n", - "Date: Wed, 04 Oct 2023 Prob (F-statistic): 0.000\n", - "Time: 16:08:40 Log-Likelihood: -14495.000\n", + "Date: Thu, 05 Oct 2023 Prob (F-statistic): 0.000\n", + "Time: 18:04:09 Log-Likelihood: -14495.000\n", "No. Observations: 811 AIC: 29000.000\n", "Df Residuals: 807 BIC: 29020.000\n", "Df Model: 3 NaN NaN\n", @@ -2730,11 +2859,11 @@ "Prob(Omnibus): 0.000 Jarque-Bera (JB): 1.253318e+06\n", "Skew: 9.899 Prob(JB): 0.000000e+00\n", "Kurtosis: 194.566 Cond. No. 1.050000e+08]\n", - "timestamp: 2023-10-04T16:08:40.431387\n", + "timestamp: 2023-10-05T18:04:09.449726\n", "comments: []\n", "exception: \n", "\n", - "uid: output_11\n", + "uid: output_12\n", "status: pass\n", "type: regression\n", "properties: {'method': 'probit', 'dof': 806.0}\n", @@ -2748,8 +2877,8 @@ "Dep. Variable: \n", "Model: Probit Df Residuals: 8.060000e+02\n", "Method: MLE Df Model: 4.000000e+00\n", - "Date: Wed, 04 Oct 2023 Pseudo R-squ.: 2.140000e-01\n", - "Time: 16:08:40 Log-Likelihood: -4.004600e+02\n", + "Date: Thu, 05 Oct 2023 Pseudo R-squ.: 2.140000e-01\n", + "Time: 18:04:09 Log-Likelihood: -4.004600e+02\n", "converged: True LL-Null: -5.095000e+02\n", "Covariance Type: nonrobust LLR p-value: 4.875000e-46, coef std err z P>|z| [0.025 \\\n", "const 4.740000e-02 5.700000e-02 0.838 0.402 -6.300000e-02 \n", @@ -2764,11 +2893,11 @@ "inc_grants 1.620000e-07 \n", "inc_donations 3.300000e-07 \n", "total_costs -1.440000e-08 ]\n", - "timestamp: 2023-10-04T16:08:40.540353\n", + "timestamp: 2023-10-05T18:04:09.499724\n", "comments: []\n", "exception: \n", "\n", - "uid: output_12\n", + "uid: output_13\n", "status: pass\n", "type: regression\n", "properties: {'method': 'logit', 'dof': 806.0}\n", @@ -2782,8 +2911,8 @@ "Dep. Variable: \n", "Model: Logit Df Residuals: 8.060000e+02\n", "Method: MLE Df Model: 4.000000e+00\n", - "Date: Wed, 04 Oct 2023 Pseudo R-squ.: 2.187000e-01\n", - "Time: 16:08:40 Log-Likelihood: -3.980700e+02\n", + "Date: Thu, 05 Oct 2023 Pseudo R-squ.: 2.187000e-01\n", + "Time: 18:04:09 Log-Likelihood: -3.980700e+02\n", "converged: True LL-Null: -5.095000e+02\n", "Covariance Type: nonrobust LLR p-value: 4.532000e-47, coef std err z P>|z| [0.025 \\\n", "const 5.120000e-02 9.100000e-02 0.561 0.575 -1.280000e-01 \n", @@ -2798,7 +2927,7 @@ "inc_grants 2.660000e-07 \n", "inc_donations 7.160000e-07 \n", "total_costs -2.150000e-08 ]\n", - "timestamp: 2023-10-04T16:08:40.616654\n", + "timestamp: 2023-10-05T18:04:09.537725\n", "comments: []\n", "exception: \n", "\n", @@ -2820,7 +2949,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, "id": "b1f77749", "metadata": {}, "outputs": [ @@ -2848,7 +2977,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 27, "id": "45ec04ef", "metadata": {}, "outputs": [ @@ -2874,7 +3003,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "id": "0c826271", "metadata": {}, "outputs": [ @@ -2902,7 +3031,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 29, "id": "2816eac7", "metadata": {}, "outputs": [ @@ -2910,7 +3039,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:acro:records:add_custom(): output_13\n" + "INFO:acro:records:add_custom(): output_14\n" ] } ], @@ -2930,7 +3059,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "id": "f38b4334", "metadata": {}, "outputs": [ @@ -2962,7 +3091,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 31, "id": "9e554eea", "metadata": {}, "outputs": [ @@ -2989,10 +3118,39 @@ " inc_grants inc_grants\n", "grant_type \n", "G 1.141279e+07 2.283220e+07\n", + "N 1.344319e+05 1.988737e+05\n", + "R 8.098502e+06 3.204495e+07\n", + "R/G 1.664827e+07 1.583532e+07]\n", + "timestamp: 2023-10-05T18:04:09.203761\n", + "comments: []\n", + "exception: \n", + "\n", + "The status of the record above is: review.\n", + "Please explain why an exception should be granted.\n", + "\n", + "INFO:acro:records:\n", + "uid: output_8\n", + "status: review\n", + "type: table\n", + "properties: {'method': 'pivot_table'}\n", + "sdc: {'summary': {'suppressed': False, 'negative': 0, 'missing': 8, 'threshold': 0, 'p-ratio': 0, 'nk-rule': 0}, 'cells': {'negative': [], 'missing': [[0, 0], [0, 1], [1, 0], [1, 1], [2, 0], [2, 1], [3, 0], [3, 1]], 'threshold': [], 'p-ratio': [], 'nk-rule': []}}\n", + "command: table = acro.pivot_table(\n", + "summary: review; missing values found\n", + "outcome: mean std\n", + " inc_grants inc_grants\n", + "grant_type \n", + "G missing missing\n", + "N missing missing\n", + "R missing missing\n", + "R/G missing missing\n", + "output: [ mean std\n", + " inc_grants inc_grants\n", + "grant_type \n", + "G 1.141279e+07 2.283220e+07\n", "N 1.364700e+05 1.999335e+05\n", "R 8.006360e+06 3.228216e+07\n", "R/G 1.664827e+07 1.583532e+07]\n", - "timestamp: 2023-10-04T16:08:40.063744\n", + "timestamp: 2023-10-05T18:04:09.264100\n", "comments: []\n", "exception: \n", "\n", @@ -3000,7 +3158,7 @@ "Please explain why an exception should be granted.\n", "\n", "INFO:acro:records:\n", - "uid: output_8\n", + "uid: output_9\n", "status: review\n", "type: table\n", "properties: {'method': 'pivot_table'}\n", @@ -3021,7 +3179,7 @@ "N 1.341800e+05 1.990196e+05\n", "R 7.882231e+06 3.204558e+07\n", "R/G 1.664827e+07 1.583532e+07]\n", - "timestamp: 2023-10-04T16:08:40.206681\n", + "timestamp: 2023-10-05T18:04:09.342995\n", "comments: []\n", "exception: \n", "\n", @@ -3033,34 +3191,60 @@ "status: fail\n", "type: table\n", "properties: {'method': 'crosstab'}\n", - "sdc: {'summary': {'suppressed': False, 'negative': 0, 'missing': 0, 'threshold': 7, 'p-ratio': 2, 'nk-rule': 1}, 'cells': {'negative': [], 'missing': [], 'threshold': [[0, 1], [0, 3], [1, 3], [2, 3], [3, 3], [4, 3], [5, 3]], 'p-ratio': [[0, 1], [0, 3]], 'nk-rule': [[0, 3]]}}\n", - "command: safe_table = acro.crosstab(df.year, df.grant_type, values=df.inc_grants, aggfunc=\"mean\")\n", - "summary: fail; threshold: 7 cells may need suppressing; p-ratio: 2 cells may need suppressing; nk-rule: 1 cells may need suppressing; \n", - "outcome: grant_type G N R R/G\n", - "year \n", - "2010 ok threshold; p-ratio; ok threshold; p-ratio; nk-rule; \n", - "2011 ok ok ok threshold; \n", - "2012 ok ok ok threshold; \n", - "2013 ok ok ok threshold; \n", - "2014 ok ok ok threshold; \n", - "2015 ok ok ok threshold; \n", - "output: [grant_type G N R R/G\n", - "year \n", - "2010 9921906.0 0.000000 8402284.0 11636000.0\n", - "2011 8502247.0 124013.859375 7716880.0 16047500.0\n", - "2012 11458580.0 131859.062500 6958050.5 16810000.0\n", - "2013 13557147.0 147937.796875 7202273.5 16765625.0\n", - "2014 13748147.0 133198.250000 8277525.5 17845750.0\n", - "2015 11133433.0 146572.187500 10812888.0 18278624.0]\n", - "timestamp: 2023-10-04T16:08:34.251644\n", - "comments: []\n", + "sdc: {'summary': {'suppressed': False, 'negative': 0, 'missing': 0, 'threshold': 14, 'p-ratio': 8, 'nk-rule': 7}, 'cells': {'negative': [], 'missing': [], 'threshold': [[0, 0], [0, 2], [0, 5], [1, 0], [1, 5], [2, 0], [2, 5], [3, 0], [3, 5], [4, 0], [4, 5], [5, 0], [5, 1], [5, 5]], 'p-ratio': [[0, 0], [0, 2], [0, 5], [1, 0], [2, 0], [3, 0], [4, 0], [5, 0]], 'nk-rule': [[0, 0], [0, 5], [1, 0], [2, 0], [3, 0], [4, 0], [5, 0]]}}\n", + "command: table = acro.crosstab(\n", + "summary: fail; threshold: 14 cells may need suppressing; p-ratio: 8 cells may need suppressing; nk-rule: 7 cells may need suppressing; \n", + "outcome: grant_type G N \\\n", + "survivor Dead in 2015 Alive in 2015 Alive in 2015 \n", + "year \n", + "2010 threshold; p-ratio; nk-rule; ok threshold; p-ratio; \n", + "2011 threshold; p-ratio; nk-rule; ok ok \n", + "2012 threshold; p-ratio; nk-rule; ok ok \n", + "2013 threshold; p-ratio; nk-rule; ok ok \n", + "2014 threshold; p-ratio; nk-rule; ok ok \n", + "2015 threshold; p-ratio; nk-rule; threshold; ok \n", + "All ok ok ok \n", + "\n", + "grant_type R R/G All \n", + "survivor Dead in 2015 Alive in 2015 Alive in 2015 \n", + "year \n", + "2010 ok ok threshold; p-ratio; nk-rule; ok \n", + "2011 ok ok threshold; ok \n", + "2012 ok ok threshold; ok \n", + "2013 ok ok threshold; ok \n", + "2014 ok ok threshold; ok \n", + "2015 ok ok threshold; ok \n", + "All ok ok ok ok \n", + "output: [grant_type G N R \\\n", + "survivor Dead in 2015 Alive in 2015 Alive in 2015 Dead in 2015 \n", + "year \n", + "2010 2 12 5 40 \n", + "2011 3 12 58 45 \n", + "2012 3 12 59 45 \n", + "2013 3 12 59 47 \n", + "2014 3 12 59 43 \n", + "2015 3 9 58 28 \n", + "All 17 69 298 248 \n", + "\n", + "grant_type R/G All \n", + "survivor Alive in 2015 Alive in 2015 \n", + "year \n", + "2010 20 4 83 \n", + "2011 24 8 150 \n", + "2012 24 8 151 \n", + "2013 24 8 153 \n", + "2014 24 8 149 \n", + "2015 23 8 129 \n", + "All 139 44 815 ]\n", + "timestamp: 2023-10-05T18:03:56.973956\n", + "comments: [\"Empty columns: ('N', 'Dead in 2015'), ('R/G', 'Dead in 2015') were deleted.\"]\n", "exception: \n", "\n", "The status of the record above is: fail.\n", "Please explain why an exception should be granted.\n", "\n", "INFO:acro:records:\n", - "uid: output_13\n", + "uid: output_14\n", "status: review\n", "type: custom\n", "properties: {}\n", @@ -3071,7 +3255,7 @@ "Columns: []\n", "Index: []\n", "output: ['XandY.jpeg']\n", - "timestamp: 2023-10-04T16:08:40.800408\n", + "timestamp: 2023-10-05T18:04:09.660560\n", "comments: ['This output is an image showing the relationship between X and Y']\n", "exception: \n", "\n", @@ -3099,7 +3283,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 32, "id": "f78b5a08", "metadata": {}, "outputs": [ @@ -3115,16 +3299,17 @@ "output_10_2.csv\n", "output_11_0.csv\n", "output_11_1.csv\n", + "output_11_2.csv\n", "output_12_0.csv\n", "output_12_1.csv\n", + "output_13_0.csv\n", + "output_13_1.csv\n", "output_3_0.csv\n", "output_5_0.csv\n", "output_6_0.csv\n", "output_7_0.csv\n", "output_8_0.csv\n", "output_9_0.csv\n", - "output_9_1.csv\n", - "output_9_2.csv\n", "pivot_table_0.csv\n", "results.json\n" ] @@ -3150,7 +3335,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 33, "id": "df2a02e0", "metadata": {}, "outputs": [ @@ -3165,16 +3350,17 @@ "output_10_2.csv.txt\n", "output_11_0.csv.txt\n", "output_11_1.csv.txt\n", + "output_11_2.csv.txt\n", "output_12_0.csv.txt\n", "output_12_1.csv.txt\n", + "output_13_0.csv.txt\n", + "output_13_1.csv.txt\n", "output_3_0.csv.txt\n", "output_5_0.csv.txt\n", "output_6_0.csv.txt\n", "output_7_0.csv.txt\n", "output_8_0.csv.txt\n", "output_9_0.csv.txt\n", - "output_9_1.csv.txt\n", - "output_9_2.csv.txt\n", "pivot_table_0.csv.txt\n", "results.json.txt\n" ] @@ -3201,7 +3387,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 34, "id": "56d2b6a1", "metadata": {}, "outputs": [ @@ -3302,7 +3488,7 @@ " },\n", " \"command\": \"safe_table = acro.crosstab(df.year, df.grant_type)\",\n", " \"summary\": \"fail; threshold: 6 cells may need suppressing; \",\n", - " \"timestamp\": \"2023-10-04T16:08:29.606127\",\n", + " \"timestamp\": \"2023-10-05T18:03:53.064163\",\n", " \"comments\": [\n", " \"This is a cross table between year and grant_type\",\n", " \"6 cells were suppressed in this table\"\n", @@ -3311,7 +3497,7 @@ " },\n", " \"output_3\": {\n", " \"uid\": \"output_3\",\n", - " \"status\": \"review\",\n", + " \"status\": \"fail\",\n", " \"type\": \"table\",\n", " \"properties\": {\n", " \"method\": \"crosstab\"\n", @@ -3323,64 +3509,205 @@ " \"summary\": {\n", " \"suppressed\": false,\n", " \"negative\": 0,\n", - " \"missing\": 14,\n", + " \"missing\": 0,\n", " \"threshold\": 7,\n", " \"p-ratio\": 2,\n", " \"nk-rule\": 1\n", " },\n", " \"cells\": {\n", " \"negative\": [],\n", - " \"missing\": [\n", - " [\n", - " 0,\n", - " 0\n", - " ],\n", + " \"missing\": [],\n", + " \"threshold\": [\n", " [\n", " 0,\n", " 1\n", " ],\n", " [\n", " 0,\n", - " 2\n", - " ],\n", - " [\n", - " 0,\n", " 3\n", " ],\n", " [\n", " 1,\n", - " 1\n", - " ],\n", - " [\n", - " 1,\n", - " 2\n", + " 3\n", " ],\n", " [\n", " 2,\n", - " 2\n", - " ],\n", - " [\n", - " 3,\n", - " 1\n", + " 3\n", " ],\n", " [\n", " 3,\n", - " 2\n", - " ],\n", - " [\n", - " 4,\n", - " 1\n", + " 3\n", " ],\n", " [\n", " 4,\n", - " 2\n", + " 3\n", " ],\n", " [\n", " 5,\n", - " 0\n", - " ],\n", + " 3\n", + " ]\n", + " ],\n", + " \"p-ratio\": [\n", " [\n", - " 5,\n", + " 0,\n", + " 1\n", + " ],\n", + " [\n", + " 0,\n", + " 3\n", + " ]\n", + " ],\n", + " \"nk-rule\": [\n", + " [\n", + " 0,\n", + " 3\n", + " ]\n", + " ]\n", + " }\n", + " }\n", + " }\n", + " ],\n", + " \"outcome\": {\n", + " \"G\": {\n", + " \"2010\": \"ok\",\n", + " \"2011\": \"ok\",\n", + " \"2012\": \"ok\",\n", + " \"2013\": \"ok\",\n", + " \"2014\": \"ok\",\n", + " \"2015\": \"ok\"\n", + " },\n", + " \"N\": {\n", + " \"2010\": \"threshold; p-ratio; \",\n", + " \"2011\": \"ok\",\n", + " \"2012\": \"ok\",\n", + " \"2013\": \"ok\",\n", + " \"2014\": \"ok\",\n", + " \"2015\": \"ok\"\n", + " },\n", + " \"R\": {\n", + " \"2010\": \"ok\",\n", + " \"2011\": \"ok\",\n", + " \"2012\": \"ok\",\n", + " \"2013\": \"ok\",\n", + " \"2014\": \"ok\",\n", + " \"2015\": \"ok\"\n", + " },\n", + " \"R/G\": {\n", + " \"2010\": \"threshold; p-ratio; nk-rule; \",\n", + " \"2011\": \"threshold; \",\n", + " \"2012\": \"threshold; \",\n", + " \"2013\": \"threshold; \",\n", + " \"2014\": \"threshold; \",\n", + " \"2015\": \"threshold; \"\n", + " }\n", + " },\n", + " \"command\": \"safe_table = acro.crosstab(df.year, df.grant_type, values=df.inc_grants, aggfunc=\\\"mean\\\")\",\n", + " \"summary\": \"fail; threshold: 7 cells may need suppressing; p-ratio: 2 cells may need suppressing; nk-rule: 1 cells may need suppressing; \",\n", + " \"timestamp\": \"2023-10-05T18:04:01.751627\",\n", + " \"comments\": [],\n", + " \"exception\": \"This one is safe. Trust me, I'm a professor.\"\n", + " },\n", + " \"output_5\": {\n", + " \"uid\": \"output_5\",\n", + " \"status\": \"review\",\n", + " \"type\": \"table\",\n", + " \"properties\": {\n", + " \"method\": \"crosstab\"\n", + " },\n", + " \"files\": [\n", + " {\n", + " \"name\": \"output_5_0.csv\",\n", + " \"sdc\": {\n", + " \"summary\": {\n", + " \"suppressed\": false,\n", + " \"negative\": 10,\n", + " \"missing\": 11,\n", + " \"threshold\": 7,\n", + " \"p-ratio\": 2,\n", + " \"nk-rule\": 1\n", + " },\n", + " \"cells\": {\n", + " \"negative\": [\n", + " [\n", + " 0,\n", + " 2\n", + " ],\n", + " [\n", + " 1,\n", + " 1\n", + " ],\n", + " [\n", + " 1,\n", + " 2\n", + " ],\n", + " [\n", + " 2,\n", + " 2\n", + " ],\n", + " [\n", + " 3,\n", + " 1\n", + " ],\n", + " [\n", + " 3,\n", + " 2\n", + " ],\n", + " [\n", + " 4,\n", + " 1\n", + " ],\n", + " [\n", + " 4,\n", + " 2\n", + " ],\n", + " [\n", + " 5,\n", + " 1\n", + " ],\n", + " [\n", + " 5,\n", + " 2\n", + " ]\n", + " ],\n", + " \"missing\": [\n", + " [\n", + " 0,\n", + " 0\n", + " ],\n", + " [\n", + " 0,\n", + " 1\n", + " ],\n", + " [\n", + " 0,\n", + " 2\n", + " ],\n", + " [\n", + " 0,\n", + " 3\n", + " ],\n", + " [\n", + " 1,\n", + " 1\n", + " ],\n", + " [\n", + " 1,\n", + " 2\n", + " ],\n", + " [\n", + " 2,\n", + " 2\n", + " ],\n", + " [\n", + " 4,\n", + " 2\n", + " ],\n", + " [\n", + " 5,\n", + " 0\n", + " ],\n", + " [\n", + " 5,\n", " 1\n", " ],\n", " [\n", @@ -3440,59 +3767,46 @@ " ],\n", " \"outcome\": {\n", " \"G\": {\n", - " \"2010\": \"missing\",\n", + " \"2010\": \"\",\n", " \"2011\": \"\",\n", " \"2012\": \"\",\n", " \"2013\": \"\",\n", " \"2014\": \"\",\n", - " \"2015\": \"missing\",\n", - " \"All\": \"\"\n", + " \"2015\": \"\"\n", " },\n", " \"N\": {\n", - " \"2010\": \"missing\",\n", - " \"2011\": \"missing\",\n", + " \"2010\": \"\",\n", + " \"2011\": \"negative\",\n", " \"2012\": \"\",\n", - " \"2013\": \"missing\",\n", - " \"2014\": \"missing\",\n", - " \"2015\": \"missing\",\n", - " \"All\": \"\"\n", + " \"2013\": \"negative\",\n", + " \"2014\": \"negative\",\n", + " \"2015\": \"negative\"\n", " },\n", " \"R\": {\n", - " \"2010\": \"missing\",\n", - " \"2011\": \"missing\",\n", - " \"2012\": \"missing\",\n", - " \"2013\": \"missing\",\n", - " \"2014\": \"missing\",\n", - " \"2015\": \"missing\",\n", - " \"All\": \"\"\n", + " \"2010\": \"negative\",\n", + " \"2011\": \"negative\",\n", + " \"2012\": \"negative\",\n", + " \"2013\": \"negative\",\n", + " \"2014\": \"negative\",\n", + " \"2015\": \"negative\"\n", " },\n", " \"R/G\": {\n", - " \"2010\": \"missing\",\n", - " \"2011\": \"\",\n", - " \"2012\": \"\",\n", - " \"2013\": \"\",\n", - " \"2014\": \"\",\n", - " \"2015\": \"\",\n", - " \"All\": \"\"\n", - " },\n", - " \"All\": {\n", " \"2010\": \"\",\n", " \"2011\": \"\",\n", " \"2012\": \"\",\n", " \"2013\": \"\",\n", " \"2014\": \"\",\n", - " \"2015\": \"\",\n", - " \"All\": \"\"\n", + " \"2015\": \"\"\n", " }\n", " },\n", - " \"command\": \"safe_table = acro.crosstab(\",\n", - " \"summary\": \"review; missing values found\",\n", - " \"timestamp\": \"2023-10-04T16:08:39.363301\",\n", + " \"command\": \"safe_table = acro.crosstab(df.year, df.grant_type, values=negative, aggfunc=\\\"mean\\\")\",\n", + " \"summary\": \"review; negative values found\",\n", + " \"timestamp\": \"2023-10-05T18:04:08.961665\",\n", " \"comments\": [],\n", - " \"exception\": \"This one is safe. Trust me, I'm a professor.\"\n", + " \"exception\": \"It's not disclosive, I promise.\"\n", " },\n", - " \"output_5\": {\n", - " \"uid\": \"output_5\",\n", + " \"output_6\": {\n", + " \"uid\": \"output_6\",\n", " \"status\": \"review\",\n", " \"type\": \"table\",\n", " \"properties\": {\n", @@ -3500,7 +3814,7 @@ " },\n", " \"files\": [\n", " {\n", - " \"name\": \"output_5_0.csv\",\n", + " \"name\": \"output_6_0.csv\",\n", " \"sdc\": {\n", " \"summary\": {\n", " \"suppressed\": false,\n", @@ -3661,12 +3975,12 @@ " },\n", " \"command\": \"table = acro.pivot_table(\",\n", " \"summary\": \"review; missing values found\",\n", - " \"timestamp\": \"2023-10-04T16:08:39.760564\",\n", + " \"timestamp\": \"2023-10-05T18:04:09.105670\",\n", " \"comments\": [],\n", - " \"exception\": \"It's not disclosive, I promise.\"\n", + " \"exception\": \"I need this one too\"\n", " },\n", - " \"output_6\": {\n", - " \"uid\": \"output_6\",\n", + " \"output_7\": {\n", + " \"uid\": \"output_7\",\n", " \"status\": \"review\",\n", " \"type\": \"table\",\n", " \"properties\": {\n", @@ -3674,7 +3988,7 @@ " },\n", " \"files\": [\n", " {\n", - " \"name\": \"output_6_0.csv\",\n", + " \"name\": \"output_7_0.csv\",\n", " \"sdc\": {\n", " \"summary\": {\n", " \"suppressed\": false,\n", @@ -3743,12 +4057,12 @@ " },\n", " \"command\": \"table = acro.pivot_table(\",\n", " \"summary\": \"review; missing values found\",\n", - " \"timestamp\": \"2023-10-04T16:08:39.951135\",\n", + " \"timestamp\": \"2023-10-05T18:04:09.203761\",\n", " \"comments\": [],\n", - " \"exception\": \"I need this one too\"\n", + " \"exception\": \"yes\"\n", " },\n", - " \"output_7\": {\n", - " \"uid\": \"output_7\",\n", + " \"output_8\": {\n", + " \"uid\": \"output_8\",\n", " \"status\": \"review\",\n", " \"type\": \"table\",\n", " \"properties\": {\n", @@ -3756,7 +4070,7 @@ " },\n", " \"files\": [\n", " {\n", - " \"name\": \"output_7_0.csv\",\n", + " \"name\": \"output_8_0.csv\",\n", " \"sdc\": {\n", " \"summary\": {\n", " \"suppressed\": false,\n", @@ -3825,12 +4139,12 @@ " },\n", " \"command\": \"table = acro.pivot_table(\",\n", " \"summary\": \"review; missing values found\",\n", - " \"timestamp\": \"2023-10-04T16:08:40.063744\",\n", + " \"timestamp\": \"2023-10-05T18:04:09.264100\",\n", " \"comments\": [],\n", - " \"exception\": \"y\"\n", + " \"exception\": \"yes\"\n", " },\n", - " \"output_8\": {\n", - " \"uid\": \"output_8\",\n", + " \"output_9\": {\n", + " \"uid\": \"output_9\",\n", " \"status\": \"review\",\n", " \"type\": \"table\",\n", " \"properties\": {\n", @@ -3838,7 +4152,7 @@ " },\n", " \"files\": [\n", " {\n", - " \"name\": \"output_8_0.csv\",\n", + " \"name\": \"output_9_0.csv\",\n", " \"sdc\": {\n", " \"summary\": {\n", " \"suppressed\": false,\n", @@ -3924,12 +4238,12 @@ " },\n", " \"command\": \"table = acro.pivot_table(\",\n", " \"summary\": \"review; negative values found\",\n", - " \"timestamp\": \"2023-10-04T16:08:40.206681\",\n", + " \"timestamp\": \"2023-10-05T18:04:09.342995\",\n", " \"comments\": [],\n", - " \"exception\": \"y\"\n", + " \"exception\": \"yes\"\n", " },\n", - " \"output_9\": {\n", - " \"uid\": \"output_9\",\n", + " \"output_10\": {\n", + " \"uid\": \"output_10\",\n", " \"status\": \"pass\",\n", " \"type\": \"regression\",\n", " \"properties\": {\n", @@ -3938,27 +4252,27 @@ " },\n", " \"files\": [\n", " {\n", - " \"name\": \"output_9_0.csv\",\n", + " \"name\": \"output_10_0.csv\",\n", " \"sdc\": {}\n", " },\n", " {\n", - " \"name\": \"output_9_1.csv\",\n", + " \"name\": \"output_10_1.csv\",\n", " \"sdc\": {}\n", " },\n", " {\n", - " \"name\": \"output_9_2.csv\",\n", + " \"name\": \"output_10_2.csv\",\n", " \"sdc\": {}\n", " }\n", " ],\n", " \"outcome\": {},\n", " \"command\": \"results = acro.ols(y, x)\",\n", " \"summary\": \"pass; dof=807.0 >= 10\",\n", - " \"timestamp\": \"2023-10-04T16:08:40.319370\",\n", + " \"timestamp\": \"2023-10-05T18:04:09.406745\",\n", " \"comments\": [],\n", " \"exception\": \"\"\n", " },\n", - " \"output_10\": {\n", - " \"uid\": \"output_10\",\n", + " \"output_11\": {\n", + " \"uid\": \"output_11\",\n", " \"status\": \"pass\",\n", " \"type\": \"regression\",\n", " \"properties\": {\n", @@ -3967,27 +4281,27 @@ " },\n", " \"files\": [\n", " {\n", - " \"name\": \"output_10_0.csv\",\n", + " \"name\": \"output_11_0.csv\",\n", " \"sdc\": {}\n", " },\n", " {\n", - " \"name\": \"output_10_1.csv\",\n", + " \"name\": \"output_11_1.csv\",\n", " \"sdc\": {}\n", " },\n", " {\n", - " \"name\": \"output_10_2.csv\",\n", + " \"name\": \"output_11_2.csv\",\n", " \"sdc\": {}\n", " }\n", " ],\n", " \"outcome\": {},\n", " \"command\": \"results = acro.olsr(\",\n", " \"summary\": \"pass; dof=807.0 >= 10\",\n", - " \"timestamp\": \"2023-10-04T16:08:40.431387\",\n", + " \"timestamp\": \"2023-10-05T18:04:09.449726\",\n", " \"comments\": [],\n", " \"exception\": \"\"\n", " },\n", - " \"output_11\": {\n", - " \"uid\": \"output_11\",\n", + " \"output_12\": {\n", + " \"uid\": \"output_12\",\n", " \"status\": \"pass\",\n", " \"type\": \"regression\",\n", " \"properties\": {\n", @@ -3996,23 +4310,23 @@ " },\n", " \"files\": [\n", " {\n", - " \"name\": \"output_11_0.csv\",\n", + " \"name\": \"output_12_0.csv\",\n", " \"sdc\": {}\n", " },\n", " {\n", - " \"name\": \"output_11_1.csv\",\n", + " \"name\": \"output_12_1.csv\",\n", " \"sdc\": {}\n", " }\n", " ],\n", " \"outcome\": {},\n", " \"command\": \"results = acro.probit(y, x)\",\n", " \"summary\": \"pass; dof=806.0 >= 10\",\n", - " \"timestamp\": \"2023-10-04T16:08:40.540353\",\n", + " \"timestamp\": \"2023-10-05T18:04:09.499724\",\n", " \"comments\": [],\n", " \"exception\": \"\"\n", " },\n", - " \"output_12\": {\n", - " \"uid\": \"output_12\",\n", + " \"output_13\": {\n", + " \"uid\": \"output_13\",\n", " \"status\": \"pass\",\n", " \"type\": \"regression\",\n", " \"properties\": {\n", @@ -4021,18 +4335,18 @@ " },\n", " \"files\": [\n", " {\n", - " \"name\": \"output_12_0.csv\",\n", + " \"name\": \"output_13_0.csv\",\n", " \"sdc\": {}\n", " },\n", " {\n", - " \"name\": \"output_12_1.csv\",\n", + " \"name\": \"output_13_1.csv\",\n", " \"sdc\": {}\n", " }\n", " ],\n", " \"outcome\": {},\n", " \"command\": \"results = acro.logit(y, x)\",\n", " \"summary\": \"pass; dof=806.0 >= 10\",\n", - " \"timestamp\": \"2023-10-04T16:08:40.616654\",\n", + " \"timestamp\": \"2023-10-05T18:04:09.537725\",\n", " \"comments\": [],\n", " \"exception\": \"\"\n", " },\n", @@ -4051,9 +4365,9 @@ " \"suppressed\": false,\n", " \"negative\": 0,\n", " \"missing\": 0,\n", - " \"threshold\": 7,\n", - " \"p-ratio\": 2,\n", - " \"nk-rule\": 1\n", + " \"threshold\": 14,\n", + " \"p-ratio\": 8,\n", + " \"nk-rule\": 7\n", " },\n", " \"cells\": {\n", " \"negative\": [],\n", @@ -4061,47 +4375,123 @@ " \"threshold\": [\n", " [\n", " 0,\n", - " 1\n", + " 0\n", " ],\n", " [\n", " 0,\n", - " 3\n", + " 2\n", + " ],\n", + " [\n", + " 0,\n", + " 5\n", " ],\n", " [\n", " 1,\n", - " 3\n", + " 0\n", + " ],\n", + " [\n", + " 1,\n", + " 5\n", " ],\n", " [\n", " 2,\n", - " 3\n", + " 0\n", + " ],\n", + " [\n", + " 2,\n", + " 5\n", " ],\n", " [\n", " 3,\n", - " 3\n", + " 0\n", + " ],\n", + " [\n", + " 3,\n", + " 5\n", " ],\n", " [\n", " 4,\n", - " 3\n", + " 0\n", + " ],\n", + " [\n", + " 4,\n", + " 5\n", " ],\n", " [\n", " 5,\n", - " 3\n", + " 0\n", + " ],\n", + " [\n", + " 5,\n", + " 1\n", + " ],\n", + " [\n", + " 5,\n", + " 5\n", " ]\n", " ],\n", " \"p-ratio\": [\n", " [\n", " 0,\n", - " 1\n", + " 0\n", " ],\n", " [\n", " 0,\n", - " 3\n", + " 2\n", + " ],\n", + " [\n", + " 0,\n", + " 5\n", + " ],\n", + " [\n", + " 1,\n", + " 0\n", + " ],\n", + " [\n", + " 2,\n", + " 0\n", + " ],\n", + " [\n", + " 3,\n", + " 0\n", + " ],\n", + " [\n", + " 4,\n", + " 0\n", + " ],\n", + " [\n", + " 5,\n", + " 0\n", " ]\n", " ],\n", " \"nk-rule\": [\n", " [\n", " 0,\n", - " 3\n", + " 0\n", + " ],\n", + " [\n", + " 0,\n", + " 5\n", + " ],\n", + " [\n", + " 1,\n", + " 0\n", + " ],\n", + " [\n", + " 2,\n", + " 0\n", + " ],\n", + " [\n", + " 3,\n", + " 0\n", + " ],\n", + " [\n", + " 4,\n", + " 0\n", + " ],\n", + " [\n", + " 5,\n", + " 0\n", " ]\n", " ]\n", " }\n", @@ -4109,47 +4499,80 @@ " }\n", " ],\n", " \"outcome\": {\n", - " \"G\": {\n", + " \"('G', 'Dead in 2015')\": {\n", + " \"2010\": \"threshold; p-ratio; nk-rule; \",\n", + " \"2011\": \"threshold; p-ratio; nk-rule; \",\n", + " \"2012\": \"threshold; p-ratio; nk-rule; \",\n", + " \"2013\": \"threshold; p-ratio; nk-rule; \",\n", + " \"2014\": \"threshold; p-ratio; nk-rule; \",\n", + " \"2015\": \"threshold; p-ratio; nk-rule; \",\n", + " \"All\": \"ok\"\n", + " },\n", + " \"('G', 'Alive in 2015')\": {\n", " \"2010\": \"ok\",\n", " \"2011\": \"ok\",\n", " \"2012\": \"ok\",\n", " \"2013\": \"ok\",\n", " \"2014\": \"ok\",\n", - " \"2015\": \"ok\"\n", + " \"2015\": \"threshold; \",\n", + " \"All\": \"ok\"\n", " },\n", - " \"N\": {\n", + " \"('N', 'Alive in 2015')\": {\n", " \"2010\": \"threshold; p-ratio; \",\n", " \"2011\": \"ok\",\n", " \"2012\": \"ok\",\n", " \"2013\": \"ok\",\n", " \"2014\": \"ok\",\n", - " \"2015\": \"ok\"\n", + " \"2015\": \"ok\",\n", + " \"All\": \"ok\"\n", " },\n", - " \"R\": {\n", + " \"('R', 'Dead in 2015')\": {\n", " \"2010\": \"ok\",\n", " \"2011\": \"ok\",\n", " \"2012\": \"ok\",\n", " \"2013\": \"ok\",\n", " \"2014\": \"ok\",\n", - " \"2015\": \"ok\"\n", + " \"2015\": \"ok\",\n", + " \"All\": \"ok\"\n", " },\n", - " \"R/G\": {\n", + " \"('R', 'Alive in 2015')\": {\n", + " \"2010\": \"ok\",\n", + " \"2011\": \"ok\",\n", + " \"2012\": \"ok\",\n", + " \"2013\": \"ok\",\n", + " \"2014\": \"ok\",\n", + " \"2015\": \"ok\",\n", + " \"All\": \"ok\"\n", + " },\n", + " \"('R/G', 'Alive in 2015')\": {\n", " \"2010\": \"threshold; p-ratio; nk-rule; \",\n", " \"2011\": \"threshold; \",\n", " \"2012\": \"threshold; \",\n", " \"2013\": \"threshold; \",\n", " \"2014\": \"threshold; \",\n", - " \"2015\": \"threshold; \"\n", + " \"2015\": \"threshold; \",\n", + " \"All\": \"ok\"\n", + " },\n", + " \"('All', '')\": {\n", + " \"2010\": \"ok\",\n", + " \"2011\": \"ok\",\n", + " \"2012\": \"ok\",\n", + " \"2013\": \"ok\",\n", + " \"2014\": \"ok\",\n", + " \"2015\": \"ok\",\n", + " \"All\": \"ok\"\n", " }\n", " },\n", - " \"command\": \"safe_table = acro.crosstab(df.year, df.grant_type, values=df.inc_grants, aggfunc=\\\"mean\\\")\",\n", - " \"summary\": \"fail; threshold: 7 cells may need suppressing; p-ratio: 2 cells may need suppressing; nk-rule: 1 cells may need suppressing; \",\n", - " \"timestamp\": \"2023-10-04T16:08:34.251644\",\n", - " \"comments\": [],\n", - " \"exception\": \"y\"\n", + " \"command\": \"table = acro.crosstab(\",\n", + " \"summary\": \"fail; threshold: 14 cells may need suppressing; p-ratio: 8 cells may need suppressing; nk-rule: 7 cells may need suppressing; \",\n", + " \"timestamp\": \"2023-10-05T18:03:56.973956\",\n", + " \"comments\": [\n", + " \"Empty columns: ('N', 'Dead in 2015'), ('R/G', 'Dead in 2015') were deleted.\"\n", + " ],\n", + " \"exception\": \"yes\"\n", " },\n", - " \"output_13\": {\n", - " \"uid\": \"output_13\",\n", + " \"output_14\": {\n", + " \"uid\": \"output_14\",\n", " \"status\": \"review\",\n", " \"type\": \"custom\",\n", " \"properties\": {},\n", @@ -4162,11 +4585,11 @@ " \"outcome\": {},\n", " \"command\": \"custom\",\n", " \"summary\": \"review\",\n", - " \"timestamp\": \"2023-10-04T16:08:40.800408\",\n", + " \"timestamp\": \"2023-10-05T18:04:09.660560\",\n", " \"comments\": [\n", " \"This output is an image showing the relationship between X and Y\"\n", " ],\n", - " \"exception\": \"y\"\n", + " \"exception\": \"yes\"\n", " }\n", " }\n", "}\n" diff --git a/test/test_initial.py b/test/test_initial.py index 9987aa8..aad4a3f 100644 --- a/test/test_initial.py +++ b/test/test_initial.py @@ -42,6 +42,24 @@ def test_crosstab_without_suppression(data): assert 48 == output.output[0]["R/G"].sum() +def test_crosstab_with_aggfunc_sum(data, acro): + """Test the crosstab with two columns and aggfunc sum.""" + acro = ACRO(suppress=False) + _ = acro.crosstab( + data.year, + [data.grant_type, data.survivor], + values=data.inc_grants, + aggfunc="sum", + ) + acro.add_exception("output_0", "Let me have it") + results: Records = acro.finalise() + output_0 = results.get_index(0) + comment = ( + "Empty columns: ('N', 'Dead in 2015'), ('R/G', 'Dead in 2015') were deleted." + ) + assert output_0.comments == [comment] + + def test_crosstab_threshold(data, acro): """Crosstab threshold test.""" _ = acro.crosstab(data.year, data.grant_type) @@ -572,7 +590,7 @@ def test_crosstab_with_totals_with_suppression(data, acro): assert "R/G" not in output.output[0].columns -def test_crosstab_with_totals_with_suppression_herichical(data, acro): +def test_crosstab_with_totals_with_suppression_hierarchical(data, acro): """Test the crosstab with both margins and suppression are true.""" _ = acro.crosstab( [data.year, data.survivor], [data.grant_type, data.status], margins=True @@ -641,7 +659,7 @@ def test_crosstab_with_manual_totals_with_suppression(data, acro): assert "R/G" in output.output[0].columns -def test_crosstab_with_manual_totals_with_suppression_herichical(data, acro): +def test_crosstab_with_manual_totals_with_suppression_hierarchical(data, acro): """Test the crosstab with both margins and suppression are true with multilevel indexes and columns while using the total manual function. """ @@ -682,7 +700,7 @@ def test_crosstab_with_manual_totals_with_suppression_with_aggfunc_mean(data, ac assert "R/G" in output.output[0].columns -def test_herichical_crosstab_with_manual_totals_with_mean(data, acro): +def test_hierarchical_crosstab_with_manual_totals_with_mean(data, acro): """Test the crosstab with both margins and suppression are true, with aggfunc mean and with multilevel columns and rows while using the total manual function. """ @@ -748,18 +766,6 @@ def test_pivot_table_with_totals_with_suppression(data, acro): if RUN_TEST: - def test_crosstab_with_sum(data, acro): - """Test the crosstab with two columns and aggfunc sum.""" - acro = ACRO(suppress=False) - _ = acro.crosstab( - data.year, - [data.grant_type, data.survivor], - values=data.inc_grants, - aggfunc="sum", - ) - output = acro.results.get_index(0) - assert (6, 8) == output.output[0].shape - def test_crosstab_multiple_aggregate_function(data, acro): """Crosstab with multiple agg funcs.""" acro = ACRO(suppress=False) @@ -814,7 +820,7 @@ def test_crosstab_with_totals_with_suppression_with_two_aggfuncs(data, acro): output_4 = (output.output[0]).droplevel(0, axis=1) assert output_3.equals(output_4) - def test_crosstab_with_totals_with_suppression_with_two_aggfuncs_herichical( + def test_crosstab_with_totals_with_suppression_with_two_aggfuncs_hierarchical( data, acro ): """Test the crosstab with both margins and suppression are true From 47003c4b07f43bd70b7f3e8a58e69c3354b2c344 Mon Sep 17 00:00:00 2001 From: Maha Albashir Date: Sun, 8 Oct 2023 20:16:43 +0100 Subject: [PATCH 02/11] delete folders after tests --- test/test_initial.py | 16 ++++++++++++++++ test/test_stata_interface.py | 17 +++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/test/test_initial.py b/test/test_initial.py index 9987aa8..1b63d36 100644 --- a/test/test_initial.py +++ b/test/test_initial.py @@ -32,6 +32,15 @@ def acro() -> ACRO: return ACRO(suppress=True) +def clean_up(name): + """Removes unwanted files or directory.""" + if os.path.exists(name): + if os.path.isfile(name): + os.remove(name) + elif os.path.isdir(name): + shutil.rmtree(name) + + def test_crosstab_without_suppression(data): """Crosstab threshold without automatic suppression.""" acro = ACRO(suppress=False) @@ -849,3 +858,10 @@ def test_crosstab_with_manual_totals_with_suppression_with_two_aggfunc( "We can not calculate the margins with a list of aggregation functions. " "Please create a table for each aggregation function" in caplog.text ) + + +def test_cleanup(): + """Gets rid of files created during tests.""" + names = ["test_outputs", "test_add_to_acro", "sdc_results", "RES_PYTEST"] + for name in names: + clean_up(name) diff --git a/test/test_stata_interface.py b/test/test_stata_interface.py index 5cc3253..561b15e 100644 --- a/test/test_stata_interface.py +++ b/test/test_stata_interface.py @@ -1,6 +1,7 @@ """This module contains unit tests for the stata interface.""" import os +import shutil import numpy as np import pandas as pd @@ -33,6 +34,15 @@ def data() -> pd.DataFrame: return data +def clean_up(name): + """Removes unwanted files or directory.""" + if os.path.exists(name): + if os.path.isfile(name): + os.remove(name) + elif os.path.isdir(name): + shutil.rmtree(name) + + def dummy_acrohandler( data, command, varlist, exclusion, exp, weights, options ): # pylint:disable=too-many-arguments @@ -736,3 +746,10 @@ def test_stata_unknown(data): ) correct = "acro command not recognised: foo" assert ret == correct, f"got:\n{ret}\nexpected:\n{correct}\n" + + +def test_cleanup(): + """Gets rid of files created during tests.""" + names = ["test_outputs"] + for name in names: + clean_up(name) From 6580386a00eba2a60e98310b089781224c0f0654 Mon Sep 17 00:00:00 2001 From: Maha Albashir Date: Sun, 8 Oct 2023 20:25:17 +0100 Subject: [PATCH 03/11] fixing pylint issues --- test/test_initial.py | 16 ---------------- test/test_stata_interface.py | 2 +- 2 files changed, 1 insertion(+), 17 deletions(-) diff --git a/test/test_initial.py b/test/test_initial.py index 1b63d36..9987aa8 100644 --- a/test/test_initial.py +++ b/test/test_initial.py @@ -32,15 +32,6 @@ def acro() -> ACRO: return ACRO(suppress=True) -def clean_up(name): - """Removes unwanted files or directory.""" - if os.path.exists(name): - if os.path.isfile(name): - os.remove(name) - elif os.path.isdir(name): - shutil.rmtree(name) - - def test_crosstab_without_suppression(data): """Crosstab threshold without automatic suppression.""" acro = ACRO(suppress=False) @@ -858,10 +849,3 @@ def test_crosstab_with_manual_totals_with_suppression_with_two_aggfunc( "We can not calculate the margins with a list of aggregation functions. " "Please create a table for each aggregation function" in caplog.text ) - - -def test_cleanup(): - """Gets rid of files created during tests.""" - names = ["test_outputs", "test_add_to_acro", "sdc_results", "RES_PYTEST"] - for name in names: - clean_up(name) diff --git a/test/test_stata_interface.py b/test/test_stata_interface.py index 561b15e..059091b 100644 --- a/test/test_stata_interface.py +++ b/test/test_stata_interface.py @@ -750,6 +750,6 @@ def test_stata_unknown(data): def test_cleanup(): """Gets rid of files created during tests.""" - names = ["test_outputs"] + names = ["test_outputs", "test_add_to_acro", "sdc_results", "RES_PYTEST"] for name in names: clean_up(name) From 3d47dc33b4af6110d7c240aa5fd06d2c6f68b4ae Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Oct 2023 17:22:42 +0000 Subject: [PATCH 04/11] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/pre-commit/pre-commit-hooks: v4.4.0 → v4.5.0](https://github.com/pre-commit/pre-commit-hooks/compare/v4.4.0...v4.5.0) - [github.com/codespell-project/codespell: v2.2.5 → v2.2.6](https://github.com/codespell-project/codespell/compare/v2.2.5...v2.2.6) - [github.com/asottile/pyupgrade: v3.13.0 → v3.15.0](https://github.com/asottile/pyupgrade/compare/v3.13.0...v3.15.0) --- .pre-commit-config.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 910fe12..05ce7bf 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -8,7 +8,7 @@ repos: # Standard hooks - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v4.5.0 hooks: - id: check-merge-conflict - id: end-of-file-fixer @@ -26,7 +26,7 @@ repos: # Check for spelling - repo: https://github.com/codespell-project/codespell - rev: v2.2.5 + rev: v2.2.6 hooks: - id: codespell args: ["-L", "tre"] @@ -39,7 +39,7 @@ repos: # Upgrade old Python syntax - repo: https://github.com/asottile/pyupgrade - rev: v3.13.0 + rev: v3.15.0 hooks: - id: pyupgrade args: [--py310-plus] From c0b17cfde2bac84017f29d92a751a551415d9de9 Mon Sep 17 00:00:00 2001 From: Maha Albashir Date: Mon, 9 Oct 2023 19:53:47 +0100 Subject: [PATCH 05/11] deleting empty rows from table --- acro/acro_tables.py | 44 +++++++---- test/test_initial.py | 181 ++++++++++++++++++++++--------------------- 2 files changed, 118 insertions(+), 107 deletions(-) diff --git a/acro/acro_tables.py b/acro/acro_tables.py index 4464ab4..a53ec37 100644 --- a/acro/acro_tables.py +++ b/acro/acro_tables.py @@ -133,22 +133,28 @@ def crosstab( # pylint: disable=too-many-arguments,too-many-locals dropna, normalize, ) - # delete empty columns from table + # delete empty rows and columns from table + deleted_rows = [] deleted_cols = [] - for col in table.columns: - if table[col].sum() == 0: - table = table.drop(col, axis=1) - deleted_cols.append(col) + # define empty columns and rows using boolean masks + empty_cols_mask = table.sum(axis=0) == 0 + empty_rows_mask = table.sum(axis=1) == 0 + + deleted_cols = list(table.columns[empty_cols_mask]) + table = table.loc[:, ~empty_cols_mask] + deleted_rows = list(table.index[empty_rows_mask]) + table = table.loc[~empty_rows_mask, :] + # create a message with the deleted column's names - if len(deleted_cols) > 0: - deleted_cols = [ - f"{col}" for col in deleted_cols - ] # to handle column's names of type tuple - msg = ", ".join(deleted_cols) - comments = [f"Empty columns: {msg} were deleted."] - logger.info(comments) - else: # pragma: no cover - comments = None + comments = [] + if deleted_cols: + msg_cols = ", ".join(str(col) for col in deleted_cols) + comments.append(f"Empty columns: {msg_cols} were deleted.") + if deleted_rows: + msg_rows = ", ".join(str(row) for row in deleted_rows) + comments.append(f"Empty rows: {msg_rows} were deleted.") + if comments: + logger.info(" ".join(comments)) masks = create_crosstab_masks( index, @@ -565,10 +571,14 @@ def create_crosstab_masks( # pylint: disable=too-many-arguments,too-many-locals normalize=normalize, ) + # drop empty columns and rows if dropna or margins: - for col in t_values.columns: - if t_values[col].sum() == 0: - t_values = t_values.drop(col, axis=1) + empty_cols_mask = t_values.sum(axis=0) == 0 + empty_rows_mask = t_values.sum(axis=1) == 0 + + t_values = t_values.loc[:, ~empty_cols_mask] + t_values = t_values.loc[~empty_rows_mask, :] + t_values = t_values < THRESHOLD masks["threshold"] = t_values # check for negative values -- currently unsupported diff --git a/test/test_initial.py b/test/test_initial.py index aad4a3f..53dd57b 100644 --- a/test/test_initial.py +++ b/test/test_initial.py @@ -15,7 +15,6 @@ # pylint: disable=redefined-outer-name PATH: str = "RES_PYTEST" -RUN_TEST = False @pytest.fixture @@ -764,94 +763,96 @@ def test_pivot_table_with_totals_with_suppression(data, acro): assert "R/G" not in output.output[0].columns -if RUN_TEST: +def test_crosstab_multiple_aggregate_function(data, acro): + """Crosstab with multiple agg funcs.""" + acro = ACRO(suppress=False) - def test_crosstab_multiple_aggregate_function(data, acro): - """Crosstab with multiple agg funcs.""" - acro = ACRO(suppress=False) + _ = acro.crosstab( + data.year, data.grant_type, values=data.inc_grants, aggfunc=["mean", "std"] + ) + output = acro.results.get_index(0) + correct_summary: str = ( + "fail; threshold: 14 cells may need suppressing;" + " p-ratio: 4 cells may need suppressing; " + "nk-rule: 2 cells may need suppressing; " + ) + assert ( + output.summary == correct_summary + ), f"\n{output.summary}\n should be \n{correct_summary}\n" + print(f"{output.output[0]['mean'][ 'R/G'].sum()}") + correctval = 97383496.0 + errmsg = f"{output.output[0]['mean']['R/G'].sum()} should be {correctval}" + assert correctval == output.output[0]["mean"]["R/G"].sum(), errmsg - _ = acro.crosstab( - data.year, data.grant_type, values=data.inc_grants, aggfunc=["mean", "std"] - ) - output = acro.results.get_index(0) - correct_summary: str = ( - "fail; threshold: 14 cells may need suppressing;" - " p-ratio: 4 cells may need suppressing; " - "nk-rule: 2 cells may need suppressing; " - ) - assert ( - output.summary == correct_summary - ), f"\n{output.summary}\n should be \n{correct_summary}\n" - print(f"{output.output[0]['mean'][ 'R/G'].sum()}") - correctval = 97383496.0 - errmsg = f"{output.output[0]['mean']['R/G'].sum()} should be {correctval}" - assert correctval == output.output[0]["mean"]["R/G"].sum(), errmsg - - def test_crosstab_with_totals_with_suppression_with_two_aggfuncs(data, acro): - """Test the crosstab with both margins and suppression are true - and with a list of aggfuncs while using the total manual function. - """ - _ = acro.crosstab( - data.year, - data.grant_type, - values=data.inc_grants, - aggfunc=["count", "std"], - margins=True, - ) - _ = acro.crosstab( - data.year, - data.grant_type, - values=data.inc_grants, - aggfunc="count", - margins=True, - ) - _ = acro.crosstab( - data.year, - data.grant_type, - values=data.inc_grants, - aggfunc="std", - margins=True, - ) - output = acro.results.get_index(0) - assert 8 == output.output[0].shape[1] - output_1 = acro.results.get_index(1) - output_2 = acro.results.get_index(2) - output_3 = pd.concat([output_1.output[0], output_2.output[0]], axis=1) - output_4 = (output.output[0]).droplevel(0, axis=1) - assert output_3.equals(output_4) - - def test_crosstab_with_totals_with_suppression_with_two_aggfuncs_hierarchical( - data, acro - ): - """Test the crosstab with both margins and suppression are true - and with a list of aggfuncs and a list of columns while using - the total manual function. - """ - _ = acro.crosstab( - data.year, - [data.grant_type, data.survivor], - values=data.inc_grants, - aggfunc=["count", "std"], - margins=True, - ) - output = acro.results.get_index(0) - assert ("G", "Dead in 2015") in output.output[0].columns - - def test_crosstab_with_manual_totals_with_suppression_with_two_aggfunc( - data, acro, caplog - ): - """Test the crosstab with both margins and suppression are true - and with a list of aggfuncs while using the total manual function. - """ - _ = acro.crosstab( - data.year, - data.grant_type, - values=data.inc_grants, - aggfunc=["count", "std"], - margins=True, - show_suppressed=True, - ) - assert ( - "We can not calculate the margins with a list of aggregation functions. " - "Please create a table for each aggregation function" in caplog.text - ) + +def test_crosstab_with_totals_with_suppression_with_two_aggfuncs(data, acro): + """Test the crosstab with both margins and suppression are true + and with a list of aggfuncs while using the total manual function. + """ + _ = acro.crosstab( + data.year, + data.grant_type, + values=data.inc_grants, + aggfunc=["count", "std"], + margins=True, + ) + _ = acro.crosstab( + data.year, + data.grant_type, + values=data.inc_grants, + aggfunc="count", + margins=True, + ) + _ = acro.crosstab( + data.year, + data.grant_type, + values=data.inc_grants, + aggfunc="std", + margins=True, + ) + output = acro.results.get_index(0) + assert 8 == output.output[0].shape[1] + output_1 = acro.results.get_index(1) + output_2 = acro.results.get_index(2) + output_3 = pd.concat([output_1.output[0], output_2.output[0]], axis=1) + output_4 = (output.output[0]).droplevel(0, axis=1) + assert output_3.equals(output_4) + + +def test_crosstab_with_totals_with_suppression_with_two_aggfuncs_hierarchical( + data, acro +): + """Test the crosstab with both margins and suppression are true + and with a list of aggfuncs and a list of columns while using + the total manual function. + """ + _ = acro.crosstab( + data.year, + [data.grant_type, data.survivor], + values=data.inc_grants, + aggfunc=["count", "std"], + margins=True, + ) + output = acro.results.get_index(0) + assert ("count", "G", "Alive in 2015") in output.output[0].columns + assert ("std", "G", "Alive in 2015") in output.output[0].columns + + +def test_crosstab_with_manual_totals_with_suppression_with_two_aggfunc( + data, acro, caplog +): + """Test the crosstab with both margins and suppression are true + and with a list of aggfuncs while using the total manual function. + """ + _ = acro.crosstab( + data.year, + data.grant_type, + values=data.inc_grants, + aggfunc=["count", "std"], + margins=True, + show_suppressed=True, + ) + assert ( + "We can not calculate the margins with a list of aggregation functions. " + "Please create a table for each aggregation function" in caplog.text + ) From 05c49ca50aa658902c6dc4894c560fd648004522 Mon Sep 17 00:00:00 2001 From: Maha Albashir Date: Mon, 9 Oct 2023 20:00:13 +0100 Subject: [PATCH 06/11] fixing code coverage --- test/test_initial.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/test/test_initial.py b/test/test_initial.py index 53dd57b..ef46863 100644 --- a/test/test_initial.py +++ b/test/test_initial.py @@ -50,13 +50,25 @@ def test_crosstab_with_aggfunc_sum(data, acro): values=data.inc_grants, aggfunc="sum", ) + _ = acro.crosstab( + [data.grant_type, data.survivor], + data.year, + values=data.inc_grants, + aggfunc="sum", + ) acro.add_exception("output_0", "Let me have it") + acro.add_exception("output_1", "I need this output") results: Records = acro.finalise() output_0 = results.get_index(0) - comment = ( + output_1 = results.get_index(1) + comment_0 = ( "Empty columns: ('N', 'Dead in 2015'), ('R/G', 'Dead in 2015') were deleted." ) - assert output_0.comments == [comment] + comment_1 = ( + "Empty rows: ('N', 'Dead in 2015'), ('R/G', 'Dead in 2015') were deleted." + ) + assert output_0.comments == [comment_0] + assert output_1.comments == [comment_1] def test_crosstab_threshold(data, acro): From f3396315f4bfb3a8c926abaeee7b0d9c23bbf53c Mon Sep 17 00:00:00 2001 From: Richard Preen Date: Tue, 10 Oct 2023 13:21:29 +0100 Subject: [PATCH 07/11] fix spelling --- docs/ACRO_For_Researchers.md | 2 +- test/test_stata_interface.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/ACRO_For_Researchers.md b/docs/ACRO_For_Researchers.md index 5b778e1..4a42bd3 100644 --- a/docs/ACRO_For_Researchers.md +++ b/docs/ACRO_For_Researchers.md @@ -89,7 +89,7 @@ The finalise function will: ## Frequently Asked Questions ### What if I want to run my code many times before I decide exactly what to send for approval? -ACRO naturally suppors this way of working. It will not produce the output folder until you are satisfied and add acro.finalise() to the end of your script. +ACRO naturally supports this way of working. It will not produce the output folder until you are satisfied and add acro.finalise() to the end of your script. ### Why is my data exported as unformatted .csv files? The outputs are saved in row format (as csv files) for the output checkers to check and make decisions. Although, you can change the format, if you like, the csv files should be there for the checking. ### Why is ACRO Python-based ‘under-the-hood’? diff --git a/test/test_stata_interface.py b/test/test_stata_interface.py index 059091b..5cec20e 100644 --- a/test/test_stata_interface.py +++ b/test/test_stata_interface.py @@ -290,7 +290,7 @@ def test_stata_rename_outputs(): def test_stata_incomplete_output_commands(): - """Tests handling incomplete or wony outpu commands + """Tests handling incomplete or wrong output commands assumes simple table has been created by earlier tests. """ # output to change not provided From 146b704812004af4f6a01394e248aeeab86ffc3d Mon Sep 17 00:00:00 2001 From: Jim-smith Date: Tue, 10 Oct 2023 17:35:41 +0100 Subject: [PATCH 08/11] Update acro_tables.py Added text to docstring Signed-off-by: Jim-smith --- acro/acro_tables.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/acro/acro_tables.py b/acro/acro_tables.py index a53ec37..dc18f8b 100644 --- a/acro/acro_tables.py +++ b/acro/acro_tables.py @@ -69,6 +69,9 @@ def crosstab( # pylint: disable=too-many-arguments,too-many-locals """Compute a simple cross tabulation of two (or more) factors. By default, computes a frequency table of the factors unless an array of values and an aggregation function are passed. + + To provide consistent behaviour with different aggregation functions, + 'empty' rows or columns -i.e. that are all NaN or 0 (count,sum) are removed. Parameters ---------- From 628c8e1cbe3d2fdec055d5f2febf876e04418664 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 10 Oct 2023 16:36:22 +0000 Subject: [PATCH 09/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- acro/acro_tables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/acro/acro_tables.py b/acro/acro_tables.py index dc18f8b..07aa229 100644 --- a/acro/acro_tables.py +++ b/acro/acro_tables.py @@ -69,7 +69,7 @@ def crosstab( # pylint: disable=too-many-arguments,too-many-locals """Compute a simple cross tabulation of two (or more) factors. By default, computes a frequency table of the factors unless an array of values and an aggregation function are passed. - + To provide consistent behaviour with different aggregation functions, 'empty' rows or columns -i.e. that are all NaN or 0 (count,sum) are removed. From 79e28526e4c19b7fae0d0e1a05b8167165bfa4bf Mon Sep 17 00:00:00 2001 From: Jim-smith Date: Tue, 10 Oct 2023 17:40:08 +0100 Subject: [PATCH 10/11] Update test.ipynb fixing typos Signed-off-by: Jim-smith --- notebooks/test.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/notebooks/test.ipynb b/notebooks/test.ipynb index 29ecbef..b37af78 100644 --- a/notebooks/test.ipynb +++ b/notebooks/test.ipynb @@ -569,7 +569,7 @@ "id": "6d4730c4", "metadata": {}, "source": [ - "### ACRO crosstab with supression" + "### ACRO crosstab with suppression" ] }, { @@ -708,7 +708,7 @@ "id": "0c695e09", "metadata": {}, "source": [ - "### ACRO crosstab with supression and totals" + "### ACRO crosstab with suppression and totals" ] }, { From 65e41e228fe580e2021596ecf4c79e053633b542 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 11 Oct 2023 23:06:14 +0000 Subject: [PATCH 11/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- notebooks/acro_demo.py | 43 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/notebooks/acro_demo.py b/notebooks/acro_demo.py index 8aaa7ec..8c46a7f 100644 --- a/notebooks/acro_demo.py +++ b/notebooks/acro_demo.py @@ -5,6 +5,7 @@ # import libraries import os + import pandas as pd from scipy.io.arff import loadarff @@ -19,12 +20,12 @@ acro = ACRO(suppress=False) # Load test data -# The dataset used in this notebook is the nursery dataset from OpenML. -# - In this version, the data can be read directly from the local machine after it has been downloaded. +# The dataset used in this notebook is the nursery dataset from OpenML. +# - In this version, the data can be read directly from the local machine after it has been downloaded. # - The code below reads the data from a folder called "data" which we assume is at the same level as the folder where you are working. # - The path might need to be changed if the data has been downloaded and stored elsewhere. -# - for example use: -# path = os.path.join("data", "nursery.arff") +# - for example use: +# path = os.path.join("data", "nursery.arff") # if the data is in a sub-folder of your work folder path = os.path.join("../data", "nursery.arff") @@ -36,12 +37,12 @@ df.head() # Examples of producing tabular output -# We rely on the industry-standard package **pandas** for tabulating data. +# We rely on the industry-standard package **pandas** for tabulating data. # In the next few examples we show: # - first, how a researcher would normally make a call in pandas, saving the results in a variable that they can view on screen (or save to file?) # - then how the call is identical in SACRO, except that: # - "pd" is replaced by "acro" -# - the researcher immediately sees a copy of what the TRE output checker will see. +# - the researcher immediately sees a copy of what the TRE output checker will see. print( "\nThese examples show acro wrappers around " @@ -50,7 +51,7 @@ # Pandas crosstab -# This is an example of crosstab using pandas. +# This is an example of crosstab using pandas. # We first make the call, then the second line print the outputs to screen. print("\nCalling crosstab of recommendation by parents using pandas") @@ -58,20 +59,18 @@ print(table) # ACRO crosstab -# - This is an example of crosstab using ACRO. +# - This is an example of crosstab using ACRO. # - The INFO lines show the researcher what will be reported to the output checkers. # - Then the (suppressed as necessary) table is shown via the print command as before. print("\nNow the same crosstab call using the ACRO interface") -safe_table = acro.crosstab( - df.recommend, df.parents -) +safe_table = acro.crosstab(df.recommend, df.parents) print("\nand this is the researchers output") print(safe_table) # ACRO crosstab with suppression # - This is an example of crosstab with suppressing the cells that violate the disclosure tests. -# - Note that you need to change the value of the suppress variable in the acro object to True. Then run the crosstab command. +# - Note that you need to change the value of the suppress variable in the acro object to True. Then run the crosstab command. # - If you wish to continue the research while suppressing the outputs, leave the suppress variable as it is, otherwise turn it off. print("\nTurn on the suppression variable") @@ -84,7 +83,7 @@ acro.suppress = False # ACRO functionality to let users manage their outputs -# +# # 1: List current ACRO outputs # This is an example of using the print_output function to list all the outputs created so far @@ -95,10 +94,10 @@ ) acro.print_outputs() -# 2: Remove some ACRO outputs before finalising -# This is an example of deleting some of the ACRO outputs. -# The name of the output that needs to be removed should be passed to the function remove_output. -# - The output name can be taken from the outputs listed by the print_outputs function, +# 2: Remove some ACRO outputs before finalising +# This is an example of deleting some of the ACRO outputs. +# The name of the output that needs to be removed should be passed to the function remove_output. +# - The output name can be taken from the outputs listed by the print_outputs function, # - or by listing the results and choosing the specific output that needs to be removed print("\nNow removing the first output") @@ -111,16 +110,16 @@ acro.rename_output("output_1", "cross_tabulation") # 4: Add a comment to output -# This is an example to add a comment to outputs. +# This is an example to add a comment to outputs. # It can be used to provide a description or to pass additional information to the output checkers. print("\nUsers can add comments which the output checkers will see.") acro.add_comments("cross_tabulation", "Please let me have this data.") # 5: (the big one) Finalise ACRO -# This is an example of the function _finalise()_ which the users must call at the end of each session. -# - It takes each output and saves it to a CSV file. -# - It also saves the SDC analysis for each output to a json file or Excel file +# This is an example of the function _finalise()_ which the users must call at the end of each session. +# - It takes each output and saves it to a CSV file. +# - It also saves the SDC analysis for each output to a json file or Excel file # (depending on the extension of the name of the file provided as an input to the function) print( @@ -128,5 +127,3 @@ " If they don't, the SDC analysis, and their outputs, are lost." ) output = acro.finalise("Examples", "json") - -