From bbe6fe2874f2e390789f9757483ba50a011444ce Mon Sep 17 00:00:00 2001 From: Frances Hartwell Date: Mon, 2 Dec 2024 13:19:34 -0500 Subject: [PATCH] Add integration tests --- .../privacy/disclosure_protection.py | 5 + .../privacy/test_disclosure_protection.py | 144 ++++++++++++++++++ .../privacy/test_disclosure_protection.py | 8 +- 3 files changed, 152 insertions(+), 5 deletions(-) create mode 100644 tests/integration/single_table/privacy/test_disclosure_protection.py diff --git a/sdmetrics/single_table/privacy/disclosure_protection.py b/sdmetrics/single_table/privacy/disclosure_protection.py index bad5b794..05807260 100644 --- a/sdmetrics/single_table/privacy/disclosure_protection.py +++ b/sdmetrics/single_table/privacy/disclosure_protection.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd +from sdmetrics.goal import Goal from sdmetrics.single_table.base import SingleTableMetric from sdmetrics.single_table.privacy.cap import ( CategoricalCAP, @@ -20,6 +21,10 @@ class DisclosureProtection(SingleTableMetric): """The DisclosureProtection metric.""" + goal = Goal.MAXIMIZE + min_value = 0 + max_value = 1 + @classmethod def _validate_inputs( cls, diff --git a/tests/integration/single_table/privacy/test_disclosure_protection.py b/tests/integration/single_table/privacy/test_disclosure_protection.py new file mode 100644 index 00000000..da1af6c3 --- /dev/null +++ b/tests/integration/single_table/privacy/test_disclosure_protection.py @@ -0,0 +1,144 @@ +import numpy as np +import pandas as pd +import pytest + +from sdmetrics.single_table.privacy.disclosure_protection import DisclosureProtection + + +@pytest.fixture +def real_data(): + return pd.DataFrame({ + 'key1': ['a', 'b', 'c', 'd', 'e'] * 20, + 'key2': range(100), + 'sensitive1': ['a', 'b', 'c', 'd', 'e'] * 20, + 'sensitive2': range(100), + }) + + +@pytest.fixture +def perfect_synthetic_data(): + random_state = np.random.RandomState(42) + + return pd.DataFrame({ + 'key1': random_state.choice(['a', 'b', 'c', 'd', 'e'], 20), + 'key2': range(20), + 'sensitive1': random_state.choice(['f', 'g', 'h', 'i', 'j'], 20), + 'sensitive2': random_state.randint(5, 10, size=20), + }) + + +@pytest.fixture +def good_synthetic_data(): + random_state = np.random.RandomState(42) + return pd.DataFrame({ + 'key1': random_state.choice(['a', 'b', 'c', 'd', 'e'], 20), + 'key2': random_state.randint(0, 5, size=20), + 'sensitive1': random_state.choice(['a', 'b', 'c', 'd', 'e'], 20), + 'sensitive2': random_state.randint(0, 5, size=20), + }) + + +@pytest.fixture +def bad_synthetic_data(): + return pd.DataFrame({ + 'key1': ['a', 'b', 'c', 'd', 'e'] * 20, + 'key2': range(100), + 'sensitive1': ['a', 'b', 'c', 'e', 'd'] * 20, + 'sensitive2': range(100), + }) + + +class TestDisclosureProtection: + def test_end_to_end_perfect(self, real_data, perfect_synthetic_data): + """Test DisclosureProtection metric end to end with perfect synthetic data.""" + # Setup + sensitive_columns = ['sensitive1', 'sensitive2'] + known_columns = ['key1', 'key2'] + continous_columns = ['key2', 'sensitive2'] + + # Run + score_breakdown = DisclosureProtection.compute_breakdown( + real_data, + perfect_synthetic_data, + sensitive_column_names=sensitive_columns, + known_column_names=known_columns, + continuous_column_names=continous_columns, + num_discrete_bins=10, + ) + + # Assert + assert score_breakdown == {'score': 1, 'cap_protection': 1, 'baseline_protection': 0.98} + + def test_end_to_end_good(self, real_data, good_synthetic_data): + """Test DisclosureProtection metric end to end with good synthetic data.""" + # Setup + sensitive_columns = ['sensitive1', 'sensitive2'] + known_columns = ['key1', 'key2'] + continuous_columns = ['key2', 'sensitive2'] + + # Run + score_breakdown = DisclosureProtection.compute_breakdown( + real_data, + good_synthetic_data, + sensitive_column_names=sensitive_columns, + known_column_names=known_columns, + continuous_column_names=continuous_columns, + num_discrete_bins=10, + ) + + # Assert + assert score_breakdown == { + 'score': 0.8979591836734694, + 'cap_protection': 0.88, + 'baseline_protection': 0.98, + } + + def test_end_to_end_bad(self, real_data, bad_synthetic_data): + """Test DisclosureProtection metric end to end with bad synthetic data.""" + # Setup + sensitive_columns = ['sensitive1', 'sensitive2'] + known_columns = ['key1', 'key2'] + continuous_columns = ['key2', 'sensitive2'] + + # Run + score_breakdown = DisclosureProtection.compute_breakdown( + real_data, + bad_synthetic_data, + sensitive_column_names=sensitive_columns, + known_column_names=known_columns, + continuous_column_names=continuous_columns, + num_discrete_bins=10, + ) + + # Assert + assert score_breakdown == { + 'score': 0.40816326530612246, + 'cap_protection': 0.4, + 'baseline_protection': 0.98, + } + + @pytest.mark.parametrize('cap_method', ['cap', 'zero_cap', 'generalized_cap']) + def test_all_cap_methods(self, cap_method, real_data, perfect_synthetic_data): + """Test DisclosureProtection metric with all possible CAP methods.""" + # Setup + sensitive_columns = ['sensitive1', 'sensitive2'] + known_columns = ['key1', 'key2'] + continuous_columns = ['key2', 'sensitive2'] + + # Run + score_breakdown = DisclosureProtection.compute_breakdown( + real_data, + perfect_synthetic_data, + sensitive_column_names=sensitive_columns, + known_column_names=known_columns, + continuous_column_names=continuous_columns, + computation_method=cap_method, + num_discrete_bins=10, + ) + + # Assert + assert score_breakdown == { + 'score': 1.0, + 'cap_protection': 1.0, + 'baseline_protection': 0.98, + } diff --git a/tests/unit/single_table/privacy/test_disclosure_protection.py b/tests/unit/single_table/privacy/test_disclosure_protection.py index bc8f3bb5..d1803c8c 100644 --- a/tests/unit/single_table/privacy/test_disclosure_protection.py +++ b/tests/unit/single_table/privacy/test_disclosure_protection.py @@ -1,4 +1,4 @@ -"""Test for the disclosure protection metrics.""" +"""Test for the disclosure metrics.""" import re from unittest.mock import Mock, patch @@ -128,11 +128,9 @@ def test__discreteize_column(self): # Assert expected_real = pd.Series(pd.Categorical(['0', '0', '2', '3', '4'])) - pd.testing.assert_series_equal(binned_real, expected_real, check_categorical=False) + np.testing.assert_array_equal(binned_real, expected_real) expected_synthetic = pd.Series(pd.Categorical(['0', '0', '1', '2', '3', '4', '4'])) - pd.testing.assert_series_equal( - binned_synthetic, expected_synthetic, check_categorical=False - ) + np.testing.assert_array_equal(binned_synthetic, expected_synthetic) def test__compute_baseline(self): """Test computing the baseline score for random data."""