From bbe6fe2874f2e390789f9757483ba50a011444ce Mon Sep 17 00:00:00 2001
From: Frances Hartwell <frances@datacebo.com>
Date: Mon, 2 Dec 2024 13:19:34 -0500
Subject: [PATCH] Add integration tests

---
 .../privacy/disclosure_protection.py          |   5 +
 .../privacy/test_disclosure_protection.py     | 144 ++++++++++++++++++
 .../privacy/test_disclosure_protection.py     |   8 +-
 3 files changed, 152 insertions(+), 5 deletions(-)
 create mode 100644 tests/integration/single_table/privacy/test_disclosure_protection.py

diff --git a/sdmetrics/single_table/privacy/disclosure_protection.py b/sdmetrics/single_table/privacy/disclosure_protection.py
index bad5b794..05807260 100644
--- a/sdmetrics/single_table/privacy/disclosure_protection.py
+++ b/sdmetrics/single_table/privacy/disclosure_protection.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pandas as pd
 
+from sdmetrics.goal import Goal
 from sdmetrics.single_table.base import SingleTableMetric
 from sdmetrics.single_table.privacy.cap import (
     CategoricalCAP,
@@ -20,6 +21,10 @@
 class DisclosureProtection(SingleTableMetric):
     """The DisclosureProtection metric."""
 
+    goal = Goal.MAXIMIZE
+    min_value = 0
+    max_value = 1
+
     @classmethod
     def _validate_inputs(
         cls,
diff --git a/tests/integration/single_table/privacy/test_disclosure_protection.py b/tests/integration/single_table/privacy/test_disclosure_protection.py
new file mode 100644
index 00000000..da1af6c3
--- /dev/null
+++ b/tests/integration/single_table/privacy/test_disclosure_protection.py
@@ -0,0 +1,144 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from sdmetrics.single_table.privacy.disclosure_protection import DisclosureProtection
+
+
+@pytest.fixture
+def real_data():
+    return pd.DataFrame({
+        'key1': ['a', 'b', 'c', 'd', 'e'] * 20,
+        'key2': range(100),
+        'sensitive1': ['a', 'b', 'c', 'd', 'e'] * 20,
+        'sensitive2': range(100),
+    })
+
+
+@pytest.fixture
+def perfect_synthetic_data():
+    random_state = np.random.RandomState(42)
+
+    return pd.DataFrame({
+        'key1': random_state.choice(['a', 'b', 'c', 'd', 'e'], 20),
+        'key2': range(20),
+        'sensitive1': random_state.choice(['f', 'g', 'h', 'i', 'j'], 20),
+        'sensitive2': random_state.randint(5, 10, size=20),
+    })
+
+
+@pytest.fixture
+def good_synthetic_data():
+    random_state = np.random.RandomState(42)
+    return pd.DataFrame({
+        'key1': random_state.choice(['a', 'b', 'c', 'd', 'e'], 20),
+        'key2': random_state.randint(0, 5, size=20),
+        'sensitive1': random_state.choice(['a', 'b', 'c', 'd', 'e'], 20),
+        'sensitive2': random_state.randint(0, 5, size=20),
+    })
+
+
+@pytest.fixture
+def bad_synthetic_data():
+    return pd.DataFrame({
+        'key1': ['a', 'b', 'c', 'd', 'e'] * 20,
+        'key2': range(100),
+        'sensitive1': ['a', 'b', 'c', 'e', 'd'] * 20,
+        'sensitive2': range(100),
+    })
+
+
+class TestDisclosureProtection:
+    def test_end_to_end_perfect(self, real_data, perfect_synthetic_data):
+        """Test DisclosureProtection metric end to end with perfect synthetic data."""
+        # Setup
+        sensitive_columns = ['sensitive1', 'sensitive2']
+        known_columns = ['key1', 'key2']
+        continous_columns = ['key2', 'sensitive2']
+
+        # Run
+        score_breakdown = DisclosureProtection.compute_breakdown(
+            real_data,
+            perfect_synthetic_data,
+            sensitive_column_names=sensitive_columns,
+            known_column_names=known_columns,
+            continuous_column_names=continous_columns,
+            num_discrete_bins=10,
+        )
+
+        # Assert
+        assert score_breakdown == {'score': 1, 'cap_protection': 1, 'baseline_protection': 0.98}
+
+    def test_end_to_end_good(self, real_data, good_synthetic_data):
+        """Test DisclosureProtection metric end to end with good synthetic data."""
+        # Setup
+        sensitive_columns = ['sensitive1', 'sensitive2']
+        known_columns = ['key1', 'key2']
+        continuous_columns = ['key2', 'sensitive2']
+
+        # Run
+        score_breakdown = DisclosureProtection.compute_breakdown(
+            real_data,
+            good_synthetic_data,
+            sensitive_column_names=sensitive_columns,
+            known_column_names=known_columns,
+            continuous_column_names=continuous_columns,
+            num_discrete_bins=10,
+        )
+
+        # Assert
+        assert score_breakdown == {
+            'score': 0.8979591836734694,
+            'cap_protection': 0.88,
+            'baseline_protection': 0.98,
+        }
+
+    def test_end_to_end_bad(self, real_data, bad_synthetic_data):
+        """Test DisclosureProtection metric end to end with bad synthetic data."""
+        # Setup
+        sensitive_columns = ['sensitive1', 'sensitive2']
+        known_columns = ['key1', 'key2']
+        continuous_columns = ['key2', 'sensitive2']
+
+        # Run
+        score_breakdown = DisclosureProtection.compute_breakdown(
+            real_data,
+            bad_synthetic_data,
+            sensitive_column_names=sensitive_columns,
+            known_column_names=known_columns,
+            continuous_column_names=continuous_columns,
+            num_discrete_bins=10,
+        )
+
+        # Assert
+        assert score_breakdown == {
+            'score': 0.40816326530612246,
+            'cap_protection': 0.4,
+            'baseline_protection': 0.98,
+        }
+
+    @pytest.mark.parametrize('cap_method', ['cap', 'zero_cap', 'generalized_cap'])
+    def test_all_cap_methods(self, cap_method, real_data, perfect_synthetic_data):
+        """Test DisclosureProtection metric with all possible CAP methods."""
+        # Setup
+        sensitive_columns = ['sensitive1', 'sensitive2']
+        known_columns = ['key1', 'key2']
+        continuous_columns = ['key2', 'sensitive2']
+
+        # Run
+        score_breakdown = DisclosureProtection.compute_breakdown(
+            real_data,
+            perfect_synthetic_data,
+            sensitive_column_names=sensitive_columns,
+            known_column_names=known_columns,
+            continuous_column_names=continuous_columns,
+            computation_method=cap_method,
+            num_discrete_bins=10,
+        )
+
+        # Assert
+        assert score_breakdown == {
+            'score': 1.0,
+            'cap_protection': 1.0,
+            'baseline_protection': 0.98,
+        }
diff --git a/tests/unit/single_table/privacy/test_disclosure_protection.py b/tests/unit/single_table/privacy/test_disclosure_protection.py
index bc8f3bb5..d1803c8c 100644
--- a/tests/unit/single_table/privacy/test_disclosure_protection.py
+++ b/tests/unit/single_table/privacy/test_disclosure_protection.py
@@ -1,4 +1,4 @@
-"""Test for the disclosure protection metrics."""
+"""Test for the disclosure metrics."""
 
 import re
 from unittest.mock import Mock, patch
@@ -128,11 +128,9 @@ def test__discreteize_column(self):
 
         # Assert
         expected_real = pd.Series(pd.Categorical(['0', '0', '2', '3', '4']))
-        pd.testing.assert_series_equal(binned_real, expected_real, check_categorical=False)
+        np.testing.assert_array_equal(binned_real, expected_real)
         expected_synthetic = pd.Series(pd.Categorical(['0', '0', '1', '2', '3', '4', '4']))
-        pd.testing.assert_series_equal(
-            binned_synthetic, expected_synthetic, check_categorical=False
-        )
+        np.testing.assert_array_equal(binned_synthetic, expected_synthetic)
 
     def test__compute_baseline(self):
         """Test computing the baseline score for random data."""