Skip to content

Commit

Permalink
Add integration tests
Browse files Browse the repository at this point in the history
  • Loading branch information
frances-h committed Dec 4, 2024
1 parent 34c1e82 commit bbe6fe2
Show file tree
Hide file tree
Showing 3 changed files with 152 additions and 5 deletions.
5 changes: 5 additions & 0 deletions sdmetrics/single_table/privacy/disclosure_protection.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import numpy as np
import pandas as pd

from sdmetrics.goal import Goal
from sdmetrics.single_table.base import SingleTableMetric
from sdmetrics.single_table.privacy.cap import (
CategoricalCAP,
Expand All @@ -20,6 +21,10 @@
class DisclosureProtection(SingleTableMetric):
"""The DisclosureProtection metric."""

goal = Goal.MAXIMIZE
min_value = 0
max_value = 1

@classmethod
def _validate_inputs(
cls,
Expand Down
144 changes: 144 additions & 0 deletions tests/integration/single_table/privacy/test_disclosure_protection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import numpy as np
import pandas as pd
import pytest

from sdmetrics.single_table.privacy.disclosure_protection import DisclosureProtection


@pytest.fixture
def real_data():
return pd.DataFrame({
'key1': ['a', 'b', 'c', 'd', 'e'] * 20,
'key2': range(100),
'sensitive1': ['a', 'b', 'c', 'd', 'e'] * 20,
'sensitive2': range(100),
})


@pytest.fixture
def perfect_synthetic_data():
random_state = np.random.RandomState(42)

return pd.DataFrame({
'key1': random_state.choice(['a', 'b', 'c', 'd', 'e'], 20),
'key2': range(20),
'sensitive1': random_state.choice(['f', 'g', 'h', 'i', 'j'], 20),
'sensitive2': random_state.randint(5, 10, size=20),
})


@pytest.fixture
def good_synthetic_data():
random_state = np.random.RandomState(42)
return pd.DataFrame({
'key1': random_state.choice(['a', 'b', 'c', 'd', 'e'], 20),
'key2': random_state.randint(0, 5, size=20),
'sensitive1': random_state.choice(['a', 'b', 'c', 'd', 'e'], 20),
'sensitive2': random_state.randint(0, 5, size=20),
})


@pytest.fixture
def bad_synthetic_data():
return pd.DataFrame({
'key1': ['a', 'b', 'c', 'd', 'e'] * 20,
'key2': range(100),
'sensitive1': ['a', 'b', 'c', 'e', 'd'] * 20,
'sensitive2': range(100),
})


class TestDisclosureProtection:
def test_end_to_end_perfect(self, real_data, perfect_synthetic_data):
"""Test DisclosureProtection metric end to end with perfect synthetic data."""
# Setup
sensitive_columns = ['sensitive1', 'sensitive2']
known_columns = ['key1', 'key2']
continous_columns = ['key2', 'sensitive2']

# Run
score_breakdown = DisclosureProtection.compute_breakdown(
real_data,
perfect_synthetic_data,
sensitive_column_names=sensitive_columns,
known_column_names=known_columns,
continuous_column_names=continous_columns,
num_discrete_bins=10,
)

# Assert
assert score_breakdown == {'score': 1, 'cap_protection': 1, 'baseline_protection': 0.98}

def test_end_to_end_good(self, real_data, good_synthetic_data):
"""Test DisclosureProtection metric end to end with good synthetic data."""
# Setup
sensitive_columns = ['sensitive1', 'sensitive2']
known_columns = ['key1', 'key2']
continuous_columns = ['key2', 'sensitive2']

# Run
score_breakdown = DisclosureProtection.compute_breakdown(
real_data,
good_synthetic_data,
sensitive_column_names=sensitive_columns,
known_column_names=known_columns,
continuous_column_names=continuous_columns,
num_discrete_bins=10,
)

# Assert
assert score_breakdown == {
'score': 0.8979591836734694,
'cap_protection': 0.88,
'baseline_protection': 0.98,
}

def test_end_to_end_bad(self, real_data, bad_synthetic_data):
"""Test DisclosureProtection metric end to end with bad synthetic data."""
# Setup
sensitive_columns = ['sensitive1', 'sensitive2']
known_columns = ['key1', 'key2']
continuous_columns = ['key2', 'sensitive2']

# Run
score_breakdown = DisclosureProtection.compute_breakdown(
real_data,
bad_synthetic_data,
sensitive_column_names=sensitive_columns,
known_column_names=known_columns,
continuous_column_names=continuous_columns,
num_discrete_bins=10,
)

# Assert
assert score_breakdown == {
'score': 0.40816326530612246,
'cap_protection': 0.4,
'baseline_protection': 0.98,
}

@pytest.mark.parametrize('cap_method', ['cap', 'zero_cap', 'generalized_cap'])
def test_all_cap_methods(self, cap_method, real_data, perfect_synthetic_data):
"""Test DisclosureProtection metric with all possible CAP methods."""
# Setup
sensitive_columns = ['sensitive1', 'sensitive2']
known_columns = ['key1', 'key2']
continuous_columns = ['key2', 'sensitive2']

# Run
score_breakdown = DisclosureProtection.compute_breakdown(
real_data,
perfect_synthetic_data,
sensitive_column_names=sensitive_columns,
known_column_names=known_columns,
continuous_column_names=continuous_columns,
computation_method=cap_method,
num_discrete_bins=10,
)

# Assert
assert score_breakdown == {
'score': 1.0,
'cap_protection': 1.0,
'baseline_protection': 0.98,
}
8 changes: 3 additions & 5 deletions tests/unit/single_table/privacy/test_disclosure_protection.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Test for the disclosure protection metrics."""
"""Test for the disclosure metrics."""

import re
from unittest.mock import Mock, patch
Expand Down Expand Up @@ -128,11 +128,9 @@ def test__discreteize_column(self):

# Assert
expected_real = pd.Series(pd.Categorical(['0', '0', '2', '3', '4']))
pd.testing.assert_series_equal(binned_real, expected_real, check_categorical=False)
np.testing.assert_array_equal(binned_real, expected_real)
expected_synthetic = pd.Series(pd.Categorical(['0', '0', '1', '2', '3', '4', '4']))
pd.testing.assert_series_equal(
binned_synthetic, expected_synthetic, check_categorical=False
)
np.testing.assert_array_equal(binned_synthetic, expected_synthetic)

def test__compute_baseline(self):
"""Test computing the baseline score for random data."""
Expand Down

0 comments on commit bbe6fe2

Please sign in to comment.