Skip to content

Commit

Permalink
Add tests
Browse files Browse the repository at this point in the history
  • Loading branch information
frances-h committed Dec 5, 2024
1 parent 677a5af commit 5a624e6
Show file tree
Hide file tree
Showing 3 changed files with 306 additions and 23 deletions.
25 changes: 14 additions & 11 deletions sdmetrics/single_table/privacy/disclosure_protection.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,15 +339,18 @@ def _compute_estimated_cap_metric(

cap_metric = CAP_METHODS.get(computation_method)
estimated_score_sum = 0
for iter in estimation_iterator:
for i in estimation_iterator:
real_data_samp = real_data.sample(min(num_rows_subsample, len(real_data)))
synth_data_samp = synthetic_data.sample(min(num_rows_subsample, len(synthetic_data)))

estimated_cap_protection = cap_metric.compute(
real_data.sample(min(num_rows_subsample, len(real_data))),
synthetic_data.sample(min(num_rows_subsample, len(synthetic_data))),
real_data_samp,
synth_data_samp,
key_fields=known_column_names,
sensitive_fields=sensitive_column_names,
)
estimated_score_sum += estimated_cap_protection
average_computed_score = estimated_score_sum / (iter + 1.0)
average_computed_score = estimated_score_sum / (i + 1.0)
if baseline_protection == 0:
average_score = 0 if average_computed_score == 0 else 1
else:
Expand Down Expand Up @@ -438,13 +441,13 @@ def compute_breakdown(
average_score, average_computed_score = cls._compute_estimated_cap_metric(
real_data,
synthetic_data,
baseline_protection,
known_column_names,
sensitive_column_names,
computation_method,
num_rows_subsample,
num_iterations,
verbose,
baseline_protection=baseline_protection,
known_column_names=known_column_names,
sensitive_column_names=sensitive_column_names,
computation_method=computation_method,
num_rows_subsample=num_rows_subsample,
num_iterations=num_iterations,
verbose=verbose,
)

return {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
import pandas as pd
import pytest

from sdmetrics.single_table.privacy.disclosure_protection import DisclosureProtection
from sdmetrics.single_table.privacy.disclosure_protection import (
DisclosureProtection,
DisclosureProtectionEstimate,
)


@pytest.fixture
Expand All @@ -23,7 +26,7 @@ def perfect_synthetic_data():
'key1': random_state.choice(['a', 'b', 'c', 'd', 'e'], 20),
'key2': range(20),
'sensitive1': random_state.choice(['f', 'g', 'h', 'i', 'j'], 20),
'sensitive2': random_state.randint(5, 10, size=20),
'sensitive2': random_state.randint(100, 200, size=20),
})


Expand Down Expand Up @@ -142,3 +145,53 @@ def test_all_cap_methods(self, cap_method, real_data, perfect_synthetic_data):
'cap_protection': 1.0,
'baseline_protection': 0.98,
}


class TestDisclosureProtectionEstimate:
def test_end_to_end_perfect(self, real_data, perfect_synthetic_data):
"""Test DisclosureProtectionEstimate metric end to end with perfect synthetic data."""
# Setup
sensitive_columns = ['sensitive1', 'sensitive2']
known_columns = ['key1', 'key2']
continous_columns = ['key2', 'sensitive2']

# Run
score_breakdown = DisclosureProtectionEstimate.compute_breakdown(
real_data,
perfect_synthetic_data,
sensitive_column_names=sensitive_columns,
known_column_names=known_columns,
continuous_column_names=continous_columns,
num_discrete_bins=10,
num_rows_subsample=20,
)

# Assert
assert score_breakdown == {'score': 1, 'cap_protection': 1, 'baseline_protection': 0.98}

@pytest.mark.parametrize('cap_method', ['cap', 'zero_cap', 'generalized_cap'])
def test_all_cap_methods(self, cap_method, real_data, perfect_synthetic_data):
"""Test DisclosureProtectionEstimate metric with all possible CAP methods."""
# Setup
sensitive_columns = ['sensitive1', 'sensitive2']
known_columns = ['key1', 'key2']
continuous_columns = ['key2', 'sensitive2']

# Run
score_breakdown = DisclosureProtectionEstimate.compute_breakdown(
real_data,
perfect_synthetic_data,
sensitive_column_names=sensitive_columns,
known_column_names=known_columns,
continuous_column_names=continuous_columns,
computation_method=cap_method,
num_discrete_bins=10,
num_rows_subsample=20,
)

# Assert
assert score_breakdown == {
'score': 1.0,
'cap_protection': 1.0,
'baseline_protection': 0.98,
}
247 changes: 237 additions & 10 deletions tests/unit/single_table/privacy/test_disclosure_protection.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
"""Test for the disclosure metrics."""

import re
from unittest.mock import Mock, patch
from unittest.mock import MagicMock, Mock, call, patch

import numpy as np
import pandas as pd
import pytest

from sdmetrics.single_table.privacy.disclosure_protection import DisclosureProtection
from sdmetrics.single_table.privacy.disclosure_protection import (
DisclosureProtection,
DisclosureProtectionEstimate,
)
from tests.utils import DataFrameMatcher


class TestDisclosureProtection:
Expand Down Expand Up @@ -151,21 +155,51 @@ def test__discretize_column_float_dtypes(self, dtype):
expected_synthetic = np.array(['0', '0', '1', np.nan, '3', np.nan, '4'], dtype='object')
assert list(binned_synthetic) == list(expected_synthetic)

def test__compute_baseline(self):
"""Test computing the baseline score for random data."""
def test__discretize_and_fillna(self):
"""Test helper method to discretize continous columns and fill nan values."""
# Setup
real_data = pd.DataFrame({
'col1': ['A', 'A', 'A', 'A', 'A'],
'col2': ['A', 'B', 'A', 'B', 'A'],
'col3': range(5),
'known': ['A', 'A', pd.NA, 'B', 'B'],
'continous': [0, 1, 3, 8, 10],
'continous_nan': [0, 7, 2, np.nan, 10],
'extra': [None, pd.NA, 0, 10, 100],
})
sensitive_column_names = ['col1', 'col2']
synthetic_data = pd.DataFrame({
'known': ['A', 'A', 'B', 'B', None],
'continous': [-1, 0, 3, 5, 11],
'continous_nan': [0, 1, 2, np.nan, 100],
'extra': [None, pd.NA, 0, 10, 100],
})
known_column_names = ['known']
sensitive_column_names = ['continous', 'continous_nan']
continuous_column_names = ['continous', 'continous_nan']
num_discrete_bins = 5

# Run
baseline_score = DisclosureProtection._compute_baseline(real_data, sensitive_column_names)
processed_real, processed_synthetic = DisclosureProtection._discretize_and_fillna(
real_data,
synthetic_data,
known_column_names,
sensitive_column_names,
continuous_column_names,
num_discrete_bins,
)

# Assert
assert baseline_score == 0.5
expected_real = pd.DataFrame({
'known': ['A', 'A', '__NULL_VALUE__', 'B', 'B'],
'continous': ['0', '0', '1', '3', '4'],
'continous_nan': ['0', '3', '0', '__NULL_VALUE__', '4'],
'extra': real_data['extra'],
})
expected_synthetic = pd.DataFrame({
'known': ['A', 'A', 'B', 'B', '__NULL_VALUE__'],
'continous': ['0', '0', '1', '2', '4'],
'continous_nan': ['0', '0', '0', '__NULL_VALUE__', '4'],
'extra': synthetic_data['extra'],
})
pd.testing.assert_frame_equal(expected_real, processed_real)
pd.testing.assert_frame_equal(expected_synthetic, processed_synthetic)

def test__compute_baseline(self):
"""Test computing the baseline score for random data."""
Expand Down Expand Up @@ -287,3 +321,196 @@ def test_compute(self, compute_breakdown_mock):

# Assert
assert score == 0.8


class TestDisclosureProtectionEstimate:
def test__validate_inputs(self):
"""Test input validation."""
# Setup
default_kwargs = {
'real_data': pd.DataFrame({'col1': range(5), 'col2': range(5)}),
'synthetic_data': pd.DataFrame({'col1': range(10), 'col2': range(10)}),
'known_column_names': ['col1'],
'sensitive_column_names': ['col2'],
'computation_method': 'cap',
'continuous_column_names': ['col2'],
'num_discrete_bins': 10,
'num_rows_subsample': 1000,
'num_iterations': 10,
}
bad_rows_subsample = 0
bad_num_iterations = 0

# Run and Assert
DisclosureProtectionEstimate._validate_inputs(**default_kwargs)

bad_rows_subsample_error = re.escape(
'`num_rows_subsample` must be an integer greater than zero.'
)
with pytest.raises(ValueError, match=bad_rows_subsample_error):
DisclosureProtectionEstimate._validate_inputs(**{
**default_kwargs,
'num_rows_subsample': bad_rows_subsample,
})

bad_num_iterations_error = re.escape(
'`num_iterations` must be an integer greater than zero.'
)
with pytest.raises(ValueError, match=bad_num_iterations_error):
DisclosureProtectionEstimate._validate_inputs(**{
**default_kwargs,
'num_iterations': bad_num_iterations,
})

@patch('sdmetrics.single_table.privacy.disclosure_protection.tqdm')
@patch('sdmetrics.single_table.privacy.disclosure_protection.CAP_METHODS')
def test__compute_estimated_cap_metric(self, CAPMethodsMock, mock_tqdm):
"""Test the ``_compute_estimated_cap_metric`` method."""
# Setup
real_data = pd.DataFrame({
'col1': np.random.choice(['A', 'B', 'C', 'D'], size=5),
'col2': np.random.choice(['X', 'Y'], size=5),
})
synthetic_data = pd.DataFrame({
'col1': np.random.choice(['A', 'B', 'C', 'D'], size=100),
'col2': np.random.choice(['X', 'Y'], size=100),
})
CAPMock = Mock()
CAPMock.compute.side_effect = [0.4, 0.5, 0.2, 0.6, 0.2]
CAPMethodsMock.keys.return_value = ['CAP', 'ZERO_CAP', 'GENERALIZED_CAP']
CAPMethodsMock.get.return_value = CAPMock
progress_bar = MagicMock()
progress_bar.__iter__.return_value = range(5)
mock_tqdm.tqdm.return_value = progress_bar

# Run
avg_score, avg_computed_score = DisclosureProtectionEstimate._compute_estimated_cap_metric(
real_data,
synthetic_data,
baseline_protection=0.5,
known_column_names=['col1'],
sensitive_column_names=['col2'],
computation_method='CAP',
num_rows_subsample=10,
num_iterations=5,
verbose=True,
)

# Assert
assert avg_score == 0.76
assert avg_computed_score == 0.38
progress_bar.set_description.assert_has_calls([
call('Estimating Disclosure Protection (Score=0.000)'),
call('Estimating Disclosure Protection (Score=0.800)'),
call('Estimating Disclosure Protection (Score=0.900)'),
call('Estimating Disclosure Protection (Score=0.733)'),
call('Estimating Disclosure Protection (Score=0.850)'),
call('Estimating Disclosure Protection (Score=0.760)'),
])

@patch('sdmetrics.single_table.privacy.disclosure_protection.CAP_METHODS')
def test__compute_estimated_cap_metric_zero_baseline(self, CAPMethodsMock):
"""Test the ``_compute_estimated_cap_metric`` method with a zero baseline."""
# Setup
real_data = pd.DataFrame({
'col1': np.random.choice(['A', 'B', 'C', 'D'], size=5),
'col2': ['A'] * 5,
})
synthetic_data = pd.DataFrame({
'col1': np.random.choice(['A', 'B', 'C', 'D'], size=100),
'col2': ['A'] * 100,
})
CAPMock = Mock()
CAPMock.compute.side_effect = [0.4, 0.5, 0.2, 0.6, 0.2]
CAPMethodsMock.keys.return_value = ['CAP', 'ZERO_CAP', 'GENERALIZED_CAP']
CAPMethodsMock.get.return_value = CAPMock

# Run
avg_score, avg_computed_score = DisclosureProtectionEstimate._compute_estimated_cap_metric(
real_data,
synthetic_data,
baseline_protection=0,
known_column_names=['col1'],
sensitive_column_names=['col2'],
computation_method='CAP',
num_rows_subsample=10,
num_iterations=5,
verbose=False,
)

# Assert
assert avg_score == 1
assert avg_computed_score == 0.38

@patch(
'sdmetrics.single_table.privacy.disclosure_protection.DisclosureProtectionEstimate._compute_estimated_cap_metric'
)
def test_compute_breakdown(self, mock__compute_estimated_cap_metric):
"""Test computing the breakdown."""
# Setup
real_data = pd.DataFrame({
'col1': np.random.choice(['A', 'B', 'C', 'D'], size=10),
'col2': ['X', 'Y', 'Z', 'Y', 'X', 'X', 'Y', 'Z', 'X', 'A'],
'col3': ['A', 'B'] * 5,
})
synthetic_data = pd.DataFrame({
'col1': np.random.choice(['A', 'B', 'C', 'D'], size=10),
'col2': np.random.choice(['X', 'Y', 'Z', 'X', 'X'], size=10),
'col3': ['A'] * 10,
})
mock__compute_estimated_cap_metric.return_value = (0.8, 0.6)

# Run
score_breakdown = DisclosureProtectionEstimate.compute_breakdown(
real_data=real_data,
synthetic_data=synthetic_data,
known_column_names=['col1'],
sensitive_column_names=['col2', 'col3'],
num_discrete_bins=2,
)

# Assert
assert score_breakdown == {
'score': 0.8,
'baseline_protection': 0.875,
'cap_protection': 0.6,
}
mock__compute_estimated_cap_metric.assert_called_once_with(
DataFrameMatcher(real_data),
DataFrameMatcher(synthetic_data),
baseline_protection=0.875,
known_column_names=['col1'],
sensitive_column_names=['col2', 'col3'],
computation_method='CAP',
num_rows_subsample=1000,
num_iterations=10,
verbose=True,
)

@patch(
'sdmetrics.single_table.privacy.disclosure_protection.DisclosureProtectionEstimate.compute_breakdown'
)
def test_compute(self, compute_breakdown_mock):
"""Test the ``compute`` method."""
# Setup
real_data = pd.DataFrame({
'col1': np.random.choice(['A', 'B', 'C', 'D'], size=10),
'col2': ['A'] * 10,
})
synthetic_data = pd.DataFrame({
'col1': np.random.choice(['A', 'B', 'C', 'D'], size=10),
'col2': ['A'] * 10,
})
compute_breakdown_mock.return_value = {
'score': 0.8,
'baseline_protection': 0.6,
'cap_protection': 0.64,
}

# Run
score = DisclosureProtectionEstimate.compute(
real_data, synthetic_data, known_column_names=['col1'], sensitive_column_names=['col2']
)

# Assert
assert score == 0.8

0 comments on commit 5a624e6

Please sign in to comment.