From 0bf95d9d5b36961502e462973c46f4fb6d0b9e80 Mon Sep 17 00:00:00 2001 From: Frances Hartwell Date: Tue, 10 Dec 2024 10:33:15 -0500 Subject: [PATCH] Add `DisclosureProtectionEstimate` metric (#686) --- sdmetrics/single_table/__init__.py | 6 +- sdmetrics/single_table/privacy/__init__.py | 6 +- sdmetrics/single_table/privacy/cap.py | 224 +++++++++++++ .../privacy/disclosure_protection.py | 313 +++++++++++++++++- .../privacy/test_disclosure_protection.py | 57 +++- tests/unit/single_table/privacy/test_cap.py | 27 ++ .../privacy/test_disclosure_protection.py | 300 ++++++++++++++++- 7 files changed, 901 insertions(+), 32 deletions(-) create mode 100644 tests/unit/single_table/privacy/test_cap.py diff --git a/sdmetrics/single_table/__init__.py b/sdmetrics/single_table/__init__.py index d35f51e6..6bc54960 100644 --- a/sdmetrics/single_table/__init__.py +++ b/sdmetrics/single_table/__init__.py @@ -67,7 +67,10 @@ CategoricalRF, CategoricalSVM, ) -from sdmetrics.single_table.privacy.disclosure_protection import DisclosureProtection +from sdmetrics.single_table.privacy.disclosure_protection import ( + DisclosureProtection, + DisclosureProtectionEstimate, +) from sdmetrics.single_table.privacy.ensemble import CategoricalEnsemble from sdmetrics.single_table.privacy.numerical_sklearn import NumericalLR, NumericalMLP, NumericalSVR from sdmetrics.single_table.privacy.radius_nearest_neighbor import NumericalRadiusNearestNeighbor @@ -111,6 +114,7 @@ 'CategoricalZeroCAP', 'CategoricalGeneralizedCAP', 'DisclosureProtection', + 'DisclosureProtectionEstimate', 'NumericalMLP', 'NumericalLR', 'NumericalSVR', diff --git a/sdmetrics/single_table/privacy/__init__.py b/sdmetrics/single_table/privacy/__init__.py index 667e178e..ac06b53c 100644 --- a/sdmetrics/single_table/privacy/__init__.py +++ b/sdmetrics/single_table/privacy/__init__.py @@ -12,7 +12,10 @@ CategoricalRF, CategoricalSVM, ) -from sdmetrics.single_table.privacy.disclosure_protection import DisclosureProtection +from sdmetrics.single_table.privacy.disclosure_protection import ( + DisclosureProtection, + DisclosureProtectionEstimate, +) from sdmetrics.single_table.privacy.ensemble import CategoricalEnsemble from sdmetrics.single_table.privacy.numerical_sklearn import NumericalLR, NumericalMLP, NumericalSVR from sdmetrics.single_table.privacy.radius_nearest_neighbor import NumericalRadiusNearestNeighbor @@ -28,6 +31,7 @@ 'CategoricalSVM', 'CategoricalZeroCAP', 'DisclosureProtection', + 'DisclosureProtectionEstimate', 'NumericalLR', 'NumericalMLP', 'NumericalPrivacyMetric', diff --git a/sdmetrics/single_table/privacy/cap.py b/sdmetrics/single_table/privacy/cap.py index 2c26794f..6d224b28 100644 --- a/sdmetrics/single_table/privacy/cap.py +++ b/sdmetrics/single_table/privacy/cap.py @@ -1,8 +1,16 @@ """CAP modules and their attackers.""" +import warnings + from sdmetrics.single_table.privacy.base import CategoricalPrivacyMetric, PrivacyAttackerModel from sdmetrics.single_table.privacy.util import closest_neighbors, count_frequency, majority +DEPRECATION_MSG = ( + 'Computing CAP metrics directly is deprecated. For improved privacy metrics, ' + "please use the 'DisclosureProtection' and 'DisclosureProtectionEstimate' " + 'metrics instead.' +) + class CAPAttacker(PrivacyAttackerModel): """The CAP (Correct Attribution Probability) privacy attacker. @@ -78,6 +86,78 @@ class CategoricalCAP(CategoricalPrivacyMetric): MODEL = CAPAttacker ACCURACY_BASE = False + @classmethod + def _compute( + cls, + real_data, + synthetic_data, + metadata=None, + key_fields=None, + sensitive_fields=None, + model_kwargs=None, + ): + return super().compute( + real_data=real_data, + synthetic_data=synthetic_data, + metadata=metadata, + key_fields=key_fields, + sensitive_fields=sensitive_fields, + model_kwargs=model_kwargs, + ) + + @classmethod + def compute( + cls, + real_data, + synthetic_data, + metadata=None, + key_fields=None, + sensitive_fields=None, + model_kwargs=None, + ): + """Compute this metric. + + This fits an adversial attacker model on the synthetic data and + then evaluates it making predictions on the real data. + + A ``key_fields`` column(s) name must be given, either directly or as a first level + entry in the ``metadata`` dict, which will be used as the key column(s) for the + attack. + + A ``sensitive_fields`` column(s) name must be given, either directly or as a first level + entry in the ``metadata`` dict, which will be used as the sensitive_fields column(s) + for the attack. + + Args: + real_data (Union[numpy.ndarray, pandas.DataFrame]): + The values from the real dataset. + synthetic_data (Union[numpy.ndarray, pandas.DataFrame]): + The values from the synthetic dataset. + metadata (dict): + Table metadata dict. If not passed, it is build based on the + real_data fields and dtypes. + key_fields (list(str)): + Name of the column(s) to use as the key attributes. + sensitive_fields (list(str)): + Name of the column(s) to use as the sensitive attributes. + model_kwargs (dict): + Key word arguments of the attacker model. cls.MODEL_KWARGS will be used + if none is provided. + + Returns: + union[float, tuple[float]]: + Scores obtained by the attackers when evaluated on the real data. + """ + warnings.warn(DEPRECATION_MSG, DeprecationWarning) + return cls._compute( + real_data=real_data, + synthetic_data=synthetic_data, + metadata=metadata, + key_fields=key_fields, + sensitive_fields=sensitive_fields, + model_kwargs=model_kwargs, + ) + class ZeroCAPAttacker(CAPAttacker): """The 0CAP privacy attacker, which operates in the same way as CAP does. @@ -113,6 +193,78 @@ class CategoricalZeroCAP(CategoricalPrivacyMetric): MODEL = ZeroCAPAttacker ACCURACY_BASE = False + @classmethod + def _compute( + cls, + real_data, + synthetic_data, + metadata=None, + key_fields=None, + sensitive_fields=None, + model_kwargs=None, + ): + return super().compute( + real_data=real_data, + synthetic_data=synthetic_data, + metadata=metadata, + key_fields=key_fields, + sensitive_fields=sensitive_fields, + model_kwargs=model_kwargs, + ) + + @classmethod + def compute( + cls, + real_data, + synthetic_data, + metadata=None, + key_fields=None, + sensitive_fields=None, + model_kwargs=None, + ): + """Compute this metric. + + This fits an adversial attacker model on the synthetic data and + then evaluates it making predictions on the real data. + + A ``key_fields`` column(s) name must be given, either directly or as a first level + entry in the ``metadata`` dict, which will be used as the key column(s) for the + attack. + + A ``sensitive_fields`` column(s) name must be given, either directly or as a first level + entry in the ``metadata`` dict, which will be used as the sensitive_fields column(s) + for the attack. + + Args: + real_data (Union[numpy.ndarray, pandas.DataFrame]): + The values from the real dataset. + synthetic_data (Union[numpy.ndarray, pandas.DataFrame]): + The values from the synthetic dataset. + metadata (dict): + Table metadata dict. If not passed, it is build based on the + real_data fields and dtypes. + key_fields (list(str)): + Name of the column(s) to use as the key attributes. + sensitive_fields (list(str)): + Name of the column(s) to use as the sensitive attributes. + model_kwargs (dict): + Key word arguments of the attacker model. cls.MODEL_KWARGS will be used + if none is provided. + + Returns: + union[float, tuple[float]]: + Scores obtained by the attackers when evaluated on the real data. + """ + warnings.warn(DEPRECATION_MSG, DeprecationWarning) + return cls._compute( + real_data=real_data, + synthetic_data=synthetic_data, + metadata=metadata, + key_fields=key_fields, + sensitive_fields=sensitive_fields, + model_kwargs=model_kwargs, + ) + class GeneralizedCAPAttacker(CAPAttacker): """The GeneralizedCAP privacy attacker. @@ -169,3 +321,75 @@ class CategoricalGeneralizedCAP(CategoricalPrivacyMetric): name = 'Categorical GeneralizedCAP' MODEL = GeneralizedCAPAttacker ACCURACY_BASE = False + + @classmethod + def _compute( + cls, + real_data, + synthetic_data, + metadata=None, + key_fields=None, + sensitive_fields=None, + model_kwargs=None, + ): + return super().compute( + real_data=real_data, + synthetic_data=synthetic_data, + metadata=metadata, + key_fields=key_fields, + sensitive_fields=sensitive_fields, + model_kwargs=model_kwargs, + ) + + @classmethod + def compute( + cls, + real_data, + synthetic_data, + metadata=None, + key_fields=None, + sensitive_fields=None, + model_kwargs=None, + ): + """Compute this metric. + + This fits an adversial attacker model on the synthetic data and + then evaluates it making predictions on the real data. + + A ``key_fields`` column(s) name must be given, either directly or as a first level + entry in the ``metadata`` dict, which will be used as the key column(s) for the + attack. + + A ``sensitive_fields`` column(s) name must be given, either directly or as a first level + entry in the ``metadata`` dict, which will be used as the sensitive_fields column(s) + for the attack. + + Args: + real_data (Union[numpy.ndarray, pandas.DataFrame]): + The values from the real dataset. + synthetic_data (Union[numpy.ndarray, pandas.DataFrame]): + The values from the synthetic dataset. + metadata (dict): + Table metadata dict. If not passed, it is build based on the + real_data fields and dtypes. + key_fields (list(str)): + Name of the column(s) to use as the key attributes. + sensitive_fields (list(str)): + Name of the column(s) to use as the sensitive attributes. + model_kwargs (dict): + Key word arguments of the attacker model. cls.MODEL_KWARGS will be used + if none is provided. + + Returns: + union[float, tuple[float]]: + Scores obtained by the attackers when evaluated on the real data. + """ + warnings.warn(DEPRECATION_MSG, DeprecationWarning) + return cls._compute( + real_data=real_data, + synthetic_data=synthetic_data, + metadata=metadata, + key_fields=key_fields, + sensitive_fields=sensitive_fields, + model_kwargs=model_kwargs, + ) diff --git a/sdmetrics/single_table/privacy/disclosure_protection.py b/sdmetrics/single_table/privacy/disclosure_protection.py index 6149a6b5..ea81c920 100644 --- a/sdmetrics/single_table/privacy/disclosure_protection.py +++ b/sdmetrics/single_table/privacy/disclosure_protection.py @@ -1,7 +1,10 @@ """Disclosure protection metrics.""" +import warnings + import numpy as np import pandas as pd +import tqdm from sdmetrics.goal import Goal from sdmetrics.single_table.base import SingleTableMetric @@ -11,6 +14,8 @@ CategoricalZeroCAP, ) +MAX_NUM_ROWS = 50000 + CAP_METHODS = { 'CAP': CategoricalCAP, 'ZERO_CAP': CategoricalZeroCAP, @@ -95,6 +100,56 @@ def _discretize_column(cls, real_column, synthetic_column, num_bins): return real_binned.to_numpy(), synthetic_binned.to_numpy() + @classmethod + def _discretize_and_fillna( + cls, + real_data, + synthetic_data, + known_column_names, + sensitive_column_names, + continuous_column_names, + num_discrete_bins, + ): + """Helper to discretize continous columns and convert null values to categories. + + Args: + real_data (pd.DataFrame): + A pd.DataFrame with the real data. + synthetic_data (pd.DataFrame): + A pd.DataFrame with the synthetic data. + known_column_names (list[str]): + A list with the string names of the columns that an attacker may already know. + sensitive_column_names (list[str]): + A list with the string names of the columns that an attacker wants to guess + (but does not already know). + continuous_column_names (list[str]): + A list of column names that represent continuous values (as opposed to discrete + values). These columns will be discretized. Defaults to None. + num_discrete_bins (int): + Number of bins to discretize continous columns in to. Defaults to 10. + + Returns: + tuple(pd.DataFrame, pd.DataFrame): + The pre-processed real and synthetic data. + """ + real_data = real_data.copy() + synthetic_data = synthetic_data.copy() + + # Discretize continous columns + if continuous_column_names is not None: + for col_name in continuous_column_names: + real_data[col_name], synthetic_data[col_name] = cls._discretize_column( + real_data[col_name], synthetic_data[col_name], num_discrete_bins + ) + + # Convert null values to own category + null_category_map = cls._get_null_categories( + real_data, synthetic_data, known_column_names + sensitive_column_names + ) + real_data = real_data.fillna(null_category_map) + synthetic_data = synthetic_data.fillna(null_category_map) + return real_data, synthetic_data + @classmethod def _compute_baseline(cls, real_data, sensitive_column_names): unique_categories_prod = np.prod([ @@ -153,30 +208,29 @@ def compute_breakdown( continuous_column_names, num_discrete_bins, ) - computation_method = computation_method.upper() - real_data = real_data.copy() - synthetic_data = synthetic_data.copy() - # Discretize continous columns - if continuous_column_names is not None: - for col_name in continuous_column_names: - real_data[col_name], synthetic_data[col_name] = cls._discretize_column( - real_data[col_name], synthetic_data[col_name], num_discrete_bins - ) + computation_method = computation_method.upper() + if len(real_data) > MAX_NUM_ROWS or len(synthetic_data) > MAX_NUM_ROWS: + warnings.warn( + f'Data exceeds {MAX_NUM_ROWS} rows, perfomance may be slow.' + 'Consider using the `DisclosureProtectionEstimate` for faster computation.' + ) - # Convert null values to own category - null_category_map = cls._get_null_categories( - real_data, synthetic_data, known_column_names + sensitive_column_names + real_data, synthetic_data = cls._discretize_and_fillna( + real_data, + synthetic_data, + known_column_names, + sensitive_column_names, + continuous_column_names, + num_discrete_bins, ) - real_data = real_data.fillna(null_category_map) - synthetic_data = synthetic_data.fillna(null_category_map) # Compute baseline baseline_protection = cls._compute_baseline(real_data, sensitive_column_names) # Compute CAP metric cap_metric = CAP_METHODS.get(computation_method) - cap_protection = cap_metric.compute( + cap_protection = cap_metric._compute( real_data, synthetic_data, key_fields=known_column_names, @@ -243,3 +297,232 @@ def compute( num_discrete_bins, ) return score_breakdown['score'] + + +class DisclosureProtectionEstimate(DisclosureProtection): + """DisclosureProtectionEstimate metric.""" + + @classmethod + def _validate_inputs( + cls, + real_data, + synthetic_data, + known_column_names, + sensitive_column_names, + computation_method, + continuous_column_names, + num_discrete_bins, + num_rows_subsample, + num_iterations, + ): + super()._validate_inputs( + real_data, + synthetic_data, + known_column_names, + sensitive_column_names, + computation_method, + continuous_column_names, + num_discrete_bins, + ) + if not isinstance(num_rows_subsample, int) or num_rows_subsample <= 0: + raise ValueError('`num_rows_subsample` must be an integer greater than zero.') + + if not isinstance(num_iterations, int) or num_iterations <= 0: + raise ValueError('`num_iterations` must be an integer greater than zero.') + + @classmethod + def _compute_estimated_cap_metric( + cls, + real_data, + synthetic_data, + baseline_protection, + known_column_names, + sensitive_column_names, + computation_method, + num_rows_subsample, + num_iterations, + verbose, + ): + estimation_iterator = tqdm.tqdm(range(num_iterations), disable=(not verbose)) + if verbose: + description = 'Estimating Disclosure Protection (Score={score:.3f})' + estimation_iterator.set_description(description.format(score=0)) + + cap_metric = CAP_METHODS.get(computation_method) + estimated_score_sum = 0 + for i in estimation_iterator: + real_data_samp = real_data.sample(min(num_rows_subsample, len(real_data))) + synth_data_samp = synthetic_data.sample(min(num_rows_subsample, len(synthetic_data))) + + estimated_cap_protection = cap_metric._compute( + real_data_samp, + synth_data_samp, + key_fields=known_column_names, + sensitive_fields=sensitive_column_names, + ) + estimated_score_sum += estimated_cap_protection + average_computed_score = estimated_score_sum / (i + 1.0) + if baseline_protection == 0: + average_score = 0 if average_computed_score == 0 else 1 + else: + average_score = min(average_computed_score / baseline_protection, 1) + + if verbose: + estimation_iterator.set_description(description.format(score=average_score)) + + return average_score, average_computed_score + + @classmethod + def compute_breakdown( + cls, + real_data, + synthetic_data, + known_column_names, + sensitive_column_names, + computation_method='cap', + continuous_column_names=None, + num_discrete_bins=10, + num_rows_subsample=1000, + num_iterations=10, + verbose=True, + ): + """Compute this metric breakdown. + + Args: + real_data (pd.DataFrame): + A pd.DataFrame with the real data. + synthetic_data (pd.DataFrame): + A pd.DataFrame with the synthetic data. + known_column_names (list[str]): + A list with the string names of the columns that an attacker may already know. + sensitive_column_names (list[str]): + A list with the string names of the columns that an attacker wants to guess + (but does not already know). + computation_method (str, optional): + The type of computation we'll use to simulate the attack. Options are: + - 'cap': Use the CAP method described in the original paper. + - 'generalized_cap': Use the generalized CAP method. + - 'zero_cap': Use the zero cap method. + Defaults to 'cap'. + continuous_column_names (list[str], optional): + A list of column names that represent continuous values (as opposed to discrete + values). These columns will be discretized. Defaults to None. + num_discrete_bins (int, optional): + Number of bins to discretize continous columns in to. Defaults to 10. + num_rows_subsample (int, optional): + The number of rows to subsample in each of the real and synthetic datasets per + iteration. Defaults to 1000 rows. + num_iterations (int, optional): + The number of iterations to do for different subsample. Defaults to 10. + verbose (bool, optional): + Whether to show the progress bar. Defaults to True. + + Returns: + dict + Mapping of the metric output with the keys: + - 'score': The overall score for the metric. + - 'cap_protection': The protection score from the selected computation method. + - 'baseline_protection': The baseline protection for the columns. + """ + cls._validate_inputs( + real_data, + synthetic_data, + known_column_names, + sensitive_column_names, + computation_method, + continuous_column_names, + num_discrete_bins, + num_rows_subsample, + num_iterations, + ) + computation_method = computation_method.upper() + real_data, synthetic_data = cls._discretize_and_fillna( + real_data, + synthetic_data, + known_column_names, + sensitive_column_names, + continuous_column_names, + num_discrete_bins, + ) + + # Compute baseline + baseline_protection = cls._compute_baseline(real_data, sensitive_column_names) + + # Compute estimated CAP metric + average_score, average_computed_score = cls._compute_estimated_cap_metric( + real_data, + synthetic_data, + baseline_protection=baseline_protection, + known_column_names=known_column_names, + sensitive_column_names=sensitive_column_names, + computation_method=computation_method, + num_rows_subsample=num_rows_subsample, + num_iterations=num_iterations, + verbose=verbose, + ) + + return { + 'score': average_score, + 'cap_protection': average_computed_score, + 'baseline_protection': baseline_protection, + } + + @classmethod + def compute( + cls, + real_data, + synthetic_data, + known_column_names, + sensitive_column_names, + computation_method='cap', + continuous_column_names=None, + num_discrete_bins=10, + num_rows_subsample=1000, + num_iterations=10, + verbose=True, + ): + """Compute the DisclosureProtectionEstimate metric. + + Args: + real_data (pd.DataFrame): + A pd.DataFrame with the real data. + synthetic_data (pd.DataFrame): + A pd.DataFrame with the synthetic data. + known_column_names (list[str]): + A list with the string names of the columns that an attacker may already know. + sensitive_column_names (list[str]): + A list with the string names of the columns that an attacker wants to guess + (but does not know). + computation_method (str, optional): + The type of computation we'll use to simulate the attack. Options are: + - 'cap': Use the CAP method described in the original paper. + - 'generalized_cap': Use the generalized CAP method. + - 'zero_cap': Use the zero cap method. + Defaults to 'cap'. + continuous_column_names (list[str], optional): + A list of column names that represent continuous values (as opposed to discrete + values). These columns will be discretized. Defaults to None. + num_discrete_bins (int, optional): + Number of bins to discretize continous columns in to. Defaults to 10. + num_rows_subsample (int, optional): + The number of rows to subsample in each of the real and synthetic datasets per + iteration. Defaults to 1000 rows. + num_iterations (int, optional): + The number of iterations to do for different subsample. Defaults to 10. + verbose (bool, optional): + Whether to show the progress bar. Defaults to True. + + Returns: + float: + The score for the DisclosureProtection metric. + """ + score_breakdown = cls.compute_breakdown( + real_data, + synthetic_data, + known_column_names, + sensitive_column_names, + computation_method, + continuous_column_names, + num_discrete_bins, + ) + return score_breakdown['score'] diff --git a/tests/integration/single_table/privacy/test_disclosure_protection.py b/tests/integration/single_table/privacy/test_disclosure_protection.py index da1af6c3..2ec14f22 100644 --- a/tests/integration/single_table/privacy/test_disclosure_protection.py +++ b/tests/integration/single_table/privacy/test_disclosure_protection.py @@ -2,7 +2,10 @@ import pandas as pd import pytest -from sdmetrics.single_table.privacy.disclosure_protection import DisclosureProtection +from sdmetrics.single_table.privacy.disclosure_protection import ( + DisclosureProtection, + DisclosureProtectionEstimate, +) @pytest.fixture @@ -23,7 +26,7 @@ def perfect_synthetic_data(): 'key1': random_state.choice(['a', 'b', 'c', 'd', 'e'], 20), 'key2': range(20), 'sensitive1': random_state.choice(['f', 'g', 'h', 'i', 'j'], 20), - 'sensitive2': random_state.randint(5, 10, size=20), + 'sensitive2': random_state.randint(100, 200, size=20), }) @@ -142,3 +145,53 @@ def test_all_cap_methods(self, cap_method, real_data, perfect_synthetic_data): 'cap_protection': 1.0, 'baseline_protection': 0.98, } + + +class TestDisclosureProtectionEstimate: + def test_end_to_end_perfect(self, real_data, perfect_synthetic_data): + """Test DisclosureProtectionEstimate metric end to end with perfect synthetic data.""" + # Setup + sensitive_columns = ['sensitive1', 'sensitive2'] + known_columns = ['key1', 'key2'] + continous_columns = ['key2', 'sensitive2'] + + # Run + score_breakdown = DisclosureProtectionEstimate.compute_breakdown( + real_data, + perfect_synthetic_data, + sensitive_column_names=sensitive_columns, + known_column_names=known_columns, + continuous_column_names=continous_columns, + num_discrete_bins=10, + num_rows_subsample=20, + ) + + # Assert + assert score_breakdown == {'score': 1, 'cap_protection': 1, 'baseline_protection': 0.98} + + @pytest.mark.parametrize('cap_method', ['cap', 'zero_cap', 'generalized_cap']) + def test_all_cap_methods(self, cap_method, real_data, perfect_synthetic_data): + """Test DisclosureProtectionEstimate metric with all possible CAP methods.""" + # Setup + sensitive_columns = ['sensitive1', 'sensitive2'] + known_columns = ['key1', 'key2'] + continuous_columns = ['key2', 'sensitive2'] + + # Run + score_breakdown = DisclosureProtectionEstimate.compute_breakdown( + real_data, + perfect_synthetic_data, + sensitive_column_names=sensitive_columns, + known_column_names=known_columns, + continuous_column_names=continuous_columns, + computation_method=cap_method, + num_discrete_bins=10, + num_rows_subsample=20, + ) + + # Assert + assert score_breakdown == { + 'score': 1.0, + 'cap_protection': 1.0, + 'baseline_protection': 0.98, + } diff --git a/tests/unit/single_table/privacy/test_cap.py b/tests/unit/single_table/privacy/test_cap.py new file mode 100644 index 00000000..d83f6260 --- /dev/null +++ b/tests/unit/single_table/privacy/test_cap.py @@ -0,0 +1,27 @@ +import re + +import pandas as pd +import pytest + +from sdmetrics.single_table.privacy.cap import ( + CategoricalCAP, + CategoricalGeneralizedCAP, + CategoricalZeroCAP, +) + + +@pytest.mark.parametrize('metric', [CategoricalCAP, CategoricalZeroCAP, CategoricalGeneralizedCAP]) +def test_CAP_deprecation_message(metric): + """Test deprecation warning is raised when running the metric directly.""" + # Setup + real_data = pd.DataFrame({'col1': range(5), 'col2': ['A', 'B', 'C', 'A', 'B']}) + synthetic_data = pd.DataFrame({'col1': range(5), 'col2': ['C', 'A', 'A', 'B', 'C']}) + + # Run and Assert + expected_warning = re.escape( + 'Computing CAP metrics directly is deprecated. For improved privacy metrics, ' + "please use the 'DisclosureProtection' and 'DisclosureProtectionEstimate' " + 'metrics instead.' + ) + with pytest.warns(DeprecationWarning, match=expected_warning): + metric.compute(real_data, synthetic_data, key_fields=['col1'], sensitive_fields=['col2']) diff --git a/tests/unit/single_table/privacy/test_disclosure_protection.py b/tests/unit/single_table/privacy/test_disclosure_protection.py index b35f6147..b65cb13b 100644 --- a/tests/unit/single_table/privacy/test_disclosure_protection.py +++ b/tests/unit/single_table/privacy/test_disclosure_protection.py @@ -1,13 +1,17 @@ """Test for the disclosure metrics.""" import re -from unittest.mock import Mock, patch +from unittest.mock import MagicMock, Mock, call, patch import numpy as np import pandas as pd import pytest -from sdmetrics.single_table.privacy.disclosure_protection import DisclosureProtection +from sdmetrics.single_table.privacy.disclosure_protection import ( + DisclosureProtection, + DisclosureProtectionEstimate, +) +from tests.utils import DataFrameMatcher class TestDisclosureProtection: @@ -151,21 +155,51 @@ def test__discretize_column_float_dtypes(self, dtype): expected_synthetic = np.array(['0', '0', '1', np.nan, '3', np.nan, '4'], dtype='object') assert list(binned_synthetic) == list(expected_synthetic) - def test__compute_baseline(self): - """Test computing the baseline score for random data.""" + def test__discretize_and_fillna(self): + """Test helper method to discretize continous columns and fill nan values.""" # Setup real_data = pd.DataFrame({ - 'col1': ['A', 'A', 'A', 'A', 'A'], - 'col2': ['A', 'B', 'A', 'B', 'A'], - 'col3': range(5), + 'known': ['A', 'A', pd.NA, 'B', 'B'], + 'continous': [0, 1, 3, 8, 10], + 'continous_nan': [0, 7, 2, np.nan, 10], + 'extra': [None, pd.NA, 0, 10, 100], }) - sensitive_column_names = ['col1', 'col2'] + synthetic_data = pd.DataFrame({ + 'known': ['A', 'A', 'B', 'B', None], + 'continous': [-1, 0, 3, 5, 11], + 'continous_nan': [0, 1, 2, np.nan, 100], + 'extra': [None, pd.NA, 0, 10, 100], + }) + known_column_names = ['known'] + sensitive_column_names = ['continous', 'continous_nan'] + continuous_column_names = ['continous', 'continous_nan'] + num_discrete_bins = 5 # Run - baseline_score = DisclosureProtection._compute_baseline(real_data, sensitive_column_names) + processed_real, processed_synthetic = DisclosureProtection._discretize_and_fillna( + real_data, + synthetic_data, + known_column_names, + sensitive_column_names, + continuous_column_names, + num_discrete_bins, + ) # Assert - assert baseline_score == 0.5 + expected_real = pd.DataFrame({ + 'known': ['A', 'A', '__NULL_VALUE__', 'B', 'B'], + 'continous': ['0', '0', '1', '3', '4'], + 'continous_nan': ['0', '3', '0', '__NULL_VALUE__', '4'], + 'extra': real_data['extra'], + }) + expected_synthetic = pd.DataFrame({ + 'known': ['A', 'A', 'B', 'B', '__NULL_VALUE__'], + 'continous': ['0', '0', '1', '2', '4'], + 'continous_nan': ['0', '0', '0', '__NULL_VALUE__', '4'], + 'extra': synthetic_data['extra'], + }) + pd.testing.assert_frame_equal(expected_real, processed_real) + pd.testing.assert_frame_equal(expected_synthetic, processed_synthetic) def test__compute_baseline(self): """Test computing the baseline score for random data.""" @@ -198,7 +232,7 @@ def test_compute_breakdown(self, CAPMethodsMock): 'col3': range(-2, 8), }) CAPMock = Mock() - CAPMock.compute.return_value = 0.9 + CAPMock._compute.return_value = 0.9 CAPMethodsMock.keys.return_value = ['CAP', 'ZERO_CAP', 'GENERALIZED_CAP'] CAPMethodsMock.get.return_value = CAPMock @@ -232,7 +266,7 @@ def test_compute_breakdown_zero_baseline(self, CAPMethodsMock): 'col2': ['A'] * 10, }) CAPMock = Mock() - CAPMock.compute.return_value = 0.5 + CAPMock._compute.return_value = 0.5 CAPMethodsMock.keys.return_value = ['CAP', 'ZERO_CAP', 'GENERALIZED_CAP'] CAPMethodsMock.get.return_value = CAPMock @@ -244,7 +278,7 @@ def test_compute_breakdown_zero_baseline(self, CAPMethodsMock): sensitive_column_names=['col2'], ) - CAPMock.compute.return_value = 0 + CAPMock._compute.return_value = 0 score_breakdown_no_cap = DisclosureProtection.compute_breakdown( real_data=real_data, synthetic_data=synthetic_data, @@ -260,6 +294,53 @@ def test_compute_breakdown_zero_baseline(self, CAPMethodsMock): } assert score_breakdown_no_cap == {'score': 0, 'baseline_protection': 0, 'cap_protection': 0} + @patch('sdmetrics.single_table.privacy.disclosure_protection.CAP_METHODS') + @patch( + 'sdmetrics.single_table.privacy.disclosure_protection.DisclosureProtection._compute_baseline' + ) + @patch( + 'sdmetrics.single_table.privacy.disclosure_protection.DisclosureProtection._discretize_and_fillna' + ) + def test_compute_breakdown_warns_too_large( + self, mock_discretize_and_fillna, mock_compute_baseline, CAPMethodsMock + ): + """Test the ``compute_breakdown`` warns if the data is too large.""" + # Setup + real_data = pd.DataFrame({ + 'col1': np.random.choice(['A', 'B', 'C', 'D'], size=50001), + 'col2': range(50001), + }) + synthetic_data = pd.DataFrame({ + 'col1': np.random.choice(['A', 'B', 'C', 'D'], size=50001), + 'col2': range(50001), + }) + CAPMock = Mock() + CAPMock._compute.return_value = 0.5 + CAPMethodsMock.keys.return_value = ['CAP', 'ZERO_CAP', 'GENERALIZED_CAP'] + CAPMethodsMock.get.return_value = CAPMock + mock_compute_baseline.return_value = 0.5 + mock_discretize_and_fillna.return_value = (real_data, synthetic_data) + + # Run + expected_warning = re.escape( + 'Data exceeds 50000 rows, perfomance may be slow.' + 'Consider using the `DisclosureProtectionEstimate` for faster computation.' + ) + with pytest.warns(UserWarning, match=expected_warning): + score_breakdown = DisclosureProtection.compute_breakdown( + real_data=real_data, + synthetic_data=synthetic_data, + known_column_names=['col1'], + sensitive_column_names=['col2'], + ) + + # Assert + assert score_breakdown == { + 'score': 1, + 'baseline_protection': 0.5, + 'cap_protection': 0.5, + } + @patch( 'sdmetrics.single_table.privacy.disclosure_protection.DisclosureProtection.compute_breakdown' ) @@ -287,3 +368,196 @@ def test_compute(self, compute_breakdown_mock): # Assert assert score == 0.8 + + +class TestDisclosureProtectionEstimate: + def test__validate_inputs(self): + """Test input validation.""" + # Setup + default_kwargs = { + 'real_data': pd.DataFrame({'col1': range(5), 'col2': range(5)}), + 'synthetic_data': pd.DataFrame({'col1': range(10), 'col2': range(10)}), + 'known_column_names': ['col1'], + 'sensitive_column_names': ['col2'], + 'computation_method': 'cap', + 'continuous_column_names': ['col2'], + 'num_discrete_bins': 10, + 'num_rows_subsample': 1000, + 'num_iterations': 10, + } + bad_rows_subsample = 0 + bad_num_iterations = 0 + + # Run and Assert + DisclosureProtectionEstimate._validate_inputs(**default_kwargs) + + bad_rows_subsample_error = re.escape( + '`num_rows_subsample` must be an integer greater than zero.' + ) + with pytest.raises(ValueError, match=bad_rows_subsample_error): + DisclosureProtectionEstimate._validate_inputs(**{ + **default_kwargs, + 'num_rows_subsample': bad_rows_subsample, + }) + + bad_num_iterations_error = re.escape( + '`num_iterations` must be an integer greater than zero.' + ) + with pytest.raises(ValueError, match=bad_num_iterations_error): + DisclosureProtectionEstimate._validate_inputs(**{ + **default_kwargs, + 'num_iterations': bad_num_iterations, + }) + + @patch('sdmetrics.single_table.privacy.disclosure_protection.tqdm') + @patch('sdmetrics.single_table.privacy.disclosure_protection.CAP_METHODS') + def test__compute_estimated_cap_metric(self, CAPMethodsMock, mock_tqdm): + """Test the ``_compute_estimated_cap_metric`` method.""" + # Setup + real_data = pd.DataFrame({ + 'col1': np.random.choice(['A', 'B', 'C', 'D'], size=5), + 'col2': np.random.choice(['X', 'Y'], size=5), + }) + synthetic_data = pd.DataFrame({ + 'col1': np.random.choice(['A', 'B', 'C', 'D'], size=100), + 'col2': np.random.choice(['X', 'Y'], size=100), + }) + CAPMock = Mock() + CAPMock._compute.side_effect = [0.4, 0.5, 0.2, 0.6, 0.2] + CAPMethodsMock.keys.return_value = ['CAP', 'ZERO_CAP', 'GENERALIZED_CAP'] + CAPMethodsMock.get.return_value = CAPMock + progress_bar = MagicMock() + progress_bar.__iter__.return_value = range(5) + mock_tqdm.tqdm.return_value = progress_bar + + # Run + avg_score, avg_computed_score = DisclosureProtectionEstimate._compute_estimated_cap_metric( + real_data, + synthetic_data, + baseline_protection=0.5, + known_column_names=['col1'], + sensitive_column_names=['col2'], + computation_method='CAP', + num_rows_subsample=10, + num_iterations=5, + verbose=True, + ) + + # Assert + assert avg_score == 0.76 + assert avg_computed_score == 0.38 + progress_bar.set_description.assert_has_calls([ + call('Estimating Disclosure Protection (Score=0.000)'), + call('Estimating Disclosure Protection (Score=0.800)'), + call('Estimating Disclosure Protection (Score=0.900)'), + call('Estimating Disclosure Protection (Score=0.733)'), + call('Estimating Disclosure Protection (Score=0.850)'), + call('Estimating Disclosure Protection (Score=0.760)'), + ]) + + @patch('sdmetrics.single_table.privacy.disclosure_protection.CAP_METHODS') + def test__compute_estimated_cap_metric_zero_baseline(self, CAPMethodsMock): + """Test the ``_compute_estimated_cap_metric`` method with a zero baseline.""" + # Setup + real_data = pd.DataFrame({ + 'col1': np.random.choice(['A', 'B', 'C', 'D'], size=5), + 'col2': ['A'] * 5, + }) + synthetic_data = pd.DataFrame({ + 'col1': np.random.choice(['A', 'B', 'C', 'D'], size=100), + 'col2': ['A'] * 100, + }) + CAPMock = Mock() + CAPMock._compute.side_effect = [0.4, 0.5, 0.2, 0.6, 0.2] + CAPMethodsMock.keys.return_value = ['CAP', 'ZERO_CAP', 'GENERALIZED_CAP'] + CAPMethodsMock.get.return_value = CAPMock + + # Run + avg_score, avg_computed_score = DisclosureProtectionEstimate._compute_estimated_cap_metric( + real_data, + synthetic_data, + baseline_protection=0, + known_column_names=['col1'], + sensitive_column_names=['col2'], + computation_method='CAP', + num_rows_subsample=10, + num_iterations=5, + verbose=False, + ) + + # Assert + assert avg_score == 1 + assert avg_computed_score == 0.38 + + @patch( + 'sdmetrics.single_table.privacy.disclosure_protection.DisclosureProtectionEstimate._compute_estimated_cap_metric' + ) + def test_compute_breakdown(self, mock__compute_estimated_cap_metric): + """Test computing the breakdown.""" + # Setup + real_data = pd.DataFrame({ + 'col1': np.random.choice(['A', 'B', 'C', 'D'], size=10), + 'col2': ['X', 'Y', 'Z', 'Y', 'X', 'X', 'Y', 'Z', 'X', 'A'], + 'col3': ['A', 'B'] * 5, + }) + synthetic_data = pd.DataFrame({ + 'col1': np.random.choice(['A', 'B', 'C', 'D'], size=10), + 'col2': np.random.choice(['X', 'Y', 'Z', 'X', 'X'], size=10), + 'col3': ['A'] * 10, + }) + mock__compute_estimated_cap_metric.return_value = (0.8, 0.6) + + # Run + score_breakdown = DisclosureProtectionEstimate.compute_breakdown( + real_data=real_data, + synthetic_data=synthetic_data, + known_column_names=['col1'], + sensitive_column_names=['col2', 'col3'], + num_discrete_bins=2, + ) + + # Assert + assert score_breakdown == { + 'score': 0.8, + 'baseline_protection': 0.875, + 'cap_protection': 0.6, + } + mock__compute_estimated_cap_metric.assert_called_once_with( + DataFrameMatcher(real_data), + DataFrameMatcher(synthetic_data), + baseline_protection=0.875, + known_column_names=['col1'], + sensitive_column_names=['col2', 'col3'], + computation_method='CAP', + num_rows_subsample=1000, + num_iterations=10, + verbose=True, + ) + + @patch( + 'sdmetrics.single_table.privacy.disclosure_protection.DisclosureProtectionEstimate.compute_breakdown' + ) + def test_compute(self, compute_breakdown_mock): + """Test the ``compute`` method.""" + # Setup + real_data = pd.DataFrame({ + 'col1': np.random.choice(['A', 'B', 'C', 'D'], size=10), + 'col2': ['A'] * 10, + }) + synthetic_data = pd.DataFrame({ + 'col1': np.random.choice(['A', 'B', 'C', 'D'], size=10), + 'col2': ['A'] * 10, + }) + compute_breakdown_mock.return_value = { + 'score': 0.8, + 'baseline_protection': 0.6, + 'cap_protection': 0.64, + } + + # Run + score = DisclosureProtectionEstimate.compute( + real_data, synthetic_data, known_column_names=['col1'], sensitive_column_names=['col2'] + ) + + # Assert + assert score == 0.8