diff --git a/sdmetrics/single_table/privacy/cap.py b/sdmetrics/single_table/privacy/cap.py index 2c26794f..6d224b28 100644 --- a/sdmetrics/single_table/privacy/cap.py +++ b/sdmetrics/single_table/privacy/cap.py @@ -1,8 +1,16 @@ """CAP modules and their attackers.""" +import warnings + from sdmetrics.single_table.privacy.base import CategoricalPrivacyMetric, PrivacyAttackerModel from sdmetrics.single_table.privacy.util import closest_neighbors, count_frequency, majority +DEPRECATION_MSG = ( + 'Computing CAP metrics directly is deprecated. For improved privacy metrics, ' + "please use the 'DisclosureProtection' and 'DisclosureProtectionEstimate' " + 'metrics instead.' +) + class CAPAttacker(PrivacyAttackerModel): """The CAP (Correct Attribution Probability) privacy attacker. @@ -78,6 +86,78 @@ class CategoricalCAP(CategoricalPrivacyMetric): MODEL = CAPAttacker ACCURACY_BASE = False + @classmethod + def _compute( + cls, + real_data, + synthetic_data, + metadata=None, + key_fields=None, + sensitive_fields=None, + model_kwargs=None, + ): + return super().compute( + real_data=real_data, + synthetic_data=synthetic_data, + metadata=metadata, + key_fields=key_fields, + sensitive_fields=sensitive_fields, + model_kwargs=model_kwargs, + ) + + @classmethod + def compute( + cls, + real_data, + synthetic_data, + metadata=None, + key_fields=None, + sensitive_fields=None, + model_kwargs=None, + ): + """Compute this metric. + + This fits an adversial attacker model on the synthetic data and + then evaluates it making predictions on the real data. + + A ``key_fields`` column(s) name must be given, either directly or as a first level + entry in the ``metadata`` dict, which will be used as the key column(s) for the + attack. + + A ``sensitive_fields`` column(s) name must be given, either directly or as a first level + entry in the ``metadata`` dict, which will be used as the sensitive_fields column(s) + for the attack. + + Args: + real_data (Union[numpy.ndarray, pandas.DataFrame]): + The values from the real dataset. + synthetic_data (Union[numpy.ndarray, pandas.DataFrame]): + The values from the synthetic dataset. + metadata (dict): + Table metadata dict. If not passed, it is build based on the + real_data fields and dtypes. + key_fields (list(str)): + Name of the column(s) to use as the key attributes. + sensitive_fields (list(str)): + Name of the column(s) to use as the sensitive attributes. + model_kwargs (dict): + Key word arguments of the attacker model. cls.MODEL_KWARGS will be used + if none is provided. + + Returns: + union[float, tuple[float]]: + Scores obtained by the attackers when evaluated on the real data. + """ + warnings.warn(DEPRECATION_MSG, DeprecationWarning) + return cls._compute( + real_data=real_data, + synthetic_data=synthetic_data, + metadata=metadata, + key_fields=key_fields, + sensitive_fields=sensitive_fields, + model_kwargs=model_kwargs, + ) + class ZeroCAPAttacker(CAPAttacker): """The 0CAP privacy attacker, which operates in the same way as CAP does. @@ -113,6 +193,78 @@ class CategoricalZeroCAP(CategoricalPrivacyMetric): MODEL = ZeroCAPAttacker ACCURACY_BASE = False + @classmethod + def _compute( + cls, + real_data, + synthetic_data, + metadata=None, + key_fields=None, + sensitive_fields=None, + model_kwargs=None, + ): + return super().compute( + real_data=real_data, + synthetic_data=synthetic_data, + metadata=metadata, + key_fields=key_fields, + sensitive_fields=sensitive_fields, + model_kwargs=model_kwargs, + ) + + @classmethod + def compute( + cls, + real_data, + synthetic_data, + metadata=None, + key_fields=None, + sensitive_fields=None, + model_kwargs=None, + ): + """Compute this metric. + + This fits an adversial attacker model on the synthetic data and + then evaluates it making predictions on the real data. + + A ``key_fields`` column(s) name must be given, either directly or as a first level + entry in the ``metadata`` dict, which will be used as the key column(s) for the + attack. + + A ``sensitive_fields`` column(s) name must be given, either directly or as a first level + entry in the ``metadata`` dict, which will be used as the sensitive_fields column(s) + for the attack. + + Args: + real_data (Union[numpy.ndarray, pandas.DataFrame]): + The values from the real dataset. + synthetic_data (Union[numpy.ndarray, pandas.DataFrame]): + The values from the synthetic dataset. + metadata (dict): + Table metadata dict. If not passed, it is build based on the + real_data fields and dtypes. + key_fields (list(str)): + Name of the column(s) to use as the key attributes. + sensitive_fields (list(str)): + Name of the column(s) to use as the sensitive attributes. + model_kwargs (dict): + Key word arguments of the attacker model. cls.MODEL_KWARGS will be used + if none is provided. + + Returns: + union[float, tuple[float]]: + Scores obtained by the attackers when evaluated on the real data. + """ + warnings.warn(DEPRECATION_MSG, DeprecationWarning) + return cls._compute( + real_data=real_data, + synthetic_data=synthetic_data, + metadata=metadata, + key_fields=key_fields, + sensitive_fields=sensitive_fields, + model_kwargs=model_kwargs, + ) + class GeneralizedCAPAttacker(CAPAttacker): """The GeneralizedCAP privacy attacker. @@ -169,3 +321,75 @@ class CategoricalGeneralizedCAP(CategoricalPrivacyMetric): name = 'Categorical GeneralizedCAP' MODEL = GeneralizedCAPAttacker ACCURACY_BASE = False + + @classmethod + def _compute( + cls, + real_data, + synthetic_data, + metadata=None, + key_fields=None, + sensitive_fields=None, + model_kwargs=None, + ): + return super().compute( + real_data=real_data, + synthetic_data=synthetic_data, + metadata=metadata, + key_fields=key_fields, + sensitive_fields=sensitive_fields, + model_kwargs=model_kwargs, + ) + + @classmethod + def compute( + cls, + real_data, + synthetic_data, + metadata=None, + key_fields=None, + sensitive_fields=None, + model_kwargs=None, + ): + """Compute this metric. + + This fits an adversial attacker model on the synthetic data and + then evaluates it making predictions on the real data. + + A ``key_fields`` column(s) name must be given, either directly or as a first level + entry in the ``metadata`` dict, which will be used as the key column(s) for the + attack. + + A ``sensitive_fields`` column(s) name must be given, either directly or as a first level + entry in the ``metadata`` dict, which will be used as the sensitive_fields column(s) + for the attack. + + Args: + real_data (Union[numpy.ndarray, pandas.DataFrame]): + The values from the real dataset. + synthetic_data (Union[numpy.ndarray, pandas.DataFrame]): + The values from the synthetic dataset. + metadata (dict): + Table metadata dict. If not passed, it is build based on the + real_data fields and dtypes. + key_fields (list(str)): + Name of the column(s) to use as the key attributes. + sensitive_fields (list(str)): + Name of the column(s) to use as the sensitive attributes. + model_kwargs (dict): + Key word arguments of the attacker model. cls.MODEL_KWARGS will be used + if none is provided. + + Returns: + union[float, tuple[float]]: + Scores obtained by the attackers when evaluated on the real data. + """ + warnings.warn(DEPRECATION_MSG, DeprecationWarning) + return cls._compute( + real_data=real_data, + synthetic_data=synthetic_data, + metadata=metadata, + key_fields=key_fields, + sensitive_fields=sensitive_fields, + model_kwargs=model_kwargs, + ) diff --git a/sdmetrics/single_table/privacy/disclosure_protection.py b/sdmetrics/single_table/privacy/disclosure_protection.py index 8d53c7c7..ea81c920 100644 --- a/sdmetrics/single_table/privacy/disclosure_protection.py +++ b/sdmetrics/single_table/privacy/disclosure_protection.py @@ -1,5 +1,7 @@ """Disclosure protection metrics.""" +import warnings + import numpy as np import pandas as pd import tqdm @@ -12,6 +14,8 @@ CategoricalZeroCAP, ) +MAX_NUM_ROWS = 50000 + CAP_METHODS = { 'CAP': CategoricalCAP, 'ZERO_CAP': CategoricalZeroCAP, @@ -204,7 +208,14 @@ def compute_breakdown( continuous_column_names, num_discrete_bins, ) + computation_method = computation_method.upper() + if len(real_data) > MAX_NUM_ROWS or len(synthetic_data) > MAX_NUM_ROWS: + warnings.warn( + f'Data exceeds {MAX_NUM_ROWS} rows, perfomance may be slow.' + 'Consider using the `DisclosureProtectionEstimate` for faster computation.' + ) + real_data, synthetic_data = cls._discretize_and_fillna( real_data, synthetic_data, @@ -219,7 +230,7 @@ def compute_breakdown( # Compute CAP metric cap_metric = CAP_METHODS.get(computation_method) - cap_protection = cap_metric.compute( + cap_protection = cap_metric._compute( real_data, synthetic_data, key_fields=known_column_names, @@ -343,7 +354,7 @@ def _compute_estimated_cap_metric( real_data_samp = real_data.sample(min(num_rows_subsample, len(real_data))) synth_data_samp = synthetic_data.sample(min(num_rows_subsample, len(synthetic_data))) - estimated_cap_protection = cap_metric.compute( + estimated_cap_protection = cap_metric._compute( real_data_samp, synth_data_samp, key_fields=known_column_names, diff --git a/tests/unit/single_table/privacy/test_cap.py b/tests/unit/single_table/privacy/test_cap.py new file mode 100644 index 00000000..d83f6260 --- /dev/null +++ b/tests/unit/single_table/privacy/test_cap.py @@ -0,0 +1,27 @@ +import re + +import pandas as pd +import pytest + +from sdmetrics.single_table.privacy.cap import ( + CategoricalCAP, + CategoricalGeneralizedCAP, + CategoricalZeroCAP, +) + + +@pytest.mark.parametrize('metric', [CategoricalCAP, CategoricalZeroCAP, CategoricalGeneralizedCAP]) +def test_CAP_deprecation_message(metric): + """Test deprecation warning is raised when running the metric directly.""" + # Setup + real_data = pd.DataFrame({'col1': range(5), 'col2': ['A', 'B', 'C', 'A', 'B']}) + synthetic_data = pd.DataFrame({'col1': range(5), 'col2': ['C', 'A', 'A', 'B', 'C']}) + + # Run and Assert + expected_warning = re.escape( + 'Computing CAP metrics directly is deprecated. For improved privacy metrics, ' + "please use the 'DisclosureProtection' and 'DisclosureProtectionEstimate' " + 'metrics instead.' + ) + with pytest.warns(DeprecationWarning, match=expected_warning): + metric.compute(real_data, synthetic_data, key_fields=['col1'], sensitive_fields=['col2']) diff --git a/tests/unit/single_table/privacy/test_disclosure_protection.py b/tests/unit/single_table/privacy/test_disclosure_protection.py index 5cbf19ff..b65cb13b 100644 --- a/tests/unit/single_table/privacy/test_disclosure_protection.py +++ b/tests/unit/single_table/privacy/test_disclosure_protection.py @@ -232,7 +232,7 @@ def test_compute_breakdown(self, CAPMethodsMock): 'col3': range(-2, 8), }) CAPMock = Mock() - CAPMock.compute.return_value = 0.9 + CAPMock._compute.return_value = 0.9 CAPMethodsMock.keys.return_value = ['CAP', 'ZERO_CAP', 'GENERALIZED_CAP'] CAPMethodsMock.get.return_value = CAPMock @@ -266,7 +266,7 @@ def test_compute_breakdown_zero_baseline(self, CAPMethodsMock): 'col2': ['A'] * 10, }) CAPMock = Mock() - CAPMock.compute.return_value = 0.5 + CAPMock._compute.return_value = 0.5 CAPMethodsMock.keys.return_value = ['CAP', 'ZERO_CAP', 'GENERALIZED_CAP'] CAPMethodsMock.get.return_value = CAPMock @@ -278,7 +278,7 @@ def test_compute_breakdown_zero_baseline(self, CAPMethodsMock): sensitive_column_names=['col2'], ) - CAPMock.compute.return_value = 0 + CAPMock._compute.return_value = 0 score_breakdown_no_cap = DisclosureProtection.compute_breakdown( real_data=real_data, synthetic_data=synthetic_data, @@ -294,6 +294,53 @@ def test_compute_breakdown_zero_baseline(self, CAPMethodsMock): } assert score_breakdown_no_cap == {'score': 0, 'baseline_protection': 0, 'cap_protection': 0} + @patch('sdmetrics.single_table.privacy.disclosure_protection.CAP_METHODS') + @patch( + 'sdmetrics.single_table.privacy.disclosure_protection.DisclosureProtection._compute_baseline' + ) + @patch( + 'sdmetrics.single_table.privacy.disclosure_protection.DisclosureProtection._discretize_and_fillna' + ) + def test_compute_breakdown_warns_too_large( + self, mock_discretize_and_fillna, mock_compute_baseline, CAPMethodsMock + ): + """Test the ``compute_breakdown`` warns if the data is too large.""" + # Setup + real_data = pd.DataFrame({ + 'col1': np.random.choice(['A', 'B', 'C', 'D'], size=50001), + 'col2': range(50001), + }) + synthetic_data = pd.DataFrame({ + 'col1': np.random.choice(['A', 'B', 'C', 'D'], size=50001), + 'col2': range(50001), + }) + CAPMock = Mock() + CAPMock._compute.return_value = 0.5 + CAPMethodsMock.keys.return_value = ['CAP', 'ZERO_CAP', 'GENERALIZED_CAP'] + CAPMethodsMock.get.return_value = CAPMock + mock_compute_baseline.return_value = 0.5 + mock_discretize_and_fillna.return_value = (real_data, synthetic_data) + + # Run + expected_warning = re.escape( + 'Data exceeds 50000 rows, perfomance may be slow.' + 'Consider using the `DisclosureProtectionEstimate` for faster computation.' + ) + with pytest.warns(UserWarning, match=expected_warning): + score_breakdown = DisclosureProtection.compute_breakdown( + real_data=real_data, + synthetic_data=synthetic_data, + known_column_names=['col1'], + sensitive_column_names=['col2'], + ) + + # Assert + assert score_breakdown == { + 'score': 1, + 'baseline_protection': 0.5, + 'cap_protection': 0.5, + } + @patch( 'sdmetrics.single_table.privacy.disclosure_protection.DisclosureProtection.compute_breakdown' ) @@ -376,7 +423,7 @@ def test__compute_estimated_cap_metric(self, CAPMethodsMock, mock_tqdm): 'col2': np.random.choice(['X', 'Y'], size=100), }) CAPMock = Mock() - CAPMock.compute.side_effect = [0.4, 0.5, 0.2, 0.6, 0.2] + CAPMock._compute.side_effect = [0.4, 0.5, 0.2, 0.6, 0.2] CAPMethodsMock.keys.return_value = ['CAP', 'ZERO_CAP', 'GENERALIZED_CAP'] CAPMethodsMock.get.return_value = CAPMock progress_bar = MagicMock() @@ -421,7 +468,7 @@ def test__compute_estimated_cap_metric_zero_baseline(self, CAPMethodsMock): 'col2': ['A'] * 100, }) CAPMock = Mock() - CAPMock.compute.side_effect = [0.4, 0.5, 0.2, 0.6, 0.2] + CAPMock._compute.side_effect = [0.4, 0.5, 0.2, 0.6, 0.2] CAPMethodsMock.keys.return_value = ['CAP', 'ZERO_CAP', 'GENERALIZED_CAP'] CAPMethodsMock.get.return_value = CAPMock