From 0bf95d9d5b36961502e462973c46f4fb6d0b9e80 Mon Sep 17 00:00:00 2001
From: Frances Hartwell <frances@datacebo.com>
Date: Tue, 10 Dec 2024 10:33:15 -0500
Subject: [PATCH] Add `DisclosureProtectionEstimate` metric (#686)

---
 sdmetrics/single_table/__init__.py            |   6 +-
 sdmetrics/single_table/privacy/__init__.py    |   6 +-
 sdmetrics/single_table/privacy/cap.py         | 224 +++++++++++++
 .../privacy/disclosure_protection.py          | 313 +++++++++++++++++-
 .../privacy/test_disclosure_protection.py     |  57 +++-
 tests/unit/single_table/privacy/test_cap.py   |  27 ++
 .../privacy/test_disclosure_protection.py     | 300 ++++++++++++++++-
 7 files changed, 901 insertions(+), 32 deletions(-)
 create mode 100644 tests/unit/single_table/privacy/test_cap.py

diff --git a/sdmetrics/single_table/__init__.py b/sdmetrics/single_table/__init__.py
index d35f51e6..6bc54960 100644
--- a/sdmetrics/single_table/__init__.py
+++ b/sdmetrics/single_table/__init__.py
@@ -67,7 +67,10 @@
     CategoricalRF,
     CategoricalSVM,
 )
-from sdmetrics.single_table.privacy.disclosure_protection import DisclosureProtection
+from sdmetrics.single_table.privacy.disclosure_protection import (
+    DisclosureProtection,
+    DisclosureProtectionEstimate,
+)
 from sdmetrics.single_table.privacy.ensemble import CategoricalEnsemble
 from sdmetrics.single_table.privacy.numerical_sklearn import NumericalLR, NumericalMLP, NumericalSVR
 from sdmetrics.single_table.privacy.radius_nearest_neighbor import NumericalRadiusNearestNeighbor
@@ -111,6 +114,7 @@
     'CategoricalZeroCAP',
     'CategoricalGeneralizedCAP',
     'DisclosureProtection',
+    'DisclosureProtectionEstimate',
     'NumericalMLP',
     'NumericalLR',
     'NumericalSVR',
diff --git a/sdmetrics/single_table/privacy/__init__.py b/sdmetrics/single_table/privacy/__init__.py
index 667e178e..ac06b53c 100644
--- a/sdmetrics/single_table/privacy/__init__.py
+++ b/sdmetrics/single_table/privacy/__init__.py
@@ -12,7 +12,10 @@
     CategoricalRF,
     CategoricalSVM,
 )
-from sdmetrics.single_table.privacy.disclosure_protection import DisclosureProtection
+from sdmetrics.single_table.privacy.disclosure_protection import (
+    DisclosureProtection,
+    DisclosureProtectionEstimate,
+)
 from sdmetrics.single_table.privacy.ensemble import CategoricalEnsemble
 from sdmetrics.single_table.privacy.numerical_sklearn import NumericalLR, NumericalMLP, NumericalSVR
 from sdmetrics.single_table.privacy.radius_nearest_neighbor import NumericalRadiusNearestNeighbor
@@ -28,6 +31,7 @@
     'CategoricalSVM',
     'CategoricalZeroCAP',
     'DisclosureProtection',
+    'DisclosureProtectionEstimate',
     'NumericalLR',
     'NumericalMLP',
     'NumericalPrivacyMetric',
diff --git a/sdmetrics/single_table/privacy/cap.py b/sdmetrics/single_table/privacy/cap.py
index 2c26794f..6d224b28 100644
--- a/sdmetrics/single_table/privacy/cap.py
+++ b/sdmetrics/single_table/privacy/cap.py
@@ -1,8 +1,16 @@
 """CAP modules and their attackers."""
 
+import warnings
+
 from sdmetrics.single_table.privacy.base import CategoricalPrivacyMetric, PrivacyAttackerModel
 from sdmetrics.single_table.privacy.util import closest_neighbors, count_frequency, majority
 
+DEPRECATION_MSG = (
+    'Computing CAP metrics directly is deprecated. For improved privacy metrics, '
+    "please use the 'DisclosureProtection' and 'DisclosureProtectionEstimate' "
+    'metrics instead.'
+)
+
 
 class CAPAttacker(PrivacyAttackerModel):
     """The CAP (Correct Attribution Probability) privacy attacker.
@@ -78,6 +86,78 @@ class CategoricalCAP(CategoricalPrivacyMetric):
     MODEL = CAPAttacker
     ACCURACY_BASE = False
 
+    @classmethod
+    def _compute(
+        cls,
+        real_data,
+        synthetic_data,
+        metadata=None,
+        key_fields=None,
+        sensitive_fields=None,
+        model_kwargs=None,
+    ):
+        return super().compute(
+            real_data=real_data,
+            synthetic_data=synthetic_data,
+            metadata=metadata,
+            key_fields=key_fields,
+            sensitive_fields=sensitive_fields,
+            model_kwargs=model_kwargs,
+        )
+
+    @classmethod
+    def compute(
+        cls,
+        real_data,
+        synthetic_data,
+        metadata=None,
+        key_fields=None,
+        sensitive_fields=None,
+        model_kwargs=None,
+    ):
+        """Compute this metric.
+
+        This fits an adversial attacker model on the synthetic data and
+        then evaluates it making predictions on the real data.
+
+        A ``key_fields`` column(s) name must be given, either directly or as a first level
+        entry in the ``metadata`` dict, which will be used as the key column(s) for the
+        attack.
+
+        A ``sensitive_fields`` column(s) name must be given, either directly or as a first level
+        entry in the ``metadata`` dict, which will be used as the sensitive_fields column(s)
+        for the attack.
+
+        Args:
+            real_data (Union[numpy.ndarray, pandas.DataFrame]):
+                The values from the real dataset.
+            synthetic_data (Union[numpy.ndarray, pandas.DataFrame]):
+                The values from the synthetic dataset.
+            metadata (dict):
+                Table metadata dict. If not passed, it is build based on the
+                real_data fields and dtypes.
+            key_fields (list(str)):
+                Name of the column(s) to use as the key attributes.
+            sensitive_fields (list(str)):
+                Name of the column(s) to use as the sensitive attributes.
+            model_kwargs (dict):
+                Key word arguments of the attacker model. cls.MODEL_KWARGS will be used
+                if none is provided.
+
+        Returns:
+            union[float, tuple[float]]:
+                Scores obtained by the attackers when evaluated on the real data.
+        """
+        warnings.warn(DEPRECATION_MSG, DeprecationWarning)
+        return cls._compute(
+            real_data=real_data,
+            synthetic_data=synthetic_data,
+            metadata=metadata,
+            key_fields=key_fields,
+            sensitive_fields=sensitive_fields,
+            model_kwargs=model_kwargs,
+        )
+
 
 class ZeroCAPAttacker(CAPAttacker):
     """The 0CAP privacy attacker, which operates in the same way as CAP does.
@@ -113,6 +193,78 @@ class CategoricalZeroCAP(CategoricalPrivacyMetric):
     MODEL = ZeroCAPAttacker
     ACCURACY_BASE = False
 
+    @classmethod
+    def _compute(
+        cls,
+        real_data,
+        synthetic_data,
+        metadata=None,
+        key_fields=None,
+        sensitive_fields=None,
+        model_kwargs=None,
+    ):
+        return super().compute(
+            real_data=real_data,
+            synthetic_data=synthetic_data,
+            metadata=metadata,
+            key_fields=key_fields,
+            sensitive_fields=sensitive_fields,
+            model_kwargs=model_kwargs,
+        )
+
+    @classmethod
+    def compute(
+        cls,
+        real_data,
+        synthetic_data,
+        metadata=None,
+        key_fields=None,
+        sensitive_fields=None,
+        model_kwargs=None,
+    ):
+        """Compute this metric.
+
+        This fits an adversial attacker model on the synthetic data and
+        then evaluates it making predictions on the real data.
+
+        A ``key_fields`` column(s) name must be given, either directly or as a first level
+        entry in the ``metadata`` dict, which will be used as the key column(s) for the
+        attack.
+
+        A ``sensitive_fields`` column(s) name must be given, either directly or as a first level
+        entry in the ``metadata`` dict, which will be used as the sensitive_fields column(s)
+        for the attack.
+
+        Args:
+            real_data (Union[numpy.ndarray, pandas.DataFrame]):
+                The values from the real dataset.
+            synthetic_data (Union[numpy.ndarray, pandas.DataFrame]):
+                The values from the synthetic dataset.
+            metadata (dict):
+                Table metadata dict. If not passed, it is build based on the
+                real_data fields and dtypes.
+            key_fields (list(str)):
+                Name of the column(s) to use as the key attributes.
+            sensitive_fields (list(str)):
+                Name of the column(s) to use as the sensitive attributes.
+            model_kwargs (dict):
+                Key word arguments of the attacker model. cls.MODEL_KWARGS will be used
+                if none is provided.
+
+        Returns:
+            union[float, tuple[float]]:
+                Scores obtained by the attackers when evaluated on the real data.
+        """
+        warnings.warn(DEPRECATION_MSG, DeprecationWarning)
+        return cls._compute(
+            real_data=real_data,
+            synthetic_data=synthetic_data,
+            metadata=metadata,
+            key_fields=key_fields,
+            sensitive_fields=sensitive_fields,
+            model_kwargs=model_kwargs,
+        )
+
 
 class GeneralizedCAPAttacker(CAPAttacker):
     """The GeneralizedCAP privacy attacker.
@@ -169,3 +321,75 @@ class CategoricalGeneralizedCAP(CategoricalPrivacyMetric):
     name = 'Categorical GeneralizedCAP'
     MODEL = GeneralizedCAPAttacker
     ACCURACY_BASE = False
+
+    @classmethod
+    def _compute(
+        cls,
+        real_data,
+        synthetic_data,
+        metadata=None,
+        key_fields=None,
+        sensitive_fields=None,
+        model_kwargs=None,
+    ):
+        return super().compute(
+            real_data=real_data,
+            synthetic_data=synthetic_data,
+            metadata=metadata,
+            key_fields=key_fields,
+            sensitive_fields=sensitive_fields,
+            model_kwargs=model_kwargs,
+        )
+
+    @classmethod
+    def compute(
+        cls,
+        real_data,
+        synthetic_data,
+        metadata=None,
+        key_fields=None,
+        sensitive_fields=None,
+        model_kwargs=None,
+    ):
+        """Compute this metric.
+
+        This fits an adversial attacker model on the synthetic data and
+        then evaluates it making predictions on the real data.
+
+        A ``key_fields`` column(s) name must be given, either directly or as a first level
+        entry in the ``metadata`` dict, which will be used as the key column(s) for the
+        attack.
+
+        A ``sensitive_fields`` column(s) name must be given, either directly or as a first level
+        entry in the ``metadata`` dict, which will be used as the sensitive_fields column(s)
+        for the attack.
+
+        Args:
+            real_data (Union[numpy.ndarray, pandas.DataFrame]):
+                The values from the real dataset.
+            synthetic_data (Union[numpy.ndarray, pandas.DataFrame]):
+                The values from the synthetic dataset.
+            metadata (dict):
+                Table metadata dict. If not passed, it is build based on the
+                real_data fields and dtypes.
+            key_fields (list(str)):
+                Name of the column(s) to use as the key attributes.
+            sensitive_fields (list(str)):
+                Name of the column(s) to use as the sensitive attributes.
+            model_kwargs (dict):
+                Key word arguments of the attacker model. cls.MODEL_KWARGS will be used
+                if none is provided.
+
+        Returns:
+            union[float, tuple[float]]:
+                Scores obtained by the attackers when evaluated on the real data.
+        """
+        warnings.warn(DEPRECATION_MSG, DeprecationWarning)
+        return cls._compute(
+            real_data=real_data,
+            synthetic_data=synthetic_data,
+            metadata=metadata,
+            key_fields=key_fields,
+            sensitive_fields=sensitive_fields,
+            model_kwargs=model_kwargs,
+        )
diff --git a/sdmetrics/single_table/privacy/disclosure_protection.py b/sdmetrics/single_table/privacy/disclosure_protection.py
index 6149a6b5..ea81c920 100644
--- a/sdmetrics/single_table/privacy/disclosure_protection.py
+++ b/sdmetrics/single_table/privacy/disclosure_protection.py
@@ -1,7 +1,10 @@
 """Disclosure protection metrics."""
 
+import warnings
+
 import numpy as np
 import pandas as pd
+import tqdm
 
 from sdmetrics.goal import Goal
 from sdmetrics.single_table.base import SingleTableMetric
@@ -11,6 +14,8 @@
     CategoricalZeroCAP,
 )
 
+MAX_NUM_ROWS = 50000
+
 CAP_METHODS = {
     'CAP': CategoricalCAP,
     'ZERO_CAP': CategoricalZeroCAP,
@@ -95,6 +100,56 @@ def _discretize_column(cls, real_column, synthetic_column, num_bins):
 
         return real_binned.to_numpy(), synthetic_binned.to_numpy()
 
+    @classmethod
+    def _discretize_and_fillna(
+        cls,
+        real_data,
+        synthetic_data,
+        known_column_names,
+        sensitive_column_names,
+        continuous_column_names,
+        num_discrete_bins,
+    ):
+        """Helper to discretize continous columns and convert null values to categories.
+
+        Args:
+            real_data (pd.DataFrame):
+                A pd.DataFrame with the real data.
+            synthetic_data (pd.DataFrame):
+                A pd.DataFrame with the synthetic data.
+            known_column_names (list[str]):
+                A list with the string names of the columns that an attacker may already know.
+            sensitive_column_names (list[str]):
+                A list with the string names of the columns that an attacker wants to guess
+                (but does not already know).
+            continuous_column_names (list[str]):
+                A list of column names that represent continuous values (as opposed to discrete
+                values). These columns will be discretized. Defaults to None.
+            num_discrete_bins (int):
+                Number of bins to discretize continous columns in to. Defaults to 10.
+
+        Returns:
+            tuple(pd.DataFrame, pd.DataFrame):
+                The pre-processed real and synthetic data.
+        """
+        real_data = real_data.copy()
+        synthetic_data = synthetic_data.copy()
+
+        # Discretize continous columns
+        if continuous_column_names is not None:
+            for col_name in continuous_column_names:
+                real_data[col_name], synthetic_data[col_name] = cls._discretize_column(
+                    real_data[col_name], synthetic_data[col_name], num_discrete_bins
+                )
+
+        # Convert null values to own category
+        null_category_map = cls._get_null_categories(
+            real_data, synthetic_data, known_column_names + sensitive_column_names
+        )
+        real_data = real_data.fillna(null_category_map)
+        synthetic_data = synthetic_data.fillna(null_category_map)
+        return real_data, synthetic_data
+
     @classmethod
     def _compute_baseline(cls, real_data, sensitive_column_names):
         unique_categories_prod = np.prod([
@@ -153,30 +208,29 @@ def compute_breakdown(
             continuous_column_names,
             num_discrete_bins,
         )
-        computation_method = computation_method.upper()
-        real_data = real_data.copy()
-        synthetic_data = synthetic_data.copy()
 
-        # Discretize continous columns
-        if continuous_column_names is not None:
-            for col_name in continuous_column_names:
-                real_data[col_name], synthetic_data[col_name] = cls._discretize_column(
-                    real_data[col_name], synthetic_data[col_name], num_discrete_bins
-                )
+        computation_method = computation_method.upper()
+        if len(real_data) > MAX_NUM_ROWS or len(synthetic_data) > MAX_NUM_ROWS:
+            warnings.warn(
+                f'Data exceeds {MAX_NUM_ROWS} rows, perfomance may be slow.'
+                'Consider using the `DisclosureProtectionEstimate` for faster computation.'
+            )
 
-        # Convert null values to own category
-        null_category_map = cls._get_null_categories(
-            real_data, synthetic_data, known_column_names + sensitive_column_names
+        real_data, synthetic_data = cls._discretize_and_fillna(
+            real_data,
+            synthetic_data,
+            known_column_names,
+            sensitive_column_names,
+            continuous_column_names,
+            num_discrete_bins,
         )
-        real_data = real_data.fillna(null_category_map)
-        synthetic_data = synthetic_data.fillna(null_category_map)
 
         # Compute baseline
         baseline_protection = cls._compute_baseline(real_data, sensitive_column_names)
 
         # Compute CAP metric
         cap_metric = CAP_METHODS.get(computation_method)
-        cap_protection = cap_metric.compute(
+        cap_protection = cap_metric._compute(
             real_data,
             synthetic_data,
             key_fields=known_column_names,
@@ -243,3 +297,232 @@ def compute(
             num_discrete_bins,
         )
         return score_breakdown['score']
+
+
+class DisclosureProtectionEstimate(DisclosureProtection):
+    """DisclosureProtectionEstimate metric."""
+
+    @classmethod
+    def _validate_inputs(
+        cls,
+        real_data,
+        synthetic_data,
+        known_column_names,
+        sensitive_column_names,
+        computation_method,
+        continuous_column_names,
+        num_discrete_bins,
+        num_rows_subsample,
+        num_iterations,
+    ):
+        super()._validate_inputs(
+            real_data,
+            synthetic_data,
+            known_column_names,
+            sensitive_column_names,
+            computation_method,
+            continuous_column_names,
+            num_discrete_bins,
+        )
+        if not isinstance(num_rows_subsample, int) or num_rows_subsample <= 0:
+            raise ValueError('`num_rows_subsample` must be an integer greater than zero.')
+
+        if not isinstance(num_iterations, int) or num_iterations <= 0:
+            raise ValueError('`num_iterations` must be an integer greater than zero.')
+
+    @classmethod
+    def _compute_estimated_cap_metric(
+        cls,
+        real_data,
+        synthetic_data,
+        baseline_protection,
+        known_column_names,
+        sensitive_column_names,
+        computation_method,
+        num_rows_subsample,
+        num_iterations,
+        verbose,
+    ):
+        estimation_iterator = tqdm.tqdm(range(num_iterations), disable=(not verbose))
+        if verbose:
+            description = 'Estimating Disclosure Protection (Score={score:.3f})'
+            estimation_iterator.set_description(description.format(score=0))
+
+        cap_metric = CAP_METHODS.get(computation_method)
+        estimated_score_sum = 0
+        for i in estimation_iterator:
+            real_data_samp = real_data.sample(min(num_rows_subsample, len(real_data)))
+            synth_data_samp = synthetic_data.sample(min(num_rows_subsample, len(synthetic_data)))
+
+            estimated_cap_protection = cap_metric._compute(
+                real_data_samp,
+                synth_data_samp,
+                key_fields=known_column_names,
+                sensitive_fields=sensitive_column_names,
+            )
+            estimated_score_sum += estimated_cap_protection
+            average_computed_score = estimated_score_sum / (i + 1.0)
+            if baseline_protection == 0:
+                average_score = 0 if average_computed_score == 0 else 1
+            else:
+                average_score = min(average_computed_score / baseline_protection, 1)
+
+            if verbose:
+                estimation_iterator.set_description(description.format(score=average_score))
+
+        return average_score, average_computed_score
+
+    @classmethod
+    def compute_breakdown(
+        cls,
+        real_data,
+        synthetic_data,
+        known_column_names,
+        sensitive_column_names,
+        computation_method='cap',
+        continuous_column_names=None,
+        num_discrete_bins=10,
+        num_rows_subsample=1000,
+        num_iterations=10,
+        verbose=True,
+    ):
+        """Compute this metric breakdown.
+
+        Args:
+            real_data (pd.DataFrame):
+                A pd.DataFrame with the real data.
+            synthetic_data (pd.DataFrame):
+                A pd.DataFrame with the synthetic data.
+            known_column_names (list[str]):
+                A list with the string names of the columns that an attacker may already know.
+            sensitive_column_names (list[str]):
+                A list with the string names of the columns that an attacker wants to guess
+                (but does not already know).
+            computation_method (str, optional):
+                The type of computation we'll use to simulate the attack. Options are:
+                    - 'cap':  Use the CAP method described in the original paper.
+                    - 'generalized_cap': Use the generalized CAP method.
+                    - 'zero_cap': Use the zero cap method.
+                Defaults to 'cap'.
+            continuous_column_names (list[str], optional):
+                A list of column names that represent continuous values (as opposed to discrete
+                values). These columns will be discretized. Defaults to None.
+            num_discrete_bins (int, optional):
+                Number of bins to discretize continous columns in to. Defaults to 10.
+            num_rows_subsample (int, optional):
+                The number of rows to subsample in each of the real and synthetic datasets per
+                iteration. Defaults to 1000 rows.
+            num_iterations (int, optional):
+                The number of iterations to do for different subsample. Defaults to 10.
+            verbose (bool, optional):
+                Whether to show the progress bar. Defaults to True.
+
+        Returns:
+            dict
+                Mapping of the metric output with the keys:
+                    - 'score': The overall score for the metric.
+                    - 'cap_protection': The protection score from the selected computation method.
+                    - 'baseline_protection': The baseline protection for the columns.
+        """
+        cls._validate_inputs(
+            real_data,
+            synthetic_data,
+            known_column_names,
+            sensitive_column_names,
+            computation_method,
+            continuous_column_names,
+            num_discrete_bins,
+            num_rows_subsample,
+            num_iterations,
+        )
+        computation_method = computation_method.upper()
+        real_data, synthetic_data = cls._discretize_and_fillna(
+            real_data,
+            synthetic_data,
+            known_column_names,
+            sensitive_column_names,
+            continuous_column_names,
+            num_discrete_bins,
+        )
+
+        # Compute baseline
+        baseline_protection = cls._compute_baseline(real_data, sensitive_column_names)
+
+        # Compute estimated CAP metric
+        average_score, average_computed_score = cls._compute_estimated_cap_metric(
+            real_data,
+            synthetic_data,
+            baseline_protection=baseline_protection,
+            known_column_names=known_column_names,
+            sensitive_column_names=sensitive_column_names,
+            computation_method=computation_method,
+            num_rows_subsample=num_rows_subsample,
+            num_iterations=num_iterations,
+            verbose=verbose,
+        )
+
+        return {
+            'score': average_score,
+            'cap_protection': average_computed_score,
+            'baseline_protection': baseline_protection,
+        }
+
+    @classmethod
+    def compute(
+        cls,
+        real_data,
+        synthetic_data,
+        known_column_names,
+        sensitive_column_names,
+        computation_method='cap',
+        continuous_column_names=None,
+        num_discrete_bins=10,
+        num_rows_subsample=1000,
+        num_iterations=10,
+        verbose=True,
+    ):
+        """Compute the DisclosureProtectionEstimate metric.
+
+        Args:
+            real_data (pd.DataFrame):
+                A pd.DataFrame with the real data.
+            synthetic_data (pd.DataFrame):
+                A pd.DataFrame with the synthetic data.
+            known_column_names (list[str]):
+                A list with the string names of the columns that an attacker may already know.
+            sensitive_column_names (list[str]):
+                A list with the string names of the columns that an attacker wants to guess
+                (but does not know).
+            computation_method (str, optional):
+                The type of computation we'll use to simulate the attack. Options are:
+                    - 'cap':  Use the CAP method described in the original paper.
+                    - 'generalized_cap': Use the generalized CAP method.
+                    - 'zero_cap': Use the zero cap method.
+                Defaults to 'cap'.
+            continuous_column_names (list[str], optional):
+                A list of column names that represent continuous values (as opposed to discrete
+                values). These columns will be discretized. Defaults to None.
+            num_discrete_bins (int, optional):
+                Number of bins to discretize continous columns in to. Defaults to 10.
+            num_rows_subsample (int, optional):
+                The number of rows to subsample in each of the real and synthetic datasets per
+                iteration. Defaults to 1000 rows.
+            num_iterations (int, optional):
+                The number of iterations to do for different subsample. Defaults to 10.
+            verbose (bool, optional):
+                Whether to show the progress bar. Defaults to True.
+
+        Returns:
+            float:
+                The score for the DisclosureProtection metric.
+        """
+        score_breakdown = cls.compute_breakdown(
+            real_data,
+            synthetic_data,
+            known_column_names,
+            sensitive_column_names,
+            computation_method,
+            continuous_column_names,
+            num_discrete_bins,
+        )
+        return score_breakdown['score']
diff --git a/tests/integration/single_table/privacy/test_disclosure_protection.py b/tests/integration/single_table/privacy/test_disclosure_protection.py
index da1af6c3..2ec14f22 100644
--- a/tests/integration/single_table/privacy/test_disclosure_protection.py
+++ b/tests/integration/single_table/privacy/test_disclosure_protection.py
@@ -2,7 +2,10 @@
 import pandas as pd
 import pytest
 
-from sdmetrics.single_table.privacy.disclosure_protection import DisclosureProtection
+from sdmetrics.single_table.privacy.disclosure_protection import (
+    DisclosureProtection,
+    DisclosureProtectionEstimate,
+)
 
 
 @pytest.fixture
@@ -23,7 +26,7 @@ def perfect_synthetic_data():
         'key1': random_state.choice(['a', 'b', 'c', 'd', 'e'], 20),
         'key2': range(20),
         'sensitive1': random_state.choice(['f', 'g', 'h', 'i', 'j'], 20),
-        'sensitive2': random_state.randint(5, 10, size=20),
+        'sensitive2': random_state.randint(100, 200, size=20),
     })
 
 
@@ -142,3 +145,53 @@ def test_all_cap_methods(self, cap_method, real_data, perfect_synthetic_data):
             'cap_protection': 1.0,
             'baseline_protection': 0.98,
         }
+
+
+class TestDisclosureProtectionEstimate:
+    def test_end_to_end_perfect(self, real_data, perfect_synthetic_data):
+        """Test DisclosureProtectionEstimate metric end to end with perfect synthetic data."""
+        # Setup
+        sensitive_columns = ['sensitive1', 'sensitive2']
+        known_columns = ['key1', 'key2']
+        continous_columns = ['key2', 'sensitive2']
+
+        # Run
+        score_breakdown = DisclosureProtectionEstimate.compute_breakdown(
+            real_data,
+            perfect_synthetic_data,
+            sensitive_column_names=sensitive_columns,
+            known_column_names=known_columns,
+            continuous_column_names=continous_columns,
+            num_discrete_bins=10,
+            num_rows_subsample=20,
+        )
+
+        # Assert
+        assert score_breakdown == {'score': 1, 'cap_protection': 1, 'baseline_protection': 0.98}
+
+    @pytest.mark.parametrize('cap_method', ['cap', 'zero_cap', 'generalized_cap'])
+    def test_all_cap_methods(self, cap_method, real_data, perfect_synthetic_data):
+        """Test DisclosureProtectionEstimate metric with all possible CAP methods."""
+        # Setup
+        sensitive_columns = ['sensitive1', 'sensitive2']
+        known_columns = ['key1', 'key2']
+        continuous_columns = ['key2', 'sensitive2']
+
+        # Run
+        score_breakdown = DisclosureProtectionEstimate.compute_breakdown(
+            real_data,
+            perfect_synthetic_data,
+            sensitive_column_names=sensitive_columns,
+            known_column_names=known_columns,
+            continuous_column_names=continuous_columns,
+            computation_method=cap_method,
+            num_discrete_bins=10,
+            num_rows_subsample=20,
+        )
+
+        # Assert
+        assert score_breakdown == {
+            'score': 1.0,
+            'cap_protection': 1.0,
+            'baseline_protection': 0.98,
+        }
diff --git a/tests/unit/single_table/privacy/test_cap.py b/tests/unit/single_table/privacy/test_cap.py
new file mode 100644
index 00000000..d83f6260
--- /dev/null
+++ b/tests/unit/single_table/privacy/test_cap.py
@@ -0,0 +1,27 @@
+import re
+
+import pandas as pd
+import pytest
+
+from sdmetrics.single_table.privacy.cap import (
+    CategoricalCAP,
+    CategoricalGeneralizedCAP,
+    CategoricalZeroCAP,
+)
+
+
+@pytest.mark.parametrize('metric', [CategoricalCAP, CategoricalZeroCAP, CategoricalGeneralizedCAP])
+def test_CAP_deprecation_message(metric):
+    """Test deprecation warning is raised when running the metric directly."""
+    # Setup
+    real_data = pd.DataFrame({'col1': range(5), 'col2': ['A', 'B', 'C', 'A', 'B']})
+    synthetic_data = pd.DataFrame({'col1': range(5), 'col2': ['C', 'A', 'A', 'B', 'C']})
+
+    # Run and Assert
+    expected_warning = re.escape(
+        'Computing CAP metrics directly is deprecated. For improved privacy metrics, '
+        "please use the 'DisclosureProtection' and 'DisclosureProtectionEstimate' "
+        'metrics instead.'
+    )
+    with pytest.warns(DeprecationWarning, match=expected_warning):
+        metric.compute(real_data, synthetic_data, key_fields=['col1'], sensitive_fields=['col2'])
diff --git a/tests/unit/single_table/privacy/test_disclosure_protection.py b/tests/unit/single_table/privacy/test_disclosure_protection.py
index b35f6147..b65cb13b 100644
--- a/tests/unit/single_table/privacy/test_disclosure_protection.py
+++ b/tests/unit/single_table/privacy/test_disclosure_protection.py
@@ -1,13 +1,17 @@
 """Test for the disclosure metrics."""
 
 import re
-from unittest.mock import Mock, patch
+from unittest.mock import MagicMock, Mock, call, patch
 
 import numpy as np
 import pandas as pd
 import pytest
 
-from sdmetrics.single_table.privacy.disclosure_protection import DisclosureProtection
+from sdmetrics.single_table.privacy.disclosure_protection import (
+    DisclosureProtection,
+    DisclosureProtectionEstimate,
+)
+from tests.utils import DataFrameMatcher
 
 
 class TestDisclosureProtection:
@@ -151,21 +155,51 @@ def test__discretize_column_float_dtypes(self, dtype):
         expected_synthetic = np.array(['0', '0', '1', np.nan, '3', np.nan, '4'], dtype='object')
         assert list(binned_synthetic) == list(expected_synthetic)
 
-    def test__compute_baseline(self):
-        """Test computing the baseline score for random data."""
+    def test__discretize_and_fillna(self):
+        """Test helper method to discretize continous columns and fill nan values."""
         # Setup
         real_data = pd.DataFrame({
-            'col1': ['A', 'A', 'A', 'A', 'A'],
-            'col2': ['A', 'B', 'A', 'B', 'A'],
-            'col3': range(5),
+            'known': ['A', 'A', pd.NA, 'B', 'B'],
+            'continous': [0, 1, 3, 8, 10],
+            'continous_nan': [0, 7, 2, np.nan, 10],
+            'extra': [None, pd.NA, 0, 10, 100],
         })
-        sensitive_column_names = ['col1', 'col2']
+        synthetic_data = pd.DataFrame({
+            'known': ['A', 'A', 'B', 'B', None],
+            'continous': [-1, 0, 3, 5, 11],
+            'continous_nan': [0, 1, 2, np.nan, 100],
+            'extra': [None, pd.NA, 0, 10, 100],
+        })
+        known_column_names = ['known']
+        sensitive_column_names = ['continous', 'continous_nan']
+        continuous_column_names = ['continous', 'continous_nan']
+        num_discrete_bins = 5
 
         # Run
-        baseline_score = DisclosureProtection._compute_baseline(real_data, sensitive_column_names)
+        processed_real, processed_synthetic = DisclosureProtection._discretize_and_fillna(
+            real_data,
+            synthetic_data,
+            known_column_names,
+            sensitive_column_names,
+            continuous_column_names,
+            num_discrete_bins,
+        )
 
         # Assert
-        assert baseline_score == 0.5
+        expected_real = pd.DataFrame({
+            'known': ['A', 'A', '__NULL_VALUE__', 'B', 'B'],
+            'continous': ['0', '0', '1', '3', '4'],
+            'continous_nan': ['0', '3', '0', '__NULL_VALUE__', '4'],
+            'extra': real_data['extra'],
+        })
+        expected_synthetic = pd.DataFrame({
+            'known': ['A', 'A', 'B', 'B', '__NULL_VALUE__'],
+            'continous': ['0', '0', '1', '2', '4'],
+            'continous_nan': ['0', '0', '0', '__NULL_VALUE__', '4'],
+            'extra': synthetic_data['extra'],
+        })
+        pd.testing.assert_frame_equal(expected_real, processed_real)
+        pd.testing.assert_frame_equal(expected_synthetic, processed_synthetic)
 
     def test__compute_baseline(self):
         """Test computing the baseline score for random data."""
@@ -198,7 +232,7 @@ def test_compute_breakdown(self, CAPMethodsMock):
             'col3': range(-2, 8),
         })
         CAPMock = Mock()
-        CAPMock.compute.return_value = 0.9
+        CAPMock._compute.return_value = 0.9
         CAPMethodsMock.keys.return_value = ['CAP', 'ZERO_CAP', 'GENERALIZED_CAP']
         CAPMethodsMock.get.return_value = CAPMock
 
@@ -232,7 +266,7 @@ def test_compute_breakdown_zero_baseline(self, CAPMethodsMock):
             'col2': ['A'] * 10,
         })
         CAPMock = Mock()
-        CAPMock.compute.return_value = 0.5
+        CAPMock._compute.return_value = 0.5
         CAPMethodsMock.keys.return_value = ['CAP', 'ZERO_CAP', 'GENERALIZED_CAP']
         CAPMethodsMock.get.return_value = CAPMock
 
@@ -244,7 +278,7 @@ def test_compute_breakdown_zero_baseline(self, CAPMethodsMock):
             sensitive_column_names=['col2'],
         )
 
-        CAPMock.compute.return_value = 0
+        CAPMock._compute.return_value = 0
         score_breakdown_no_cap = DisclosureProtection.compute_breakdown(
             real_data=real_data,
             synthetic_data=synthetic_data,
@@ -260,6 +294,53 @@ def test_compute_breakdown_zero_baseline(self, CAPMethodsMock):
         }
         assert score_breakdown_no_cap == {'score': 0, 'baseline_protection': 0, 'cap_protection': 0}
 
+    @patch('sdmetrics.single_table.privacy.disclosure_protection.CAP_METHODS')
+    @patch(
+        'sdmetrics.single_table.privacy.disclosure_protection.DisclosureProtection._compute_baseline'
+    )
+    @patch(
+        'sdmetrics.single_table.privacy.disclosure_protection.DisclosureProtection._discretize_and_fillna'
+    )
+    def test_compute_breakdown_warns_too_large(
+        self, mock_discretize_and_fillna, mock_compute_baseline, CAPMethodsMock
+    ):
+        """Test the ``compute_breakdown`` warns if the data is too large."""
+        # Setup
+        real_data = pd.DataFrame({
+            'col1': np.random.choice(['A', 'B', 'C', 'D'], size=50001),
+            'col2': range(50001),
+        })
+        synthetic_data = pd.DataFrame({
+            'col1': np.random.choice(['A', 'B', 'C', 'D'], size=50001),
+            'col2': range(50001),
+        })
+        CAPMock = Mock()
+        CAPMock._compute.return_value = 0.5
+        CAPMethodsMock.keys.return_value = ['CAP', 'ZERO_CAP', 'GENERALIZED_CAP']
+        CAPMethodsMock.get.return_value = CAPMock
+        mock_compute_baseline.return_value = 0.5
+        mock_discretize_and_fillna.return_value = (real_data, synthetic_data)
+
+        # Run
+        expected_warning = re.escape(
+            'Data exceeds 50000 rows, perfomance may be slow.'
+            'Consider using the `DisclosureProtectionEstimate` for faster computation.'
+        )
+        with pytest.warns(UserWarning, match=expected_warning):
+            score_breakdown = DisclosureProtection.compute_breakdown(
+                real_data=real_data,
+                synthetic_data=synthetic_data,
+                known_column_names=['col1'],
+                sensitive_column_names=['col2'],
+            )
+
+        # Assert
+        assert score_breakdown == {
+            'score': 1,
+            'baseline_protection': 0.5,
+            'cap_protection': 0.5,
+        }
+
     @patch(
         'sdmetrics.single_table.privacy.disclosure_protection.DisclosureProtection.compute_breakdown'
     )
@@ -287,3 +368,196 @@ def test_compute(self, compute_breakdown_mock):
 
         # Assert
         assert score == 0.8
+
+
+class TestDisclosureProtectionEstimate:
+    def test__validate_inputs(self):
+        """Test input validation."""
+        # Setup
+        default_kwargs = {
+            'real_data': pd.DataFrame({'col1': range(5), 'col2': range(5)}),
+            'synthetic_data': pd.DataFrame({'col1': range(10), 'col2': range(10)}),
+            'known_column_names': ['col1'],
+            'sensitive_column_names': ['col2'],
+            'computation_method': 'cap',
+            'continuous_column_names': ['col2'],
+            'num_discrete_bins': 10,
+            'num_rows_subsample': 1000,
+            'num_iterations': 10,
+        }
+        bad_rows_subsample = 0
+        bad_num_iterations = 0
+
+        # Run and Assert
+        DisclosureProtectionEstimate._validate_inputs(**default_kwargs)
+
+        bad_rows_subsample_error = re.escape(
+            '`num_rows_subsample` must be an integer greater than zero.'
+        )
+        with pytest.raises(ValueError, match=bad_rows_subsample_error):
+            DisclosureProtectionEstimate._validate_inputs(**{
+                **default_kwargs,
+                'num_rows_subsample': bad_rows_subsample,
+            })
+
+        bad_num_iterations_error = re.escape(
+            '`num_iterations` must be an integer greater than zero.'
+        )
+        with pytest.raises(ValueError, match=bad_num_iterations_error):
+            DisclosureProtectionEstimate._validate_inputs(**{
+                **default_kwargs,
+                'num_iterations': bad_num_iterations,
+            })
+
+    @patch('sdmetrics.single_table.privacy.disclosure_protection.tqdm')
+    @patch('sdmetrics.single_table.privacy.disclosure_protection.CAP_METHODS')
+    def test__compute_estimated_cap_metric(self, CAPMethodsMock, mock_tqdm):
+        """Test the ``_compute_estimated_cap_metric`` method."""
+        # Setup
+        real_data = pd.DataFrame({
+            'col1': np.random.choice(['A', 'B', 'C', 'D'], size=5),
+            'col2': np.random.choice(['X', 'Y'], size=5),
+        })
+        synthetic_data = pd.DataFrame({
+            'col1': np.random.choice(['A', 'B', 'C', 'D'], size=100),
+            'col2': np.random.choice(['X', 'Y'], size=100),
+        })
+        CAPMock = Mock()
+        CAPMock._compute.side_effect = [0.4, 0.5, 0.2, 0.6, 0.2]
+        CAPMethodsMock.keys.return_value = ['CAP', 'ZERO_CAP', 'GENERALIZED_CAP']
+        CAPMethodsMock.get.return_value = CAPMock
+        progress_bar = MagicMock()
+        progress_bar.__iter__.return_value = range(5)
+        mock_tqdm.tqdm.return_value = progress_bar
+
+        # Run
+        avg_score, avg_computed_score = DisclosureProtectionEstimate._compute_estimated_cap_metric(
+            real_data,
+            synthetic_data,
+            baseline_protection=0.5,
+            known_column_names=['col1'],
+            sensitive_column_names=['col2'],
+            computation_method='CAP',
+            num_rows_subsample=10,
+            num_iterations=5,
+            verbose=True,
+        )
+
+        # Assert
+        assert avg_score == 0.76
+        assert avg_computed_score == 0.38
+        progress_bar.set_description.assert_has_calls([
+            call('Estimating Disclosure Protection (Score=0.000)'),
+            call('Estimating Disclosure Protection (Score=0.800)'),
+            call('Estimating Disclosure Protection (Score=0.900)'),
+            call('Estimating Disclosure Protection (Score=0.733)'),
+            call('Estimating Disclosure Protection (Score=0.850)'),
+            call('Estimating Disclosure Protection (Score=0.760)'),
+        ])
+
+    @patch('sdmetrics.single_table.privacy.disclosure_protection.CAP_METHODS')
+    def test__compute_estimated_cap_metric_zero_baseline(self, CAPMethodsMock):
+        """Test the ``_compute_estimated_cap_metric`` method with a zero baseline."""
+        # Setup
+        real_data = pd.DataFrame({
+            'col1': np.random.choice(['A', 'B', 'C', 'D'], size=5),
+            'col2': ['A'] * 5,
+        })
+        synthetic_data = pd.DataFrame({
+            'col1': np.random.choice(['A', 'B', 'C', 'D'], size=100),
+            'col2': ['A'] * 100,
+        })
+        CAPMock = Mock()
+        CAPMock._compute.side_effect = [0.4, 0.5, 0.2, 0.6, 0.2]
+        CAPMethodsMock.keys.return_value = ['CAP', 'ZERO_CAP', 'GENERALIZED_CAP']
+        CAPMethodsMock.get.return_value = CAPMock
+
+        # Run
+        avg_score, avg_computed_score = DisclosureProtectionEstimate._compute_estimated_cap_metric(
+            real_data,
+            synthetic_data,
+            baseline_protection=0,
+            known_column_names=['col1'],
+            sensitive_column_names=['col2'],
+            computation_method='CAP',
+            num_rows_subsample=10,
+            num_iterations=5,
+            verbose=False,
+        )
+
+        # Assert
+        assert avg_score == 1
+        assert avg_computed_score == 0.38
+
+    @patch(
+        'sdmetrics.single_table.privacy.disclosure_protection.DisclosureProtectionEstimate._compute_estimated_cap_metric'
+    )
+    def test_compute_breakdown(self, mock__compute_estimated_cap_metric):
+        """Test computing the breakdown."""
+        # Setup
+        real_data = pd.DataFrame({
+            'col1': np.random.choice(['A', 'B', 'C', 'D'], size=10),
+            'col2': ['X', 'Y', 'Z', 'Y', 'X', 'X', 'Y', 'Z', 'X', 'A'],
+            'col3': ['A', 'B'] * 5,
+        })
+        synthetic_data = pd.DataFrame({
+            'col1': np.random.choice(['A', 'B', 'C', 'D'], size=10),
+            'col2': np.random.choice(['X', 'Y', 'Z', 'X', 'X'], size=10),
+            'col3': ['A'] * 10,
+        })
+        mock__compute_estimated_cap_metric.return_value = (0.8, 0.6)
+
+        # Run
+        score_breakdown = DisclosureProtectionEstimate.compute_breakdown(
+            real_data=real_data,
+            synthetic_data=synthetic_data,
+            known_column_names=['col1'],
+            sensitive_column_names=['col2', 'col3'],
+            num_discrete_bins=2,
+        )
+
+        # Assert
+        assert score_breakdown == {
+            'score': 0.8,
+            'baseline_protection': 0.875,
+            'cap_protection': 0.6,
+        }
+        mock__compute_estimated_cap_metric.assert_called_once_with(
+            DataFrameMatcher(real_data),
+            DataFrameMatcher(synthetic_data),
+            baseline_protection=0.875,
+            known_column_names=['col1'],
+            sensitive_column_names=['col2', 'col3'],
+            computation_method='CAP',
+            num_rows_subsample=1000,
+            num_iterations=10,
+            verbose=True,
+        )
+
+    @patch(
+        'sdmetrics.single_table.privacy.disclosure_protection.DisclosureProtectionEstimate.compute_breakdown'
+    )
+    def test_compute(self, compute_breakdown_mock):
+        """Test the ``compute`` method."""
+        # Setup
+        real_data = pd.DataFrame({
+            'col1': np.random.choice(['A', 'B', 'C', 'D'], size=10),
+            'col2': ['A'] * 10,
+        })
+        synthetic_data = pd.DataFrame({
+            'col1': np.random.choice(['A', 'B', 'C', 'D'], size=10),
+            'col2': ['A'] * 10,
+        })
+        compute_breakdown_mock.return_value = {
+            'score': 0.8,
+            'baseline_protection': 0.6,
+            'cap_protection': 0.64,
+        }
+
+        # Run
+        score = DisclosureProtectionEstimate.compute(
+            real_data, synthetic_data, known_column_names=['col1'], sensitive_column_names=['col2']
+        )
+
+        # Assert
+        assert score == 0.8