diff --git a/sdmetrics/single_column/__init__.py b/sdmetrics/single_column/__init__.py index e12393e9..563ea574 100644 --- a/sdmetrics/single_column/__init__.py +++ b/sdmetrics/single_column/__init__.py @@ -6,6 +6,7 @@ from sdmetrics.single_column.statistical.category_adherence import CategoryAdherence from sdmetrics.single_column.statistical.category_coverage import CategoryCoverage from sdmetrics.single_column.statistical.cstest import CSTest +from sdmetrics.single_column.statistical.key_uniqueness import KeyUniqueness from sdmetrics.single_column.statistical.kscomplement import KSComplement from sdmetrics.single_column.statistical.missing_value_similarity import MissingValueSimilarity from sdmetrics.single_column.statistical.range_coverage import RangeCoverage @@ -19,6 +20,7 @@ 'CategoryCoverage', 'CategoryAdherence', 'CSTest', + 'KeyUniqueness', 'KSComplement', 'MissingValueSimilarity', 'RangeCoverage', diff --git a/sdmetrics/single_column/statistical/__init__.py b/sdmetrics/single_column/statistical/__init__.py index 9838ac2e..252cd6ac 100644 --- a/sdmetrics/single_column/statistical/__init__.py +++ b/sdmetrics/single_column/statistical/__init__.py @@ -4,6 +4,7 @@ from sdmetrics.single_column.statistical.category_adherence import CategoryAdherence from sdmetrics.single_column.statistical.category_coverage import CategoryCoverage from sdmetrics.single_column.statistical.cstest import CSTest +from sdmetrics.single_column.statistical.key_uniqueness import KeyUniqueness from sdmetrics.single_column.statistical.kscomplement import KSComplement from sdmetrics.single_column.statistical.missing_value_similarity import MissingValueSimilarity from sdmetrics.single_column.statistical.range_coverage import RangeCoverage @@ -15,6 +16,7 @@ 'CategoryCoverage', 'CategoryAdherence', 'CSTest', + 'KeyUniqueness', 'KSComplement', 'MissingValueSimilarity', 'RangeCoverage', diff --git a/sdmetrics/single_column/statistical/key_uniqueness.py b/sdmetrics/single_column/statistical/key_uniqueness.py new file mode 100644 index 00000000..c973cbbb --- /dev/null +++ b/sdmetrics/single_column/statistical/key_uniqueness.py @@ -0,0 +1,69 @@ +"""Key Uniqueness Metric.""" +import logging + +from sdmetrics.goal import Goal +from sdmetrics.single_column.base import SingleColumnMetric + +LOGGER = logging.getLogger(__name__) + + +class KeyUniqueness(SingleColumnMetric): + """Key uniqueness metric. + + The proportion of data points in the synthetic data that are unique. + + Attributes: + name (str): + Name to use when reports about this metric are printed. + goal (sdmetrics.goal.Goal): + The goal of this metric. + min_value (Union[float, tuple[float]]): + Minimum value or values that this metric can take. + max_value (Union[float, tuple[float]]): + Maximum value or values that this metric can take. + """ + + name = 'KeyUniqueness' + goal = Goal.MAXIMIZE + min_value = 0.0 + max_value = 1.0 + + @classmethod + def compute_breakdown(cls, real_data, synthetic_data): + """Compute the score breakdown of the key uniqueness metric. + + Args: + real_data (pandas.Series): + The real data. + synthetic_data (pandas.Series): + The synthetic data. + + Returns: + dict: + The score breakdown of the key uniqueness metric. + """ + has_duplicates = real_data.duplicated().any() + has_nans = real_data.isna().any() + if has_duplicates or has_nans: + LOGGER.info('The real data contains NA or duplicate values.') + + nans_or_duplicates_synthetic = synthetic_data.duplicated() | synthetic_data.isna() + score = 1 - nans_or_duplicates_synthetic.sum() / len(synthetic_data) + + return {'score': score} + + @classmethod + def compute(cls, real_data, synthetic_data): + """Compute the key uniqueness metric. + + Args: + real_data (pandas.Series): + The real data. + synthetic_data (pandas.Series): + The synthetic data. + + Returns: + float: + The proportion of data points in the synthetic data that are unique. + """ + return cls.compute_breakdown(real_data, synthetic_data)['score'] diff --git a/tests/unit/single_column/statistical/test_key_uniqueness.py b/tests/unit/single_column/statistical/test_key_uniqueness.py new file mode 100644 index 00000000..0909b2b0 --- /dev/null +++ b/tests/unit/single_column/statistical/test_key_uniqueness.py @@ -0,0 +1,54 @@ +from unittest.mock import patch + +import numpy as np +import pandas as pd + +from sdmetrics.single_column.statistical import KeyUniqueness + + +class TestKeyUniqueness: + + def test_compute_breakdown(self): + """Test the ``compute_breakdown`` method.""" + # Setup + real_data = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + synthetic_data = pd.Series([1, 2, np.nan, 3, np.nan, 5, 2, np.nan, 6, None]) + + metric = KeyUniqueness() + + # Run + result = metric.compute_breakdown(real_data, synthetic_data) + + # Assert + assert result == {'score': 0.5} + + @patch('sdmetrics.single_column.statistical.key_uniqueness.LOGGER') + def test_compute_breakdown_with_duplicates_in_real_data(self, logger_mock): + """Test the ``compute_breakdown`` method with duplicates in the real data.""" + # Setup + real_data = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10] * 2) + synthetic_data = pd.Series([1, 2, np.nan, 3, np.nan, 5, 2, np.nan, 6, None]) + metric = KeyUniqueness() + + # Run + metric.compute_breakdown(real_data, synthetic_data) + + # Assert + expected_message = 'The real data contains NA or duplicate values.' + logger_mock.info.assert_called_once_with(expected_message) + + @patch('sdmetrics.single_column.statistical.key_uniqueness.KeyUniqueness.compute_breakdown') + def test_compute(self, compute_breakdown_mock): + """Test the ``compute`` method.""" + # Setup + real_data = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + synthetic_data = pd.Series([1, 2, np.nan, 3, np.nan, 5, 2, np.nan, 6, None]) + metric = KeyUniqueness() + compute_breakdown_mock.return_value = {'score': 0.6} + + # Run + result = metric.compute(real_data, synthetic_data) + + # Assert + compute_breakdown_mock.assert_called_once_with(real_data, synthetic_data) + assert result == 0.6