Skip to content

Commit

Permalink
Add KeyUniqueness metric (#474)
Browse files Browse the repository at this point in the history
  • Loading branch information
R-Palazzo committed Oct 26, 2023
1 parent a388ddb commit ddd8f22
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 0 deletions.
2 changes: 2 additions & 0 deletions sdmetrics/single_column/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from sdmetrics.single_column.statistical.category_adherence import CategoryAdherence
from sdmetrics.single_column.statistical.category_coverage import CategoryCoverage
from sdmetrics.single_column.statistical.cstest import CSTest
from sdmetrics.single_column.statistical.key_uniqueness import KeyUniqueness
from sdmetrics.single_column.statistical.kscomplement import KSComplement
from sdmetrics.single_column.statistical.missing_value_similarity import MissingValueSimilarity
from sdmetrics.single_column.statistical.range_coverage import RangeCoverage
Expand All @@ -19,6 +20,7 @@
'CategoryCoverage',
'CategoryAdherence',
'CSTest',
'KeyUniqueness',
'KSComplement',
'MissingValueSimilarity',
'RangeCoverage',
Expand Down
2 changes: 2 additions & 0 deletions sdmetrics/single_column/statistical/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from sdmetrics.single_column.statistical.category_adherence import CategoryAdherence
from sdmetrics.single_column.statistical.category_coverage import CategoryCoverage
from sdmetrics.single_column.statistical.cstest import CSTest
from sdmetrics.single_column.statistical.key_uniqueness import KeyUniqueness
from sdmetrics.single_column.statistical.kscomplement import KSComplement
from sdmetrics.single_column.statistical.missing_value_similarity import MissingValueSimilarity
from sdmetrics.single_column.statistical.range_coverage import RangeCoverage
Expand All @@ -15,6 +16,7 @@
'CategoryCoverage',
'CategoryAdherence',
'CSTest',
'KeyUniqueness',
'KSComplement',
'MissingValueSimilarity',
'RangeCoverage',
Expand Down
69 changes: 69 additions & 0 deletions sdmetrics/single_column/statistical/key_uniqueness.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""Key Uniqueness Metric."""
import logging

from sdmetrics.goal import Goal
from sdmetrics.single_column.base import SingleColumnMetric

LOGGER = logging.getLogger(__name__)


class KeyUniqueness(SingleColumnMetric):
"""Key uniqueness metric.
The proportion of data points in the synthetic data that are unique.
Attributes:
name (str):
Name to use when reports about this metric are printed.
goal (sdmetrics.goal.Goal):
The goal of this metric.
min_value (Union[float, tuple[float]]):
Minimum value or values that this metric can take.
max_value (Union[float, tuple[float]]):
Maximum value or values that this metric can take.
"""

name = 'KeyUniqueness'
goal = Goal.MAXIMIZE
min_value = 0.0
max_value = 1.0

@classmethod
def compute_breakdown(cls, real_data, synthetic_data):
"""Compute the score breakdown of the key uniqueness metric.
Args:
real_data (pandas.Series):
The real data.
synthetic_data (pandas.Series):
The synthetic data.
Returns:
dict:
The score breakdown of the key uniqueness metric.
"""
has_duplicates = real_data.duplicated().any()
has_nans = real_data.isna().any()
if has_duplicates or has_nans:
LOGGER.info('The real data contains NA or duplicate values.')

nans_or_duplicates_synthetic = synthetic_data.duplicated() | synthetic_data.isna()
score = 1 - nans_or_duplicates_synthetic.sum() / len(synthetic_data)

return {'score': score}

@classmethod
def compute(cls, real_data, synthetic_data):
"""Compute the key uniqueness metric.
Args:
real_data (pandas.Series):
The real data.
synthetic_data (pandas.Series):
The synthetic data.
Returns:
float:
The proportion of data points in the synthetic data that are unique.
"""
return cls.compute_breakdown(real_data, synthetic_data)['score']
54 changes: 54 additions & 0 deletions tests/unit/single_column/statistical/test_key_uniqueness.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from unittest.mock import patch

import numpy as np
import pandas as pd

from sdmetrics.single_column.statistical import KeyUniqueness


class TestKeyUniqueness:

def test_compute_breakdown(self):
"""Test the ``compute_breakdown`` method."""
# Setup
real_data = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
synthetic_data = pd.Series([1, 2, np.nan, 3, np.nan, 5, 2, np.nan, 6, None])

metric = KeyUniqueness()

# Run
result = metric.compute_breakdown(real_data, synthetic_data)

# Assert
assert result == {'score': 0.5}

@patch('sdmetrics.single_column.statistical.key_uniqueness.LOGGER')
def test_compute_breakdown_with_duplicates_in_real_data(self, logger_mock):
"""Test the ``compute_breakdown`` method with duplicates in the real data."""
# Setup
real_data = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10] * 2)
synthetic_data = pd.Series([1, 2, np.nan, 3, np.nan, 5, 2, np.nan, 6, None])
metric = KeyUniqueness()

# Run
metric.compute_breakdown(real_data, synthetic_data)

# Assert
expected_message = 'The real data contains NA or duplicate values.'
logger_mock.info.assert_called_once_with(expected_message)

@patch('sdmetrics.single_column.statistical.key_uniqueness.KeyUniqueness.compute_breakdown')
def test_compute(self, compute_breakdown_mock):
"""Test the ``compute`` method."""
# Setup
real_data = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
synthetic_data = pd.Series([1, 2, np.nan, 3, np.nan, 5, 2, np.nan, 6, None])
metric = KeyUniqueness()
compute_breakdown_mock.return_value = {'score': 0.6}

# Run
result = metric.compute(real_data, synthetic_data)

# Assert
compute_breakdown_mock.assert_called_once_with(real_data, synthetic_data)
assert result == 0.6

0 comments on commit ddd8f22

Please sign in to comment.