Skip to content

Commit

Permalink
def + test
Browse files Browse the repository at this point in the history
  • Loading branch information
R-Palazzo committed Oct 20, 2023
1 parent 99cb1e4 commit e72e903
Show file tree
Hide file tree
Showing 4 changed files with 150 additions and 0 deletions.
2 changes: 2 additions & 0 deletions sdmetrics/single_column/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from sdmetrics.single_column import base
from sdmetrics.single_column.base import SingleColumnMetric
from sdmetrics.single_column.statistical.boundary_adherence import BoundaryAdherence
from sdmetrics.single_column.statistical.category_adherence import CategoryAdherence
from sdmetrics.single_column.statistical.category_coverage import CategoryCoverage
from sdmetrics.single_column.statistical.cstest import CSTest
from sdmetrics.single_column.statistical.kscomplement import KSComplement
Expand All @@ -16,6 +17,7 @@
'SingleColumnMetric',
'BoundaryAdherence',
'CategoryCoverage',
'CategoryAdherence',
'CSTest',
'KSComplement',
'MissingValueSimilarity',
Expand Down
2 changes: 2 additions & 0 deletions sdmetrics/single_column/statistical/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Univariate goodness-of-fit tests."""

from sdmetrics.single_column.statistical.boundary_adherence import BoundaryAdherence
from sdmetrics.single_column.statistical.category_adherence import CategoryAdherence
from sdmetrics.single_column.statistical.category_coverage import CategoryCoverage
from sdmetrics.single_column.statistical.cstest import CSTest
from sdmetrics.single_column.statistical.kscomplement import KSComplement
Expand All @@ -12,6 +13,7 @@
__all__ = [
'BoundaryAdherence',
'CategoryCoverage',
'CategoryAdherence',
'CSTest',
'KSComplement',
'MissingValueSimilarity',
Expand Down
78 changes: 78 additions & 0 deletions sdmetrics/single_column/statistical/category_adherence.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""Category Adherence Metric."""

import numpy as np

from sdmetrics.goal import Goal
from sdmetrics.single_column.base import SingleColumnMetric


class CategoryAdherence(SingleColumnMetric):
"""Category adherence metric.
Compute the fraction of synthetic values that match at least 1 value in the real data.
Attributes:
name (str):
Name to use when reports about this metric are printed.
goal (sdmetrics.goal.Goal):
The goal of this metric.
min_value (Union[float, tuple[float]]):
Minimum value or values that this metric can take.
max_value (Union[float, tuple[float]]):
Maximum value or values that this metric can take.
"""

name = 'CategoryAdherence'
goal = Goal.MAXIMIZE
min_value = 0.0
max_value = 1.0

@classmethod
def compute_breakdown(cls, real_data, synthetic_data):
"""Compute the score breakdown of the category adherence metric.
Args:
real_data (pandas.Series):
The real data.
synthetic_data (pandas.Series):
The synthetic data.
Returns:
dict:
The score breakdown of the key uniqueness metric.
"""
real_data = real_data.fillna(np.nan)
synthetic_data = synthetic_data.fillna(np.nan)
score = synthetic_data.isin(real_data).mean()

return {'score': score}

@classmethod
def compute(cls, real_data, synthetic_data):
"""Compute the category adherence of two columns.
Args:
real_data (pandas.Series):
The real data.
synthetic_data (pandas.Series):
The synthetic data.
Returns:
float:
The key uniqueness of the two columns.
"""
return cls.compute_breakdown(real_data, synthetic_data)['score']

@classmethod
def normalize(cls, raw_score):
"""Return the `raw_score` as is, since it is already normalized.
Args:
raw_score (float):
The value of the metric from `compute`.
Returns:
float:
The normalized value of the metric
"""
return super().normalize(raw_score)
68 changes: 68 additions & 0 deletions tests/unit/single_column/statistical/test_category_adherence.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
from unittest.mock import patch

import numpy as np
import pandas as pd

from sdmetrics.single_column.statistical import CategoryAdherence


class TestCategoryAdherence:

def test_compute_breakdown(self):
"""Test the ``compute_breakdown`` method."""
# Setup
real_data = pd.Series(['A', 'B', 'C', 'B', 'A'])
synthetic_data = pd.Series(['A', 'B', 'C', 'D', 'E'])

metric = CategoryAdherence()

# Run
result = metric.compute_breakdown(real_data, synthetic_data)

# Assert
assert result == {'score': 0.6}

def test_compute_breakdown_with_nans(self):
"""Test the ``compute_breakdown`` method with NaNs."""
# Setup
real_data = pd.Series(['A', 'B', 'C', 'B', 'A', None])
synthetic_data = pd.Series(['A', 'B', np.nan, 'C', np.nan, 'B', 'A', None, 'D', 'C'])

metric = CategoryAdherence()

# Run
result = metric.compute_breakdown(real_data, synthetic_data)

# Assert
assert result == {'score': 0.9}

@patch('sdmetrics.single_column.statistical.category_adherence.'
'CategoryAdherence.compute_breakdown')
def test_compute(self, compute_breakdown_mock):
"""Test the ``compute`` method."""
# Setup
real_data = pd.Series(['A', 'B', 'C', 'B', 'A'])
synthetic_data = pd.Series(['A', 'B', 'C', 'D', 'E'])
metric = CategoryAdherence()
compute_breakdown_mock.return_value = {'score': 0.6}

# Run
result = metric.compute(real_data, synthetic_data)

# Assert
compute_breakdown_mock.assert_called_once_with(real_data, synthetic_data)
assert result == 0.6

@patch('sdmetrics.single_column.statistical.category_adherence.SingleColumnMetric.normalize')
def test_normalize(self, normalize_mock):
"""Test the ``normalize`` method."""
# Setup
metric = CategoryAdherence()
raw_score = 0.9

# Run
result = metric.normalize(raw_score)

# Assert
normalize_mock.assert_called_once_with(raw_score)
assert result == normalize_mock.return_value

0 comments on commit e72e903

Please sign in to comment.