Skip to content

Commit

Permalink
add CardinalityBoundaryAdherence metric
Browse files Browse the repository at this point in the history
  • Loading branch information
frances-h committed Oct 26, 2023
1 parent a8e2d0f commit 1160220
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 0 deletions.
3 changes: 3 additions & 0 deletions sdmetrics/column_pairs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
"""Metrics to compare column pairs."""

from sdmetrics.column_pairs.base import ColumnPairsMetric
from sdmetrics.column_pairs.statistical.cardinality_boundary_adherence import (
CardinalityBoundaryAdherence)
from sdmetrics.column_pairs.statistical.contingency_similarity import ContingencySimilarity
from sdmetrics.column_pairs.statistical.correlation_similarity import CorrelationSimilarity
from sdmetrics.column_pairs.statistical.kl_divergence import (
ContinuousKLDivergence, DiscreteKLDivergence)

__all__ = [
'CardinalityBoundaryAdherence',
'ColumnPairsMetric',
'ContingencySimilarity',
'ContinuousKLDivergence',
Expand Down
3 changes: 3 additions & 0 deletions sdmetrics/column_pairs/statistical/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
"""Statistical Metrics to compare column pairs."""

from sdmetrics.column_pairs.statistical.cardinality_boundary_adherence import (
CardinalityBoundaryAdherence)
from sdmetrics.column_pairs.statistical.contingency_similarity import ContingencySimilarity
from sdmetrics.column_pairs.statistical.correlation_similarity import CorrelationSimilarity
from sdmetrics.column_pairs.statistical.kl_divergence import (
ContinuousKLDivergence, DiscreteKLDivergence)

__all__ = [
'CardinalityBoundaryAdherence',
'ContingencySimilarity',
'ContinuousKLDivergence',
'CorrelationSimilarity',
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""ColumnPair metrics based on Kullback–Leibler Divergence."""

import pandas as pd

from sdmetrics.column_pairs.base import ColumnPairsMetric
from sdmetrics.goal import Goal


class CardinalityBoundaryAdherence(ColumnPairsMetric):
"""Cardinality Boundary Adherence metric.
Computes the percentage of synthetic parents whose cardinality
falls within the min/max range of cardinality in the real data.
Attributes:
name (str):
Name to use when reports about this metric are printed.
goal (sdmetrics.goal.Goal):
The goal of this metric.
min_value (Union[float, tuple[float]]):
Minimum value or values that this metric can take.
max_value (Union[float, tuple[float]]):
Maximum value or values that this metric can take.
"""

name: 'CardinalityBoundaryAdherence'
goal: Goal.MAXIMIZE
min_value: 0.0
max_value: 1.0

@staticmethod
def compute_breakdown(real_data, synthetic_data):
"""Calculate the percentage of synthetic parents with cardinality in the correct range.
Args:
real_data (tuple(pd.Series, pd.Series)):
A tuple with the real primary key Series as the first element and real
foreign keys Series as the second element.
synthetic_data (tuple(pd.Series, pd.Series)):
A tuple with the synthetic primary key as the first element and synthetic
foreign keys as the second element.
Returns:
Union[float, tuple[float]]:
Metric output.
"""
real_cardinality = pd.DataFrame(index=real_data[0].copy())
real_cardinality['cardinality'] = real_data[1].value_counts()
real_cardinality = real_cardinality.fillna(0)
synthetic_cardinality = pd.DataFrame(index=synthetic_data[0].copy())
synthetic_cardinality['cardinality'] = synthetic_data[1].value_counts()
synthetic_cardinality = synthetic_cardinality.fillna(0)

min_cardinality = real_cardinality['cardinality'].min()
max_cardinality = real_cardinality['cardinality'].max()

valid_cardinality = sum(
synthetic_cardinality['cardinality'].between(
min_cardinality, max_cardinality))
score = valid_cardinality / len(synthetic_cardinality)

return {'score': score}

@classmethod
def compute(cls, real_data, synthetic_data):
"""Calculate the percentage of synthetic parents with cardinality in the correct range.
Args:
real_data (tuple(pd.Series, pd.Series)):
A tuple with the real primary key Series as the first element and real
foreign keys Series as the second element.
synthetic_data (tuple(pd.Series, pd.Series)):
A tuple with the synthetic primary key as the first element and synthetic
foreign keys as the second element.
Returns:
Union[float, tuple[float]]:
Metric output.
"""
return cls.compute_breakdown(real_data, synthetic_data)['score']
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@

import pandas as pd

from sdmetrics.column_pairs.statistical import CardinalityBoundaryAdherence


class TestCardinalityBoundaryAdherence:

def test_compute_breakdown(self):
"""Test the ``compute_breakdown`` method."""
# Setup
real_parent_keys = pd.Series([1, 2, 3, 4, 5])
real_foreign_keys = pd.Series([1, 1, 2, 3, 4, 5, 5])
real_data = (real_parent_keys, real_foreign_keys)
synthetic_parent_keys = pd.Series([1, 2, 3, 4, 5])
synthetic_foreign_keys = pd.Series([2, 2, 2, 3, 4, 5])
synthetic_data = (synthetic_parent_keys, synthetic_foreign_keys)

metric = CardinalityBoundaryAdherence()

# Run
result = metric.compute_breakdown(real_data, synthetic_data)

# Assert
assert result == {'score': 0.6}

def test_compute(self):
"""Test the ``compute`` method."""
# Setup
real_parent_keys = pd.Series([1, 2, 3, 4, 5])
real_foreign_keys = pd.Series([1, 1, 2, 3, 4, 5, 5])
real_data = (real_parent_keys, real_foreign_keys)
synthetic_parent_keys = pd.Series([1, 2, 3, 4, 5])
synthetic_foreign_keys = pd.Series([2, 2, 2, 3, 4, 5])
synthetic_data = (synthetic_parent_keys, synthetic_foreign_keys)

metric = CardinalityBoundaryAdherence()

# Run
result = metric.compute(real_data, synthetic_data)

# Assert
assert result == 0.6

0 comments on commit 1160220

Please sign in to comment.