From 1160220850ef3bac27e61787e2ef74c882fed441 Mon Sep 17 00:00:00 2001 From: Frances Hartwell Date: Thu, 26 Oct 2023 10:57:27 -0400 Subject: [PATCH] add CardinalityBoundaryAdherence metric --- sdmetrics/column_pairs/__init__.py | 3 + .../column_pairs/statistical/__init__.py | 3 + .../cardinality_boundary_adherence.py | 80 +++++++++++++++++++ .../test_cardinality_boundary_adherence.py | 43 ++++++++++ 4 files changed, 129 insertions(+) create mode 100644 sdmetrics/column_pairs/statistical/cardinality_boundary_adherence.py create mode 100644 tests/unit/column_pairs/statistical/test_cardinality_boundary_adherence.py diff --git a/sdmetrics/column_pairs/__init__.py b/sdmetrics/column_pairs/__init__.py index efa83ac8..f7bab951 100644 --- a/sdmetrics/column_pairs/__init__.py +++ b/sdmetrics/column_pairs/__init__.py @@ -1,12 +1,15 @@ """Metrics to compare column pairs.""" from sdmetrics.column_pairs.base import ColumnPairsMetric +from sdmetrics.column_pairs.statistical.cardinality_boundary_adherence import ( + CardinalityBoundaryAdherence) from sdmetrics.column_pairs.statistical.contingency_similarity import ContingencySimilarity from sdmetrics.column_pairs.statistical.correlation_similarity import CorrelationSimilarity from sdmetrics.column_pairs.statistical.kl_divergence import ( ContinuousKLDivergence, DiscreteKLDivergence) __all__ = [ + 'CardinalityBoundaryAdherence', 'ColumnPairsMetric', 'ContingencySimilarity', 'ContinuousKLDivergence', diff --git a/sdmetrics/column_pairs/statistical/__init__.py b/sdmetrics/column_pairs/statistical/__init__.py index c9175c95..81b20b21 100644 --- a/sdmetrics/column_pairs/statistical/__init__.py +++ b/sdmetrics/column_pairs/statistical/__init__.py @@ -1,11 +1,14 @@ """Statistical Metrics to compare column pairs.""" +from sdmetrics.column_pairs.statistical.cardinality_boundary_adherence import ( + CardinalityBoundaryAdherence) from sdmetrics.column_pairs.statistical.contingency_similarity import ContingencySimilarity from sdmetrics.column_pairs.statistical.correlation_similarity import CorrelationSimilarity from sdmetrics.column_pairs.statistical.kl_divergence import ( ContinuousKLDivergence, DiscreteKLDivergence) __all__ = [ + 'CardinalityBoundaryAdherence', 'ContingencySimilarity', 'ContinuousKLDivergence', 'CorrelationSimilarity', diff --git a/sdmetrics/column_pairs/statistical/cardinality_boundary_adherence.py b/sdmetrics/column_pairs/statistical/cardinality_boundary_adherence.py new file mode 100644 index 00000000..4129a760 --- /dev/null +++ b/sdmetrics/column_pairs/statistical/cardinality_boundary_adherence.py @@ -0,0 +1,80 @@ +"""ColumnPair metrics based on Kullback–Leibler Divergence.""" + +import pandas as pd + +from sdmetrics.column_pairs.base import ColumnPairsMetric +from sdmetrics.goal import Goal + + +class CardinalityBoundaryAdherence(ColumnPairsMetric): + """Cardinality Boundary Adherence metric. + + Computes the percentage of synthetic parents whose cardinality + falls within the min/max range of cardinality in the real data. + + Attributes: + name (str): + Name to use when reports about this metric are printed. + goal (sdmetrics.goal.Goal): + The goal of this metric. + min_value (Union[float, tuple[float]]): + Minimum value or values that this metric can take. + max_value (Union[float, tuple[float]]): + Maximum value or values that this metric can take. + """ + + name: 'CardinalityBoundaryAdherence' + goal: Goal.MAXIMIZE + min_value: 0.0 + max_value: 1.0 + + @staticmethod + def compute_breakdown(real_data, synthetic_data): + """Calculate the percentage of synthetic parents with cardinality in the correct range. + + Args: + real_data (tuple(pd.Series, pd.Series)): + A tuple with the real primary key Series as the first element and real + foreign keys Series as the second element. + synthetic_data (tuple(pd.Series, pd.Series)): + A tuple with the synthetic primary key as the first element and synthetic + foreign keys as the second element. + + Returns: + Union[float, tuple[float]]: + Metric output. + """ + real_cardinality = pd.DataFrame(index=real_data[0].copy()) + real_cardinality['cardinality'] = real_data[1].value_counts() + real_cardinality = real_cardinality.fillna(0) + synthetic_cardinality = pd.DataFrame(index=synthetic_data[0].copy()) + synthetic_cardinality['cardinality'] = synthetic_data[1].value_counts() + synthetic_cardinality = synthetic_cardinality.fillna(0) + + min_cardinality = real_cardinality['cardinality'].min() + max_cardinality = real_cardinality['cardinality'].max() + + valid_cardinality = sum( + synthetic_cardinality['cardinality'].between( + min_cardinality, max_cardinality)) + score = valid_cardinality / len(synthetic_cardinality) + + return {'score': score} + + @classmethod + def compute(cls, real_data, synthetic_data): + """Calculate the percentage of synthetic parents with cardinality in the correct range. + + Args: + real_data (tuple(pd.Series, pd.Series)): + A tuple with the real primary key Series as the first element and real + foreign keys Series as the second element. + synthetic_data (tuple(pd.Series, pd.Series)): + A tuple with the synthetic primary key as the first element and synthetic + foreign keys as the second element. + + Returns: + Union[float, tuple[float]]: + Metric output. + """ + return cls.compute_breakdown(real_data, synthetic_data)['score'] diff --git a/tests/unit/column_pairs/statistical/test_cardinality_boundary_adherence.py b/tests/unit/column_pairs/statistical/test_cardinality_boundary_adherence.py new file mode 100644 index 00000000..d4a98173 --- /dev/null +++ b/tests/unit/column_pairs/statistical/test_cardinality_boundary_adherence.py @@ -0,0 +1,43 @@ + +import pandas as pd + +from sdmetrics.column_pairs.statistical import CardinalityBoundaryAdherence + + +class TestCardinalityBoundaryAdherence: + + def test_compute_breakdown(self): + """Test the ``compute_breakdown`` method.""" + # Setup + real_parent_keys = pd.Series([1, 2, 3, 4, 5]) + real_foreign_keys = pd.Series([1, 1, 2, 3, 4, 5, 5]) + real_data = (real_parent_keys, real_foreign_keys) + synthetic_parent_keys = pd.Series([1, 2, 3, 4, 5]) + synthetic_foreign_keys = pd.Series([2, 2, 2, 3, 4, 5]) + synthetic_data = (synthetic_parent_keys, synthetic_foreign_keys) + + metric = CardinalityBoundaryAdherence() + + # Run + result = metric.compute_breakdown(real_data, synthetic_data) + + # Assert + assert result == {'score': 0.6} + + def test_compute(self): + """Test the ``compute`` method.""" + # Setup + real_parent_keys = pd.Series([1, 2, 3, 4, 5]) + real_foreign_keys = pd.Series([1, 1, 2, 3, 4, 5, 5]) + real_data = (real_parent_keys, real_foreign_keys) + synthetic_parent_keys = pd.Series([1, 2, 3, 4, 5]) + synthetic_foreign_keys = pd.Series([2, 2, 2, 3, 4, 5]) + synthetic_data = (synthetic_parent_keys, synthetic_foreign_keys) + + metric = CardinalityBoundaryAdherence() + + # Run + result = metric.compute(real_data, synthetic_data) + + # Assert + assert result == 0.6