From 0245e95f5189b7d7f9228daaa0d3c9913937471a Mon Sep 17 00:00:00 2001 From: R-Palazzo <116157184+R-Palazzo@users.noreply.github.com> Date: Thu, 26 Oct 2023 08:13:55 -0600 Subject: [PATCH] Add `ReferentialIntegrity` metric (#480) --- sdmetrics/column_pairs/__init__.py | 2 + .../column_pairs/statistical/__init__.py | 2 + .../statistical/referential_integrity.py | 70 ++++++++++++++++++ .../statistical/test_referential_integrity.py | 72 +++++++++++++++++++ 4 files changed, 146 insertions(+) create mode 100644 sdmetrics/column_pairs/statistical/referential_integrity.py create mode 100644 tests/unit/column_pairs/statistical/test_referential_integrity.py diff --git a/sdmetrics/column_pairs/__init__.py b/sdmetrics/column_pairs/__init__.py index efa83ac8..a3792f2c 100644 --- a/sdmetrics/column_pairs/__init__.py +++ b/sdmetrics/column_pairs/__init__.py @@ -5,6 +5,7 @@ from sdmetrics.column_pairs.statistical.correlation_similarity import CorrelationSimilarity from sdmetrics.column_pairs.statistical.kl_divergence import ( ContinuousKLDivergence, DiscreteKLDivergence) +from sdmetrics.column_pairs.statistical.referential_integrity import ReferentialIntegrity __all__ = [ 'ColumnPairsMetric', @@ -12,4 +13,5 @@ 'ContinuousKLDivergence', 'CorrelationSimilarity', 'DiscreteKLDivergence', + 'ReferentialIntegrity', ] diff --git a/sdmetrics/column_pairs/statistical/__init__.py b/sdmetrics/column_pairs/statistical/__init__.py index c9175c95..d13b7872 100644 --- a/sdmetrics/column_pairs/statistical/__init__.py +++ b/sdmetrics/column_pairs/statistical/__init__.py @@ -4,10 +4,12 @@ from sdmetrics.column_pairs.statistical.correlation_similarity import CorrelationSimilarity from sdmetrics.column_pairs.statistical.kl_divergence import ( ContinuousKLDivergence, DiscreteKLDivergence) +from sdmetrics.column_pairs.statistical.referential_integrity import ReferentialIntegrity __all__ = [ 'ContingencySimilarity', 'ContinuousKLDivergence', 'CorrelationSimilarity', 'DiscreteKLDivergence', + 'ReferentialIntegrity', ] diff --git a/sdmetrics/column_pairs/statistical/referential_integrity.py b/sdmetrics/column_pairs/statistical/referential_integrity.py new file mode 100644 index 00000000..096bc239 --- /dev/null +++ b/sdmetrics/column_pairs/statistical/referential_integrity.py @@ -0,0 +1,70 @@ +"""Referential Integrity Metric.""" +import logging + +from sdmetrics.column_pairs.base import ColumnPairsMetric +from sdmetrics.goal import Goal + +LOGGER = logging.getLogger(__name__) + + +class ReferentialIntegrity(ColumnPairsMetric): + """Referential Integrity metric. + + Compute the fraction of foreign key values that reference a value in the primary key column + in the synthetic data. + + Attributes: + name (str): + Name to use when reports about this metric are printed. + goal (sdmetrics.goal.Goal): + The goal of this metric. + min_value (Union[float, tuple[float]]): + Minimum value or values that this metric can take. + max_value (Union[float, tuple[float]]): + Maximum value or values that this metric can take. + """ + + name = 'ReferentialIntegrity' + goal = Goal.MAXIMIZE + min_value = 0.0 + max_value = 1.0 + + @classmethod + def compute_breakdown(cls, real_data, synthetic_data): + """Compute the score breakdown of the referential integrity metric. + + Args: + real_data (tuple of 2 pandas.Series): + (primary_key, foreign_key) columns from the real data. + synthetic_data (tuple of 2 pandas.Series): + (primary_key, foreign_key) columns from the synthetic data. + + Returns: + dict: + The score breakdown of the key uniqueness metric. + """ + missing_parents = not real_data[1].isin(real_data[0]).all() + if missing_parents: + LOGGER.info( + "The real data has foreign keys that don't reference any primary key." + ) + + score = synthetic_data[1].isin(synthetic_data[0]).mean() + + return {'score': score} + + @classmethod + def compute(cls, real_data, synthetic_data): + """Compute the referential integrity of two columns. + + Args: + real_data (tuple of 2 pandas.Series): + (primary_key, foreign_key) columns from the real data. + synthetic_data (tuple of 2 pandas.Series): + (primary_key, foreign_key) columns from the synthetic data. + + Returns: + float: + The key uniqueness of the two columns. + """ + return cls.compute_breakdown(real_data, synthetic_data)['score'] diff --git a/tests/unit/column_pairs/statistical/test_referential_integrity.py b/tests/unit/column_pairs/statistical/test_referential_integrity.py new file mode 100644 index 00000000..f6fffb0a --- /dev/null +++ b/tests/unit/column_pairs/statistical/test_referential_integrity.py @@ -0,0 +1,72 @@ +from unittest.mock import patch + +import pandas as pd + +from sdmetrics.column_pairs.statistical import ReferentialIntegrity + + +class TestReferentialIntegrity: + + def test_compute_breakdown(self): + """Test the ``compute_breakdown`` method.""" + # Setup + real_data = pd.DataFrame({ + 'primary_key': [1, 2, 3, 4, 5], + 'foreign_key': [1, 2, 3, 2, 1] + }) + synthetic_data = pd.DataFrame({ + 'primary_key': [1, 2, 3, 4, 5], + 'foreign_key': [1, 6, 3, 4, 5] + }) + + metric = ReferentialIntegrity() + tuple_real = (real_data['primary_key'], real_data['foreign_key']) + tuple_synthetic = (synthetic_data['primary_key'], synthetic_data['foreign_key']) + + # Run + result = metric.compute_breakdown(tuple_real, tuple_synthetic) + + # Assert + assert result == {'score': 0.8} + + @patch('sdmetrics.column_pairs.statistical.referential_integrity.LOGGER') + def test_compute_breakdown_with_missing_relations_real_data(self, logger_mock): + """Test the ``compute_breakdown`` when there is missing relationships in the real data.""" + # Setup + real_data = pd.DataFrame({ + 'primary_key': [1, 2, 3, 4, 5], + 'foreign_key': [1, 2, 6, 2, 1] + }) + synthetic_data = pd.DataFrame({ + 'primary_key': [1, 2, 3, 4, 5], + 'foreign_key': [1, 6, 3, 4, 5] + }) + + metric = ReferentialIntegrity() + tuple_real = (real_data['primary_key'], real_data['foreign_key']) + tuple_synthetic = (synthetic_data['primary_key'], synthetic_data['foreign_key']) + + # Run + result = metric.compute_breakdown(tuple_real, tuple_synthetic) + + # Assert + expected_message = "The real data has foreign keys that don't reference any primary key." + assert result == {'score': 0.8} + logger_mock.info.assert_called_once_with(expected_message) + + @patch('sdmetrics.column_pairs.statistical.referential_integrity.' + 'ReferentialIntegrity.compute_breakdown') + def test_compute(self, compute_breakdown_mock): + """Test the ``compute`` method.""" + # Setup + real_data = pd.Series(['A', 'B', 'C', 'B', 'A']) + synthetic_data = pd.Series(['A', 'B', 'C', 'D', 'E']) + metric = ReferentialIntegrity() + compute_breakdown_mock.return_value = {'score': 0.6} + + # Run + result = metric.compute(real_data, synthetic_data) + + # Assert + compute_breakdown_mock.assert_called_once_with(real_data, synthetic_data) + assert result == 0.6