Add ReferentialIntegrity metric (#480)

sdv-dev · Oct 26, 2023 · 0245e95 · 0245e95
1 parent 99cb1e4
commit 0245e95
Show file tree

Hide file tree

Showing 4 changed files with 146 additions and 0 deletions.
diff --git a/sdmetrics/column_pairs/__init__.py b/sdmetrics/column_pairs/__init__.py
@@ -5,11 +5,13 @@
 from sdmetrics.column_pairs.statistical.correlation_similarity import CorrelationSimilarity
 from sdmetrics.column_pairs.statistical.kl_divergence import (
     ContinuousKLDivergence, DiscreteKLDivergence)
+from sdmetrics.column_pairs.statistical.referential_integrity import ReferentialIntegrity
 
 __all__ = [
     'ColumnPairsMetric',
     'ContingencySimilarity',
     'ContinuousKLDivergence',
     'CorrelationSimilarity',
     'DiscreteKLDivergence',
+    'ReferentialIntegrity',
 ]
diff --git a/sdmetrics/column_pairs/statistical/__init__.py b/sdmetrics/column_pairs/statistical/__init__.py
@@ -4,10 +4,12 @@
 from sdmetrics.column_pairs.statistical.correlation_similarity import CorrelationSimilarity
 from sdmetrics.column_pairs.statistical.kl_divergence import (
     ContinuousKLDivergence, DiscreteKLDivergence)
+from sdmetrics.column_pairs.statistical.referential_integrity import ReferentialIntegrity
 
 __all__ = [
     'ContingencySimilarity',
     'ContinuousKLDivergence',
     'CorrelationSimilarity',
     'DiscreteKLDivergence',
+    'ReferentialIntegrity',
 ]
diff --git a/sdmetrics/column_pairs/statistical/referential_integrity.py b/sdmetrics/column_pairs/statistical/referential_integrity.py
@@ -0,0 +1,70 @@
+"""Referential Integrity Metric."""
+import logging
+
+from sdmetrics.column_pairs.base import ColumnPairsMetric
+from sdmetrics.goal import Goal
+
+LOGGER = logging.getLogger(__name__)
+
+
+class ReferentialIntegrity(ColumnPairsMetric):
+    """Referential Integrity metric.
+
+    Compute the fraction of foreign key values that reference a value in the primary key column
+    in the synthetic data.
+
+    Attributes:
+        name (str):
+            Name to use when reports about this metric are printed.
+        goal (sdmetrics.goal.Goal):
+            The goal of this metric.
+        min_value (Union[float, tuple[float]]):
+            Minimum value or values that this metric can take.
+        max_value (Union[float, tuple[float]]):
+            Maximum value or values that this metric can take.
+    """
+
+    name = 'ReferentialIntegrity'
+    goal = Goal.MAXIMIZE
+    min_value = 0.0
+    max_value = 1.0
+
+    @classmethod
+    def compute_breakdown(cls, real_data, synthetic_data):
+        """Compute the score breakdown of the referential integrity metric.
+
+        Args:
+            real_data (tuple of 2 pandas.Series):
+                (primary_key, foreign_key) columns from the real data.
+            synthetic_data (tuple of 2 pandas.Series):
+                (primary_key, foreign_key) columns from the synthetic data.
+
+        Returns:
+            dict:
+                The score breakdown of the key uniqueness metric.
+        """
+        missing_parents = not real_data[1].isin(real_data[0]).all()
+        if missing_parents:
+            LOGGER.info(
+                "The real data has foreign keys that don't reference any primary key."
+            )
+
+        score = synthetic_data[1].isin(synthetic_data[0]).mean()
+
+        return {'score': score}
+
+    @classmethod
+    def compute(cls, real_data, synthetic_data):
+        """Compute the referential integrity of two columns.
+
+        Args:
+            real_data (tuple of 2 pandas.Series):
+                (primary_key, foreign_key) columns from the real data.
+            synthetic_data (tuple of 2 pandas.Series):
+                (primary_key, foreign_key) columns from the synthetic data.
+
+        Returns:
+            float:
+                The key uniqueness of the two columns.
+        """
+        return cls.compute_breakdown(real_data, synthetic_data)['score']
diff --git a/tests/unit/column_pairs/statistical/test_referential_integrity.py b/tests/unit/column_pairs/statistical/test_referential_integrity.py
@@ -0,0 +1,72 @@
+from unittest.mock import patch
+
+import pandas as pd
+
+from sdmetrics.column_pairs.statistical import ReferentialIntegrity
+
+
+class TestReferentialIntegrity:
+
+    def test_compute_breakdown(self):
+        """Test the ``compute_breakdown`` method."""
+        # Setup
+        real_data = pd.DataFrame({
+            'primary_key': [1, 2, 3, 4, 5],
+            'foreign_key': [1, 2, 3, 2, 1]
+        })
+        synthetic_data = pd.DataFrame({
+            'primary_key': [1, 2, 3, 4, 5],
+            'foreign_key': [1, 6, 3, 4, 5]
+        })
+
+        metric = ReferentialIntegrity()
+        tuple_real = (real_data['primary_key'], real_data['foreign_key'])
+        tuple_synthetic = (synthetic_data['primary_key'], synthetic_data['foreign_key'])
+
+        # Run
+        result = metric.compute_breakdown(tuple_real, tuple_synthetic)
+
+        # Assert
+        assert result == {'score': 0.8}
+
+    @patch('sdmetrics.column_pairs.statistical.referential_integrity.LOGGER')
+    def test_compute_breakdown_with_missing_relations_real_data(self, logger_mock):
+        """Test the ``compute_breakdown`` when there is missing relationships in the real data."""
+        # Setup
+        real_data = pd.DataFrame({
+            'primary_key': [1, 2, 3, 4, 5],
+            'foreign_key': [1, 2, 6, 2, 1]
+        })
+        synthetic_data = pd.DataFrame({
+            'primary_key': [1, 2, 3, 4, 5],
+            'foreign_key': [1, 6, 3, 4, 5]
+        })
+
+        metric = ReferentialIntegrity()
+        tuple_real = (real_data['primary_key'], real_data['foreign_key'])
+        tuple_synthetic = (synthetic_data['primary_key'], synthetic_data['foreign_key'])
+
+        # Run
+        result = metric.compute_breakdown(tuple_real, tuple_synthetic)
+
+        # Assert
+        expected_message = "The real data has foreign keys that don't reference any primary key."
+        assert result == {'score': 0.8}
+        logger_mock.info.assert_called_once_with(expected_message)
+
+    @patch('sdmetrics.column_pairs.statistical.referential_integrity.'
+           'ReferentialIntegrity.compute_breakdown')
+    def test_compute(self, compute_breakdown_mock):
+        """Test the ``compute`` method."""
+        # Setup
+        real_data = pd.Series(['A', 'B', 'C', 'B', 'A'])
+        synthetic_data = pd.Series(['A', 'B', 'C', 'D', 'E'])
+        metric = ReferentialIntegrity()
+        compute_breakdown_mock.return_value = {'score': 0.6}
+
+        # Run
+        result = metric.compute(real_data, synthetic_data)
+
+        # Assert
+        compute_breakdown_mock.assert_called_once_with(real_data, synthetic_data)
+        assert result == 0.6