From 7d2f508f9dad7f42594af26f18f4ab337e03004b Mon Sep 17 00:00:00 2001 From: R-Palazzo <116157184+R-Palazzo@users.noreply.github.com> Date: Mon, 24 Jun 2024 09:51:32 +0100 Subject: [PATCH] Update Referential Integrity metric to support NaNs in child column (#598) --- .../statistical/test_referential_integrity.py | 42 +++++++++++++++---- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/tests/unit/column_pairs/statistical/test_referential_integrity.py b/tests/unit/column_pairs/statistical/test_referential_integrity.py index 6214542c..a68cc77d 100644 --- a/tests/unit/column_pairs/statistical/test_referential_integrity.py +++ b/tests/unit/column_pairs/statistical/test_referential_integrity.py @@ -68,32 +68,56 @@ def test_compute(self, compute_breakdown_mock): assert result == 0.6 def test_compute_with_nan_foreign_keys_real_data(self): - """Test the ``compute`` method with NaN foreign keys inside the real data.""" + """Test the ``compute`` method with NaN foreign keys inside the real data. + + Here, the score should be 1.0, whether or not the synthetic data have NaN values + values, as the real data have null foreign keys. + """ # Setup parent_keys = pd.Series(['a', 'b', 'c']) - foreign_keys = pd.Series(['a', 'a', 'b', 'c', np.nan]) + real_fk = pd.Series(['a', 'a', 'b', 'c', np.nan]) + synthetic_fk = pd.Series(['a', 'a', 'b', 'c', 'a']) + synthetic_fk_with_nan = pd.Series(['a', 'a', 'b', 'c', np.nan]) metric = ReferentialIntegrity() # Run result = metric.compute( - real_data=(parent_keys, foreign_keys), synthetic_data=(parent_keys, foreign_keys) + real_data=(parent_keys, real_fk), synthetic_data=(parent_keys, synthetic_fk) + ) + result_with_nan = metric.compute( + real_data=(parent_keys, real_fk), synthetic_data=(parent_keys, synthetic_fk_with_nan) ) # Assert assert result == 1.0 + assert result_with_nan == 1.0 def test_compute_with_nan_foreign_keys_only_synthetic_data(self): - """Test the ``compute`` method with NaN foreign keys inside the synthetic data.""" + """Test the ``compute`` method with NaN foreign keys inside the synthetic data. + + Here, the real data have no null foreign keys, so the score should decrease as + the number of NaN values in the synthetic data increases. + """ # Setup parent_keys = pd.Series(['a', 'b', 'c']) - foreign_keys = pd.Series(['a', 'a', 'b', 'c', 'a']) - synth_foreign_keys = pd.Series(['a', 'a', 'b', 'c', np.nan]) + real_fk = pd.Series(['a', 'a', 'b', 'c', 'a']) + synth_fk_0_nan = pd.Series(['a', 'a', 'b', 'c']) + synth_fk_1_nan = pd.Series(['a', 'a', 'b', 'c', np.nan]) + synth_fk_2_nan = pd.Series(['a', 'a', 'b', 'c', np.nan, np.nan]) metric = ReferentialIntegrity() # Run - result = metric.compute( - real_data=(parent_keys, foreign_keys), synthetic_data=(parent_keys, synth_foreign_keys) + result_0 = metric.compute( + real_data=(parent_keys, real_fk), synthetic_data=(parent_keys, synth_fk_0_nan) + ) + result_1 = metric.compute( + real_data=(parent_keys, real_fk), synthetic_data=(parent_keys, synth_fk_1_nan) + ) + result_2 = metric.compute( + real_data=(parent_keys, real_fk), synthetic_data=(parent_keys, synth_fk_2_nan) ) # Assert - assert result == 0.8 + assert result_0 == 1.0 + assert result_1 == 0.8 + assert result_2 == 2 / 3