Skip to content

Commit

Permalink
Update Referential Integrity metric to support NaNs in child column (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
R-Palazzo authored Jun 24, 2024
1 parent 2463b24 commit 7d2f508
Showing 1 changed file with 33 additions and 9 deletions.
42 changes: 33 additions & 9 deletions tests/unit/column_pairs/statistical/test_referential_integrity.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,32 +68,56 @@ def test_compute(self, compute_breakdown_mock):
assert result == 0.6

def test_compute_with_nan_foreign_keys_real_data(self):
"""Test the ``compute`` method with NaN foreign keys inside the real data."""
"""Test the ``compute`` method with NaN foreign keys inside the real data.
Here, the score should be 1.0, whether or not the synthetic data have NaN values
values, as the real data have null foreign keys.
"""
# Setup
parent_keys = pd.Series(['a', 'b', 'c'])
foreign_keys = pd.Series(['a', 'a', 'b', 'c', np.nan])
real_fk = pd.Series(['a', 'a', 'b', 'c', np.nan])
synthetic_fk = pd.Series(['a', 'a', 'b', 'c', 'a'])
synthetic_fk_with_nan = pd.Series(['a', 'a', 'b', 'c', np.nan])
metric = ReferentialIntegrity()

# Run
result = metric.compute(
real_data=(parent_keys, foreign_keys), synthetic_data=(parent_keys, foreign_keys)
real_data=(parent_keys, real_fk), synthetic_data=(parent_keys, synthetic_fk)
)
result_with_nan = metric.compute(
real_data=(parent_keys, real_fk), synthetic_data=(parent_keys, synthetic_fk_with_nan)
)

# Assert
assert result == 1.0
assert result_with_nan == 1.0

def test_compute_with_nan_foreign_keys_only_synthetic_data(self):
"""Test the ``compute`` method with NaN foreign keys inside the synthetic data."""
"""Test the ``compute`` method with NaN foreign keys inside the synthetic data.
Here, the real data have no null foreign keys, so the score should decrease as
the number of NaN values in the synthetic data increases.
"""
# Setup
parent_keys = pd.Series(['a', 'b', 'c'])
foreign_keys = pd.Series(['a', 'a', 'b', 'c', 'a'])
synth_foreign_keys = pd.Series(['a', 'a', 'b', 'c', np.nan])
real_fk = pd.Series(['a', 'a', 'b', 'c', 'a'])
synth_fk_0_nan = pd.Series(['a', 'a', 'b', 'c'])
synth_fk_1_nan = pd.Series(['a', 'a', 'b', 'c', np.nan])
synth_fk_2_nan = pd.Series(['a', 'a', 'b', 'c', np.nan, np.nan])
metric = ReferentialIntegrity()

# Run
result = metric.compute(
real_data=(parent_keys, foreign_keys), synthetic_data=(parent_keys, synth_foreign_keys)
result_0 = metric.compute(
real_data=(parent_keys, real_fk), synthetic_data=(parent_keys, synth_fk_0_nan)
)
result_1 = metric.compute(
real_data=(parent_keys, real_fk), synthetic_data=(parent_keys, synth_fk_1_nan)
)
result_2 = metric.compute(
real_data=(parent_keys, real_fk), synthetic_data=(parent_keys, synth_fk_2_nan)
)

# Assert
assert result == 0.8
assert result_0 == 1.0
assert result_1 == 0.8
assert result_2 == 2 / 3

0 comments on commit 7d2f508

Please sign in to comment.