Skip to content

Commit

Permalink
Merge branch 'issue_581_get_column_plot' into issue_581_get_cardinali…
Browse files Browse the repository at this point in the history
…ty_plot
  • Loading branch information
lajohn4747 committed Jun 25, 2024
2 parents a93c05f + 990eb74 commit 8f5a9fb
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 12 deletions.
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ dependencies = [
"scikit-learn>=1.1.0;python_version>='3.10' and python_version<'3.11'",
"scikit-learn>=1.1.3;python_version>='3.11' and python_version<'3.12'",
"scikit-learn>=1.3.1;python_version>='3.12'",
"scipy>=1.7.3;python_version<'3.10'",
"scipy>=1.9.2;python_version>='3.10' and python_version<'3.12'",
"scipy>=1.12.0;python_version>='3.12'",
"scipy>=1.7.3,<1.14.0;python_version<'3.10'",
"scipy>=1.9.2,<1.14.0;python_version>='3.10' and python_version<'3.12'",
"scipy>=1.12.0,<1.14.0;python_version>='3.12'",
'copulas>=0.11.0',
'tqdm>=4.29',
'plotly>=5.19.0',
Expand Down
42 changes: 33 additions & 9 deletions tests/unit/column_pairs/statistical/test_referential_integrity.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,32 +68,56 @@ def test_compute(self, compute_breakdown_mock):
assert result == 0.6

def test_compute_with_nan_foreign_keys_real_data(self):
"""Test the ``compute`` method with NaN foreign keys inside the real data."""
"""Test the ``compute`` method with NaN foreign keys inside the real data.
Here, the score should be 1.0, whether or not the synthetic data have NaN values
values, as the real data have null foreign keys.
"""
# Setup
parent_keys = pd.Series(['a', 'b', 'c'])
foreign_keys = pd.Series(['a', 'a', 'b', 'c', np.nan])
real_fk = pd.Series(['a', 'a', 'b', 'c', np.nan])
synthetic_fk = pd.Series(['a', 'a', 'b', 'c', 'a'])
synthetic_fk_with_nan = pd.Series(['a', 'a', 'b', 'c', np.nan])
metric = ReferentialIntegrity()

# Run
result = metric.compute(
real_data=(parent_keys, foreign_keys), synthetic_data=(parent_keys, foreign_keys)
real_data=(parent_keys, real_fk), synthetic_data=(parent_keys, synthetic_fk)
)
result_with_nan = metric.compute(
real_data=(parent_keys, real_fk), synthetic_data=(parent_keys, synthetic_fk_with_nan)
)

# Assert
assert result == 1.0
assert result_with_nan == 1.0

def test_compute_with_nan_foreign_keys_only_synthetic_data(self):
"""Test the ``compute`` method with NaN foreign keys inside the synthetic data."""
"""Test the ``compute`` method with NaN foreign keys inside the synthetic data.
Here, the real data have no null foreign keys, so the score should decrease as
the number of NaN values in the synthetic data increases.
"""
# Setup
parent_keys = pd.Series(['a', 'b', 'c'])
foreign_keys = pd.Series(['a', 'a', 'b', 'c', 'a'])
synth_foreign_keys = pd.Series(['a', 'a', 'b', 'c', np.nan])
real_fk = pd.Series(['a', 'a', 'b', 'c', 'a'])
synth_fk_0_nan = pd.Series(['a', 'a', 'b', 'c'])
synth_fk_1_nan = pd.Series(['a', 'a', 'b', 'c', np.nan])
synth_fk_2_nan = pd.Series(['a', 'a', 'b', 'c', np.nan, np.nan])
metric = ReferentialIntegrity()

# Run
result = metric.compute(
real_data=(parent_keys, foreign_keys), synthetic_data=(parent_keys, synth_foreign_keys)
result_0 = metric.compute(
real_data=(parent_keys, real_fk), synthetic_data=(parent_keys, synth_fk_0_nan)
)
result_1 = metric.compute(
real_data=(parent_keys, real_fk), synthetic_data=(parent_keys, synth_fk_1_nan)
)
result_2 = metric.compute(
real_data=(parent_keys, real_fk), synthetic_data=(parent_keys, synth_fk_2_nan)
)

# Assert
assert result == 0.8
assert result_0 == 1.0
assert result_1 == 0.8
assert result_2 == 2 / 3

0 comments on commit 8f5a9fb

Please sign in to comment.