diff --git a/pyproject.toml b/pyproject.toml index f95571d9..f0e87e4d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,9 +30,9 @@ dependencies = [ "scikit-learn>=1.1.0;python_version>='3.10' and python_version<'3.11'", "scikit-learn>=1.1.3;python_version>='3.11' and python_version<'3.12'", "scikit-learn>=1.3.1;python_version>='3.12'", - "scipy>=1.7.3;python_version<'3.10'", - "scipy>=1.9.2;python_version>='3.10' and python_version<'3.12'", - "scipy>=1.12.0;python_version>='3.12'", + "scipy>=1.7.3,<1.14.0;python_version<'3.10'", + "scipy>=1.9.2,<1.14.0;python_version>='3.10' and python_version<'3.12'", + "scipy>=1.12.0,<1.14.0;python_version>='3.12'", 'copulas>=0.11.0', 'tqdm>=4.29', 'plotly>=5.19.0', diff --git a/tests/unit/column_pairs/statistical/test_referential_integrity.py b/tests/unit/column_pairs/statistical/test_referential_integrity.py index 6214542c..a68cc77d 100644 --- a/tests/unit/column_pairs/statistical/test_referential_integrity.py +++ b/tests/unit/column_pairs/statistical/test_referential_integrity.py @@ -68,32 +68,56 @@ def test_compute(self, compute_breakdown_mock): assert result == 0.6 def test_compute_with_nan_foreign_keys_real_data(self): - """Test the ``compute`` method with NaN foreign keys inside the real data.""" + """Test the ``compute`` method with NaN foreign keys inside the real data. + + Here, the score should be 1.0, whether or not the synthetic data have NaN values + values, as the real data have null foreign keys. + """ # Setup parent_keys = pd.Series(['a', 'b', 'c']) - foreign_keys = pd.Series(['a', 'a', 'b', 'c', np.nan]) + real_fk = pd.Series(['a', 'a', 'b', 'c', np.nan]) + synthetic_fk = pd.Series(['a', 'a', 'b', 'c', 'a']) + synthetic_fk_with_nan = pd.Series(['a', 'a', 'b', 'c', np.nan]) metric = ReferentialIntegrity() # Run result = metric.compute( - real_data=(parent_keys, foreign_keys), synthetic_data=(parent_keys, foreign_keys) + real_data=(parent_keys, real_fk), synthetic_data=(parent_keys, synthetic_fk) + ) + result_with_nan = metric.compute( + real_data=(parent_keys, real_fk), synthetic_data=(parent_keys, synthetic_fk_with_nan) ) # Assert assert result == 1.0 + assert result_with_nan == 1.0 def test_compute_with_nan_foreign_keys_only_synthetic_data(self): - """Test the ``compute`` method with NaN foreign keys inside the synthetic data.""" + """Test the ``compute`` method with NaN foreign keys inside the synthetic data. + + Here, the real data have no null foreign keys, so the score should decrease as + the number of NaN values in the synthetic data increases. + """ # Setup parent_keys = pd.Series(['a', 'b', 'c']) - foreign_keys = pd.Series(['a', 'a', 'b', 'c', 'a']) - synth_foreign_keys = pd.Series(['a', 'a', 'b', 'c', np.nan]) + real_fk = pd.Series(['a', 'a', 'b', 'c', 'a']) + synth_fk_0_nan = pd.Series(['a', 'a', 'b', 'c']) + synth_fk_1_nan = pd.Series(['a', 'a', 'b', 'c', np.nan]) + synth_fk_2_nan = pd.Series(['a', 'a', 'b', 'c', np.nan, np.nan]) metric = ReferentialIntegrity() # Run - result = metric.compute( - real_data=(parent_keys, foreign_keys), synthetic_data=(parent_keys, synth_foreign_keys) + result_0 = metric.compute( + real_data=(parent_keys, real_fk), synthetic_data=(parent_keys, synth_fk_0_nan) + ) + result_1 = metric.compute( + real_data=(parent_keys, real_fk), synthetic_data=(parent_keys, synth_fk_1_nan) + ) + result_2 = metric.compute( + real_data=(parent_keys, real_fk), synthetic_data=(parent_keys, synth_fk_2_nan) ) # Assert - assert result == 0.8 + assert result_0 == 1.0 + assert result_1 == 0.8 + assert result_2 == 2 / 3