diff --git a/sdmetrics/single_column/statistical/boundary_adherence.py b/sdmetrics/single_column/statistical/boundary_adherence.py index 2646b24e..3dc74f0b 100644 --- a/sdmetrics/single_column/statistical/boundary_adherence.py +++ b/sdmetrics/single_column/statistical/boundary_adherence.py @@ -43,8 +43,11 @@ def compute(cls, real_data, synthetic_data): float: The boundary adherence of the two columns. """ - real_data = pd.Series(real_data).dropna() - synthetic_data = pd.Series(synthetic_data).dropna() + real_data = pd.Series(real_data) + synthetic_data = pd.Series(synthetic_data) + if any(pd.isna(real_data)): + real_data = real_data.dropna() + synthetic_data = synthetic_data.dropna() if is_datetime(real_data): real_data = pd.to_numeric(real_data) diff --git a/tests/unit/single_column/statistical/test_boundary_adherence.py b/tests/unit/single_column/statistical/test_boundary_adherence.py index 4ef240bf..828a7730 100644 --- a/tests/unit/single_column/statistical/test_boundary_adherence.py +++ b/tests/unit/single_column/statistical/test_boundary_adherence.py @@ -1,5 +1,7 @@ +from datetime import datetime from unittest.mock import patch +import numpy as np import pandas as pd from sdmetrics.single_column.statistical import BoundaryAdherence @@ -31,6 +33,65 @@ def test_compute(self): # Assert assert result == 0.75 + def test_compute_nans(self): + """Test the ``compute`` method with nan values. + + Expect that the nan values in synthetic data are considered as + out of bounds if the real data does not also containt nan values. + """ + # Setup + real_data = pd.Series([1.0, 2.4, 2.6, 0.8]) # 0.8 -> 2.6 + real_data_nans = pd.Series([1.0, 2.4, 2.6, 0.8, np.nan]) + synthetic_data = pd.Series([0.9, 1.8, 2.1, 5.0, np.nan]) + + metric = BoundaryAdherence() + + # Run + result = metric.compute(real_data, synthetic_data) + result_ignore_nans = metric.compute(real_data_nans, synthetic_data) + + # Assert + assert result == 0.6 + assert result_ignore_nans == 0.75 + + def test_compute_datetime_nans(self): + """Test the ``compute`` method with nan values. + + Expect that the nan values in synthetic data are considered as + out of bounds if the real data does not also containt nan values. + """ + # Setup + real_data = pd.Series([ + datetime(2020, 10, 1), + datetime(2021, 1, 2), + datetime(2021, 9, 12), + datetime(2022, 10, 1), + + ], dtype='datetime64[ns]') # 0.8 -> 2.6 + real_data_nans = pd.Series([ + datetime(2020, 10, 1), + datetime(2021, 1, 2), + datetime(2021, 9, 12), + datetime(2022, 10, 1), + pd.NaT + ], dtype='datetime64[ns]') + synthetic_data = pd.Series([ + datetime(2020, 11, 1), + datetime(2021, 1, 2), + datetime(2021, 2, 9), + pd.NaT, + ], dtype='datetime64[ns]') + + metric = BoundaryAdherence() + + # Run + result = metric.compute(real_data, synthetic_data) + result_ignore_nans = metric.compute(real_data_nans, synthetic_data) + + # Assert + assert result == 0.75 + assert result_ignore_nans == 1 + @patch('sdmetrics.single_column.statistical.boundary_adherence.SingleColumnMetric.normalize') def test_normalize(self, normalize_mock): """Test the ``normalize`` method.