Skip to content

Commit

Permalink
fix boundary_adherence nan handling
Browse files Browse the repository at this point in the history
  • Loading branch information
frances-h committed Oct 24, 2023
1 parent 99cb1e4 commit 5a5fef4
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 2 deletions.
5 changes: 3 additions & 2 deletions sdmetrics/single_column/statistical/boundary_adherence.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,9 @@ def compute(cls, real_data, synthetic_data):
float:
The boundary adherence of the two columns.
"""
real_data = pd.Series(real_data).dropna()
synthetic_data = pd.Series(synthetic_data).dropna()
if any(pd.isna(real_data)):
real_data = pd.Series(real_data).dropna()
synthetic_data = pd.Series(synthetic_data).dropna()

if is_datetime(real_data):
real_data = pd.to_numeric(real_data)
Expand Down
61 changes: 61 additions & 0 deletions tests/unit/single_column/statistical/test_boundary_adherence.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from datetime import datetime
from unittest.mock import patch

import numpy as np
import pandas as pd

from sdmetrics.single_column.statistical import BoundaryAdherence
Expand Down Expand Up @@ -31,6 +33,65 @@ def test_compute(self):
# Assert
assert result == 0.75

def test_compute_nans(self):
"""Test the ``compute`` method with nan values.
Expect that the nan values in synthetic data are considered as
out of bounds if the real data does not also containt nan values.
"""
# Setup
real_data = pd.Series([1.0, 2.4, 2.6, 0.8]) # 0.8 -> 2.6
real_data_nans = pd.Series([1.0, 2.4, 2.6, 0.8, np.nan])
synthetic_data = pd.Series([0.9, 1.8, 2.1, 5.0, np.nan])

metric = BoundaryAdherence()

# Run
result = metric.compute(real_data, synthetic_data)
result_ignore_nans = metric.compute(real_data_nans, synthetic_data)

# Assert
assert result == 0.6
assert result_ignore_nans == 0.75

def test_compute_datetime_nans(self):
"""Test the ``compute`` method with nan values.
Expect that the nan values in synthetic data are considered as
out of bounds if the real data does not also containt nan values.
"""
# Setup
real_data = pd.Series([
datetime(2020, 10, 1),
datetime(2021, 1, 2),
datetime(2021, 9, 12),
datetime(2022, 10, 1),

], dtype='datetime64[ns]') # 0.8 -> 2.6
real_data_nans = pd.Series([
datetime(2020, 10, 1),
datetime(2021, 1, 2),
datetime(2021, 9, 12),
datetime(2022, 10, 1),
pd.NaT
], dtype='datetime64[ns]')
synthetic_data = pd.Series([
datetime(2020, 11, 1),
datetime(2021, 1, 2),
datetime(2021, 2, 9),
pd.NaT,
], dtype='datetime64[ns]')

metric = BoundaryAdherence()

# Run
result = metric.compute(real_data, synthetic_data)
result_ignore_nans = metric.compute(real_data_nans, synthetic_data)

# Assert
assert result == 0.75
assert result_ignore_nans == 1

@patch('sdmetrics.single_column.statistical.boundary_adherence.SingleColumnMetric.normalize')
def test_normalize(self, normalize_mock):
"""Test the ``normalize`` method.
Expand Down

0 comments on commit 5a5fef4

Please sign in to comment.