Skip to content

Commit

Permalink
Merge branch 'main' into issue-652-kscomplement
Browse files Browse the repository at this point in the history
  • Loading branch information
fealho authored Nov 25, 2024
2 parents 3700419 + d2cb918 commit e7d49ea
Show file tree
Hide file tree
Showing 7 changed files with 105 additions and 46 deletions.
9 changes: 9 additions & 0 deletions .github/workflows/integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,12 @@ jobs:
python -m pip install invoke .[test]
- name: Run integration tests
run: invoke integration

- if: matrix.os == 'ubuntu-latest' && matrix.python-version == 3.12
name: Upload integration codecov report
uses: codecov/codecov-action@v4
with:
flags: integration
file: ${{ github.workspace }}/integration_cov.xml
fail_ci_if_error: true
token: ${{ secrets.CODECOV_TOKEN }}
10 changes: 8 additions & 2 deletions .github/workflows/unit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@ jobs:
python -m pip install invoke .[test]
- name: Run unit tests
run: invoke unit
- if: matrix.os == 'ubuntu-latest' && matrix.python-version == 3.8
name: Upload codecov report

- if: matrix.os == 'ubuntu-latest' && matrix.python-version == 3.12
name: Upload unit codecov report
uses: codecov/codecov-action@v4
with:
flags: unit
file: ${{ github.workspace }}/unit_cov.xml
fail_ci_if_error: true
token: ${{ secrets.CODECOV_TOKEN }}
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ coverage.xml
*.cover
.hypothesis/
.pytest_cache/
*_cov.xml

# Translations
*.mo
Expand Down Expand Up @@ -108,4 +109,3 @@ ENV/

# OS Files
.DS_Store

2 changes: 1 addition & 1 deletion latest_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ pandas==2.2.3
plotly==5.24.1
scikit-learn==1.5.2
scipy==1.13.1
tqdm==4.67.0
tqdm==4.67.1
99 changes: 59 additions & 40 deletions sdmetrics/column_pairs/statistical/inter_row_msas.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,61 @@ class InterRowMSAS:
max_value = 1.0

@staticmethod
def compute(real_data, synthetic_data, n_rows_diff=1, apply_log=False):
def _validate_inputs(real_data, synthetic_data, n_rows_diff, apply_log):
for data in [real_data, synthetic_data]:
if (
not isinstance(data, tuple)
or len(data) != 2
or (not (isinstance(data[0], pd.Series) and isinstance(data[1], pd.Series)))
):
raise ValueError('The data must be a tuple of two pandas series.')

if not isinstance(n_rows_diff, int) or n_rows_diff < 1:
raise ValueError("'n_rows_diff' must be an integer greater than zero.")

if not isinstance(apply_log, bool):
raise ValueError("'apply_log' must be a boolean.")

@staticmethod
def _apply_log(real_values, synthetic_values, apply_log):
if apply_log:
num_invalid = sum(x <= 0 for x in pd.concat((real_values, synthetic_values)))
if num_invalid:
warnings.warn(
f'There are {num_invalid} non-positive values in your data, which cannot be '
"used with log. Consider changing 'apply_log' to False for a better result."
)
with warnings.catch_warnings():
warnings.filterwarnings('ignore', message='.*encountered in log')
real_values = np.log(real_values)
synthetic_values = np.log(synthetic_values)

return real_values, synthetic_values

@staticmethod
def _calculate_differences(keys, values, n_rows_diff, data_name):
grouped = values.groupby(keys)
group_sizes = grouped.size()

num_invalid_groups = len(group_sizes[group_sizes <= n_rows_diff])
if num_invalid_groups > 0:
warnings.warn(
f"n_rows_diff '{n_rows_diff}' is greater than the "
f'size of {num_invalid_groups} sequence keys in {data_name}.'
)

def diff_func(group):
if len(group) <= n_rows_diff:
return np.nan
group = group.to_numpy()
return np.mean(group[n_rows_diff:] - group[:-n_rows_diff])

with warnings.catch_warnings():
warnings.filterwarnings('ignore', message='invalid value encountered in.*')
return grouped.apply(diff_func)

@classmethod
def compute(cls, real_data, synthetic_data, n_rows_diff=1, apply_log=False):
"""Compute this metric.
This metric compares the inter-row differences of sequences in the real data
Expand Down Expand Up @@ -58,48 +112,13 @@ def compute(real_data, synthetic_data, n_rows_diff=1, apply_log=False):
float:
The similarity score between the real and synthetic data distributions.
"""
for data in [real_data, synthetic_data]:
if (
not isinstance(data, tuple)
or len(data) != 2
or (not (isinstance(data[0], pd.Series) and isinstance(data[1], pd.Series)))
):
raise ValueError('The data must be a tuple of two pandas series.')

if not isinstance(n_rows_diff, int) or n_rows_diff < 1:
raise ValueError("'n_rows_diff' must be an integer greater than zero.")

if not isinstance(apply_log, bool):
raise ValueError("'apply_log' must be a boolean.")

cls._validate_inputs(real_data, synthetic_data, n_rows_diff, apply_log)
real_keys, real_values = real_data
synthetic_keys, synthetic_values = synthetic_data
real_values, synthetic_values = cls._apply_log(real_values, synthetic_values, apply_log)

if apply_log:
real_values = np.log(real_values)
synthetic_values = np.log(synthetic_values)

def calculate_differences(keys, values, n_rows_diff, data_name):
group_sizes = values.groupby(keys).size()
num_invalid_groups = group_sizes[group_sizes <= n_rows_diff].count()
if num_invalid_groups > 0:
warnings.warn(
f"n_rows_diff '{n_rows_diff}' is greater than the "
f'size of {num_invalid_groups} sequence keys in {data_name}.'
)

differences = values.groupby(keys).apply(
lambda group: np.mean(
group.to_numpy()[n_rows_diff:] - group.to_numpy()[:-n_rows_diff]
)
if len(group) > n_rows_diff
else np.nan
)

return pd.Series(differences)

real_diff = calculate_differences(real_keys, real_values, n_rows_diff, 'real_data')
synthetic_diff = calculate_differences(
real_diff = cls._calculate_differences(real_keys, real_values, n_rows_diff, 'real_data')
synthetic_diff = cls._calculate_differences(
synthetic_keys, synthetic_values, n_rows_diff, 'synthetic_data'
)

Expand Down
4 changes: 2 additions & 2 deletions tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,12 @@ def check_dependencies(c):

@task
def unit(c):
c.run('python -m pytest ./tests/unit --cov=sdmetrics --cov-report=xml')
c.run('python -m pytest ./tests/unit --cov=sdmetrics --cov-report=xml:./unit_cov.xml')


@task
def integration(c):
c.run('python -m pytest ./tests/integration --reruns 5 --disable-warnings')
c.run('python -m pytest ./tests/integration --reruns 5 --disable-warnings --cov=sdmetrics --cov-report=xml:./integration_cov.xml')


def _get_minimum_versions(dependencies, python_version):
Expand Down
25 changes: 25 additions & 0 deletions tests/unit/column_pairs/statistical/test_inter_row_msas.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,31 @@ def test_compute_with_log(self):
# Assert
assert score == 1

def test_compute_with_log_warning(self):
"""Test it warns when negative values are present and apply_log is True."""
# Setup
real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2'])
real_values = pd.Series([1, 1.4, 4, -1, 16, -10])
synthetic_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2'])
synthetic_values = pd.Series([1, 2, -4, 8, 16, 30])

# Run
with pytest.warns(UserWarning) as warning_info:
score = InterRowMSAS.compute(
real_data=(real_keys, real_values),
synthetic_data=(synthetic_keys, synthetic_values),
apply_log=True,
)

# Assert
expected_message = (
'There are 3 non-positive values in your data, which cannot be used with log. '
"Consider changing 'apply_log' to False for a better result."
)
assert len(warning_info) == 1
assert str(warning_info[0].message) == expected_message
assert score == 0

def test_compute_different_n_rows_diff(self):
"""Test it with different n_rows_diff."""
# Setup
Expand Down

0 comments on commit e7d49ea

Please sign in to comment.