Skip to content

Commit

Permalink
make release-tag: Merge branch 'main' into stable
Browse files Browse the repository at this point in the history
  • Loading branch information
amontanez24 committed Nov 15, 2024
2 parents 9d75e23 + c829829 commit 1568b68
Show file tree
Hide file tree
Showing 34 changed files with 683 additions and 40 deletions.
1 change: 1 addition & 0 deletions .github/workflows/dependency_checker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ jobs:
run: |
python -m pip install .[dev]
make check-deps OUTPUT_FILEPATH=latest_requirements.txt
make fix-lint
- name: Create pull request
id: cpr
uses: peter-evans/create-pull-request@v4
Expand Down
16 changes: 16 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,21 @@
# History

## v0.17.0 - 2024-11-14

This release adds a number of Multi-Sequence Aggregate Similarity (MSAS) metrics!

### Bugs Fixed

* Relocate timeseries metrics modules - Issue [#661](https://github.com/sdv-dev/SDMetrics/issues/661) by @fealho
* Fix `SequenceLengthSimilarity` docstrings - Issue [#660](https://github.com/sdv-dev/SDMetrics/issues/660) by @fealho
* When running Quality Report, ContingencySimilarity produces a RuntimeWarning (`The values in the array are unorderable.`) - Issue [#656](https://github.com/sdv-dev/SDMetrics/issues/656) by @R-Palazzo

### New Features

* Add metric for inter-row MSAS - Issue [#640](https://github.com/sdv-dev/SDMetrics/issues/640) by @fealho
* Add metric for general MSAS statistics - Issue [#639](https://github.com/sdv-dev/SDMetrics/issues/639) by @fealho
* Add metric for sequence length similarity - Issue [#638](https://github.com/sdv-dev/SDMetrics/issues/638) by @fealho

## v0.16.0 - 2024-09-25

This release improves the performance of the `contingency_similarity` metric. It also factors dtypes into the score of the `TableStructure` metric.
Expand Down
2 changes: 1 addition & 1 deletion conda/meta.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{% set version = '0.16.0' %}
{% set version = '0.17.0.dev1' %}

package:
name: "{{ name|lower }}"
Expand Down
2 changes: 1 addition & 1 deletion latest_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ pandas==2.2.3
plotly==5.24.1
scikit-learn==1.5.2
scipy==1.13.1
tqdm==4.66.5
tqdm==4.67.0
11 changes: 6 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ convention = 'google'
add-ignore = ['D107', 'D407', 'D417']

[tool.bumpversion]
current_version = "0.16.0"
current_version = "0.17.0.dev1"
parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
serialize = [
'{major}.{minor}.{patch}.{release}{candidate}',
Expand Down Expand Up @@ -186,7 +186,7 @@ exclude = [
".git",
"__pycache__",
".ipynb_checkpoints",
".ipynb",
"*.ipynb",
"tasks.py",
]

Expand All @@ -204,10 +204,11 @@ select = [
# print statements
"T201",
# pandas-vet
"PD"
"PD",
# numpy 2.0
"NPY201"
]
ignore = [
"E501",
# pydocstyle
"D107", # Missing docstring in __init__
"D417", # Missing argument descriptions in the docstring, this is a bug from pydocstyle: https://github.com/PyCQA/pydocstyle/issues/449
Expand All @@ -229,7 +230,7 @@ lines-between-types = 0
[tool.ruff.lint.per-file-ignores]
"__init__.py" = ["F401", "E402", "F403", "F405", "E501", "I001"]
"errors.py" = ["D105"]
"tests/**.py" = ["D", "W505"]
"tests/**.py" = ["D"]

[tool.ruff.lint.pydocstyle]
convention = "google"
Expand Down
2 changes: 1 addition & 1 deletion sdmetrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

__author__ = 'MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__version__ = '0.16.0'
__version__ = '0.17.0.dev1'

import sys
import warnings as python_warnings
Expand Down
3 changes: 1 addition & 2 deletions sdmetrics/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,7 @@ def normalize(cls, raw_score):

if score is None or score < 0 or score > 1:
raise AssertionError(
f'This should be unreachable. The score {score} should be'
f'a value between 0 and 1.'
f'This should be unreachable. The score {score} should bea value between 0 and 1.'
)

if cls.goal == Goal.MINIMIZE:
Expand Down
4 changes: 4 additions & 0 deletions sdmetrics/column_pairs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
DiscreteKLDivergence,
)
from sdmetrics.column_pairs.statistical.referential_integrity import ReferentialIntegrity
from sdmetrics.column_pairs.statistical.inter_row_msas import InterRowMSAS
from sdmetrics.column_pairs.statistical.statistic_msas import StatisticMSAS

__all__ = [
'CardinalityBoundaryAdherence',
Expand All @@ -20,4 +22,6 @@
'CorrelationSimilarity',
'DiscreteKLDivergence',
'ReferentialIntegrity',
'InterRowMSAS',
'StatisticMSAS',
]
4 changes: 4 additions & 0 deletions sdmetrics/column_pairs/statistical/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
DiscreteKLDivergence,
)
from sdmetrics.column_pairs.statistical.referential_integrity import ReferentialIntegrity
from sdmetrics.column_pairs.statistical.inter_row_msas import InterRowMSAS
from sdmetrics.column_pairs.statistical.statistic_msas import StatisticMSAS

__all__ = [
'CardinalityBoundaryAdherence',
Expand All @@ -18,4 +20,6 @@
'CorrelationSimilarity',
'DiscreteKLDivergence',
'ReferentialIntegrity',
'InterRowMSAS',
'StatisticMSAS',
]
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def compute(cls, real_data, synthetic_data):
contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
synthetic
)
combined_index = contingency_real.index.union(contingency_synthetic.index)
combined_index = contingency_real.index.union(contingency_synthetic.index, sort=False)
contingency_synthetic = contingency_synthetic.reindex(combined_index, fill_value=0)
contingency_real = contingency_real.reindex(combined_index, fill_value=0)
diff = abs(contingency_real - contingency_synthetic).fillna(0)
Expand Down
106 changes: 106 additions & 0 deletions sdmetrics/column_pairs/statistical/inter_row_msas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""InterRowMSAS module."""

import warnings

import numpy as np
import pandas as pd

from sdmetrics.goal import Goal
from sdmetrics.single_column.statistical.kscomplement import KSComplement


class InterRowMSAS:
"""Inter-Row Multi-Sequence Aggregate Similarity (MSAS) metric.
Attributes:
name (str):
Name to use when reports about this metric are printed.
goal (sdmetrics.goal.Goal):
The goal of this metric.
min_value (Union[float, tuple[float]]):
Minimum value or values that this metric can take.
max_value (Union[float, tuple[float]]):
Maximum value or values that this metric can take.
"""

name = 'Inter-Row Multi-Sequence Aggregate Similarity'
goal = Goal.MAXIMIZE
min_value = 0.0
max_value = 1.0

@staticmethod
def compute(real_data, synthetic_data, n_rows_diff=1, apply_log=False):
"""Compute this metric.
This metric compares the inter-row differences of sequences in the real data
vs. the synthetic data.
It works as follows:
- Calculate the difference between row r and row r+x for each row in the real data
- Take the average over each sequence to form a distribution D_r
- Do the same for the synthetic data to form a new distribution D_s
- Apply the KSComplement metric to compare the similarities of (D_r, D_s)
- Return this score
Args:
real_data (tuple[pd.Series, pd.Series]):
A tuple of 2 pandas.Series objects. The first represents the sequence key
of the real data and the second represents a continuous column of data.
synthetic_data (tuple[pd.Series, pd.Series]):
A tuple of 2 pandas.Series objects. The first represents the sequence key
of the synthetic data and the second represents a continuous column of data.
n_rows_diff (int):
An integer representing the number of rows to consider when taking the difference.
apply_log (bool):
Whether to apply a natural log before taking the difference.
Returns:
float:
The similarity score between the real and synthetic data distributions.
"""
for data in [real_data, synthetic_data]:
if (
not isinstance(data, tuple)
or len(data) != 2
or (not (isinstance(data[0], pd.Series) and isinstance(data[1], pd.Series)))
):
raise ValueError('The data must be a tuple of two pandas series.')

if not isinstance(n_rows_diff, int) or n_rows_diff < 1:
raise ValueError("'n_rows_diff' must be an integer greater than zero.")

if not isinstance(apply_log, bool):
raise ValueError("'apply_log' must be a boolean.")

real_keys, real_values = real_data
synthetic_keys, synthetic_values = synthetic_data

if apply_log:
real_values = np.log(real_values)
synthetic_values = np.log(synthetic_values)

def calculate_differences(keys, values, n_rows_diff, data_name):
group_sizes = values.groupby(keys).size()
num_invalid_groups = group_sizes[group_sizes <= n_rows_diff].count()
if num_invalid_groups > 0:
warnings.warn(
f"n_rows_diff '{n_rows_diff}' is greater than the "
f'size of {num_invalid_groups} sequence keys in {data_name}.'
)

differences = values.groupby(keys).apply(
lambda group: np.mean(
group.to_numpy()[n_rows_diff:] - group.to_numpy()[:-n_rows_diff]
)
if len(group) > n_rows_diff
else np.nan
)

return pd.Series(differences)

real_diff = calculate_differences(real_keys, real_values, n_rows_diff, 'real_data')
synthetic_diff = calculate_differences(
synthetic_keys, synthetic_values, n_rows_diff, 'synthetic_data'
)

return KSComplement.compute(real_diff, synthetic_diff)
96 changes: 96 additions & 0 deletions sdmetrics/column_pairs/statistical/statistic_msas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
"""StatisticMSAS module."""

import numpy as np
import pandas as pd

from sdmetrics.goal import Goal
from sdmetrics.single_column.statistical.kscomplement import KSComplement


class StatisticMSAS:
"""Statistic Multi-Sequence Aggregate Similarity (MSAS) metric.
Attributes:
name (str):
Name to use when reports about this metric are printed.
goal (sdmetrics.goal.Goal):
The goal of this metric.
min_value (Union[float, tuple[float]]):
Minimum value or values that this metric can take.
max_value (Union[float, tuple[float]]):
Maximum value or values that this metric can take.
"""

name = 'Statistic Multi-Sequence Aggregate Similarity'
goal = Goal.MAXIMIZE
min_value = 0.0
max_value = 1.0

@staticmethod
def compute(real_data, synthetic_data, statistic='mean'):
"""Compute this metric.
This metric compares the distribution of a given statistic across sequences
in the real data vs. the synthetic data.
It works as follows:
- Calculate the specified statistic for each sequence in the real data
- Form a distribution D_r from these statistics
- Do the same for the synthetic data to form a new distribution D_s
- Apply the KSComplement metric to compare the similarities of (D_r, D_s)
- Return this score
Args:
real_data (tuple[pd.Series, pd.Series]):
A tuple of 2 pandas.Series objects. The first represents the sequence key
of the real data and the second represents a continuous column of data.
synthetic_data (tuple[pd.Series, pd.Series]):
A tuple of 2 pandas.Series objects. The first represents the sequence key
of the synthetic data and the second represents a continuous column of data.
statistic (str):
A string representing the statistic function to use when computing MSAS.
Available options are:
- 'mean': The arithmetic mean of the sequence
- 'median': The median value of the sequence
- 'std': The standard deviation of the sequence
- 'min': The minimum value in the sequence
- 'max': The maximum value in the sequence
Returns:
float:
The similarity score between the real and synthetic data distributions.
"""
statistic_functions = {
'mean': np.mean,
'median': np.median,
'std': np.std,
'min': np.min,
'max': np.max,
}
if statistic not in statistic_functions:
raise ValueError(
f'Invalid statistic: {statistic}.'
f' Choose from [{", ".join(statistic_functions.keys())}].'
)

for data in [real_data, synthetic_data]:
if (
not isinstance(data, tuple)
or len(data) != 2
or (not (isinstance(data[0], pd.Series) and isinstance(data[1], pd.Series)))
):
raise ValueError('The data must be a tuple of two pandas series.')

real_keys, real_values = real_data
synthetic_keys, synthetic_values = synthetic_data
stat_func = statistic_functions[statistic]

def calculate_statistics(keys, values):
df = pd.DataFrame({'keys': keys, 'values': values})
return df.groupby('keys')['values'].agg(stat_func)

real_stats = calculate_statistics(real_keys, real_values)
synthetic_stats = calculate_statistics(synthetic_keys, synthetic_values)

return KSComplement.compute(real_stats, synthetic_stats)
5 changes: 3 additions & 2 deletions sdmetrics/reports/base_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def _validate_metadata_matches_data(self, real_data, synthetic_data, metadata):
error_message = (
'The metadata does not match the data. The following columns are missing'
' in the real/synthetic data or in the metadata: '
f"{', '.join(sorted(missing_columns))}"
f'{", ".join(sorted(missing_columns))}'
)
raise ValueError(error_message)

Expand Down Expand Up @@ -145,7 +145,8 @@ def generate(self, real_data, synthetic_data, metadata, verbose=True):
if not isinstance(metadata, dict):
raise TypeError(
f"Expected a dictionary but received a '{type(metadata).__name__}' instead."
" For SDV metadata objects, please use the 'to_dict' function to convert it to a dictionary."
" For SDV metadata objects, please use the 'to_dict' function to convert it"
' to a dictionary.'
)

self._validate(real_data, synthetic_data, metadata)
Expand Down
2 changes: 1 addition & 1 deletion sdmetrics/reports/single_table/plot_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ def get_column_pairs_plot(score_breakdowns, average_score=None):
xaxis='x',
yaxis='y',
hovertemplate=(
'<b>Column Pair</b><br>(%{x},%{y})<br><br>Similarity: ' '%{z}<extra></extra>'
'<b>Column Pair</b><br>(%{x},%{y})<br><br>Similarity: %{z}<extra></extra>'
),
),
1,
Expand Down
4 changes: 3 additions & 1 deletion sdmetrics/reports/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,9 @@ def _validate_categorical_values(real_data, synthetic_data, metadata, table=None
The name of the current table, if one exists
"""
if table:
warning_format = 'Unexpected values ({values}) in column "{column}" ' f'and table "{table}"'
warning_format = (
f'Unexpected values ({{values}}) in column "{{column}}" and table "{table}"'
)
else:
warning_format = 'Unexpected values ({values}) in column "{column}"'

Expand Down
2 changes: 2 additions & 0 deletions sdmetrics/single_column/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from sdmetrics.single_column.statistical.range_coverage import RangeCoverage
from sdmetrics.single_column.statistical.statistic_similarity import StatisticSimilarity
from sdmetrics.single_column.statistical.tv_complement import TVComplement
from sdmetrics.single_column.statistical.sequence_length_similarity import SequenceLengthSimilarity

__all__ = [
'base',
Expand All @@ -26,4 +27,5 @@
'RangeCoverage',
'StatisticSimilarity',
'TVComplement',
'SequenceLengthSimilarity',
]
Loading

0 comments on commit 1568b68

Please sign in to comment.