Skip to content

Commit

Permalink
Add InterRowMSAS, StatisticMSAS and SequenceLengthSimilarity me…
Browse files Browse the repository at this point in the history
…trics (#662)
  • Loading branch information
fealho authored Nov 14, 2024
1 parent 838e81d commit d5ccb75
Show file tree
Hide file tree
Showing 11 changed files with 617 additions and 1 deletion.
4 changes: 4 additions & 0 deletions sdmetrics/column_pairs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
DiscreteKLDivergence,
)
from sdmetrics.column_pairs.statistical.referential_integrity import ReferentialIntegrity
from sdmetrics.column_pairs.statistical.inter_row_msas import InterRowMSAS
from sdmetrics.column_pairs.statistical.statistic_msas import StatisticMSAS

__all__ = [
'CardinalityBoundaryAdherence',
Expand All @@ -20,4 +22,6 @@
'CorrelationSimilarity',
'DiscreteKLDivergence',
'ReferentialIntegrity',
'InterRowMSAS',
'StatisticMSAS',
]
4 changes: 4 additions & 0 deletions sdmetrics/column_pairs/statistical/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
DiscreteKLDivergence,
)
from sdmetrics.column_pairs.statistical.referential_integrity import ReferentialIntegrity
from sdmetrics.column_pairs.statistical.inter_row_msas import InterRowMSAS
from sdmetrics.column_pairs.statistical.statistic_msas import StatisticMSAS

__all__ = [
'CardinalityBoundaryAdherence',
Expand All @@ -18,4 +20,6 @@
'CorrelationSimilarity',
'DiscreteKLDivergence',
'ReferentialIntegrity',
'InterRowMSAS',
'StatisticMSAS',
]
106 changes: 106 additions & 0 deletions sdmetrics/column_pairs/statistical/inter_row_msas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""InterRowMSAS module."""

import warnings

import numpy as np
import pandas as pd

from sdmetrics.goal import Goal
from sdmetrics.single_column.statistical.kscomplement import KSComplement


class InterRowMSAS:
"""Inter-Row Multi-Sequence Aggregate Similarity (MSAS) metric.
Attributes:
name (str):
Name to use when reports about this metric are printed.
goal (sdmetrics.goal.Goal):
The goal of this metric.
min_value (Union[float, tuple[float]]):
Minimum value or values that this metric can take.
max_value (Union[float, tuple[float]]):
Maximum value or values that this metric can take.
"""

name = 'Inter-Row Multi-Sequence Aggregate Similarity'
goal = Goal.MAXIMIZE
min_value = 0.0
max_value = 1.0

@staticmethod
def compute(real_data, synthetic_data, n_rows_diff=1, apply_log=False):
"""Compute this metric.
This metric compares the inter-row differences of sequences in the real data
vs. the synthetic data.
It works as follows:
- Calculate the difference between row r and row r+x for each row in the real data
- Take the average over each sequence to form a distribution D_r
- Do the same for the synthetic data to form a new distribution D_s
- Apply the KSComplement metric to compare the similarities of (D_r, D_s)
- Return this score
Args:
real_data (tuple[pd.Series, pd.Series]):
A tuple of 2 pandas.Series objects. The first represents the sequence key
of the real data and the second represents a continuous column of data.
synthetic_data (tuple[pd.Series, pd.Series]):
A tuple of 2 pandas.Series objects. The first represents the sequence key
of the synthetic data and the second represents a continuous column of data.
n_rows_diff (int):
An integer representing the number of rows to consider when taking the difference.
apply_log (bool):
Whether to apply a natural log before taking the difference.
Returns:
float:
The similarity score between the real and synthetic data distributions.
"""
for data in [real_data, synthetic_data]:
if (
not isinstance(data, tuple)
or len(data) != 2
or (not (isinstance(data[0], pd.Series) and isinstance(data[1], pd.Series)))
):
raise ValueError('The data must be a tuple of two pandas series.')

if not isinstance(n_rows_diff, int) or n_rows_diff < 1:
raise ValueError("'n_rows_diff' must be an integer greater than zero.")

if not isinstance(apply_log, bool):
raise ValueError("'apply_log' must be a boolean.")

real_keys, real_values = real_data
synthetic_keys, synthetic_values = synthetic_data

if apply_log:
real_values = np.log(real_values)
synthetic_values = np.log(synthetic_values)

def calculate_differences(keys, values, n_rows_diff, data_name):
group_sizes = values.groupby(keys).size()
num_invalid_groups = group_sizes[group_sizes <= n_rows_diff].count()
if num_invalid_groups > 0:
warnings.warn(
f"n_rows_diff '{n_rows_diff}' is greater than the "
f'size of {num_invalid_groups} sequence keys in {data_name}.'
)

differences = values.groupby(keys).apply(
lambda group: np.mean(
group.to_numpy()[n_rows_diff:] - group.to_numpy()[:-n_rows_diff]
)
if len(group) > n_rows_diff
else np.nan
)

return pd.Series(differences)

real_diff = calculate_differences(real_keys, real_values, n_rows_diff, 'real_data')
synthetic_diff = calculate_differences(
synthetic_keys, synthetic_values, n_rows_diff, 'synthetic_data'
)

return KSComplement.compute(real_diff, synthetic_diff)
96 changes: 96 additions & 0 deletions sdmetrics/column_pairs/statistical/statistic_msas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
"""StatisticMSAS module."""

import numpy as np
import pandas as pd

from sdmetrics.goal import Goal
from sdmetrics.single_column.statistical.kscomplement import KSComplement


class StatisticMSAS:
"""Statistic Multi-Sequence Aggregate Similarity (MSAS) metric.
Attributes:
name (str):
Name to use when reports about this metric are printed.
goal (sdmetrics.goal.Goal):
The goal of this metric.
min_value (Union[float, tuple[float]]):
Minimum value or values that this metric can take.
max_value (Union[float, tuple[float]]):
Maximum value or values that this metric can take.
"""

name = 'Statistic Multi-Sequence Aggregate Similarity'
goal = Goal.MAXIMIZE
min_value = 0.0
max_value = 1.0

@staticmethod
def compute(real_data, synthetic_data, statistic='mean'):
"""Compute this metric.
This metric compares the distribution of a given statistic across sequences
in the real data vs. the synthetic data.
It works as follows:
- Calculate the specified statistic for each sequence in the real data
- Form a distribution D_r from these statistics
- Do the same for the synthetic data to form a new distribution D_s
- Apply the KSComplement metric to compare the similarities of (D_r, D_s)
- Return this score
Args:
real_data (tuple[pd.Series, pd.Series]):
A tuple of 2 pandas.Series objects. The first represents the sequence key
of the real data and the second represents a continuous column of data.
synthetic_data (tuple[pd.Series, pd.Series]):
A tuple of 2 pandas.Series objects. The first represents the sequence key
of the synthetic data and the second represents a continuous column of data.
statistic (str):
A string representing the statistic function to use when computing MSAS.
Available options are:
- 'mean': The arithmetic mean of the sequence
- 'median': The median value of the sequence
- 'std': The standard deviation of the sequence
- 'min': The minimum value in the sequence
- 'max': The maximum value in the sequence
Returns:
float:
The similarity score between the real and synthetic data distributions.
"""
statistic_functions = {
'mean': np.mean,
'median': np.median,
'std': np.std,
'min': np.min,
'max': np.max,
}
if statistic not in statistic_functions:
raise ValueError(
f'Invalid statistic: {statistic}.'
f' Choose from [{", ".join(statistic_functions.keys())}].'
)

for data in [real_data, synthetic_data]:
if (
not isinstance(data, tuple)
or len(data) != 2
or (not (isinstance(data[0], pd.Series) and isinstance(data[1], pd.Series)))
):
raise ValueError('The data must be a tuple of two pandas series.')

real_keys, real_values = real_data
synthetic_keys, synthetic_values = synthetic_data
stat_func = statistic_functions[statistic]

def calculate_statistics(keys, values):
df = pd.DataFrame({'keys': keys, 'values': values})
return df.groupby('keys')['values'].agg(stat_func)

real_stats = calculate_statistics(real_keys, real_values)
synthetic_stats = calculate_statistics(synthetic_keys, synthetic_values)

return KSComplement.compute(real_stats, synthetic_stats)
2 changes: 2 additions & 0 deletions sdmetrics/single_column/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from sdmetrics.single_column.statistical.range_coverage import RangeCoverage
from sdmetrics.single_column.statistical.statistic_similarity import StatisticSimilarity
from sdmetrics.single_column.statistical.tv_complement import TVComplement
from sdmetrics.single_column.statistical.sequence_length_similarity import SequenceLengthSimilarity

__all__ = [
'base',
Expand All @@ -26,4 +27,5 @@
'RangeCoverage',
'StatisticSimilarity',
'TVComplement',
'SequenceLengthSimilarity',
]
2 changes: 2 additions & 0 deletions sdmetrics/single_column/statistical/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from sdmetrics.single_column.statistical.range_coverage import RangeCoverage
from sdmetrics.single_column.statistical.statistic_similarity import StatisticSimilarity
from sdmetrics.single_column.statistical.tv_complement import TVComplement
from sdmetrics.single_column.statistical.sequence_length_similarity import SequenceLengthSimilarity

__all__ = [
'BoundaryAdherence',
Expand All @@ -22,4 +23,5 @@
'RangeCoverage',
'StatisticSimilarity',
'TVComplement',
'SequenceLengthSimilarity',
]
9 changes: 8 additions & 1 deletion sdmetrics/single_column/statistical/kscomplement.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Kolmogorov-Smirnov test based Metric."""

import numpy as np
import pandas as pd
from scipy.stats import ks_2samp

Expand Down Expand Up @@ -56,7 +57,13 @@ def compute(real_data, synthetic_data):
real_data = pd.to_numeric(real_data)
synthetic_data = pd.to_numeric(synthetic_data)

statistic, _ = ks_2samp(real_data, synthetic_data)
try:
statistic, _ = ks_2samp(real_data, synthetic_data)
except ValueError as e:
if str(e) == 'Data passed to ks_2samp must not be empty':
return np.nan
else:
raise ValueError(e)

return 1 - statistic

Expand Down
53 changes: 53 additions & 0 deletions sdmetrics/single_column/statistical/sequence_length_similarity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""SequenceLengthSimilarity module."""

import pandas as pd

from sdmetrics.goal import Goal
from sdmetrics.single_column.statistical.kscomplement import KSComplement


class SequenceLengthSimilarity:
"""Sequence Length Similarity metric.
Attributes:
name (str):
Name to use when reports about this metric are printed.
goal (sdmetrics.goal.Goal):
The goal of this metric.
min_value (Union[float, tuple[float]]):
Minimum value or values that this metric can take.
max_value (Union[float, tuple[float]]):
Maximum value or values that this metric can take.
"""

name = 'Sequence Length Similarity'
goal = Goal.MAXIMIZE
min_value = 0.0
max_value = 1.0

@staticmethod
def compute(real_data: pd.Series, synthetic_data: pd.Series) -> float:
"""Compute this metric.
The length of a sequence is determined by the number of times the same sequence key occurs.
For example if id_09231 appeared 150 times in the sequence key, then the sequence is of
length 150. This metric compares the lengths of all sequence keys in the
real data vs. the synthetic data.
It works as follows:
- Calculate the length of each sequence in the real data
- Calculate the length of each sequence in the synthetic data
- Apply the KSComplement metric to compare the similarities of the distributions
- Return this score
Args:
real_data (pd.Series):
The values from the real dataset.
synthetic_data (pd.Series):
The values from the synthetic dataset.
Returns:
float:
The score.
"""
return KSComplement.compute(real_data.value_counts(), synthetic_data.value_counts())
Loading

0 comments on commit d5ccb75

Please sign in to comment.