make release-tag: Merge branch 'main' into stable

sdv-dev · Nov 15, 2024 · 1568b68 · 1568b68
2 parents 9d75e23 + c829829
commit 1568b68
Show file tree

Hide file tree

Showing 34 changed files with 683 additions and 40 deletions.
diff --git a/.github/workflows/dependency_checker.yml b/.github/workflows/dependency_checker.yml
@@ -16,6 +16,7 @@ jobs:
       run: |
         python -m pip install .[dev]
         make check-deps OUTPUT_FILEPATH=latest_requirements.txt
+        make fix-lint
     - name: Create pull request
       id: cpr
       uses: peter-evans/create-pull-request@v4

diff --git a/HISTORY.md b/HISTORY.md
@@ -1,5 +1,21 @@
 # History
 
+## v0.17.0 - 2024-11-14
+
+This release adds a number of Multi-Sequence Aggregate Similarity (MSAS) metrics!
+
+### Bugs Fixed
+
+* Relocate timeseries metrics modules - Issue [#661](https://github.com/sdv-dev/SDMetrics/issues/661) by @fealho
+* Fix `SequenceLengthSimilarity` docstrings - Issue [#660](https://github.com/sdv-dev/SDMetrics/issues/660) by @fealho
+* When running Quality Report, ContingencySimilarity produces a RuntimeWarning (`The values in the array are unorderable.`) - Issue [#656](https://github.com/sdv-dev/SDMetrics/issues/656) by @R-Palazzo
+
+### New Features
+
+* Add metric for inter-row MSAS - Issue [#640](https://github.com/sdv-dev/SDMetrics/issues/640) by @fealho
+* Add metric for general MSAS statistics - Issue [#639](https://github.com/sdv-dev/SDMetrics/issues/639) by @fealho
+* Add metric for sequence length similarity - Issue [#638](https://github.com/sdv-dev/SDMetrics/issues/638) by @fealho
+
 ## v0.16.0 - 2024-09-25
 
 This release improves the performance of the `contingency_similarity` metric. It also factors dtypes into the score of the `TableStructure` metric.

diff --git a/conda/meta.yaml b/conda/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = '0.16.0' %}
+{% set version = '0.17.0.dev1' %}
 
 package:
   name: "{{ name|lower }}"

diff --git a/latest_requirements.txt b/latest_requirements.txt
@@ -4,4 +4,4 @@ pandas==2.2.3
 plotly==5.24.1
 scikit-learn==1.5.2
 scipy==1.13.1
-tqdm==4.66.5
+tqdm==4.67.0
diff --git a/pyproject.toml b/pyproject.toml
@@ -134,7 +134,7 @@ convention = 'google'
 add-ignore = ['D107', 'D407', 'D417']
 
 [tool.bumpversion]
-current_version = "0.16.0"
+current_version = "0.17.0.dev1"
 parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
 serialize = [
     '{major}.{minor}.{patch}.{release}{candidate}',
@@ -186,7 +186,7 @@ exclude = [
     ".git",
     "__pycache__",
     ".ipynb_checkpoints",
-    ".ipynb",
+    "*.ipynb",
     "tasks.py",
 ]
 
@@ -204,10 +204,11 @@ select = [
     # print statements
     "T201",
     # pandas-vet
-    "PD"
+    "PD",
+    # numpy 2.0
+    "NPY201"
 ]
 ignore = [
-    "E501",
     # pydocstyle
     "D107",  # Missing docstring in __init__
     "D417",   # Missing argument descriptions in the docstring, this is a bug from pydocstyle: https://github.com/PyCQA/pydocstyle/issues/449
@@ -229,7 +230,7 @@ lines-between-types = 0
 [tool.ruff.lint.per-file-ignores]
 "__init__.py" = ["F401", "E402", "F403", "F405", "E501", "I001"]
 "errors.py" = ["D105"]
-"tests/**.py" = ["D", "W505"]
+"tests/**.py" = ["D"]
 
 [tool.ruff.lint.pydocstyle]
 convention = "google"

diff --git a/sdmetrics/__init__.py b/sdmetrics/__init__.py
@@ -4,7 +4,7 @@
 
 __author__ = 'MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
-__version__ = '0.16.0'
+__version__ = '0.17.0.dev1'
 
 import sys
 import warnings as python_warnings

diff --git a/sdmetrics/base.py b/sdmetrics/base.py
@@ -99,8 +99,7 @@ def normalize(cls, raw_score):
 
         if score is None or score < 0 or score > 1:
             raise AssertionError(
-                f'This should be unreachable. The score {score} should be'
-                f'a value between 0 and 1.'
+                f'This should be unreachable. The score {score} should bea value between 0 and 1.'
             )
 
         if cls.goal == Goal.MINIMIZE:

diff --git a/sdmetrics/column_pairs/__init__.py b/sdmetrics/column_pairs/__init__.py
@@ -11,6 +11,8 @@
     DiscreteKLDivergence,
 )
 from sdmetrics.column_pairs.statistical.referential_integrity import ReferentialIntegrity
+from sdmetrics.column_pairs.statistical.inter_row_msas import InterRowMSAS
+from sdmetrics.column_pairs.statistical.statistic_msas import StatisticMSAS
 
 __all__ = [
     'CardinalityBoundaryAdherence',
@@ -20,4 +22,6 @@
     'CorrelationSimilarity',
     'DiscreteKLDivergence',
     'ReferentialIntegrity',
+    'InterRowMSAS',
+    'StatisticMSAS',
 ]
diff --git a/sdmetrics/column_pairs/statistical/__init__.py b/sdmetrics/column_pairs/statistical/__init__.py
@@ -10,6 +10,8 @@
     DiscreteKLDivergence,
 )
 from sdmetrics.column_pairs.statistical.referential_integrity import ReferentialIntegrity
+from sdmetrics.column_pairs.statistical.inter_row_msas import InterRowMSAS
+from sdmetrics.column_pairs.statistical.statistic_msas import StatisticMSAS
 
 __all__ = [
     'CardinalityBoundaryAdherence',
@@ -18,4 +20,6 @@
     'CorrelationSimilarity',
     'DiscreteKLDivergence',
     'ReferentialIntegrity',
+    'InterRowMSAS',
+    'StatisticMSAS',
 ]
diff --git a/sdmetrics/column_pairs/statistical/contingency_similarity.py b/sdmetrics/column_pairs/statistical/contingency_similarity.py
@@ -44,7 +44,7 @@ def compute(cls, real_data, synthetic_data):
         contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
             synthetic
         )
-        combined_index = contingency_real.index.union(contingency_synthetic.index)
+        combined_index = contingency_real.index.union(contingency_synthetic.index, sort=False)
         contingency_synthetic = contingency_synthetic.reindex(combined_index, fill_value=0)
         contingency_real = contingency_real.reindex(combined_index, fill_value=0)
         diff = abs(contingency_real - contingency_synthetic).fillna(0)

diff --git a/sdmetrics/column_pairs/statistical/inter_row_msas.py b/sdmetrics/column_pairs/statistical/inter_row_msas.py
@@ -0,0 +1,106 @@
+"""InterRowMSAS module."""
+
+import warnings
+
+import numpy as np
+import pandas as pd
+
+from sdmetrics.goal import Goal
+from sdmetrics.single_column.statistical.kscomplement import KSComplement
+
+
+class InterRowMSAS:
+    """Inter-Row Multi-Sequence Aggregate Similarity (MSAS) metric.
+
+    Attributes:
+        name (str):
+            Name to use when reports about this metric are printed.
+        goal (sdmetrics.goal.Goal):
+            The goal of this metric.
+        min_value (Union[float, tuple[float]]):
+            Minimum value or values that this metric can take.
+        max_value (Union[float, tuple[float]]):
+            Maximum value or values that this metric can take.
+    """
+
+    name = 'Inter-Row Multi-Sequence Aggregate Similarity'
+    goal = Goal.MAXIMIZE
+    min_value = 0.0
+    max_value = 1.0
+
+    @staticmethod
+    def compute(real_data, synthetic_data, n_rows_diff=1, apply_log=False):
+        """Compute this metric.
+
+        This metric compares the inter-row differences of sequences in the real data
+        vs. the synthetic data.
+
+        It works as follows:
+            - Calculate the difference between row r and row r+x for each row in the real data
+            - Take the average over each sequence to form a distribution D_r
+            - Do the same for the synthetic data to form a new distribution D_s
+            - Apply the KSComplement metric to compare the similarities of (D_r, D_s)
+            - Return this score
+
+        Args:
+            real_data (tuple[pd.Series, pd.Series]):
+                A tuple of 2 pandas.Series objects. The first represents the sequence key
+                of the real data and the second represents a continuous column of data.
+            synthetic_data (tuple[pd.Series, pd.Series]):
+                A tuple of 2 pandas.Series objects. The first represents the sequence key
+                of the synthetic data and the second represents a continuous column of data.
+            n_rows_diff (int):
+                An integer representing the number of rows to consider when taking the difference.
+            apply_log (bool):
+                Whether to apply a natural log before taking the difference.
+
+        Returns:
+            float:
+                The similarity score between the real and synthetic data distributions.
+        """
+        for data in [real_data, synthetic_data]:
+            if (
+                not isinstance(data, tuple)
+                or len(data) != 2
+                or (not (isinstance(data[0], pd.Series) and isinstance(data[1], pd.Series)))
+            ):
+                raise ValueError('The data must be a tuple of two pandas series.')
+
+        if not isinstance(n_rows_diff, int) or n_rows_diff < 1:
+            raise ValueError("'n_rows_diff' must be an integer greater than zero.")
+
+        if not isinstance(apply_log, bool):
+            raise ValueError("'apply_log' must be a boolean.")
+
+        real_keys, real_values = real_data
+        synthetic_keys, synthetic_values = synthetic_data
+
+        if apply_log:
+            real_values = np.log(real_values)
+            synthetic_values = np.log(synthetic_values)
+
+        def calculate_differences(keys, values, n_rows_diff, data_name):
+            group_sizes = values.groupby(keys).size()
+            num_invalid_groups = group_sizes[group_sizes <= n_rows_diff].count()
+            if num_invalid_groups > 0:
+                warnings.warn(
+                    f"n_rows_diff '{n_rows_diff}' is greater than the "
+                    f'size of {num_invalid_groups} sequence keys in {data_name}.'
+                )
+
+            differences = values.groupby(keys).apply(
+                lambda group: np.mean(
+                    group.to_numpy()[n_rows_diff:] - group.to_numpy()[:-n_rows_diff]
+                )
+                if len(group) > n_rows_diff
+                else np.nan
+            )
+
+            return pd.Series(differences)
+
+        real_diff = calculate_differences(real_keys, real_values, n_rows_diff, 'real_data')
+        synthetic_diff = calculate_differences(
+            synthetic_keys, synthetic_values, n_rows_diff, 'synthetic_data'
+        )
+
+        return KSComplement.compute(real_diff, synthetic_diff)
diff --git a/sdmetrics/column_pairs/statistical/statistic_msas.py b/sdmetrics/column_pairs/statistical/statistic_msas.py
@@ -0,0 +1,96 @@
+"""StatisticMSAS module."""
+
+import numpy as np
+import pandas as pd
+
+from sdmetrics.goal import Goal
+from sdmetrics.single_column.statistical.kscomplement import KSComplement
+
+
+class StatisticMSAS:
+    """Statistic Multi-Sequence Aggregate Similarity (MSAS) metric.
+
+    Attributes:
+        name (str):
+            Name to use when reports about this metric are printed.
+        goal (sdmetrics.goal.Goal):
+            The goal of this metric.
+        min_value (Union[float, tuple[float]]):
+            Minimum value or values that this metric can take.
+        max_value (Union[float, tuple[float]]):
+            Maximum value or values that this metric can take.
+    """
+
+    name = 'Statistic Multi-Sequence Aggregate Similarity'
+    goal = Goal.MAXIMIZE
+    min_value = 0.0
+    max_value = 1.0
+
+    @staticmethod
+    def compute(real_data, synthetic_data, statistic='mean'):
+        """Compute this metric.
+
+        This metric compares the distribution of a given statistic across sequences
+        in the real data vs. the synthetic data.
+
+        It works as follows:
+            - Calculate the specified statistic for each sequence in the real data
+            - Form a distribution D_r from these statistics
+            - Do the same for the synthetic data to form a new distribution D_s
+            - Apply the KSComplement metric to compare the similarities of (D_r, D_s)
+            - Return this score
+
+        Args:
+            real_data (tuple[pd.Series, pd.Series]):
+                A tuple of 2 pandas.Series objects. The first represents the sequence key
+                of the real data and the second represents a continuous column of data.
+            synthetic_data (tuple[pd.Series, pd.Series]):
+                A tuple of 2 pandas.Series objects. The first represents the sequence key
+                of the synthetic data and the second represents a continuous column of data.
+            statistic (str):
+                A string representing the statistic function to use when computing MSAS.
+
+                Available options are:
+                    - 'mean': The arithmetic mean of the sequence
+                    - 'median': The median value of the sequence
+                    - 'std': The standard deviation of the sequence
+                    - 'min': The minimum value in the sequence
+                    - 'max': The maximum value in the sequence
+
+        Returns:
+            float:
+                The similarity score between the real and synthetic data distributions.
+        """
+        statistic_functions = {
+            'mean': np.mean,
+            'median': np.median,
+            'std': np.std,
+            'min': np.min,
+            'max': np.max,
+        }
+        if statistic not in statistic_functions:
+            raise ValueError(
+                f'Invalid statistic: {statistic}.'
+                f' Choose from [{", ".join(statistic_functions.keys())}].'
+            )
+
+        for data in [real_data, synthetic_data]:
+            if (
+                not isinstance(data, tuple)
+                or len(data) != 2
+                or (not (isinstance(data[0], pd.Series) and isinstance(data[1], pd.Series)))
+            ):
+                raise ValueError('The data must be a tuple of two pandas series.')
+
+        real_keys, real_values = real_data
+        synthetic_keys, synthetic_values = synthetic_data
+        stat_func = statistic_functions[statistic]
+
+        def calculate_statistics(keys, values):
+            df = pd.DataFrame({'keys': keys, 'values': values})
+            return df.groupby('keys')['values'].agg(stat_func)
+
+        real_stats = calculate_statistics(real_keys, real_values)
+        synthetic_stats = calculate_statistics(synthetic_keys, synthetic_values)
+
+        return KSComplement.compute(real_stats, synthetic_stats)
diff --git a/sdmetrics/reports/base_report.py b/sdmetrics/reports/base_report.py
@@ -50,7 +50,7 @@ def _validate_metadata_matches_data(self, real_data, synthetic_data, metadata):
             error_message = (
                 'The metadata does not match the data. The following columns are missing'
                 ' in the real/synthetic data or in the metadata: '
-                f"{', '.join(sorted(missing_columns))}"
+                f'{", ".join(sorted(missing_columns))}'
             )
             raise ValueError(error_message)
 
@@ -145,7 +145,8 @@ def generate(self, real_data, synthetic_data, metadata, verbose=True):
         if not isinstance(metadata, dict):
             raise TypeError(
                 f"Expected a dictionary but received a '{type(metadata).__name__}' instead."
-                " For SDV metadata objects, please use the 'to_dict' function to convert it to a dictionary."
+                " For SDV metadata objects, please use the 'to_dict' function to convert it"
+                ' to a dictionary.'
             )
 
         self._validate(real_data, synthetic_data, metadata)

diff --git a/sdmetrics/reports/single_table/plot_utils.py b/sdmetrics/reports/single_table/plot_utils.py
@@ -313,7 +313,7 @@ def get_column_pairs_plot(score_breakdowns, average_score=None):
             xaxis='x',
             yaxis='y',
             hovertemplate=(
-                '<b>Column Pair</b><br>(%{x},%{y})<br><br>Similarity: ' '%{z}<extra></extra>'
+                '<b>Column Pair</b><br>(%{x},%{y})<br><br>Similarity: %{z}<extra></extra>'
             ),
         ),
         1,

diff --git a/sdmetrics/reports/utils.py b/sdmetrics/reports/utils.py
@@ -222,7 +222,9 @@ def _validate_categorical_values(real_data, synthetic_data, metadata, table=None
             The name of the current table, if one exists
     """
     if table:
-        warning_format = 'Unexpected values ({values}) in column "{column}" ' f'and table "{table}"'
+        warning_format = (
+            f'Unexpected values ({{values}}) in column "{{column}}" and table "{table}"'
+        )
     else:
         warning_format = 'Unexpected values ({values}) in column "{column}"'
 

diff --git a/sdmetrics/single_column/__init__.py b/sdmetrics/single_column/__init__.py
@@ -12,6 +12,7 @@
 from sdmetrics.single_column.statistical.range_coverage import RangeCoverage
 from sdmetrics.single_column.statistical.statistic_similarity import StatisticSimilarity
 from sdmetrics.single_column.statistical.tv_complement import TVComplement
+from sdmetrics.single_column.statistical.sequence_length_similarity import SequenceLengthSimilarity
 
 __all__ = [
     'base',
@@ -26,4 +27,5 @@
     'RangeCoverage',
     'StatisticSimilarity',
     'TVComplement',
+    'SequenceLengthSimilarity',
 ]