From e171245c74cc87a3777a20b86f7b06ceef302069 Mon Sep 17 00:00:00 2001
From: Andrew Montanez <andrew@sdv.dev>
Date: Wed, 25 Sep 2024 14:45:16 -0400
Subject: [PATCH 01/11] =?UTF-8?q?Bump=20version:=200.16.0=20=E2=86=92=200.?=
 =?UTF-8?q?16.1.dev0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 conda/meta.yaml       | 2 +-
 pyproject.toml        | 2 +-
 sdmetrics/__init__.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/conda/meta.yaml b/conda/meta.yaml
index 0611c98b..8b65e9e1 100644
--- a/conda/meta.yaml
+++ b/conda/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = '0.16.0' %}
+{% set version = '0.16.1.dev0' %}
 
 package:
   name: "{{ name|lower }}"
diff --git a/pyproject.toml b/pyproject.toml
index 7962b60c..23b42eb6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -134,7 +134,7 @@ convention = 'google'
 add-ignore = ['D107', 'D407', 'D417']
 
 [tool.bumpversion]
-current_version = "0.16.0"
+current_version = "0.16.1.dev0"
 parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
 serialize = [
     '{major}.{minor}.{patch}.{release}{candidate}',
diff --git a/sdmetrics/__init__.py b/sdmetrics/__init__.py
index 92fe24e4..8f1972a6 100644
--- a/sdmetrics/__init__.py
+++ b/sdmetrics/__init__.py
@@ -4,7 +4,7 @@
 
 __author__ = 'MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
-__version__ = '0.16.0'
+__version__ = '0.16.1.dev0'
 
 import sys
 import warnings as python_warnings

From b0d4fb9ae44ceaff7545352a9cec7157a9d210eb Mon Sep 17 00:00:00 2001
From: SDV Team <98988753+sdv-team@users.noreply.github.com>
Date: Wed, 25 Sep 2024 17:17:26 -0400
Subject: [PATCH 02/11] Latest Code Analysis (#636)

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 static_code_analysis.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/static_code_analysis.txt b/static_code_analysis.txt
index b9dd8d90..702d9ce5 100644
--- a/static_code_analysis.txt
+++ b/static_code_analysis.txt
@@ -1,4 +1,4 @@
-Run started:2024-08-13 22:18:16.859345
+Run started:2024-09-25 19:32:12.685068
 
 Test results:
 >> Issue: [B101:assert_used] Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
@@ -117,7 +117,7 @@ Test results:
 --------------------------------------------------
 
 Code scanned:
-	Total lines of code: 9249
+	Total lines of code: 9225
 	Total lines skipped (#nosec): 0
 	Total potential issues skipped due to specifically being disabled (e.g., #nosec BXXX): 0
 

From f58f4437663fd721f91947ac2a46005ef7f90494 Mon Sep 17 00:00:00 2001
From: R-Palazzo <116157184+R-Palazzo@users.noreply.github.com>
Date: Mon, 28 Oct 2024 10:07:26 -0400
Subject: [PATCH 03/11] Fix lint + Pin ruff version (#646)

---
 pyproject.toml                                        | 11 ++++++-----
 sdmetrics/base.py                                     |  3 +--
 sdmetrics/reports/base_report.py                      |  5 +++--
 sdmetrics/reports/single_table/plot_utils.py          |  2 +-
 sdmetrics/reports/utils.py                            |  4 +++-
 sdmetrics/single_table/privacy/base.py                |  2 +-
 sdmetrics/single_table/privacy/util.py                |  2 +-
 sdmetrics/visualization.py                            |  7 ++++---
 .../statistical/test_cardinality_shape_similarity.py  |  3 +--
 .../reports/multi_table/_properties/test_structure.py |  2 +-
 .../single_table/_properties/test_column_shapes.py    |  3 +--
 .../single_table/_properties/test_structure.py        |  6 +++---
 .../single_table/_properties/test_synthesis.py        |  8 ++------
 .../statistical/test_category_adherence.py            |  3 +--
 14 files changed, 29 insertions(+), 32 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 23b42eb6..d64d78f7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -76,7 +76,7 @@ dev = [
     'watchdog>=1.0.1,<5',
 
     # style check
-    'ruff>=0.3.2,<1',
+    'ruff>=0.3.2,<0.7.2',
 
     # distribute on PyPI
     'twine>=1.10.0,<6',
@@ -186,7 +186,7 @@ exclude = [
     ".git",
     "__pycache__",
     ".ipynb_checkpoints",
-    ".ipynb",
+    "*.ipynb",
     "tasks.py",
 ]
 
@@ -204,10 +204,11 @@ select = [
     # print statements
     "T201",
     # pandas-vet
-    "PD"
+    "PD",
+    # numpy 2.0
+    "NPY201"
 ]
 ignore = [
-    "E501",
     # pydocstyle
     "D107",  # Missing docstring in __init__
     "D417",   # Missing argument descriptions in the docstring, this is a bug from pydocstyle: https://github.com/PyCQA/pydocstyle/issues/449
@@ -229,7 +230,7 @@ lines-between-types = 0
 [tool.ruff.lint.per-file-ignores]
 "__init__.py" = ["F401", "E402", "F403", "F405", "E501", "I001"]
 "errors.py" = ["D105"]
-"tests/**.py" = ["D", "W505"]
+"tests/**.py" = ["D"]
 
 [tool.ruff.lint.pydocstyle]
 convention = "google"
diff --git a/sdmetrics/base.py b/sdmetrics/base.py
index c8ac9033..650a76c0 100644
--- a/sdmetrics/base.py
+++ b/sdmetrics/base.py
@@ -99,8 +99,7 @@ def normalize(cls, raw_score):
 
         if score is None or score < 0 or score > 1:
             raise AssertionError(
-                f'This should be unreachable. The score {score} should be'
-                f'a value between 0 and 1.'
+                f'This should be unreachable. The score {score} should bea value between 0 and 1.'
             )
 
         if cls.goal == Goal.MINIMIZE:
diff --git a/sdmetrics/reports/base_report.py b/sdmetrics/reports/base_report.py
index a179baed..614b8083 100644
--- a/sdmetrics/reports/base_report.py
+++ b/sdmetrics/reports/base_report.py
@@ -50,7 +50,7 @@ def _validate_metadata_matches_data(self, real_data, synthetic_data, metadata):
             error_message = (
                 'The metadata does not match the data. The following columns are missing'
                 ' in the real/synthetic data or in the metadata: '
-                f"{', '.join(sorted(missing_columns))}"
+                f'{", ".join(sorted(missing_columns))}'
             )
             raise ValueError(error_message)
 
@@ -145,7 +145,8 @@ def generate(self, real_data, synthetic_data, metadata, verbose=True):
         if not isinstance(metadata, dict):
             raise TypeError(
                 f"Expected a dictionary but received a '{type(metadata).__name__}' instead."
-                " For SDV metadata objects, please use the 'to_dict' function to convert it to a dictionary."
+                " For SDV metadata objects, please use the 'to_dict' function to convert it"
+                ' to a dictionary.'
             )
 
         self._validate(real_data, synthetic_data, metadata)
diff --git a/sdmetrics/reports/single_table/plot_utils.py b/sdmetrics/reports/single_table/plot_utils.py
index 202cf060..995ccb85 100644
--- a/sdmetrics/reports/single_table/plot_utils.py
+++ b/sdmetrics/reports/single_table/plot_utils.py
@@ -313,7 +313,7 @@ def get_column_pairs_plot(score_breakdowns, average_score=None):
             xaxis='x',
             yaxis='y',
             hovertemplate=(
-                '<b>Column Pair</b><br>(%{x},%{y})<br><br>Similarity: ' '%{z}<extra></extra>'
+                '<b>Column Pair</b><br>(%{x},%{y})<br><br>Similarity: %{z}<extra></extra>'
             ),
         ),
         1,
diff --git a/sdmetrics/reports/utils.py b/sdmetrics/reports/utils.py
index 66d80239..c019e0ab 100644
--- a/sdmetrics/reports/utils.py
+++ b/sdmetrics/reports/utils.py
@@ -222,7 +222,9 @@ def _validate_categorical_values(real_data, synthetic_data, metadata, table=None
             The name of the current table, if one exists
     """
     if table:
-        warning_format = 'Unexpected values ({values}) in column "{column}" ' f'and table "{table}"'
+        warning_format = (
+            f'Unexpected values ({{values}}) in column "{{column}}" and table "{table}"'
+        )
     else:
         warning_format = 'Unexpected values ({values}) in column "{column}"'
 
diff --git a/sdmetrics/single_table/privacy/base.py b/sdmetrics/single_table/privacy/base.py
index e67327a9..08a8d600 100644
--- a/sdmetrics/single_table/privacy/base.py
+++ b/sdmetrics/single_table/privacy/base.py
@@ -366,5 +366,5 @@ def score(self, key_data, sensitive_data):
                 The sensitive data.
         """
         raise NotImplementedError(
-            'Posterior probability based scoring not supported' 'for this attacker!'
+            'Posterior probability based scoring not supportedfor this attacker!'
         )
diff --git a/sdmetrics/single_table/privacy/util.py b/sdmetrics/single_table/privacy/util.py
index 9873c42b..918bd969 100644
--- a/sdmetrics/single_table/privacy/util.py
+++ b/sdmetrics/single_table/privacy/util.py
@@ -70,7 +70,7 @@ def hamming_distance(target, test):
     """
     dist = 0
     assert len(target) == len(test), (
-        'Tuples must have the same length in the' 'calculation of hamming distance!'
+        'Tuples must have the same length in thecalculation of hamming distance!'
     )
 
     for target_entry, test_entry in zip(target, test):
diff --git a/sdmetrics/visualization.py b/sdmetrics/visualization.py
index 52377991..24427b5a 100644
--- a/sdmetrics/visualization.py
+++ b/sdmetrics/visualization.py
@@ -32,7 +32,7 @@ def wrapper(*args, **kwargs):
             ipython_interpreter = str(get_ipython())
             if 'ZMQInteractiveShell' in ipython_interpreter and 'iframe' in renderers:
                 # This means we are using jupyter notebook
-                pio.renderers.default = 'iframe'
+                pio.renderers.default = 'vscode'
 
         except Exception:
             pass
@@ -554,8 +554,9 @@ def get_column_plot(real_data, synthetic_data, column_name, plot_type=None):
             plot_type = 'bar'
     elif plot_type == 'distplot' and column_is_constant:
         raise ValueError(
-            f"Plot type 'distplot' cannot be created because column '{column_name}' has a constant value inside"
-            " the real or synthetic data. To render a visualization, please update the plot_type to 'bar'."
+            f"Plot type 'distplot' cannot be created because column '{column_name}'"
+            ' has a constant value inside the real or synthetic data. To render a'
+            " visualization, please update the plot_type to 'bar'."
         )
 
     fig = _generate_column_plot(real_column, synthetic_column, plot_type)
diff --git a/tests/unit/multi_table/statistical/test_cardinality_shape_similarity.py b/tests/unit/multi_table/statistical/test_cardinality_shape_similarity.py
index 9fc41112..9234577f 100644
--- a/tests/unit/multi_table/statistical/test_cardinality_shape_similarity.py
+++ b/tests/unit/multi_table/statistical/test_cardinality_shape_similarity.py
@@ -197,8 +197,7 @@ def test_compute_breakdown_no_relationships(self):
         assert result == expected_metric_breakdown
 
     @patch(
-        'sdmetrics.multi_table.statistical.cardinality_shape_similarity.MultiTableMetric.'
-        'normalize'
+        'sdmetrics.multi_table.statistical.cardinality_shape_similarity.MultiTableMetric.normalize'
     )
     def test_normalize(self, normalize_mock):
         """Test the ``normalize`` method.
diff --git a/tests/unit/reports/multi_table/_properties/test_structure.py b/tests/unit/reports/multi_table/_properties/test_structure.py
index d8b1dc2e..81a14635 100644
--- a/tests/unit/reports/multi_table/_properties/test_structure.py
+++ b/tests/unit/reports/multi_table/_properties/test_structure.py
@@ -52,7 +52,7 @@ def test_get_visualization(mock_px):
         'x': 'Table',
         'y': 'Score',
         'title': (
-            'Data Diagnostic: Structure (Average ' f'Score={mock__compute_average.return_value})'
+            f'Data Diagnostic: Structure (Average Score={mock__compute_average.return_value})'
         ),
         'category_orders': {'group': mock_df['Table'].tolist()},
         'color': 'Metric',
diff --git a/tests/unit/reports/single_table/_properties/test_column_shapes.py b/tests/unit/reports/single_table/_properties/test_column_shapes.py
index 2aea852a..f9c688cf 100644
--- a/tests/unit/reports/single_table/_properties/test_column_shapes.py
+++ b/tests/unit/reports/single_table/_properties/test_column_shapes.py
@@ -147,8 +147,7 @@ def test_get_visualization(self, mock_px):
             'x': 'Column',
             'y': 'Score',
             'title': (
-                'Data Quality: Column Shapes (Average '
-                f'Score={mock__compute_average.return_value})'
+                f'Data Quality: Column Shapes (Average Score={mock__compute_average.return_value})'
             ),
             'category_orders': {'group': mock_df['Column'].tolist()},
             'color': 'Metric',
diff --git a/tests/unit/reports/single_table/_properties/test_structure.py b/tests/unit/reports/single_table/_properties/test_structure.py
index cd82a7ed..32869132 100644
--- a/tests/unit/reports/single_table/_properties/test_structure.py
+++ b/tests/unit/reports/single_table/_properties/test_structure.py
@@ -9,7 +9,7 @@
 
 
 class TestStructure:
-    @patch('sdmetrics.reports.single_table._properties.structure.' 'TableStructure.compute')
+    @patch('sdmetrics.reports.single_table._properties.structure.TableStructure.compute')
     def test__generate_details(self, table_format_mock):
         """Test the ``_generate_details`` method."""
         # Setup
@@ -49,7 +49,7 @@ def test__generate_details(self, table_format_mock):
         )
         pd.testing.assert_frame_equal(result, expected_details)
 
-    @patch('sdmetrics.reports.single_table._properties.structure.' 'TableStructure.compute')
+    @patch('sdmetrics.reports.single_table._properties.structure.TableStructure.compute')
     def test__generate_details_with_id_column(self, table_format_mock):
         """Test the ``_generate_details`` method."""
         # Setup
@@ -96,7 +96,7 @@ def test_get_visualization(self):
 
         # Run and Assert
         expected_message = (
-            'The single table Structure property does not have a' ' supported visualization.'
+            'The single table Structure property does not have a supported visualization.'
         )
         with pytest.raises(VisualizationUnavailableError, match=expected_message):
             structure_property.get_visualization()
diff --git a/tests/unit/reports/single_table/_properties/test_synthesis.py b/tests/unit/reports/single_table/_properties/test_synthesis.py
index 7b8da813..68a4eb7c 100644
--- a/tests/unit/reports/single_table/_properties/test_synthesis.py
+++ b/tests/unit/reports/single_table/_properties/test_synthesis.py
@@ -7,9 +7,7 @@
 
 
 class TestSynthesis:
-    @patch(
-        'sdmetrics.reports.single_table._properties.synthesis.' 'NewRowSynthesis.compute_breakdown'
-    )
+    @patch('sdmetrics.reports.single_table._properties.synthesis.NewRowSynthesis.compute_breakdown')
     def test__generate_details(self, newrowsynthesis_mock):
         """Test the ``_generate_details`` method.
 
@@ -64,9 +62,7 @@ def test__generate_details(self, newrowsynthesis_mock):
 
         pd.testing.assert_frame_equal(details, expected__details)
 
-    @patch(
-        'sdmetrics.reports.single_table._properties.synthesis.' 'NewRowSynthesis.compute_breakdown'
-    )
+    @patch('sdmetrics.reports.single_table._properties.synthesis.NewRowSynthesis.compute_breakdown')
     def test__generate_details_error(self, newrowsynthesis_mock):
         """Test the ``_generate_details`` method when the metric raises an error."""
         # Setup
diff --git a/tests/unit/single_column/statistical/test_category_adherence.py b/tests/unit/single_column/statistical/test_category_adherence.py
index f4af0bd8..c77f0820 100644
--- a/tests/unit/single_column/statistical/test_category_adherence.py
+++ b/tests/unit/single_column/statistical/test_category_adherence.py
@@ -36,8 +36,7 @@ def test_compute_breakdown_with_nans(self):
         assert result == {'score': 0.9}
 
     @patch(
-        'sdmetrics.single_column.statistical.category_adherence.'
-        'CategoryAdherence.compute_breakdown'
+        'sdmetrics.single_column.statistical.category_adherence.CategoryAdherence.compute_breakdown'
     )
     def test_compute(self, compute_breakdown_mock):
         """Test the ``compute`` method."""

From 838c2001989a733602a4a41286691030c679bd6c Mon Sep 17 00:00:00 2001
From: R-Palazzo <116157184+R-Palazzo@users.noreply.github.com>
Date: Thu, 31 Oct 2024 09:04:39 -0400
Subject: [PATCH 04/11] Run fix-lint during latest dependency check workflow
 (#650)

---
 .github/workflows/dependency_checker.yml | 1 +
 pyproject.toml                           | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/dependency_checker.yml b/.github/workflows/dependency_checker.yml
index 7900502e..589ce20d 100644
--- a/.github/workflows/dependency_checker.yml
+++ b/.github/workflows/dependency_checker.yml
@@ -16,6 +16,7 @@ jobs:
       run: |
         python -m pip install .[dev]
         make check-deps OUTPUT_FILEPATH=latest_requirements.txt
+        make fix-lint
     - name: Create pull request
       id: cpr
       uses: peter-evans/create-pull-request@v4
diff --git a/pyproject.toml b/pyproject.toml
index d64d78f7..5a5bdeb5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -76,7 +76,7 @@ dev = [
     'watchdog>=1.0.1,<5',
 
     # style check
-    'ruff>=0.3.2,<0.7.2',
+    'ruff>=0.3.2,<1',
 
     # distribute on PyPI
     'twine>=1.10.0,<6',

From 27c7e1d6d10fef246c1f4080789f69cbbedf0ef4 Mon Sep 17 00:00:00 2001
From: SDV Team <98988753+sdv-team@users.noreply.github.com>
Date: Mon, 4 Nov 2024 09:46:11 -0500
Subject: [PATCH 05/11] Automated Latest Dependency Updates (#651)

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 latest_requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/latest_requirements.txt b/latest_requirements.txt
index 4bb9945c..691f176d 100644
--- a/latest_requirements.txt
+++ b/latest_requirements.txt
@@ -4,4 +4,4 @@ pandas==2.2.3
 plotly==5.24.1
 scikit-learn==1.5.2
 scipy==1.13.1
-tqdm==4.66.5
+tqdm==4.66.6

From 369613ba3c83d1afc275438debc5cc88c0eb7e3c Mon Sep 17 00:00:00 2001
From: SDV Team <98988753+sdv-team@users.noreply.github.com>
Date: Tue, 12 Nov 2024 09:10:12 -0500
Subject: [PATCH 06/11] Automated Latest Dependency Updates (#655)

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 latest_requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/latest_requirements.txt b/latest_requirements.txt
index 691f176d..aed0be91 100644
--- a/latest_requirements.txt
+++ b/latest_requirements.txt
@@ -4,4 +4,4 @@ pandas==2.2.3
 plotly==5.24.1
 scikit-learn==1.5.2
 scipy==1.13.1
-tqdm==4.66.6
+tqdm==4.67.0

From 838e81db5556630d82195d93c2d16a9bfa94c7a8 Mon Sep 17 00:00:00 2001
From: R-Palazzo <116157184+R-Palazzo@users.noreply.github.com>
Date: Thu, 14 Nov 2024 13:34:43 -0500
Subject: [PATCH 07/11] When running Quality Report, ContingencySimilarity
 produces a RuntimeWarning (`The values in the array are unorderable.`) (#657)

---
 .../statistical/contingency_similarity.py           |  2 +-
 .../reports/multi_table/test_quality_report.py      |  2 +-
 .../statistical/test_contingency_similarity.py      | 13 +++++++++++++
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/sdmetrics/column_pairs/statistical/contingency_similarity.py b/sdmetrics/column_pairs/statistical/contingency_similarity.py
index 5d2c801d..e41075e0 100644
--- a/sdmetrics/column_pairs/statistical/contingency_similarity.py
+++ b/sdmetrics/column_pairs/statistical/contingency_similarity.py
@@ -44,7 +44,7 @@ def compute(cls, real_data, synthetic_data):
         contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
             synthetic
         )
-        combined_index = contingency_real.index.union(contingency_synthetic.index)
+        combined_index = contingency_real.index.union(contingency_synthetic.index, sort=False)
         contingency_synthetic = contingency_synthetic.reindex(combined_index, fill_value=0)
         contingency_real = contingency_real.reindex(combined_index, fill_value=0)
         diff = abs(contingency_real - contingency_synthetic).fillna(0)
diff --git a/tests/integration/reports/multi_table/test_quality_report.py b/tests/integration/reports/multi_table/test_quality_report.py
index 9c3b79b1..fb571e81 100644
--- a/tests/integration/reports/multi_table/test_quality_report.py
+++ b/tests/integration/reports/multi_table/test_quality_report.py
@@ -342,7 +342,7 @@ def test_quality_report_with_errors():
             None,
         ],
     })
-    assert score == 0.7249603174603174
+    assert score == 0.7249603174603175
     pd.testing.assert_frame_equal(properties, expected_properties)
     pd.testing.assert_frame_equal(details_column_shapes, expected_details)
 
diff --git a/tests/unit/column_pairs/statistical/test_contingency_similarity.py b/tests/unit/column_pairs/statistical/test_contingency_similarity.py
index 713aab7d..dc7bafdd 100644
--- a/tests/unit/column_pairs/statistical/test_contingency_similarity.py
+++ b/tests/unit/column_pairs/statistical/test_contingency_similarity.py
@@ -1,6 +1,7 @@
 from unittest.mock import patch
 
 import pandas as pd
+import pytest
 
 from sdmetrics.column_pairs.statistical import ContingencySimilarity
 
@@ -53,3 +54,15 @@ def test_normalize(self, normalize_mock):
         # Assert
         normalize_mock.assert_called_once_with(raw_score)
         assert result == normalize_mock.return_value
+
+    @pytest.mark.filterwarnings('error:.*The values in the array are unorderable.*:RuntimeWarning')
+    def test_no_runtime_warning_raised(self):
+        """Test that no RuntimeWarning warning is raised when the metric is computed."""
+        # Setup
+        real_data = pd.DataFrame(data={'A': ['value'] * 4, 'B': ['1', '2', '3', pd.NA]})
+        synthetic_data = pd.DataFrame(data={'A': ['value'] * 3, 'B': ['1', '2', pd.NA]})
+
+        # Run and Assert
+        ContingencySimilarity.compute(
+            real_data=real_data[['A', 'B']], synthetic_data=synthetic_data[['A', 'B']]
+        )

From d5ccb7536e83cc6e5b459e23c938e7b60276bfb8 Mon Sep 17 00:00:00 2001
From: Felipe Alex Hofmann <fealho@gmail.com>
Date: Thu, 14 Nov 2024 14:59:09 -0800
Subject: [PATCH 08/11] Add `InterRowMSAS`, `StatisticMSAS` and
 `SequenceLengthSimilarity` metrics (#662)

---
 sdmetrics/column_pairs/__init__.py            |   4 +
 .../column_pairs/statistical/__init__.py      |   4 +
 .../statistical/inter_row_msas.py             | 106 +++++++++++
 .../statistical/statistic_msas.py             |  96 ++++++++++
 sdmetrics/single_column/__init__.py           |   2 +
 .../single_column/statistical/__init__.py     |   2 +
 .../single_column/statistical/kscomplement.py |   9 +-
 .../statistical/sequence_length_similarity.py |  53 ++++++
 .../statistical/test_inter_row_msas.py        | 176 ++++++++++++++++++
 .../statistical/test_statistic_msas.py        | 125 +++++++++++++
 .../test_sequence_length_similarity.py        |  41 ++++
 11 files changed, 617 insertions(+), 1 deletion(-)
 create mode 100644 sdmetrics/column_pairs/statistical/inter_row_msas.py
 create mode 100644 sdmetrics/column_pairs/statistical/statistic_msas.py
 create mode 100644 sdmetrics/single_column/statistical/sequence_length_similarity.py
 create mode 100644 tests/unit/column_pairs/statistical/test_inter_row_msas.py
 create mode 100644 tests/unit/column_pairs/statistical/test_statistic_msas.py
 create mode 100644 tests/unit/single_column/statistical/test_sequence_length_similarity.py

diff --git a/sdmetrics/column_pairs/__init__.py b/sdmetrics/column_pairs/__init__.py
index e44e35de..38f1aebe 100644
--- a/sdmetrics/column_pairs/__init__.py
+++ b/sdmetrics/column_pairs/__init__.py
@@ -11,6 +11,8 @@
     DiscreteKLDivergence,
 )
 from sdmetrics.column_pairs.statistical.referential_integrity import ReferentialIntegrity
+from sdmetrics.column_pairs.statistical.inter_row_msas import InterRowMSAS
+from sdmetrics.column_pairs.statistical.statistic_msas import StatisticMSAS
 
 __all__ = [
     'CardinalityBoundaryAdherence',
@@ -20,4 +22,6 @@
     'CorrelationSimilarity',
     'DiscreteKLDivergence',
     'ReferentialIntegrity',
+    'InterRowMSAS',
+    'StatisticMSAS',
 ]
diff --git a/sdmetrics/column_pairs/statistical/__init__.py b/sdmetrics/column_pairs/statistical/__init__.py
index 7f921df6..7198944e 100644
--- a/sdmetrics/column_pairs/statistical/__init__.py
+++ b/sdmetrics/column_pairs/statistical/__init__.py
@@ -10,6 +10,8 @@
     DiscreteKLDivergence,
 )
 from sdmetrics.column_pairs.statistical.referential_integrity import ReferentialIntegrity
+from sdmetrics.column_pairs.statistical.inter_row_msas import InterRowMSAS
+from sdmetrics.column_pairs.statistical.statistic_msas import StatisticMSAS
 
 __all__ = [
     'CardinalityBoundaryAdherence',
@@ -18,4 +20,6 @@
     'CorrelationSimilarity',
     'DiscreteKLDivergence',
     'ReferentialIntegrity',
+    'InterRowMSAS',
+    'StatisticMSAS',
 ]
diff --git a/sdmetrics/column_pairs/statistical/inter_row_msas.py b/sdmetrics/column_pairs/statistical/inter_row_msas.py
new file mode 100644
index 00000000..eea77f06
--- /dev/null
+++ b/sdmetrics/column_pairs/statistical/inter_row_msas.py
@@ -0,0 +1,106 @@
+"""InterRowMSAS module."""
+
+import warnings
+
+import numpy as np
+import pandas as pd
+
+from sdmetrics.goal import Goal
+from sdmetrics.single_column.statistical.kscomplement import KSComplement
+
+
+class InterRowMSAS:
+    """Inter-Row Multi-Sequence Aggregate Similarity (MSAS) metric.
+
+    Attributes:
+        name (str):
+            Name to use when reports about this metric are printed.
+        goal (sdmetrics.goal.Goal):
+            The goal of this metric.
+        min_value (Union[float, tuple[float]]):
+            Minimum value or values that this metric can take.
+        max_value (Union[float, tuple[float]]):
+            Maximum value or values that this metric can take.
+    """
+
+    name = 'Inter-Row Multi-Sequence Aggregate Similarity'
+    goal = Goal.MAXIMIZE
+    min_value = 0.0
+    max_value = 1.0
+
+    @staticmethod
+    def compute(real_data, synthetic_data, n_rows_diff=1, apply_log=False):
+        """Compute this metric.
+
+        This metric compares the inter-row differences of sequences in the real data
+        vs. the synthetic data.
+
+        It works as follows:
+            - Calculate the difference between row r and row r+x for each row in the real data
+            - Take the average over each sequence to form a distribution D_r
+            - Do the same for the synthetic data to form a new distribution D_s
+            - Apply the KSComplement metric to compare the similarities of (D_r, D_s)
+            - Return this score
+
+        Args:
+            real_data (tuple[pd.Series, pd.Series]):
+                A tuple of 2 pandas.Series objects. The first represents the sequence key
+                of the real data and the second represents a continuous column of data.
+            synthetic_data (tuple[pd.Series, pd.Series]):
+                A tuple of 2 pandas.Series objects. The first represents the sequence key
+                of the synthetic data and the second represents a continuous column of data.
+            n_rows_diff (int):
+                An integer representing the number of rows to consider when taking the difference.
+            apply_log (bool):
+                Whether to apply a natural log before taking the difference.
+
+        Returns:
+            float:
+                The similarity score between the real and synthetic data distributions.
+        """
+        for data in [real_data, synthetic_data]:
+            if (
+                not isinstance(data, tuple)
+                or len(data) != 2
+                or (not (isinstance(data[0], pd.Series) and isinstance(data[1], pd.Series)))
+            ):
+                raise ValueError('The data must be a tuple of two pandas series.')
+
+        if not isinstance(n_rows_diff, int) or n_rows_diff < 1:
+            raise ValueError("'n_rows_diff' must be an integer greater than zero.")
+
+        if not isinstance(apply_log, bool):
+            raise ValueError("'apply_log' must be a boolean.")
+
+        real_keys, real_values = real_data
+        synthetic_keys, synthetic_values = synthetic_data
+
+        if apply_log:
+            real_values = np.log(real_values)
+            synthetic_values = np.log(synthetic_values)
+
+        def calculate_differences(keys, values, n_rows_diff, data_name):
+            group_sizes = values.groupby(keys).size()
+            num_invalid_groups = group_sizes[group_sizes <= n_rows_diff].count()
+            if num_invalid_groups > 0:
+                warnings.warn(
+                    f"n_rows_diff '{n_rows_diff}' is greater than the "
+                    f'size of {num_invalid_groups} sequence keys in {data_name}.'
+                )
+
+            differences = values.groupby(keys).apply(
+                lambda group: np.mean(
+                    group.to_numpy()[n_rows_diff:] - group.to_numpy()[:-n_rows_diff]
+                )
+                if len(group) > n_rows_diff
+                else np.nan
+            )
+
+            return pd.Series(differences)
+
+        real_diff = calculate_differences(real_keys, real_values, n_rows_diff, 'real_data')
+        synthetic_diff = calculate_differences(
+            synthetic_keys, synthetic_values, n_rows_diff, 'synthetic_data'
+        )
+
+        return KSComplement.compute(real_diff, synthetic_diff)
diff --git a/sdmetrics/column_pairs/statistical/statistic_msas.py b/sdmetrics/column_pairs/statistical/statistic_msas.py
new file mode 100644
index 00000000..8afab764
--- /dev/null
+++ b/sdmetrics/column_pairs/statistical/statistic_msas.py
@@ -0,0 +1,96 @@
+"""StatisticMSAS module."""
+
+import numpy as np
+import pandas as pd
+
+from sdmetrics.goal import Goal
+from sdmetrics.single_column.statistical.kscomplement import KSComplement
+
+
+class StatisticMSAS:
+    """Statistic Multi-Sequence Aggregate Similarity (MSAS) metric.
+
+    Attributes:
+        name (str):
+            Name to use when reports about this metric are printed.
+        goal (sdmetrics.goal.Goal):
+            The goal of this metric.
+        min_value (Union[float, tuple[float]]):
+            Minimum value or values that this metric can take.
+        max_value (Union[float, tuple[float]]):
+            Maximum value or values that this metric can take.
+    """
+
+    name = 'Statistic Multi-Sequence Aggregate Similarity'
+    goal = Goal.MAXIMIZE
+    min_value = 0.0
+    max_value = 1.0
+
+    @staticmethod
+    def compute(real_data, synthetic_data, statistic='mean'):
+        """Compute this metric.
+
+        This metric compares the distribution of a given statistic across sequences
+        in the real data vs. the synthetic data.
+
+        It works as follows:
+            - Calculate the specified statistic for each sequence in the real data
+            - Form a distribution D_r from these statistics
+            - Do the same for the synthetic data to form a new distribution D_s
+            - Apply the KSComplement metric to compare the similarities of (D_r, D_s)
+            - Return this score
+
+        Args:
+            real_data (tuple[pd.Series, pd.Series]):
+                A tuple of 2 pandas.Series objects. The first represents the sequence key
+                of the real data and the second represents a continuous column of data.
+            synthetic_data (tuple[pd.Series, pd.Series]):
+                A tuple of 2 pandas.Series objects. The first represents the sequence key
+                of the synthetic data and the second represents a continuous column of data.
+            statistic (str):
+                A string representing the statistic function to use when computing MSAS.
+
+                Available options are:
+                    - 'mean': The arithmetic mean of the sequence
+                    - 'median': The median value of the sequence
+                    - 'std': The standard deviation of the sequence
+                    - 'min': The minimum value in the sequence
+                    - 'max': The maximum value in the sequence
+
+        Returns:
+            float:
+                The similarity score between the real and synthetic data distributions.
+        """
+        statistic_functions = {
+            'mean': np.mean,
+            'median': np.median,
+            'std': np.std,
+            'min': np.min,
+            'max': np.max,
+        }
+        if statistic not in statistic_functions:
+            raise ValueError(
+                f'Invalid statistic: {statistic}.'
+                f' Choose from [{", ".join(statistic_functions.keys())}].'
+            )
+
+        for data in [real_data, synthetic_data]:
+            if (
+                not isinstance(data, tuple)
+                or len(data) != 2
+                or (not (isinstance(data[0], pd.Series) and isinstance(data[1], pd.Series)))
+            ):
+                raise ValueError('The data must be a tuple of two pandas series.')
+
+        real_keys, real_values = real_data
+        synthetic_keys, synthetic_values = synthetic_data
+        stat_func = statistic_functions[statistic]
+
+        def calculate_statistics(keys, values):
+            df = pd.DataFrame({'keys': keys, 'values': values})
+            return df.groupby('keys')['values'].agg(stat_func)
+
+        real_stats = calculate_statistics(real_keys, real_values)
+        synthetic_stats = calculate_statistics(synthetic_keys, synthetic_values)
+
+        return KSComplement.compute(real_stats, synthetic_stats)
diff --git a/sdmetrics/single_column/__init__.py b/sdmetrics/single_column/__init__.py
index 563ea574..fdd9d9f1 100644
--- a/sdmetrics/single_column/__init__.py
+++ b/sdmetrics/single_column/__init__.py
@@ -12,6 +12,7 @@
 from sdmetrics.single_column.statistical.range_coverage import RangeCoverage
 from sdmetrics.single_column.statistical.statistic_similarity import StatisticSimilarity
 from sdmetrics.single_column.statistical.tv_complement import TVComplement
+from sdmetrics.single_column.statistical.sequence_length_similarity import SequenceLengthSimilarity
 
 __all__ = [
     'base',
@@ -26,4 +27,5 @@
     'RangeCoverage',
     'StatisticSimilarity',
     'TVComplement',
+    'SequenceLengthSimilarity',
 ]
diff --git a/sdmetrics/single_column/statistical/__init__.py b/sdmetrics/single_column/statistical/__init__.py
index 252cd6ac..228a456b 100644
--- a/sdmetrics/single_column/statistical/__init__.py
+++ b/sdmetrics/single_column/statistical/__init__.py
@@ -10,6 +10,7 @@
 from sdmetrics.single_column.statistical.range_coverage import RangeCoverage
 from sdmetrics.single_column.statistical.statistic_similarity import StatisticSimilarity
 from sdmetrics.single_column.statistical.tv_complement import TVComplement
+from sdmetrics.single_column.statistical.sequence_length_similarity import SequenceLengthSimilarity
 
 __all__ = [
     'BoundaryAdherence',
@@ -22,4 +23,5 @@
     'RangeCoverage',
     'StatisticSimilarity',
     'TVComplement',
+    'SequenceLengthSimilarity',
 ]
diff --git a/sdmetrics/single_column/statistical/kscomplement.py b/sdmetrics/single_column/statistical/kscomplement.py
index 3be01330..525e85c7 100644
--- a/sdmetrics/single_column/statistical/kscomplement.py
+++ b/sdmetrics/single_column/statistical/kscomplement.py
@@ -1,5 +1,6 @@
 """Kolmogorov-Smirnov test based Metric."""
 
+import numpy as np
 import pandas as pd
 from scipy.stats import ks_2samp
 
@@ -56,7 +57,13 @@ def compute(real_data, synthetic_data):
             real_data = pd.to_numeric(real_data)
             synthetic_data = pd.to_numeric(synthetic_data)
 
-        statistic, _ = ks_2samp(real_data, synthetic_data)
+        try:
+            statistic, _ = ks_2samp(real_data, synthetic_data)
+        except ValueError as e:
+            if str(e) == 'Data passed to ks_2samp must not be empty':
+                return np.nan
+            else:
+                raise ValueError(e)
 
         return 1 - statistic
 
diff --git a/sdmetrics/single_column/statistical/sequence_length_similarity.py b/sdmetrics/single_column/statistical/sequence_length_similarity.py
new file mode 100644
index 00000000..105f159b
--- /dev/null
+++ b/sdmetrics/single_column/statistical/sequence_length_similarity.py
@@ -0,0 +1,53 @@
+"""SequenceLengthSimilarity module."""
+
+import pandas as pd
+
+from sdmetrics.goal import Goal
+from sdmetrics.single_column.statistical.kscomplement import KSComplement
+
+
+class SequenceLengthSimilarity:
+    """Sequence Length Similarity metric.
+
+    Attributes:
+        name (str):
+            Name to use when reports about this metric are printed.
+        goal (sdmetrics.goal.Goal):
+            The goal of this metric.
+        min_value (Union[float, tuple[float]]):
+            Minimum value or values that this metric can take.
+        max_value (Union[float, tuple[float]]):
+            Maximum value or values that this metric can take.
+    """
+
+    name = 'Sequence Length Similarity'
+    goal = Goal.MAXIMIZE
+    min_value = 0.0
+    max_value = 1.0
+
+    @staticmethod
+    def compute(real_data: pd.Series, synthetic_data: pd.Series) -> float:
+        """Compute this metric.
+
+        The length of a sequence is determined by the number of times the same sequence key occurs.
+        For example if id_09231 appeared 150 times in the sequence key, then the sequence is of
+        length 150. This metric compares the lengths of all sequence keys in the
+        real data vs. the synthetic data.
+
+        It works as follows:
+            - Calculate the length of each sequence in the real data
+            - Calculate the length of each sequence in the synthetic data
+            - Apply the KSComplement metric to compare the similarities of the distributions
+            - Return this score
+
+        Args:
+            real_data (pd.Series):
+                The values from the real dataset.
+            synthetic_data (pd.Series):
+                The values from the synthetic dataset.
+
+        Returns:
+            float:
+                The score.
+        """
+        return KSComplement.compute(real_data.value_counts(), synthetic_data.value_counts())
diff --git a/tests/unit/column_pairs/statistical/test_inter_row_msas.py b/tests/unit/column_pairs/statistical/test_inter_row_msas.py
new file mode 100644
index 00000000..9a3552db
--- /dev/null
+++ b/tests/unit/column_pairs/statistical/test_inter_row_msas.py
@@ -0,0 +1,176 @@
+import pandas as pd
+import pytest
+
+from sdmetrics.column_pairs import InterRowMSAS
+
+
+class TestInterRowMSAS:
+    def test_compute(self):
+        """Test it runs."""
+        # Setup
+        real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2'])
+        real_values = pd.Series([1, 2, 3, 4, 5, 6])
+        synthetic_keys = pd.Series(['id3', 'id3', 'id3', 'id4', 'id4', 'id4'])
+        synthetic_values = pd.Series([1, 10, 3, 7, 5, 1])
+
+        # Run
+        score = InterRowMSAS.compute(
+            real_data=(real_keys, real_values), synthetic_data=(synthetic_keys, synthetic_values)
+        )
+
+        # Assert
+        assert score == 0.5
+
+    def test_compute_identical_sequences(self):
+        """Test it returns 1 when real and synthetic data are identical."""
+        # Setup
+        real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2'])
+        real_values = pd.Series([1, 2, 3, 4, 5, 6])
+        synthetic_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2'])
+        synthetic_values = pd.Series([1, 2, 3, 4, 5, 6])
+
+        # Run
+        score = InterRowMSAS.compute(
+            real_data=(real_keys, real_values), synthetic_data=(synthetic_keys, synthetic_values)
+        )
+
+        # Assert
+        assert score == 1
+
+    def test_compute_different_sequences(self):
+        """Test it for distinct distributions."""
+        # Setup
+        real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2'])
+        real_values = pd.Series([1, 2, 3, 4, 5, 6])
+        synthetic_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2'])
+        synthetic_values = pd.Series([1, 3, 5, 2, 4, 6])
+
+        # Run
+        score = InterRowMSAS.compute(
+            real_data=(real_keys, real_values), synthetic_data=(synthetic_keys, synthetic_values)
+        )
+
+        # Assert
+        assert score == 0
+
+    def test_compute_with_log(self):
+        """Test it with logarithmic transformation."""
+        # Setup
+        real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2'])
+        real_values = pd.Series([1, 2, 4, 8, 16, 32])
+        synthetic_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2'])
+        synthetic_values = pd.Series([1, 2, 4, 8, 16, 32])
+
+        # Run
+        score = InterRowMSAS.compute(
+            real_data=(real_keys, real_values),
+            synthetic_data=(synthetic_keys, synthetic_values),
+            apply_log=True,
+        )
+
+        # Assert
+        assert score == 1
+
+    def test_compute_different_n_rows_diff(self):
+        """Test it with different n_rows_diff."""
+        # Setup
+        real_keys = pd.Series(['id1'] * 10 + ['id2'] * 10)
+        real_values = pd.Series(list(range(10)) + list(range(10)))
+        synthetic_keys = pd.Series(['id1'] * 10 + ['id2'] * 10)
+        synthetic_values = pd.Series(list(range(10)) + list(range(10)))
+
+        # Run
+        score = InterRowMSAS.compute(
+            real_data=(real_keys, real_values),
+            synthetic_data=(synthetic_keys, synthetic_values),
+            n_rows_diff=3,
+        )
+
+        # Assert
+        assert score == 1
+
+    def test_compute_invalid_real_data(self):
+        """Test that it raises ValueError when real_data is invalid."""
+        # Setup
+        real_data = [[1, 2, 3], [4, 5, 6]]  # Not a tuple of pandas Series
+        synthetic_keys = pd.Series(['id1', 'id1', 'id2', 'id2'])
+        synthetic_values = pd.Series([1, 2, 3, 4])
+
+        # Run and Assert
+        with pytest.raises(ValueError, match='The data must be a tuple of two pandas series.'):
+            InterRowMSAS.compute(
+                real_data=real_data,
+                synthetic_data=(synthetic_keys, synthetic_values),
+                n_rows_diff=1,
+                apply_log=False,
+            )
+
+    def test_compute_invalid_synthetic_data(self):
+        """Test that it raises ValueError when synthetic_data is invalid."""
+        # Setup
+        real_keys = pd.Series(['id1', 'id1', 'id2', 'id2'])
+        real_values = pd.Series([1, 2, 3, 4])
+        synthetic_data = [[1, 2, 3], [4, 5, 6]]  # Not a tuple of pandas Series
+
+        # Run and Assert
+        with pytest.raises(ValueError, match='The data must be a tuple of two pandas series.'):
+            InterRowMSAS.compute(
+                real_data=(real_keys, real_values),
+                synthetic_data=synthetic_data,
+                n_rows_diff=1,
+                apply_log=False,
+            )
+
+    def test_compute_invalid_n_rows_diff(self):
+        """Test that it raises ValueError when n_rows_diff is invalid."""
+        # Setup
+        real_keys = pd.Series(['id1', 'id1', 'id2', 'id2'])
+        real_values = pd.Series([1, 2, 3, 4])
+        synthetic_keys = pd.Series(['id3', 'id3', 'id4', 'id4'])
+        synthetic_values = pd.Series([1, 2, 3, 4])
+
+        # Run and Assert
+        with pytest.raises(ValueError, match="'n_rows_diff' must be an integer greater than zero."):
+            InterRowMSAS.compute(
+                real_data=(real_keys, real_values),
+                synthetic_data=(synthetic_keys, synthetic_values),
+                n_rows_diff=0,
+                apply_log=False,
+            )
+
+    def test_compute_invalid_apply_log(self):
+        """Test that it raises ValueError when apply_log is invalid."""
+        # Setup
+        real_keys = pd.Series(['id1', 'id1', 'id2', 'id2'])
+        real_values = pd.Series([1, 2, 3, 4])
+        synthetic_keys = pd.Series(['id1', 'id1', 'id2', 'id2'])
+        synthetic_values = pd.Series([1, 2, 3, 4])
+
+        # Run and Assert
+        with pytest.raises(ValueError, match="'apply_log' must be a boolean."):
+            InterRowMSAS.compute(
+                real_data=(real_keys, real_values),
+                synthetic_data=(synthetic_keys, synthetic_values),
+                n_rows_diff=1,
+                apply_log='True',  # Should be a boolean, not a string
+            )
+
+    def test_compute_warning(self):
+        """Test a warning is raised when n_rows_diff is greater than sequence values size."""
+        # Setup
+        real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2'])
+        real_values = pd.Series([1, 2, 3, 4, 5, 6])
+        synthetic_keys = pd.Series(['id3', 'id3', 'id3', 'id4', 'id4', 'id4'])
+        synthetic_values = pd.Series([1, 10, 3, 7, 5, 1])
+
+        # Run and Assert
+        warn_msg = "n_rows_diff '10' is greater than the size of 2 sequence keys in real_data."
+        with pytest.warns(UserWarning, match=warn_msg):
+            score = InterRowMSAS.compute(
+                real_data=(real_keys, real_values),
+                synthetic_data=(synthetic_keys, synthetic_values),
+                n_rows_diff=10,
+            )
+
+        # Assert
+        assert pd.isna(score)
diff --git a/tests/unit/column_pairs/statistical/test_statistic_msas.py b/tests/unit/column_pairs/statistical/test_statistic_msas.py
new file mode 100644
index 00000000..9e8813eb
--- /dev/null
+++ b/tests/unit/column_pairs/statistical/test_statistic_msas.py
@@ -0,0 +1,125 @@
+import re
+
+import pandas as pd
+import pytest
+
+from sdmetrics.column_pairs import StatisticMSAS
+
+
+class TestStatisticMSAS:
+    def test_compute_identical_sequences(self):
+        """Test it returns 1 when real and synthetic data are identical."""
+        # Setup
+        real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2'])
+        real_values = pd.Series([1, 2, 3, 4, 5, 6])
+        synthetic_keys = pd.Series(['id3', 'id3', 'id3', 'id4', 'id4', 'id4'])
+        synthetic_values = pd.Series([1, 2, 3, 4, 5, 6])
+
+        # Run and Assert
+        for statistic in ['mean', 'median', 'std', 'min', 'max']:
+            score = StatisticMSAS.compute(
+                real_data=(real_keys, real_values),
+                synthetic_data=(synthetic_keys, synthetic_values),
+                statistic=statistic,
+            )
+            assert score == 1
+
+    def test_compute_different_sequences(self):
+        """Test it for distinct distributions."""
+        # Setup
+        real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2'])
+        real_values = pd.Series([1, 2, 3, 4, 5, 6])
+        synthetic_keys = pd.Series(['id3', 'id3', 'id3', 'id4', 'id4', 'id4'])
+        synthetic_values = pd.Series([10, 20, 30, 40, 50, 60])
+
+        # Run and Assert
+        for statistic in ['mean', 'median', 'std', 'min', 'max']:
+            score = StatisticMSAS.compute(
+                real_data=(real_keys, real_values),
+                synthetic_data=(synthetic_keys, synthetic_values),
+                statistic=statistic,
+            )
+            assert score == 0
+
+    def test_compute_with_single_sequence(self):
+        """Test it with a single sequence."""
+        # Setup
+        real_keys = pd.Series(['id1', 'id1', 'id1'])
+        real_values = pd.Series([1, 2, 3])
+        synthetic_keys = pd.Series(['id2', 'id2', 'id2'])
+        synthetic_values = pd.Series([1, 2, 3])
+
+        # Run
+        score = StatisticMSAS.compute(
+            real_data=(real_keys, real_values),
+            synthetic_data=(synthetic_keys, synthetic_values),
+            statistic='mean',
+        )
+
+        # Assert
+        assert score == 1
+
+    def test_compute_with_different_sequence_lengths(self):
+        """Test it with different sequence lengths."""
+        # Setup
+        real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2'])
+        real_values = pd.Series([1, 2, 3, 4, 5])
+        synthetic_keys = pd.Series(['id2', 'id2', 'id3', 'id4', 'id5'])
+        synthetic_values = pd.Series([1, 2, 3, 4, 5])
+
+        # Run
+        score = StatisticMSAS.compute(
+            real_data=(real_keys, real_values),
+            synthetic_data=(synthetic_keys, synthetic_values),
+            statistic='mean',
+        )
+
+        # Assert
+        assert score == 0.75
+
+    def test_compute_with_invalid_statistic(self):
+        """Test it raises ValueError for invalid statistic."""
+        # Setup
+        real_keys = pd.Series(['id1', 'id1', 'id1'])
+        real_values = pd.Series([1, 2, 3])
+        synthetic_keys = pd.Series(['id2', 'id2', 'id2'])
+        synthetic_values = pd.Series([1, 2, 3])
+
+        # Run and Assert
+        err_msg = re.escape(
+            'Invalid statistic: invalid. Choose from [mean, median, std, min, max].'
+        )
+        with pytest.raises(ValueError, match=err_msg):
+            StatisticMSAS.compute(
+                real_data=(real_keys, real_values),
+                synthetic_data=(synthetic_keys, synthetic_values),
+                statistic='invalid',
+            )
+
+    def test_compute_invalid_real_data(self):
+        """Test that it raises ValueError when real_data is invalid."""
+        # Setup
+        real_data = [[1, 2, 3], [4, 5, 6]]  # Not a tuple of pandas Series
+        synthetic_keys = pd.Series(['id1', 'id1', 'id2', 'id2'])
+        synthetic_values = pd.Series([1, 2, 3, 4])
+
+        # Run and Assert
+        with pytest.raises(ValueError, match='The data must be a tuple of two pandas series.'):
+            StatisticMSAS.compute(
+                real_data=real_data,
+                synthetic_data=(synthetic_keys, synthetic_values),
+            )
+
+    def test_compute_invalid_synthetic_data(self):
+        """Test that it raises ValueError when synthetic_data is invalid."""
+        # Setup
+        real_keys = pd.Series(['id1', 'id1', 'id2', 'id2'])
+        real_values = pd.Series([1, 2, 3, 4])
+        synthetic_data = [[1, 2, 3], [4, 5, 6]]  # Not a tuple of pandas Series
+
+        # Run and Assert
+        with pytest.raises(ValueError, match='The data must be a tuple of two pandas series.'):
+            StatisticMSAS.compute(
+                real_data=(real_keys, real_values),
+                synthetic_data=synthetic_data,
+            )
diff --git a/tests/unit/single_column/statistical/test_sequence_length_similarity.py b/tests/unit/single_column/statistical/test_sequence_length_similarity.py
new file mode 100644
index 00000000..4e27ab98
--- /dev/null
+++ b/tests/unit/single_column/statistical/test_sequence_length_similarity.py
@@ -0,0 +1,41 @@
+import pandas as pd
+
+from sdmetrics.single_column import SequenceLengthSimilarity
+
+
+class TestSequenceLengthSimilarity:
+    def test_compute(self):
+        """Test it runs."""
+        # Setup
+        real_data = pd.Series(['id1', 'id2', 'id2', 'id3'])
+        synthetic_data = pd.Series(['id4', 'id5', 'id6'])
+
+        # Run
+        score = SequenceLengthSimilarity.compute(real_data, synthetic_data)
+
+        # Assert
+        assert score == 0.6666666666666667
+
+    def test_compute_one(self):
+        """Test it returns 1 when real and synthetic data have the same distribution."""
+        # Setup
+        real_data = pd.Series(['id1', 'id1', 'id2', 'id2', 'id2', 'id3'])
+        synthetic_data = pd.Series(['id4', 'id4', 'id5', 'id6', 'id6', 'id6'])
+
+        # Run
+        score = SequenceLengthSimilarity.compute(real_data, synthetic_data)
+
+        # Assert
+        assert score == 1
+
+    def test_compute_low_score(self):
+        """Test it for distinct distributions."""
+        # Setup
+        real_data = pd.Series([f'id{i}' for i in range(100)])
+        synthetic_data = pd.Series(['id100'] * 100)
+
+        # Run
+        score = SequenceLengthSimilarity.compute(real_data, synthetic_data)
+
+        # Assert
+        assert score == 0

From 0f8c8aff86fba653acc09092eb9496de5b7ff4b7 Mon Sep 17 00:00:00 2001
From: Andrew Montanez <andrew@sdv.dev>
Date: Thu, 14 Nov 2024 17:01:42 -0600
Subject: [PATCH 09/11] =?UTF-8?q?Bump=20version:=200.16.1.dev0=20=E2=86=92?=
 =?UTF-8?q?=200.17.0.dev0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 conda/meta.yaml       | 2 +-
 pyproject.toml        | 2 +-
 sdmetrics/__init__.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/conda/meta.yaml b/conda/meta.yaml
index 8b65e9e1..96ab3875 100644
--- a/conda/meta.yaml
+++ b/conda/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = '0.16.1.dev0' %}
+{% set version = '0.17.0.dev0' %}
 
 package:
   name: "{{ name|lower }}"
diff --git a/pyproject.toml b/pyproject.toml
index 5a5bdeb5..8469152f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -134,7 +134,7 @@ convention = 'google'
 add-ignore = ['D107', 'D407', 'D417']
 
 [tool.bumpversion]
-current_version = "0.16.1.dev0"
+current_version = "0.17.0.dev0"
 parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
 serialize = [
     '{major}.{minor}.{patch}.{release}{candidate}',
diff --git a/sdmetrics/__init__.py b/sdmetrics/__init__.py
index 8f1972a6..732410e2 100644
--- a/sdmetrics/__init__.py
+++ b/sdmetrics/__init__.py
@@ -4,7 +4,7 @@
 
 __author__ = 'MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
-__version__ = '0.16.1.dev0'
+__version__ = '0.17.0.dev0'
 
 import sys
 import warnings as python_warnings

From 3c0ad516e5711b5d50ccbc20db6918700343334d Mon Sep 17 00:00:00 2001
From: Andrew Montanez <andrew@sdv.dev>
Date: Thu, 14 Nov 2024 17:02:18 -0600
Subject: [PATCH 10/11] =?UTF-8?q?Bump=20version:=200.17.0.dev0=20=E2=86=92?=
 =?UTF-8?q?=200.17.0.dev1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 conda/meta.yaml       | 2 +-
 pyproject.toml        | 2 +-
 sdmetrics/__init__.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/conda/meta.yaml b/conda/meta.yaml
index 96ab3875..45c4528d 100644
--- a/conda/meta.yaml
+++ b/conda/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = '0.17.0.dev0' %}
+{% set version = '0.17.0.dev1' %}
 
 package:
   name: "{{ name|lower }}"
diff --git a/pyproject.toml b/pyproject.toml
index 8469152f..306c5c97 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -134,7 +134,7 @@ convention = 'google'
 add-ignore = ['D107', 'D407', 'D417']
 
 [tool.bumpversion]
-current_version = "0.17.0.dev0"
+current_version = "0.17.0.dev1"
 parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
 serialize = [
     '{major}.{minor}.{patch}.{release}{candidate}',
diff --git a/sdmetrics/__init__.py b/sdmetrics/__init__.py
index 732410e2..76158f47 100644
--- a/sdmetrics/__init__.py
+++ b/sdmetrics/__init__.py
@@ -4,7 +4,7 @@
 
 __author__ = 'MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
-__version__ = '0.17.0.dev0'
+__version__ = '0.17.0.dev1'
 
 import sys
 import warnings as python_warnings

From c829829b872ce50f96b9f2012d05b05303161e5f Mon Sep 17 00:00:00 2001
From: Andrew Montanez <andrew@sdv.dev>
Date: Thu, 14 Nov 2024 20:30:00 -0600
Subject: [PATCH 11/11] 0.17.0 release notes (#663)

---
 HISTORY.md | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/HISTORY.md b/HISTORY.md
index 7132cf4e..e841f6fc 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,5 +1,21 @@
 # History
 
+## v0.17.0 - 2024-11-14
+
+This release adds a number of Multi-Sequence Aggregate Similarity (MSAS) metrics!
+
+### Bugs Fixed
+
+* Relocate timeseries metrics modules - Issue [#661](https://github.com/sdv-dev/SDMetrics/issues/661) by @fealho
+* Fix `SequenceLengthSimilarity` docstrings - Issue [#660](https://github.com/sdv-dev/SDMetrics/issues/660) by @fealho
+* When running Quality Report, ContingencySimilarity produces a RuntimeWarning (`The values in the array are unorderable.`) - Issue [#656](https://github.com/sdv-dev/SDMetrics/issues/656) by @R-Palazzo
+
+### New Features
+
+* Add metric for inter-row MSAS - Issue [#640](https://github.com/sdv-dev/SDMetrics/issues/640) by @fealho
+* Add metric for general MSAS statistics - Issue [#639](https://github.com/sdv-dev/SDMetrics/issues/639) by @fealho
+* Add metric for sequence length similarity - Issue [#638](https://github.com/sdv-dev/SDMetrics/issues/638) by @fealho
+
 ## v0.16.0 - 2024-09-25
 
 This release improves the performance of the `contingency_similarity` metric. It also factors dtypes into the score of the `TableStructure` metric.