From e171245c74cc87a3777a20b86f7b06ceef302069 Mon Sep 17 00:00:00 2001 From: Andrew Montanez Date: Wed, 25 Sep 2024 14:45:16 -0400 Subject: [PATCH 01/11] =?UTF-8?q?Bump=20version:=200.16.0=20=E2=86=92=200.?= =?UTF-8?q?16.1.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- conda/meta.yaml | 2 +- pyproject.toml | 2 +- sdmetrics/__init__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/conda/meta.yaml b/conda/meta.yaml index 0611c98b..8b65e9e1 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -1,4 +1,4 @@ -{% set version = '0.16.0' %} +{% set version = '0.16.1.dev0' %} package: name: "{{ name|lower }}" diff --git a/pyproject.toml b/pyproject.toml index 7962b60c..23b42eb6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -134,7 +134,7 @@ convention = 'google' add-ignore = ['D107', 'D407', 'D417'] [tool.bumpversion] -current_version = "0.16.0" +current_version = "0.16.1.dev0" parse = '(?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?' serialize = [ '{major}.{minor}.{patch}.{release}{candidate}', diff --git a/sdmetrics/__init__.py b/sdmetrics/__init__.py index 92fe24e4..8f1972a6 100644 --- a/sdmetrics/__init__.py +++ b/sdmetrics/__init__.py @@ -4,7 +4,7 @@ __author__ = 'MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' -__version__ = '0.16.0' +__version__ = '0.16.1.dev0' import sys import warnings as python_warnings From b0d4fb9ae44ceaff7545352a9cec7157a9d210eb Mon Sep 17 00:00:00 2001 From: SDV Team <98988753+sdv-team@users.noreply.github.com> Date: Wed, 25 Sep 2024 17:17:26 -0400 Subject: [PATCH 02/11] Latest Code Analysis (#636) Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- static_code_analysis.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/static_code_analysis.txt b/static_code_analysis.txt index b9dd8d90..702d9ce5 100644 --- a/static_code_analysis.txt +++ b/static_code_analysis.txt @@ -1,4 +1,4 @@ -Run started:2024-08-13 22:18:16.859345 +Run started:2024-09-25 19:32:12.685068 Test results: >> Issue: [B101:assert_used] Use of assert detected. The enclosed code will be removed when compiling to optimised byte code. @@ -117,7 +117,7 @@ Test results: -------------------------------------------------- Code scanned: - Total lines of code: 9249 + Total lines of code: 9225 Total lines skipped (#nosec): 0 Total potential issues skipped due to specifically being disabled (e.g., #nosec BXXX): 0 From f58f4437663fd721f91947ac2a46005ef7f90494 Mon Sep 17 00:00:00 2001 From: R-Palazzo <116157184+R-Palazzo@users.noreply.github.com> Date: Mon, 28 Oct 2024 10:07:26 -0400 Subject: [PATCH 03/11] Fix lint + Pin ruff version (#646) --- pyproject.toml | 11 ++++++----- sdmetrics/base.py | 3 +-- sdmetrics/reports/base_report.py | 5 +++-- sdmetrics/reports/single_table/plot_utils.py | 2 +- sdmetrics/reports/utils.py | 4 +++- sdmetrics/single_table/privacy/base.py | 2 +- sdmetrics/single_table/privacy/util.py | 2 +- sdmetrics/visualization.py | 7 ++++--- .../statistical/test_cardinality_shape_similarity.py | 3 +-- .../reports/multi_table/_properties/test_structure.py | 2 +- .../single_table/_properties/test_column_shapes.py | 3 +-- .../single_table/_properties/test_structure.py | 6 +++--- .../single_table/_properties/test_synthesis.py | 8 ++------ .../statistical/test_category_adherence.py | 3 +-- 14 files changed, 29 insertions(+), 32 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 23b42eb6..d64d78f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,7 +76,7 @@ dev = [ 'watchdog>=1.0.1,<5', # style check - 'ruff>=0.3.2,<1', + 'ruff>=0.3.2,<0.7.2', # distribute on PyPI 'twine>=1.10.0,<6', @@ -186,7 +186,7 @@ exclude = [ ".git", "__pycache__", ".ipynb_checkpoints", - ".ipynb", + "*.ipynb", "tasks.py", ] @@ -204,10 +204,11 @@ select = [ # print statements "T201", # pandas-vet - "PD" + "PD", + # numpy 2.0 + "NPY201" ] ignore = [ - "E501", # pydocstyle "D107", # Missing docstring in __init__ "D417", # Missing argument descriptions in the docstring, this is a bug from pydocstyle: https://github.com/PyCQA/pydocstyle/issues/449 @@ -229,7 +230,7 @@ lines-between-types = 0 [tool.ruff.lint.per-file-ignores] "__init__.py" = ["F401", "E402", "F403", "F405", "E501", "I001"] "errors.py" = ["D105"] -"tests/**.py" = ["D", "W505"] +"tests/**.py" = ["D"] [tool.ruff.lint.pydocstyle] convention = "google" diff --git a/sdmetrics/base.py b/sdmetrics/base.py index c8ac9033..650a76c0 100644 --- a/sdmetrics/base.py +++ b/sdmetrics/base.py @@ -99,8 +99,7 @@ def normalize(cls, raw_score): if score is None or score < 0 or score > 1: raise AssertionError( - f'This should be unreachable. The score {score} should be' - f'a value between 0 and 1.' + f'This should be unreachable. The score {score} should bea value between 0 and 1.' ) if cls.goal == Goal.MINIMIZE: diff --git a/sdmetrics/reports/base_report.py b/sdmetrics/reports/base_report.py index a179baed..614b8083 100644 --- a/sdmetrics/reports/base_report.py +++ b/sdmetrics/reports/base_report.py @@ -50,7 +50,7 @@ def _validate_metadata_matches_data(self, real_data, synthetic_data, metadata): error_message = ( 'The metadata does not match the data. The following columns are missing' ' in the real/synthetic data or in the metadata: ' - f"{', '.join(sorted(missing_columns))}" + f'{", ".join(sorted(missing_columns))}' ) raise ValueError(error_message) @@ -145,7 +145,8 @@ def generate(self, real_data, synthetic_data, metadata, verbose=True): if not isinstance(metadata, dict): raise TypeError( f"Expected a dictionary but received a '{type(metadata).__name__}' instead." - " For SDV metadata objects, please use the 'to_dict' function to convert it to a dictionary." + " For SDV metadata objects, please use the 'to_dict' function to convert it" + ' to a dictionary.' ) self._validate(real_data, synthetic_data, metadata) diff --git a/sdmetrics/reports/single_table/plot_utils.py b/sdmetrics/reports/single_table/plot_utils.py index 202cf060..995ccb85 100644 --- a/sdmetrics/reports/single_table/plot_utils.py +++ b/sdmetrics/reports/single_table/plot_utils.py @@ -313,7 +313,7 @@ def get_column_pairs_plot(score_breakdowns, average_score=None): xaxis='x', yaxis='y', hovertemplate=( - 'Column Pair
(%{x},%{y})

Similarity: ' '%{z}' + 'Column Pair
(%{x},%{y})

Similarity: %{z}' ), ), 1, diff --git a/sdmetrics/reports/utils.py b/sdmetrics/reports/utils.py index 66d80239..c019e0ab 100644 --- a/sdmetrics/reports/utils.py +++ b/sdmetrics/reports/utils.py @@ -222,7 +222,9 @@ def _validate_categorical_values(real_data, synthetic_data, metadata, table=None The name of the current table, if one exists """ if table: - warning_format = 'Unexpected values ({values}) in column "{column}" ' f'and table "{table}"' + warning_format = ( + f'Unexpected values ({{values}}) in column "{{column}}" and table "{table}"' + ) else: warning_format = 'Unexpected values ({values}) in column "{column}"' diff --git a/sdmetrics/single_table/privacy/base.py b/sdmetrics/single_table/privacy/base.py index e67327a9..08a8d600 100644 --- a/sdmetrics/single_table/privacy/base.py +++ b/sdmetrics/single_table/privacy/base.py @@ -366,5 +366,5 @@ def score(self, key_data, sensitive_data): The sensitive data. """ raise NotImplementedError( - 'Posterior probability based scoring not supported' 'for this attacker!' + 'Posterior probability based scoring not supportedfor this attacker!' ) diff --git a/sdmetrics/single_table/privacy/util.py b/sdmetrics/single_table/privacy/util.py index 9873c42b..918bd969 100644 --- a/sdmetrics/single_table/privacy/util.py +++ b/sdmetrics/single_table/privacy/util.py @@ -70,7 +70,7 @@ def hamming_distance(target, test): """ dist = 0 assert len(target) == len(test), ( - 'Tuples must have the same length in the' 'calculation of hamming distance!' + 'Tuples must have the same length in thecalculation of hamming distance!' ) for target_entry, test_entry in zip(target, test): diff --git a/sdmetrics/visualization.py b/sdmetrics/visualization.py index 52377991..24427b5a 100644 --- a/sdmetrics/visualization.py +++ b/sdmetrics/visualization.py @@ -32,7 +32,7 @@ def wrapper(*args, **kwargs): ipython_interpreter = str(get_ipython()) if 'ZMQInteractiveShell' in ipython_interpreter and 'iframe' in renderers: # This means we are using jupyter notebook - pio.renderers.default = 'iframe' + pio.renderers.default = 'vscode' except Exception: pass @@ -554,8 +554,9 @@ def get_column_plot(real_data, synthetic_data, column_name, plot_type=None): plot_type = 'bar' elif plot_type == 'distplot' and column_is_constant: raise ValueError( - f"Plot type 'distplot' cannot be created because column '{column_name}' has a constant value inside" - " the real or synthetic data. To render a visualization, please update the plot_type to 'bar'." + f"Plot type 'distplot' cannot be created because column '{column_name}'" + ' has a constant value inside the real or synthetic data. To render a' + " visualization, please update the plot_type to 'bar'." ) fig = _generate_column_plot(real_column, synthetic_column, plot_type) diff --git a/tests/unit/multi_table/statistical/test_cardinality_shape_similarity.py b/tests/unit/multi_table/statistical/test_cardinality_shape_similarity.py index 9fc41112..9234577f 100644 --- a/tests/unit/multi_table/statistical/test_cardinality_shape_similarity.py +++ b/tests/unit/multi_table/statistical/test_cardinality_shape_similarity.py @@ -197,8 +197,7 @@ def test_compute_breakdown_no_relationships(self): assert result == expected_metric_breakdown @patch( - 'sdmetrics.multi_table.statistical.cardinality_shape_similarity.MultiTableMetric.' - 'normalize' + 'sdmetrics.multi_table.statistical.cardinality_shape_similarity.MultiTableMetric.normalize' ) def test_normalize(self, normalize_mock): """Test the ``normalize`` method. diff --git a/tests/unit/reports/multi_table/_properties/test_structure.py b/tests/unit/reports/multi_table/_properties/test_structure.py index d8b1dc2e..81a14635 100644 --- a/tests/unit/reports/multi_table/_properties/test_structure.py +++ b/tests/unit/reports/multi_table/_properties/test_structure.py @@ -52,7 +52,7 @@ def test_get_visualization(mock_px): 'x': 'Table', 'y': 'Score', 'title': ( - 'Data Diagnostic: Structure (Average ' f'Score={mock__compute_average.return_value})' + f'Data Diagnostic: Structure (Average Score={mock__compute_average.return_value})' ), 'category_orders': {'group': mock_df['Table'].tolist()}, 'color': 'Metric', diff --git a/tests/unit/reports/single_table/_properties/test_column_shapes.py b/tests/unit/reports/single_table/_properties/test_column_shapes.py index 2aea852a..f9c688cf 100644 --- a/tests/unit/reports/single_table/_properties/test_column_shapes.py +++ b/tests/unit/reports/single_table/_properties/test_column_shapes.py @@ -147,8 +147,7 @@ def test_get_visualization(self, mock_px): 'x': 'Column', 'y': 'Score', 'title': ( - 'Data Quality: Column Shapes (Average ' - f'Score={mock__compute_average.return_value})' + f'Data Quality: Column Shapes (Average Score={mock__compute_average.return_value})' ), 'category_orders': {'group': mock_df['Column'].tolist()}, 'color': 'Metric', diff --git a/tests/unit/reports/single_table/_properties/test_structure.py b/tests/unit/reports/single_table/_properties/test_structure.py index cd82a7ed..32869132 100644 --- a/tests/unit/reports/single_table/_properties/test_structure.py +++ b/tests/unit/reports/single_table/_properties/test_structure.py @@ -9,7 +9,7 @@ class TestStructure: - @patch('sdmetrics.reports.single_table._properties.structure.' 'TableStructure.compute') + @patch('sdmetrics.reports.single_table._properties.structure.TableStructure.compute') def test__generate_details(self, table_format_mock): """Test the ``_generate_details`` method.""" # Setup @@ -49,7 +49,7 @@ def test__generate_details(self, table_format_mock): ) pd.testing.assert_frame_equal(result, expected_details) - @patch('sdmetrics.reports.single_table._properties.structure.' 'TableStructure.compute') + @patch('sdmetrics.reports.single_table._properties.structure.TableStructure.compute') def test__generate_details_with_id_column(self, table_format_mock): """Test the ``_generate_details`` method.""" # Setup @@ -96,7 +96,7 @@ def test_get_visualization(self): # Run and Assert expected_message = ( - 'The single table Structure property does not have a' ' supported visualization.' + 'The single table Structure property does not have a supported visualization.' ) with pytest.raises(VisualizationUnavailableError, match=expected_message): structure_property.get_visualization() diff --git a/tests/unit/reports/single_table/_properties/test_synthesis.py b/tests/unit/reports/single_table/_properties/test_synthesis.py index 7b8da813..68a4eb7c 100644 --- a/tests/unit/reports/single_table/_properties/test_synthesis.py +++ b/tests/unit/reports/single_table/_properties/test_synthesis.py @@ -7,9 +7,7 @@ class TestSynthesis: - @patch( - 'sdmetrics.reports.single_table._properties.synthesis.' 'NewRowSynthesis.compute_breakdown' - ) + @patch('sdmetrics.reports.single_table._properties.synthesis.NewRowSynthesis.compute_breakdown') def test__generate_details(self, newrowsynthesis_mock): """Test the ``_generate_details`` method. @@ -64,9 +62,7 @@ def test__generate_details(self, newrowsynthesis_mock): pd.testing.assert_frame_equal(details, expected__details) - @patch( - 'sdmetrics.reports.single_table._properties.synthesis.' 'NewRowSynthesis.compute_breakdown' - ) + @patch('sdmetrics.reports.single_table._properties.synthesis.NewRowSynthesis.compute_breakdown') def test__generate_details_error(self, newrowsynthesis_mock): """Test the ``_generate_details`` method when the metric raises an error.""" # Setup diff --git a/tests/unit/single_column/statistical/test_category_adherence.py b/tests/unit/single_column/statistical/test_category_adherence.py index f4af0bd8..c77f0820 100644 --- a/tests/unit/single_column/statistical/test_category_adherence.py +++ b/tests/unit/single_column/statistical/test_category_adherence.py @@ -36,8 +36,7 @@ def test_compute_breakdown_with_nans(self): assert result == {'score': 0.9} @patch( - 'sdmetrics.single_column.statistical.category_adherence.' - 'CategoryAdherence.compute_breakdown' + 'sdmetrics.single_column.statistical.category_adherence.CategoryAdherence.compute_breakdown' ) def test_compute(self, compute_breakdown_mock): """Test the ``compute`` method.""" From 838c2001989a733602a4a41286691030c679bd6c Mon Sep 17 00:00:00 2001 From: R-Palazzo <116157184+R-Palazzo@users.noreply.github.com> Date: Thu, 31 Oct 2024 09:04:39 -0400 Subject: [PATCH 04/11] Run fix-lint during latest dependency check workflow (#650) --- .github/workflows/dependency_checker.yml | 1 + pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/dependency_checker.yml b/.github/workflows/dependency_checker.yml index 7900502e..589ce20d 100644 --- a/.github/workflows/dependency_checker.yml +++ b/.github/workflows/dependency_checker.yml @@ -16,6 +16,7 @@ jobs: run: | python -m pip install .[dev] make check-deps OUTPUT_FILEPATH=latest_requirements.txt + make fix-lint - name: Create pull request id: cpr uses: peter-evans/create-pull-request@v4 diff --git a/pyproject.toml b/pyproject.toml index d64d78f7..5a5bdeb5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,7 +76,7 @@ dev = [ 'watchdog>=1.0.1,<5', # style check - 'ruff>=0.3.2,<0.7.2', + 'ruff>=0.3.2,<1', # distribute on PyPI 'twine>=1.10.0,<6', From 27c7e1d6d10fef246c1f4080789f69cbbedf0ef4 Mon Sep 17 00:00:00 2001 From: SDV Team <98988753+sdv-team@users.noreply.github.com> Date: Mon, 4 Nov 2024 09:46:11 -0500 Subject: [PATCH 05/11] Automated Latest Dependency Updates (#651) Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- latest_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/latest_requirements.txt b/latest_requirements.txt index 4bb9945c..691f176d 100644 --- a/latest_requirements.txt +++ b/latest_requirements.txt @@ -4,4 +4,4 @@ pandas==2.2.3 plotly==5.24.1 scikit-learn==1.5.2 scipy==1.13.1 -tqdm==4.66.5 +tqdm==4.66.6 From 369613ba3c83d1afc275438debc5cc88c0eb7e3c Mon Sep 17 00:00:00 2001 From: SDV Team <98988753+sdv-team@users.noreply.github.com> Date: Tue, 12 Nov 2024 09:10:12 -0500 Subject: [PATCH 06/11] Automated Latest Dependency Updates (#655) Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- latest_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/latest_requirements.txt b/latest_requirements.txt index 691f176d..aed0be91 100644 --- a/latest_requirements.txt +++ b/latest_requirements.txt @@ -4,4 +4,4 @@ pandas==2.2.3 plotly==5.24.1 scikit-learn==1.5.2 scipy==1.13.1 -tqdm==4.66.6 +tqdm==4.67.0 From 838e81db5556630d82195d93c2d16a9bfa94c7a8 Mon Sep 17 00:00:00 2001 From: R-Palazzo <116157184+R-Palazzo@users.noreply.github.com> Date: Thu, 14 Nov 2024 13:34:43 -0500 Subject: [PATCH 07/11] When running Quality Report, ContingencySimilarity produces a RuntimeWarning (`The values in the array are unorderable.`) (#657) --- .../statistical/contingency_similarity.py | 2 +- .../reports/multi_table/test_quality_report.py | 2 +- .../statistical/test_contingency_similarity.py | 13 +++++++++++++ 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/sdmetrics/column_pairs/statistical/contingency_similarity.py b/sdmetrics/column_pairs/statistical/contingency_similarity.py index 5d2c801d..e41075e0 100644 --- a/sdmetrics/column_pairs/statistical/contingency_similarity.py +++ b/sdmetrics/column_pairs/statistical/contingency_similarity.py @@ -44,7 +44,7 @@ def compute(cls, real_data, synthetic_data): contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len( synthetic ) - combined_index = contingency_real.index.union(contingency_synthetic.index) + combined_index = contingency_real.index.union(contingency_synthetic.index, sort=False) contingency_synthetic = contingency_synthetic.reindex(combined_index, fill_value=0) contingency_real = contingency_real.reindex(combined_index, fill_value=0) diff = abs(contingency_real - contingency_synthetic).fillna(0) diff --git a/tests/integration/reports/multi_table/test_quality_report.py b/tests/integration/reports/multi_table/test_quality_report.py index 9c3b79b1..fb571e81 100644 --- a/tests/integration/reports/multi_table/test_quality_report.py +++ b/tests/integration/reports/multi_table/test_quality_report.py @@ -342,7 +342,7 @@ def test_quality_report_with_errors(): None, ], }) - assert score == 0.7249603174603174 + assert score == 0.7249603174603175 pd.testing.assert_frame_equal(properties, expected_properties) pd.testing.assert_frame_equal(details_column_shapes, expected_details) diff --git a/tests/unit/column_pairs/statistical/test_contingency_similarity.py b/tests/unit/column_pairs/statistical/test_contingency_similarity.py index 713aab7d..dc7bafdd 100644 --- a/tests/unit/column_pairs/statistical/test_contingency_similarity.py +++ b/tests/unit/column_pairs/statistical/test_contingency_similarity.py @@ -1,6 +1,7 @@ from unittest.mock import patch import pandas as pd +import pytest from sdmetrics.column_pairs.statistical import ContingencySimilarity @@ -53,3 +54,15 @@ def test_normalize(self, normalize_mock): # Assert normalize_mock.assert_called_once_with(raw_score) assert result == normalize_mock.return_value + + @pytest.mark.filterwarnings('error:.*The values in the array are unorderable.*:RuntimeWarning') + def test_no_runtime_warning_raised(self): + """Test that no RuntimeWarning warning is raised when the metric is computed.""" + # Setup + real_data = pd.DataFrame(data={'A': ['value'] * 4, 'B': ['1', '2', '3', pd.NA]}) + synthetic_data = pd.DataFrame(data={'A': ['value'] * 3, 'B': ['1', '2', pd.NA]}) + + # Run and Assert + ContingencySimilarity.compute( + real_data=real_data[['A', 'B']], synthetic_data=synthetic_data[['A', 'B']] + ) From d5ccb7536e83cc6e5b459e23c938e7b60276bfb8 Mon Sep 17 00:00:00 2001 From: Felipe Alex Hofmann Date: Thu, 14 Nov 2024 14:59:09 -0800 Subject: [PATCH 08/11] Add `InterRowMSAS`, `StatisticMSAS` and `SequenceLengthSimilarity` metrics (#662) --- sdmetrics/column_pairs/__init__.py | 4 + .../column_pairs/statistical/__init__.py | 4 + .../statistical/inter_row_msas.py | 106 +++++++++++ .../statistical/statistic_msas.py | 96 ++++++++++ sdmetrics/single_column/__init__.py | 2 + .../single_column/statistical/__init__.py | 2 + .../single_column/statistical/kscomplement.py | 9 +- .../statistical/sequence_length_similarity.py | 53 ++++++ .../statistical/test_inter_row_msas.py | 176 ++++++++++++++++++ .../statistical/test_statistic_msas.py | 125 +++++++++++++ .../test_sequence_length_similarity.py | 41 ++++ 11 files changed, 617 insertions(+), 1 deletion(-) create mode 100644 sdmetrics/column_pairs/statistical/inter_row_msas.py create mode 100644 sdmetrics/column_pairs/statistical/statistic_msas.py create mode 100644 sdmetrics/single_column/statistical/sequence_length_similarity.py create mode 100644 tests/unit/column_pairs/statistical/test_inter_row_msas.py create mode 100644 tests/unit/column_pairs/statistical/test_statistic_msas.py create mode 100644 tests/unit/single_column/statistical/test_sequence_length_similarity.py diff --git a/sdmetrics/column_pairs/__init__.py b/sdmetrics/column_pairs/__init__.py index e44e35de..38f1aebe 100644 --- a/sdmetrics/column_pairs/__init__.py +++ b/sdmetrics/column_pairs/__init__.py @@ -11,6 +11,8 @@ DiscreteKLDivergence, ) from sdmetrics.column_pairs.statistical.referential_integrity import ReferentialIntegrity +from sdmetrics.column_pairs.statistical.inter_row_msas import InterRowMSAS +from sdmetrics.column_pairs.statistical.statistic_msas import StatisticMSAS __all__ = [ 'CardinalityBoundaryAdherence', @@ -20,4 +22,6 @@ 'CorrelationSimilarity', 'DiscreteKLDivergence', 'ReferentialIntegrity', + 'InterRowMSAS', + 'StatisticMSAS', ] diff --git a/sdmetrics/column_pairs/statistical/__init__.py b/sdmetrics/column_pairs/statistical/__init__.py index 7f921df6..7198944e 100644 --- a/sdmetrics/column_pairs/statistical/__init__.py +++ b/sdmetrics/column_pairs/statistical/__init__.py @@ -10,6 +10,8 @@ DiscreteKLDivergence, ) from sdmetrics.column_pairs.statistical.referential_integrity import ReferentialIntegrity +from sdmetrics.column_pairs.statistical.inter_row_msas import InterRowMSAS +from sdmetrics.column_pairs.statistical.statistic_msas import StatisticMSAS __all__ = [ 'CardinalityBoundaryAdherence', @@ -18,4 +20,6 @@ 'CorrelationSimilarity', 'DiscreteKLDivergence', 'ReferentialIntegrity', + 'InterRowMSAS', + 'StatisticMSAS', ] diff --git a/sdmetrics/column_pairs/statistical/inter_row_msas.py b/sdmetrics/column_pairs/statistical/inter_row_msas.py new file mode 100644 index 00000000..eea77f06 --- /dev/null +++ b/sdmetrics/column_pairs/statistical/inter_row_msas.py @@ -0,0 +1,106 @@ +"""InterRowMSAS module.""" + +import warnings + +import numpy as np +import pandas as pd + +from sdmetrics.goal import Goal +from sdmetrics.single_column.statistical.kscomplement import KSComplement + + +class InterRowMSAS: + """Inter-Row Multi-Sequence Aggregate Similarity (MSAS) metric. + + Attributes: + name (str): + Name to use when reports about this metric are printed. + goal (sdmetrics.goal.Goal): + The goal of this metric. + min_value (Union[float, tuple[float]]): + Minimum value or values that this metric can take. + max_value (Union[float, tuple[float]]): + Maximum value or values that this metric can take. + """ + + name = 'Inter-Row Multi-Sequence Aggregate Similarity' + goal = Goal.MAXIMIZE + min_value = 0.0 + max_value = 1.0 + + @staticmethod + def compute(real_data, synthetic_data, n_rows_diff=1, apply_log=False): + """Compute this metric. + + This metric compares the inter-row differences of sequences in the real data + vs. the synthetic data. + + It works as follows: + - Calculate the difference between row r and row r+x for each row in the real data + - Take the average over each sequence to form a distribution D_r + - Do the same for the synthetic data to form a new distribution D_s + - Apply the KSComplement metric to compare the similarities of (D_r, D_s) + - Return this score + + Args: + real_data (tuple[pd.Series, pd.Series]): + A tuple of 2 pandas.Series objects. The first represents the sequence key + of the real data and the second represents a continuous column of data. + synthetic_data (tuple[pd.Series, pd.Series]): + A tuple of 2 pandas.Series objects. The first represents the sequence key + of the synthetic data and the second represents a continuous column of data. + n_rows_diff (int): + An integer representing the number of rows to consider when taking the difference. + apply_log (bool): + Whether to apply a natural log before taking the difference. + + Returns: + float: + The similarity score between the real and synthetic data distributions. + """ + for data in [real_data, synthetic_data]: + if ( + not isinstance(data, tuple) + or len(data) != 2 + or (not (isinstance(data[0], pd.Series) and isinstance(data[1], pd.Series))) + ): + raise ValueError('The data must be a tuple of two pandas series.') + + if not isinstance(n_rows_diff, int) or n_rows_diff < 1: + raise ValueError("'n_rows_diff' must be an integer greater than zero.") + + if not isinstance(apply_log, bool): + raise ValueError("'apply_log' must be a boolean.") + + real_keys, real_values = real_data + synthetic_keys, synthetic_values = synthetic_data + + if apply_log: + real_values = np.log(real_values) + synthetic_values = np.log(synthetic_values) + + def calculate_differences(keys, values, n_rows_diff, data_name): + group_sizes = values.groupby(keys).size() + num_invalid_groups = group_sizes[group_sizes <= n_rows_diff].count() + if num_invalid_groups > 0: + warnings.warn( + f"n_rows_diff '{n_rows_diff}' is greater than the " + f'size of {num_invalid_groups} sequence keys in {data_name}.' + ) + + differences = values.groupby(keys).apply( + lambda group: np.mean( + group.to_numpy()[n_rows_diff:] - group.to_numpy()[:-n_rows_diff] + ) + if len(group) > n_rows_diff + else np.nan + ) + + return pd.Series(differences) + + real_diff = calculate_differences(real_keys, real_values, n_rows_diff, 'real_data') + synthetic_diff = calculate_differences( + synthetic_keys, synthetic_values, n_rows_diff, 'synthetic_data' + ) + + return KSComplement.compute(real_diff, synthetic_diff) diff --git a/sdmetrics/column_pairs/statistical/statistic_msas.py b/sdmetrics/column_pairs/statistical/statistic_msas.py new file mode 100644 index 00000000..8afab764 --- /dev/null +++ b/sdmetrics/column_pairs/statistical/statistic_msas.py @@ -0,0 +1,96 @@ +"""StatisticMSAS module.""" + +import numpy as np +import pandas as pd + +from sdmetrics.goal import Goal +from sdmetrics.single_column.statistical.kscomplement import KSComplement + + +class StatisticMSAS: + """Statistic Multi-Sequence Aggregate Similarity (MSAS) metric. + + Attributes: + name (str): + Name to use when reports about this metric are printed. + goal (sdmetrics.goal.Goal): + The goal of this metric. + min_value (Union[float, tuple[float]]): + Minimum value or values that this metric can take. + max_value (Union[float, tuple[float]]): + Maximum value or values that this metric can take. + """ + + name = 'Statistic Multi-Sequence Aggregate Similarity' + goal = Goal.MAXIMIZE + min_value = 0.0 + max_value = 1.0 + + @staticmethod + def compute(real_data, synthetic_data, statistic='mean'): + """Compute this metric. + + This metric compares the distribution of a given statistic across sequences + in the real data vs. the synthetic data. + + It works as follows: + - Calculate the specified statistic for each sequence in the real data + - Form a distribution D_r from these statistics + - Do the same for the synthetic data to form a new distribution D_s + - Apply the KSComplement metric to compare the similarities of (D_r, D_s) + - Return this score + + Args: + real_data (tuple[pd.Series, pd.Series]): + A tuple of 2 pandas.Series objects. The first represents the sequence key + of the real data and the second represents a continuous column of data. + synthetic_data (tuple[pd.Series, pd.Series]): + A tuple of 2 pandas.Series objects. The first represents the sequence key + of the synthetic data and the second represents a continuous column of data. + statistic (str): + A string representing the statistic function to use when computing MSAS. + + Available options are: + - 'mean': The arithmetic mean of the sequence + - 'median': The median value of the sequence + - 'std': The standard deviation of the sequence + - 'min': The minimum value in the sequence + - 'max': The maximum value in the sequence + + Returns: + float: + The similarity score between the real and synthetic data distributions. + """ + statistic_functions = { + 'mean': np.mean, + 'median': np.median, + 'std': np.std, + 'min': np.min, + 'max': np.max, + } + if statistic not in statistic_functions: + raise ValueError( + f'Invalid statistic: {statistic}.' + f' Choose from [{", ".join(statistic_functions.keys())}].' + ) + + for data in [real_data, synthetic_data]: + if ( + not isinstance(data, tuple) + or len(data) != 2 + or (not (isinstance(data[0], pd.Series) and isinstance(data[1], pd.Series))) + ): + raise ValueError('The data must be a tuple of two pandas series.') + + real_keys, real_values = real_data + synthetic_keys, synthetic_values = synthetic_data + stat_func = statistic_functions[statistic] + + def calculate_statistics(keys, values): + df = pd.DataFrame({'keys': keys, 'values': values}) + return df.groupby('keys')['values'].agg(stat_func) + + real_stats = calculate_statistics(real_keys, real_values) + synthetic_stats = calculate_statistics(synthetic_keys, synthetic_values) + + return KSComplement.compute(real_stats, synthetic_stats) diff --git a/sdmetrics/single_column/__init__.py b/sdmetrics/single_column/__init__.py index 563ea574..fdd9d9f1 100644 --- a/sdmetrics/single_column/__init__.py +++ b/sdmetrics/single_column/__init__.py @@ -12,6 +12,7 @@ from sdmetrics.single_column.statistical.range_coverage import RangeCoverage from sdmetrics.single_column.statistical.statistic_similarity import StatisticSimilarity from sdmetrics.single_column.statistical.tv_complement import TVComplement +from sdmetrics.single_column.statistical.sequence_length_similarity import SequenceLengthSimilarity __all__ = [ 'base', @@ -26,4 +27,5 @@ 'RangeCoverage', 'StatisticSimilarity', 'TVComplement', + 'SequenceLengthSimilarity', ] diff --git a/sdmetrics/single_column/statistical/__init__.py b/sdmetrics/single_column/statistical/__init__.py index 252cd6ac..228a456b 100644 --- a/sdmetrics/single_column/statistical/__init__.py +++ b/sdmetrics/single_column/statistical/__init__.py @@ -10,6 +10,7 @@ from sdmetrics.single_column.statistical.range_coverage import RangeCoverage from sdmetrics.single_column.statistical.statistic_similarity import StatisticSimilarity from sdmetrics.single_column.statistical.tv_complement import TVComplement +from sdmetrics.single_column.statistical.sequence_length_similarity import SequenceLengthSimilarity __all__ = [ 'BoundaryAdherence', @@ -22,4 +23,5 @@ 'RangeCoverage', 'StatisticSimilarity', 'TVComplement', + 'SequenceLengthSimilarity', ] diff --git a/sdmetrics/single_column/statistical/kscomplement.py b/sdmetrics/single_column/statistical/kscomplement.py index 3be01330..525e85c7 100644 --- a/sdmetrics/single_column/statistical/kscomplement.py +++ b/sdmetrics/single_column/statistical/kscomplement.py @@ -1,5 +1,6 @@ """Kolmogorov-Smirnov test based Metric.""" +import numpy as np import pandas as pd from scipy.stats import ks_2samp @@ -56,7 +57,13 @@ def compute(real_data, synthetic_data): real_data = pd.to_numeric(real_data) synthetic_data = pd.to_numeric(synthetic_data) - statistic, _ = ks_2samp(real_data, synthetic_data) + try: + statistic, _ = ks_2samp(real_data, synthetic_data) + except ValueError as e: + if str(e) == 'Data passed to ks_2samp must not be empty': + return np.nan + else: + raise ValueError(e) return 1 - statistic diff --git a/sdmetrics/single_column/statistical/sequence_length_similarity.py b/sdmetrics/single_column/statistical/sequence_length_similarity.py new file mode 100644 index 00000000..105f159b --- /dev/null +++ b/sdmetrics/single_column/statistical/sequence_length_similarity.py @@ -0,0 +1,53 @@ +"""SequenceLengthSimilarity module.""" + +import pandas as pd + +from sdmetrics.goal import Goal +from sdmetrics.single_column.statistical.kscomplement import KSComplement + + +class SequenceLengthSimilarity: + """Sequence Length Similarity metric. + + Attributes: + name (str): + Name to use when reports about this metric are printed. + goal (sdmetrics.goal.Goal): + The goal of this metric. + min_value (Union[float, tuple[float]]): + Minimum value or values that this metric can take. + max_value (Union[float, tuple[float]]): + Maximum value or values that this metric can take. + """ + + name = 'Sequence Length Similarity' + goal = Goal.MAXIMIZE + min_value = 0.0 + max_value = 1.0 + + @staticmethod + def compute(real_data: pd.Series, synthetic_data: pd.Series) -> float: + """Compute this metric. + + The length of a sequence is determined by the number of times the same sequence key occurs. + For example if id_09231 appeared 150 times in the sequence key, then the sequence is of + length 150. This metric compares the lengths of all sequence keys in the + real data vs. the synthetic data. + + It works as follows: + - Calculate the length of each sequence in the real data + - Calculate the length of each sequence in the synthetic data + - Apply the KSComplement metric to compare the similarities of the distributions + - Return this score + + Args: + real_data (pd.Series): + The values from the real dataset. + synthetic_data (pd.Series): + The values from the synthetic dataset. + + Returns: + float: + The score. + """ + return KSComplement.compute(real_data.value_counts(), synthetic_data.value_counts()) diff --git a/tests/unit/column_pairs/statistical/test_inter_row_msas.py b/tests/unit/column_pairs/statistical/test_inter_row_msas.py new file mode 100644 index 00000000..9a3552db --- /dev/null +++ b/tests/unit/column_pairs/statistical/test_inter_row_msas.py @@ -0,0 +1,176 @@ +import pandas as pd +import pytest + +from sdmetrics.column_pairs import InterRowMSAS + + +class TestInterRowMSAS: + def test_compute(self): + """Test it runs.""" + # Setup + real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2']) + real_values = pd.Series([1, 2, 3, 4, 5, 6]) + synthetic_keys = pd.Series(['id3', 'id3', 'id3', 'id4', 'id4', 'id4']) + synthetic_values = pd.Series([1, 10, 3, 7, 5, 1]) + + # Run + score = InterRowMSAS.compute( + real_data=(real_keys, real_values), synthetic_data=(synthetic_keys, synthetic_values) + ) + + # Assert + assert score == 0.5 + + def test_compute_identical_sequences(self): + """Test it returns 1 when real and synthetic data are identical.""" + # Setup + real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2']) + real_values = pd.Series([1, 2, 3, 4, 5, 6]) + synthetic_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2']) + synthetic_values = pd.Series([1, 2, 3, 4, 5, 6]) + + # Run + score = InterRowMSAS.compute( + real_data=(real_keys, real_values), synthetic_data=(synthetic_keys, synthetic_values) + ) + + # Assert + assert score == 1 + + def test_compute_different_sequences(self): + """Test it for distinct distributions.""" + # Setup + real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2']) + real_values = pd.Series([1, 2, 3, 4, 5, 6]) + synthetic_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2']) + synthetic_values = pd.Series([1, 3, 5, 2, 4, 6]) + + # Run + score = InterRowMSAS.compute( + real_data=(real_keys, real_values), synthetic_data=(synthetic_keys, synthetic_values) + ) + + # Assert + assert score == 0 + + def test_compute_with_log(self): + """Test it with logarithmic transformation.""" + # Setup + real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2']) + real_values = pd.Series([1, 2, 4, 8, 16, 32]) + synthetic_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2']) + synthetic_values = pd.Series([1, 2, 4, 8, 16, 32]) + + # Run + score = InterRowMSAS.compute( + real_data=(real_keys, real_values), + synthetic_data=(synthetic_keys, synthetic_values), + apply_log=True, + ) + + # Assert + assert score == 1 + + def test_compute_different_n_rows_diff(self): + """Test it with different n_rows_diff.""" + # Setup + real_keys = pd.Series(['id1'] * 10 + ['id2'] * 10) + real_values = pd.Series(list(range(10)) + list(range(10))) + synthetic_keys = pd.Series(['id1'] * 10 + ['id2'] * 10) + synthetic_values = pd.Series(list(range(10)) + list(range(10))) + + # Run + score = InterRowMSAS.compute( + real_data=(real_keys, real_values), + synthetic_data=(synthetic_keys, synthetic_values), + n_rows_diff=3, + ) + + # Assert + assert score == 1 + + def test_compute_invalid_real_data(self): + """Test that it raises ValueError when real_data is invalid.""" + # Setup + real_data = [[1, 2, 3], [4, 5, 6]] # Not a tuple of pandas Series + synthetic_keys = pd.Series(['id1', 'id1', 'id2', 'id2']) + synthetic_values = pd.Series([1, 2, 3, 4]) + + # Run and Assert + with pytest.raises(ValueError, match='The data must be a tuple of two pandas series.'): + InterRowMSAS.compute( + real_data=real_data, + synthetic_data=(synthetic_keys, synthetic_values), + n_rows_diff=1, + apply_log=False, + ) + + def test_compute_invalid_synthetic_data(self): + """Test that it raises ValueError when synthetic_data is invalid.""" + # Setup + real_keys = pd.Series(['id1', 'id1', 'id2', 'id2']) + real_values = pd.Series([1, 2, 3, 4]) + synthetic_data = [[1, 2, 3], [4, 5, 6]] # Not a tuple of pandas Series + + # Run and Assert + with pytest.raises(ValueError, match='The data must be a tuple of two pandas series.'): + InterRowMSAS.compute( + real_data=(real_keys, real_values), + synthetic_data=synthetic_data, + n_rows_diff=1, + apply_log=False, + ) + + def test_compute_invalid_n_rows_diff(self): + """Test that it raises ValueError when n_rows_diff is invalid.""" + # Setup + real_keys = pd.Series(['id1', 'id1', 'id2', 'id2']) + real_values = pd.Series([1, 2, 3, 4]) + synthetic_keys = pd.Series(['id3', 'id3', 'id4', 'id4']) + synthetic_values = pd.Series([1, 2, 3, 4]) + + # Run and Assert + with pytest.raises(ValueError, match="'n_rows_diff' must be an integer greater than zero."): + InterRowMSAS.compute( + real_data=(real_keys, real_values), + synthetic_data=(synthetic_keys, synthetic_values), + n_rows_diff=0, + apply_log=False, + ) + + def test_compute_invalid_apply_log(self): + """Test that it raises ValueError when apply_log is invalid.""" + # Setup + real_keys = pd.Series(['id1', 'id1', 'id2', 'id2']) + real_values = pd.Series([1, 2, 3, 4]) + synthetic_keys = pd.Series(['id1', 'id1', 'id2', 'id2']) + synthetic_values = pd.Series([1, 2, 3, 4]) + + # Run and Assert + with pytest.raises(ValueError, match="'apply_log' must be a boolean."): + InterRowMSAS.compute( + real_data=(real_keys, real_values), + synthetic_data=(synthetic_keys, synthetic_values), + n_rows_diff=1, + apply_log='True', # Should be a boolean, not a string + ) + + def test_compute_warning(self): + """Test a warning is raised when n_rows_diff is greater than sequence values size.""" + # Setup + real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2']) + real_values = pd.Series([1, 2, 3, 4, 5, 6]) + synthetic_keys = pd.Series(['id3', 'id3', 'id3', 'id4', 'id4', 'id4']) + synthetic_values = pd.Series([1, 10, 3, 7, 5, 1]) + + # Run and Assert + warn_msg = "n_rows_diff '10' is greater than the size of 2 sequence keys in real_data." + with pytest.warns(UserWarning, match=warn_msg): + score = InterRowMSAS.compute( + real_data=(real_keys, real_values), + synthetic_data=(synthetic_keys, synthetic_values), + n_rows_diff=10, + ) + + # Assert + assert pd.isna(score) diff --git a/tests/unit/column_pairs/statistical/test_statistic_msas.py b/tests/unit/column_pairs/statistical/test_statistic_msas.py new file mode 100644 index 00000000..9e8813eb --- /dev/null +++ b/tests/unit/column_pairs/statistical/test_statistic_msas.py @@ -0,0 +1,125 @@ +import re + +import pandas as pd +import pytest + +from sdmetrics.column_pairs import StatisticMSAS + + +class TestStatisticMSAS: + def test_compute_identical_sequences(self): + """Test it returns 1 when real and synthetic data are identical.""" + # Setup + real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2']) + real_values = pd.Series([1, 2, 3, 4, 5, 6]) + synthetic_keys = pd.Series(['id3', 'id3', 'id3', 'id4', 'id4', 'id4']) + synthetic_values = pd.Series([1, 2, 3, 4, 5, 6]) + + # Run and Assert + for statistic in ['mean', 'median', 'std', 'min', 'max']: + score = StatisticMSAS.compute( + real_data=(real_keys, real_values), + synthetic_data=(synthetic_keys, synthetic_values), + statistic=statistic, + ) + assert score == 1 + + def test_compute_different_sequences(self): + """Test it for distinct distributions.""" + # Setup + real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2']) + real_values = pd.Series([1, 2, 3, 4, 5, 6]) + synthetic_keys = pd.Series(['id3', 'id3', 'id3', 'id4', 'id4', 'id4']) + synthetic_values = pd.Series([10, 20, 30, 40, 50, 60]) + + # Run and Assert + for statistic in ['mean', 'median', 'std', 'min', 'max']: + score = StatisticMSAS.compute( + real_data=(real_keys, real_values), + synthetic_data=(synthetic_keys, synthetic_values), + statistic=statistic, + ) + assert score == 0 + + def test_compute_with_single_sequence(self): + """Test it with a single sequence.""" + # Setup + real_keys = pd.Series(['id1', 'id1', 'id1']) + real_values = pd.Series([1, 2, 3]) + synthetic_keys = pd.Series(['id2', 'id2', 'id2']) + synthetic_values = pd.Series([1, 2, 3]) + + # Run + score = StatisticMSAS.compute( + real_data=(real_keys, real_values), + synthetic_data=(synthetic_keys, synthetic_values), + statistic='mean', + ) + + # Assert + assert score == 1 + + def test_compute_with_different_sequence_lengths(self): + """Test it with different sequence lengths.""" + # Setup + real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2']) + real_values = pd.Series([1, 2, 3, 4, 5]) + synthetic_keys = pd.Series(['id2', 'id2', 'id3', 'id4', 'id5']) + synthetic_values = pd.Series([1, 2, 3, 4, 5]) + + # Run + score = StatisticMSAS.compute( + real_data=(real_keys, real_values), + synthetic_data=(synthetic_keys, synthetic_values), + statistic='mean', + ) + + # Assert + assert score == 0.75 + + def test_compute_with_invalid_statistic(self): + """Test it raises ValueError for invalid statistic.""" + # Setup + real_keys = pd.Series(['id1', 'id1', 'id1']) + real_values = pd.Series([1, 2, 3]) + synthetic_keys = pd.Series(['id2', 'id2', 'id2']) + synthetic_values = pd.Series([1, 2, 3]) + + # Run and Assert + err_msg = re.escape( + 'Invalid statistic: invalid. Choose from [mean, median, std, min, max].' + ) + with pytest.raises(ValueError, match=err_msg): + StatisticMSAS.compute( + real_data=(real_keys, real_values), + synthetic_data=(synthetic_keys, synthetic_values), + statistic='invalid', + ) + + def test_compute_invalid_real_data(self): + """Test that it raises ValueError when real_data is invalid.""" + # Setup + real_data = [[1, 2, 3], [4, 5, 6]] # Not a tuple of pandas Series + synthetic_keys = pd.Series(['id1', 'id1', 'id2', 'id2']) + synthetic_values = pd.Series([1, 2, 3, 4]) + + # Run and Assert + with pytest.raises(ValueError, match='The data must be a tuple of two pandas series.'): + StatisticMSAS.compute( + real_data=real_data, + synthetic_data=(synthetic_keys, synthetic_values), + ) + + def test_compute_invalid_synthetic_data(self): + """Test that it raises ValueError when synthetic_data is invalid.""" + # Setup + real_keys = pd.Series(['id1', 'id1', 'id2', 'id2']) + real_values = pd.Series([1, 2, 3, 4]) + synthetic_data = [[1, 2, 3], [4, 5, 6]] # Not a tuple of pandas Series + + # Run and Assert + with pytest.raises(ValueError, match='The data must be a tuple of two pandas series.'): + StatisticMSAS.compute( + real_data=(real_keys, real_values), + synthetic_data=synthetic_data, + ) diff --git a/tests/unit/single_column/statistical/test_sequence_length_similarity.py b/tests/unit/single_column/statistical/test_sequence_length_similarity.py new file mode 100644 index 00000000..4e27ab98 --- /dev/null +++ b/tests/unit/single_column/statistical/test_sequence_length_similarity.py @@ -0,0 +1,41 @@ +import pandas as pd + +from sdmetrics.single_column import SequenceLengthSimilarity + + +class TestSequenceLengthSimilarity: + def test_compute(self): + """Test it runs.""" + # Setup + real_data = pd.Series(['id1', 'id2', 'id2', 'id3']) + synthetic_data = pd.Series(['id4', 'id5', 'id6']) + + # Run + score = SequenceLengthSimilarity.compute(real_data, synthetic_data) + + # Assert + assert score == 0.6666666666666667 + + def test_compute_one(self): + """Test it returns 1 when real and synthetic data have the same distribution.""" + # Setup + real_data = pd.Series(['id1', 'id1', 'id2', 'id2', 'id2', 'id3']) + synthetic_data = pd.Series(['id4', 'id4', 'id5', 'id6', 'id6', 'id6']) + + # Run + score = SequenceLengthSimilarity.compute(real_data, synthetic_data) + + # Assert + assert score == 1 + + def test_compute_low_score(self): + """Test it for distinct distributions.""" + # Setup + real_data = pd.Series([f'id{i}' for i in range(100)]) + synthetic_data = pd.Series(['id100'] * 100) + + # Run + score = SequenceLengthSimilarity.compute(real_data, synthetic_data) + + # Assert + assert score == 0 From 0f8c8aff86fba653acc09092eb9496de5b7ff4b7 Mon Sep 17 00:00:00 2001 From: Andrew Montanez Date: Thu, 14 Nov 2024 17:01:42 -0600 Subject: [PATCH 09/11] =?UTF-8?q?Bump=20version:=200.16.1.dev0=20=E2=86=92?= =?UTF-8?q?=200.17.0.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- conda/meta.yaml | 2 +- pyproject.toml | 2 +- sdmetrics/__init__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/conda/meta.yaml b/conda/meta.yaml index 8b65e9e1..96ab3875 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -1,4 +1,4 @@ -{% set version = '0.16.1.dev0' %} +{% set version = '0.17.0.dev0' %} package: name: "{{ name|lower }}" diff --git a/pyproject.toml b/pyproject.toml index 5a5bdeb5..8469152f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -134,7 +134,7 @@ convention = 'google' add-ignore = ['D107', 'D407', 'D417'] [tool.bumpversion] -current_version = "0.16.1.dev0" +current_version = "0.17.0.dev0" parse = '(?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?' serialize = [ '{major}.{minor}.{patch}.{release}{candidate}', diff --git a/sdmetrics/__init__.py b/sdmetrics/__init__.py index 8f1972a6..732410e2 100644 --- a/sdmetrics/__init__.py +++ b/sdmetrics/__init__.py @@ -4,7 +4,7 @@ __author__ = 'MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' -__version__ = '0.16.1.dev0' +__version__ = '0.17.0.dev0' import sys import warnings as python_warnings From 3c0ad516e5711b5d50ccbc20db6918700343334d Mon Sep 17 00:00:00 2001 From: Andrew Montanez Date: Thu, 14 Nov 2024 17:02:18 -0600 Subject: [PATCH 10/11] =?UTF-8?q?Bump=20version:=200.17.0.dev0=20=E2=86=92?= =?UTF-8?q?=200.17.0.dev1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- conda/meta.yaml | 2 +- pyproject.toml | 2 +- sdmetrics/__init__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/conda/meta.yaml b/conda/meta.yaml index 96ab3875..45c4528d 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -1,4 +1,4 @@ -{% set version = '0.17.0.dev0' %} +{% set version = '0.17.0.dev1' %} package: name: "{{ name|lower }}" diff --git a/pyproject.toml b/pyproject.toml index 8469152f..306c5c97 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -134,7 +134,7 @@ convention = 'google' add-ignore = ['D107', 'D407', 'D417'] [tool.bumpversion] -current_version = "0.17.0.dev0" +current_version = "0.17.0.dev1" parse = '(?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?' serialize = [ '{major}.{minor}.{patch}.{release}{candidate}', diff --git a/sdmetrics/__init__.py b/sdmetrics/__init__.py index 732410e2..76158f47 100644 --- a/sdmetrics/__init__.py +++ b/sdmetrics/__init__.py @@ -4,7 +4,7 @@ __author__ = 'MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' -__version__ = '0.17.0.dev0' +__version__ = '0.17.0.dev1' import sys import warnings as python_warnings From c829829b872ce50f96b9f2012d05b05303161e5f Mon Sep 17 00:00:00 2001 From: Andrew Montanez Date: Thu, 14 Nov 2024 20:30:00 -0600 Subject: [PATCH 11/11] 0.17.0 release notes (#663) --- HISTORY.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index 7132cf4e..e841f6fc 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,21 @@ # History +## v0.17.0 - 2024-11-14 + +This release adds a number of Multi-Sequence Aggregate Similarity (MSAS) metrics! + +### Bugs Fixed + +* Relocate timeseries metrics modules - Issue [#661](https://github.com/sdv-dev/SDMetrics/issues/661) by @fealho +* Fix `SequenceLengthSimilarity` docstrings - Issue [#660](https://github.com/sdv-dev/SDMetrics/issues/660) by @fealho +* When running Quality Report, ContingencySimilarity produces a RuntimeWarning (`The values in the array are unorderable.`) - Issue [#656](https://github.com/sdv-dev/SDMetrics/issues/656) by @R-Palazzo + +### New Features + +* Add metric for inter-row MSAS - Issue [#640](https://github.com/sdv-dev/SDMetrics/issues/640) by @fealho +* Add metric for general MSAS statistics - Issue [#639](https://github.com/sdv-dev/SDMetrics/issues/639) by @fealho +* Add metric for sequence length similarity - Issue [#638](https://github.com/sdv-dev/SDMetrics/issues/638) by @fealho + ## v0.16.0 - 2024-09-25 This release improves the performance of the `contingency_similarity` metric. It also factors dtypes into the score of the `TableStructure` metric.