Merge branch 'main' into issue-652-kscomplement

sdv-dev · Nov 25, 2024 · e7d49ea · e7d49ea
2 parents 3700419 + d2cb918
commit e7d49ea
Show file tree

Hide file tree

Showing 7 changed files with 105 additions and 46 deletions.
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
@@ -29,3 +29,12 @@ jobs:
           python -m pip install invoke .[test]
     - name: Run integration tests
       run: invoke integration
+
+    - if: matrix.os == 'ubuntu-latest' && matrix.python-version == 3.12
+      name: Upload integration codecov report
+      uses: codecov/codecov-action@v4
+      with:
+        flags: integration
+        file: ${{ github.workspace }}/integration_cov.xml
+        fail_ci_if_error: true
+        token: ${{ secrets.CODECOV_TOKEN }}
diff --git a/.github/workflows/unit.yml b/.github/workflows/unit.yml
@@ -29,6 +29,12 @@ jobs:
           python -m pip install invoke .[test]
     - name: Run unit tests
       run: invoke unit
-    - if: matrix.os == 'ubuntu-latest' && matrix.python-version == 3.8
-      name: Upload codecov report
+
+    - if: matrix.os == 'ubuntu-latest' && matrix.python-version == 3.12
+      name: Upload unit codecov report
       uses: codecov/codecov-action@v4
+      with:
+        flags: unit
+        file: ${{ github.workspace }}/unit_cov.xml
+        fail_ci_if_error: true
+        token: ${{ secrets.CODECOV_TOKEN }}
diff --git a/.gitignore b/.gitignore
@@ -46,6 +46,7 @@ coverage.xml
 *.cover
 .hypothesis/
 .pytest_cache/
+*_cov.xml
 
 # Translations
 *.mo
@@ -108,4 +109,3 @@ ENV/
 
 # OS Files
 .DS_Store
-
diff --git a/latest_requirements.txt b/latest_requirements.txt
@@ -4,4 +4,4 @@ pandas==2.2.3
 plotly==5.24.1
 scikit-learn==1.5.2
 scipy==1.13.1
-tqdm==4.67.0
+tqdm==4.67.1
diff --git a/sdmetrics/column_pairs/statistical/inter_row_msas.py b/sdmetrics/column_pairs/statistical/inter_row_msas.py
@@ -29,7 +29,61 @@ class InterRowMSAS:
     max_value = 1.0
 
     @staticmethod
-    def compute(real_data, synthetic_data, n_rows_diff=1, apply_log=False):
+    def _validate_inputs(real_data, synthetic_data, n_rows_diff, apply_log):
+        for data in [real_data, synthetic_data]:
+            if (
+                not isinstance(data, tuple)
+                or len(data) != 2
+                or (not (isinstance(data[0], pd.Series) and isinstance(data[1], pd.Series)))
+            ):
+                raise ValueError('The data must be a tuple of two pandas series.')
+
+        if not isinstance(n_rows_diff, int) or n_rows_diff < 1:
+            raise ValueError("'n_rows_diff' must be an integer greater than zero.")
+
+        if not isinstance(apply_log, bool):
+            raise ValueError("'apply_log' must be a boolean.")
+
+    @staticmethod
+    def _apply_log(real_values, synthetic_values, apply_log):
+        if apply_log:
+            num_invalid = sum(x <= 0 for x in pd.concat((real_values, synthetic_values)))
+            if num_invalid:
+                warnings.warn(
+                    f'There are {num_invalid} non-positive values in your data, which cannot be '
+                    "used with log. Consider changing 'apply_log' to False for a better result."
+                )
+            with warnings.catch_warnings():
+                warnings.filterwarnings('ignore', message='.*encountered in log')
+                real_values = np.log(real_values)
+                synthetic_values = np.log(synthetic_values)
+
+        return real_values, synthetic_values
+
+    @staticmethod
+    def _calculate_differences(keys, values, n_rows_diff, data_name):
+        grouped = values.groupby(keys)
+        group_sizes = grouped.size()
+
+        num_invalid_groups = len(group_sizes[group_sizes <= n_rows_diff])
+        if num_invalid_groups > 0:
+            warnings.warn(
+                f"n_rows_diff '{n_rows_diff}' is greater than the "
+                f'size of {num_invalid_groups} sequence keys in {data_name}.'
+            )
+
+        def diff_func(group):
+            if len(group) <= n_rows_diff:
+                return np.nan
+            group = group.to_numpy()
+            return np.mean(group[n_rows_diff:] - group[:-n_rows_diff])
+
+        with warnings.catch_warnings():
+            warnings.filterwarnings('ignore', message='invalid value encountered in.*')
+            return grouped.apply(diff_func)
+
+    @classmethod
+    def compute(cls, real_data, synthetic_data, n_rows_diff=1, apply_log=False):
         """Compute this metric.
 
         This metric compares the inter-row differences of sequences in the real data
@@ -58,48 +112,13 @@ def compute(real_data, synthetic_data, n_rows_diff=1, apply_log=False):
             float:
                 The similarity score between the real and synthetic data distributions.
         """
-        for data in [real_data, synthetic_data]:
-            if (
-                not isinstance(data, tuple)
-                or len(data) != 2
-                or (not (isinstance(data[0], pd.Series) and isinstance(data[1], pd.Series)))
-            ):
-                raise ValueError('The data must be a tuple of two pandas series.')
-
-        if not isinstance(n_rows_diff, int) or n_rows_diff < 1:
-            raise ValueError("'n_rows_diff' must be an integer greater than zero.")
-
-        if not isinstance(apply_log, bool):
-            raise ValueError("'apply_log' must be a boolean.")
-
+        cls._validate_inputs(real_data, synthetic_data, n_rows_diff, apply_log)
         real_keys, real_values = real_data
         synthetic_keys, synthetic_values = synthetic_data
+        real_values, synthetic_values = cls._apply_log(real_values, synthetic_values, apply_log)
 
-        if apply_log:
-            real_values = np.log(real_values)
-            synthetic_values = np.log(synthetic_values)
-
-        def calculate_differences(keys, values, n_rows_diff, data_name):
-            group_sizes = values.groupby(keys).size()
-            num_invalid_groups = group_sizes[group_sizes <= n_rows_diff].count()
-            if num_invalid_groups > 0:
-                warnings.warn(
-                    f"n_rows_diff '{n_rows_diff}' is greater than the "
-                    f'size of {num_invalid_groups} sequence keys in {data_name}.'
-                )
-
-            differences = values.groupby(keys).apply(
-                lambda group: np.mean(
-                    group.to_numpy()[n_rows_diff:] - group.to_numpy()[:-n_rows_diff]
-                )
-                if len(group) > n_rows_diff
-                else np.nan
-            )
-
-            return pd.Series(differences)
-
-        real_diff = calculate_differences(real_keys, real_values, n_rows_diff, 'real_data')
-        synthetic_diff = calculate_differences(
+        real_diff = cls._calculate_differences(real_keys, real_values, n_rows_diff, 'real_data')
+        synthetic_diff = cls._calculate_differences(
             synthetic_keys, synthetic_values, n_rows_diff, 'synthetic_data'
         )
 

diff --git a/tasks.py b/tasks.py
@@ -26,12 +26,12 @@ def check_dependencies(c):
 
 @task
 def unit(c):
-    c.run('python -m pytest ./tests/unit --cov=sdmetrics --cov-report=xml')
+    c.run('python -m pytest ./tests/unit --cov=sdmetrics --cov-report=xml:./unit_cov.xml')
 
 
 @task
 def integration(c):
-    c.run('python -m pytest ./tests/integration --reruns 5 --disable-warnings')
+    c.run('python -m pytest ./tests/integration --reruns 5 --disable-warnings --cov=sdmetrics --cov-report=xml:./integration_cov.xml')
 
 
 def _get_minimum_versions(dependencies, python_version):

diff --git a/tests/unit/column_pairs/statistical/test_inter_row_msas.py b/tests/unit/column_pairs/statistical/test_inter_row_msas.py
@@ -71,6 +71,31 @@ def test_compute_with_log(self):
         # Assert
         assert score == 1
 
+    def test_compute_with_log_warning(self):
+        """Test it warns when negative values are present and apply_log is True."""
+        # Setup
+        real_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2'])
+        real_values = pd.Series([1, 1.4, 4, -1, 16, -10])
+        synthetic_keys = pd.Series(['id1', 'id1', 'id1', 'id2', 'id2', 'id2'])
+        synthetic_values = pd.Series([1, 2, -4, 8, 16, 30])
+
+        # Run
+        with pytest.warns(UserWarning) as warning_info:
+            score = InterRowMSAS.compute(
+                real_data=(real_keys, real_values),
+                synthetic_data=(synthetic_keys, synthetic_values),
+                apply_log=True,
+            )
+
+        # Assert
+        expected_message = (
+            'There are 3 non-positive values in your data, which cannot be used with log. '
+            "Consider changing 'apply_log' to False for a better result."
+        )
+        assert len(warning_info) == 1
+        assert str(warning_info[0].message) == expected_message
+        assert score == 0
+
     def test_compute_different_n_rows_diff(self):
         """Test it with different n_rows_diff."""
         # Setup