add unit test and fix unique col scaling

ibis-project · Sep 13, 2024 · d6446c7 · d6446c7
1 parent 6582682
commit d6446c7
Show file tree

Hide file tree

Showing 2 changed files with 63 additions and 2 deletions.
diff --git a/ibis_ml/steps/_standardize.py b/ibis_ml/steps/_standardize.py
@@ -61,7 +61,14 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
             self._fit_expr = [expr]
             results = expr.execute().to_dict("records")[0]
             for name in columns:
-                stats[name] = (results[f"{name}_max"], results[f"{name}_min"])
+                col_max = results[f"{name}_max"]
+                col_min = results[f"{name}_min"]
+                if col_max == col_min:
+                    raise ValueError(
+                        f"Cannot standardize {name!r} - "
+                        "the maximum and minimum values are equal"
+                    )
+                stats[name] = (col_max, col_min)
 
         self.stats_ = stats
 
@@ -121,7 +128,12 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
             self._fit_expr = [table.aggregate(aggs)]
             results = self._fit_expr[-1].execute().to_dict("records")[0]
             for name in columns:
-                stats[name] = (results[f"{name}_mean"], results[f"{name}_std"])
+                col_std = results[f"{name}_std"]
+                if col_std == 0:
+                    raise ValueError(
+                        f"Cannot standardize {name!r} - the standard deviation is zero"
+                    )
+                stats[name] = (results[f"{name}_mean"], col_std)
 
         self.stats_ = stats
 

diff --git a/tests/test_standardize.py b/tests/test_standardize.py
@@ -0,0 +1,49 @@
+import ibis
+import numpy as np
+import pandas as pd
+import pandas.testing as tm
+import pytest
+
+import ibis_ml as ml
+
+
+def test_scalestandard():
+    cols = np.arange(0, 100)
+    mean = np.mean(cols)
+    std = np.std(cols)
+    table = ibis.memtable({"col": cols})
+    step = ml.ScaleStandard("col")
+    step.fit_table(table, ml.core.Metadata())
+    result = step.transform_table(table)
+    expected = pd.DataFrame({"col": (cols - mean) / std})
+    tm.assert_frame_equal(result.execute(), expected, check_exact=False)
+
+
+def test_scaleminmax():
+    cols = np.arange(0, 100)
+    min_val = np.min(cols)
+    max_val = np.max(cols)
+    table = ibis.memtable({"col": cols})
+    step = ml.ScaleMinMax("col")
+    step.fit_table(table, ml.core.Metadata())
+    result = step.transform_table(table)
+    expected = pd.DataFrame({"col": (cols - min_val) / (max_val - min_val)})
+    tm.assert_frame_equal(result.execute(), expected, check_exact=False)
+
+
+@pytest.mark.parametrize(
+    ("model", "msg"),
+    [
+        ("ScaleStandard", "Cannot standardize 'col' - the standard deviation is zero"),
+        (
+            "ScaleMinMax",
+            "Cannot standardize 'col' - the maximum and minimum values are equal",
+        ),
+    ],
+)
+def test_scale_unique_col(model, msg):
+    table = ibis.memtable({"col": [1]})
+    scale_class = getattr(ml, model)
+    step = scale_class("col")
+    with pytest.raises(ValueError, match=msg):
+        step.fit_table(table, ml.core.Metadata())