Skip to content

Commit

Permalink
add unit test and fix unique col scaling
Browse files Browse the repository at this point in the history
  • Loading branch information
jitingxu1 committed Sep 13, 2024
1 parent 6582682 commit d6446c7
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 2 deletions.
16 changes: 14 additions & 2 deletions ibis_ml/steps/_standardize.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,14 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
self._fit_expr = [expr]
results = expr.execute().to_dict("records")[0]
for name in columns:
stats[name] = (results[f"{name}_max"], results[f"{name}_min"])
col_max = results[f"{name}_max"]
col_min = results[f"{name}_min"]
if col_max == col_min:
raise ValueError(
f"Cannot standardize {name!r} - "
"the maximum and minimum values are equal"
)
stats[name] = (col_max, col_min)

self.stats_ = stats

Expand Down Expand Up @@ -121,7 +128,12 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
self._fit_expr = [table.aggregate(aggs)]
results = self._fit_expr[-1].execute().to_dict("records")[0]
for name in columns:
stats[name] = (results[f"{name}_mean"], results[f"{name}_std"])
col_std = results[f"{name}_std"]
if col_std == 0:
raise ValueError(
f"Cannot standardize {name!r} - the standard deviation is zero"
)
stats[name] = (results[f"{name}_mean"], col_std)

self.stats_ = stats

Expand Down
49 changes: 49 additions & 0 deletions tests/test_standardize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import ibis
import numpy as np
import pandas as pd
import pandas.testing as tm
import pytest

import ibis_ml as ml


def test_scalestandard():
cols = np.arange(0, 100)
mean = np.mean(cols)
std = np.std(cols)
table = ibis.memtable({"col": cols})
step = ml.ScaleStandard("col")
step.fit_table(table, ml.core.Metadata())
result = step.transform_table(table)
expected = pd.DataFrame({"col": (cols - mean) / std})
tm.assert_frame_equal(result.execute(), expected, check_exact=False)


def test_scaleminmax():
cols = np.arange(0, 100)
min_val = np.min(cols)
max_val = np.max(cols)
table = ibis.memtable({"col": cols})
step = ml.ScaleMinMax("col")
step.fit_table(table, ml.core.Metadata())
result = step.transform_table(table)
expected = pd.DataFrame({"col": (cols - min_val) / (max_val - min_val)})
tm.assert_frame_equal(result.execute(), expected, check_exact=False)


@pytest.mark.parametrize(
("model", "msg"),
[
("ScaleStandard", "Cannot standardize 'col' - the standard deviation is zero"),
(
"ScaleMinMax",
"Cannot standardize 'col' - the maximum and minimum values are equal",
),
],
)
def test_scale_unique_col(model, msg):
table = ibis.memtable({"col": [1]})
scale_class = getattr(ml, model)
step = scale_class("col")
with pytest.raises(ValueError, match=msg):
step.fit_table(table, ml.core.Metadata())

0 comments on commit d6446c7

Please sign in to comment.