diff --git a/blocks/base.py b/blocks/base.py index 7308580..39fddbb 100644 --- a/blocks/base.py +++ b/blocks/base.py @@ -117,4 +117,6 @@ def __post_init__(self): class BaseDataLoader: - pass + @abstractmethod + def get(self, label: str) -> pd.DataFrame: + pass diff --git a/blocks/meta.py b/blocks/meta.py index 951365b..bad2ff1 100644 --- a/blocks/meta.py +++ b/blocks/meta.py @@ -179,6 +179,8 @@ def __call__(self, X: pd.DataFrame, y=None) -> pd.DataFrame: class Factor(TransformerMixin, MetaEstimatorMixin, BaseEstimator): + """Work in progress""" + def __init__(self, template: BaseFactor): self.template = template diff --git a/pyproject.toml b/pyproject.toml index cd2eb72..0c1891d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "python-blocks" -version = "0.1.7" +version = "0.1.8" description = "Extra blocks for scikit-learn features." authors = ["ActurialCapital"] license = "BSD 3-Clause License" diff --git a/tests/test_base.py b/tests/test_base.py new file mode 100644 index 0000000..18a7eee --- /dev/null +++ b/tests/test_base.py @@ -0,0 +1,61 @@ +import pytest +from dataclasses import dataclass, field +import blocks as bk + + +def test_base_sampler(): + + class MySampler(bk.BaseSampler): + pass + + with pytest.raises(TypeError): + MySampler() + + +def test_base_transformer(): + + class MyTransformer(bk.BaseTransformer): + pass + + with pytest.raises(TypeError): + MyTransformer() + + +def test_base_transformer_check_kwargs(): + + class MyTransformer(bk.BaseTransformer): + TRANSFORMERS = {'func1': 'a', 'func2': 'b'} + + def __init__(self, select: str, **kwargs): + self.select = select + self.kwargs = kwargs + + def __call__(cls): + pass + + my_transformer = MyTransformer('func1') + assert my_transformer.check_kwargs("hello", "hello") == None + assert my_transformer.check_kwargs("func2", "b") == None + with pytest.raises(ValueError): + my_transformer.check_kwargs("func1", "a") + + my_transformer = MyTransformer('func1', a='a') + assert my_transformer.check_kwargs("hello", "hello") == None + assert my_transformer.check_kwargs("func2", "b") == None + assert my_transformer.check_kwargs("func1", "a") == None + + +def test_base_factor(): + + @dataclass + class MyFactor(bk.BaseFactor): + tags: list = field(default_factory=lambda: []) + name: str = "" + X: str = "" + y: str = None + market_feature: str = "" + inputs: dict = field(default_factory=lambda: {}) + outputs: dict = field(default_factory=lambda: {}) + pipeline: tuple = field(default_factory=lambda: ()) + + assert MyFactor().__class__.__name__ == "MyFactor" diff --git a/tests/test_decorators.py b/tests/test_decorators.py index 4dc053b..b536ded 100644 --- a/tests/test_decorators.py +++ b/tests/test_decorators.py @@ -5,8 +5,13 @@ import pandas as pd from pandas.testing import assert_index_equal -from blocks.decorators import validate_select, register_feature_names, output_pandas_dataframe -from blocks.base import BaseTransformer +from sklearn.linear_model import LinearRegression + +from blocks.decorators import ( + validate_select, + register_feature_names, + output_pandas_dataframe +) length = 50 n_paths = 10 @@ -34,9 +39,13 @@ columns=assets, index=index, ) +df3 = pd.DataFrame( + np.random.normal(size=(length, n_paths)), + columns=assets, + index=index, +) - -class MyClass(BaseTransformer): +class MyClass: CHECK_SELECT = {'a': 'foo', 'b': 'bar'} @validate_select(CHECK_SELECT) @@ -44,25 +53,36 @@ def __init__(self, select: str): self.select = select @register_feature_names - def fit(self, X, y=None): + def fit(self, X, y): + self.model = LinearRegression().fit(X, y) return self @output_pandas_dataframe - def __call__(cls, X, y=None): - return X + def predict(self, X, y=None): + return self.model.predict(X) + + def test_validate_select(): # Test valid selections MyClass('a') MyClass('b') + with pytest.raises(TypeError): + MyClass() with pytest.raises(TypeError): MyClass('hello') + with pytest.raises(TypeError): MyClass(123) + with pytest.raises(TypeError): MyClass(3.14) + with pytest.raises(TypeError): MyClass(True) + with pytest.raises(TypeError): MyClass(['a']) + with pytest.raises(TypeError): MyClass({'a': 1}) + with pytest.raises(TypeError): MyClass(None) @@ -76,8 +96,23 @@ def test_additional_valid_options(): def test_register_feature_names(): - transformer = MyClass('a').fit(df1) + transformer = MyClass('a').fit(df1, df2) assert_index_equal(transformer.columns_, df1.columns) - transformer = MyClass('a').fit(df2) + transformer = MyClass('a').fit(df2, df1) assert_index_equal(transformer.columns_, df2.columns) + +def test_output_pandas_dataframe(): + arr = df3.to_numpy() + assert isinstance(arr, np.ndarray) + + output = LinearRegression().fit(df1, df2).predict(df3) + assert isinstance(output, np.ndarray) + + myclass = MyClass('a').fit(df1, df2) + pred = myclass.predict(df3) + assert isinstance(pred, pd.DataFrame) + assert_index_equal(myclass.columns_, df1.columns) + + + \ No newline at end of file diff --git a/tests/test_meta.py b/tests/test_meta.py index a33c07d..fe58147 100644 --- a/tests/test_meta.py +++ b/tests/test_meta.py @@ -1,3 +1,5 @@ +import pytest + import numpy as np import pandas as pd from datetime import datetime, timedelta @@ -44,7 +46,9 @@ def test_vector_regression(): # Model based - pred = bk.VectorRegressor(LinearRegression).fit(X_train, y_train_).transform(y_test) + model = bk.VectorRegressor(LinearRegression) + model.fit(X_train, y_train_) + pred = model.transform(y_test) # Iterating through assets (vector by vector) predictions = [] @@ -58,6 +62,12 @@ def test_vector_regression(): # Assert assert_frame_equal(pred, output) + + # Test All NaNs + X_train_nans = pd.DataFrame(index=X_train.index, columns=X_train.columns) + with pytest.raises(ValueError): + model.fit(X_train_nans, y_train_) + def test_estimator_transformer(): model = LinearRegression() @@ -71,4 +81,20 @@ def test_estimator_transformer(): # Assert assert_frame_equal(pred, output) + + # Test check_X_y + model = bk.EstimatorTransformer(LinearRegression(), check_input=True) + new_y_train = np.select([y_train > 0, y_train <= 0], [True, False], default=True) + model.fit(X_train, new_y_train) + output = model.transform(y_test) + assert isinstance(output, pd.DataFrame) + + new_y_train = np.select([y_train > 0, y_train <= 0], ['foo', 'bar'], default='foo') + with pytest.raises(ValueError): + model.fit(X_train, new_y_train) + + +def test_factor(): + pass + \ No newline at end of file diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 3374bc4..7a7b03b 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -1,6 +1,7 @@ import logging import pytest +import numpy as np from sklearn import datasets from sklearn.base import BaseEstimator, TransformerMixin from sklearn.model_selection import GridSearchCV @@ -9,30 +10,31 @@ OneVsRestClassifier, OutputCodeClassifier, ) +from sklearn.pipeline import FeatureUnion from sklearn.preprocessing import StandardScaler from sklearn.svm import LinearSVC -from blocks.pipeline import BlockPipeline, make_block_pipeline +from blocks import BlockPipeline, make_block_pipeline IRIS = datasets.load_iris() class Adder(TransformerMixin, BaseEstimator): def __init__(self, value): - self._value = value + self.value = value def fit(self, X, y=None): return self def transform(self, X): - return X + self._value + return X + self.value def fit_transform(self, X, y=None): self.fit(X, y) return self.transform(X) def __repr__(self): - return f"Adder(value={self._value})" + return f"Adder(value={self.value})" def custom_log_callback(output, execution_time, **kwargs): @@ -50,7 +52,8 @@ def custom_log_callback(output, execution_time, **kwargs): """ logger = logging.getLogger(__name__) step_result, step = output - logger.info(f"[{step}] shape={step_result.shape} " f"nbytes={step_result.nbytes} time={execution_time}") + logger.info( + f"[{step}] shape={step_result.shape} " f"nbytes={step_result.nbytes} time={execution_time}") @pytest.fixture @@ -62,6 +65,12 @@ def named_steps(): ("add_1000", Adder(value=1000)), ] +@pytest.fixture +def last_named_steps_to_record(): + return [ + ("add_1", Adder(value=1)), + ("add_10", Adder(value=10)), + ] @pytest.fixture def nameless_steps(): @@ -93,6 +102,41 @@ def test_no_logs_when_log_callback_is_None(caplog, named_steps): assert not caplog.text, f"Log should be empty: {caplog.text}" +def test_output_shape_in_logs_when_log_callback_is_custom(caplog, named_steps): + pipe = BlockPipeline(named_steps, log_callback="custom") + caplog.clear() + with caplog.at_level(logging.INFO): + pipe.fit(IRIS.data, IRIS.target) + assert caplog.text, f"Log should be none empty: {caplog.text}" + shape_str = f"shape={IRIS.data.shape}" + assert shape_str in caplog.text, f'"{shape_str}" should be in {caplog.text}' + assert caplog.text.count(shape_str) == ( + len(pipe.steps) - 1 + ), f'"{shape_str}" should be {len(pipe.steps) - 1} times in {caplog.text}' + + +def test_time_in_logs_when_log_callback_is_custom(caplog, named_steps): + pipe = BlockPipeline(named_steps, log_callback="custom") + caplog.clear() + with caplog.at_level(logging.INFO): + pipe.fit(IRIS.data, IRIS.target) + assert caplog.text, f"Log should be none empty: {caplog.text}" + assert "time=" in caplog.text, f'"time=" should be in: {caplog.text}' + assert caplog.text.count("time") == ( + len(pipe.steps) - 1 + ), f'"time" should be {len(pipe.steps) - 1} times in {caplog.text}' + + +def test_step_name_in_logs_when_log_callback_is_custom(caplog, named_steps): + pipe = BlockPipeline(named_steps, log_callback="custom") + caplog.clear() + with caplog.at_level(logging.INFO): + pipe.fit(IRIS.data, IRIS.target) + assert caplog.text, f"Log should be none empty: {caplog.text}" + for _, step in pipe.steps[:-1]: + assert str(step) in caplog.text, f"{step} should be in: {caplog.text}" + assert caplog.text.count( + str(step)) == 1, f"{step} should be once in {caplog.text}" def test_nbytes_in_logs_when_log_callback_is_custom(caplog, named_steps): @@ -107,7 +151,61 @@ def test_nbytes_in_logs_when_log_callback_is_custom(caplog, named_steps): ), f'"nbytes=" should be {len(pipe.steps) - 1} times in {caplog.text}' +def test_feature_union(caplog, named_steps): + pipe_w_default_log_callback = BlockPipeline( + named_steps, log_callback='custom') + pipe_w_custom_log_callback = BlockPipeline( + named_steps, log_callback=custom_log_callback) + + pipe_union = FeatureUnion( + [ + ("pipe_w_default_log_callback", pipe_w_default_log_callback), + ("pipe_w_custom_log_callback", pipe_w_custom_log_callback), + ] + ) + + caplog.clear() + with caplog.at_level(logging.INFO): + pipe_union.fit(IRIS.data, IRIS.target) + assert caplog.text, f"Log should be none empty: {caplog.text}" + for pipe in [pipe_w_default_log_callback, pipe_w_custom_log_callback]: + for _, step in pipe.steps[:-1]: + assert str( + step) in caplog.text, f"{step} should be in: {caplog.text}" + assert caplog.text.count( + str(step)) == 2, f"{step} should be once in {caplog.text}" + + def test_different_name_for_repeated_step(nameless_steps): ss_twice_pipeline = make_block_pipeline(*nameless_steps) assert ss_twice_pipeline.steps[0][0] != ss_twice_pipeline.steps[1][0] + +def test_nameless_step_name_in_logs_when_log_callback_is_custom(caplog, nameless_steps): + pipe = make_block_pipeline(*nameless_steps, log_callback="custom") + caplog.clear() + with caplog.at_level(logging.INFO): + pipe.fit(IRIS.data, IRIS.target) + assert caplog.text, f"Log should be none empty: {caplog.text}" + for _, step in pipe.steps[:-1]: + assert str(step) in caplog.text, f"{step} should be in: {caplog.text}" + assert caplog.text.count( + str(step)) == 1, f"{step} should be once in {caplog.text}" + +def test_record(named_steps, last_named_steps_to_record): + pipe = BlockPipeline(last_named_steps_to_record) + pipe.fit(IRIS.data, IRIS.target) + output = pipe.transform(IRIS.data) + + pipe = BlockPipeline(named_steps, record='add_10') + pipe.fit(IRIS.data, IRIS.target) + pipe.transform(IRIS.data) + recorded = pipe.record + + assert np.array_equal(recorded, output) + +def test_make_block_pipeline_kwargs(): + with pytest.raises(TypeError): + make_block_pipeline(*nameless_steps, hello="") + + \ No newline at end of file