Skip to content

Commit

Permalink
add tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Galileo-Galilei committed Dec 1, 2024
1 parent 678ad94 commit a8694da
Showing 1 changed file with 103 additions and 17 deletions.
120 changes: 103 additions & 17 deletions tests/framework/hooks/test_hook_pipeline_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def preprocess_fun(data):
return data

def train_fun(data, param):
return 2
return 1

def predict_fun(model, data):
return data * model
Expand Down Expand Up @@ -105,7 +105,7 @@ def remove_stopwords(data, stopwords):
return data

def train_fun_hyperparam(data, hyperparam):
return 2
return 1

def predict_fun(model, data):
return data * model
Expand Down Expand Up @@ -156,6 +156,25 @@ def convert_probs_to_pred(data, threshold):
return pipeline_ml_with_parameters


@pytest.fixture
def catalog_with_parameters(kedro_project_with_mlflow_conf):
catalog_with_parameters = DataCatalog(
{
"data": MemoryDataset(pd.DataFrame(data=[0.5], columns=["a"])),
"cleaned_data": MemoryDataset(),
"params:stopwords": MemoryDataset(["Hello", "Hi"]),
"params:penalty": MemoryDataset(0.1),
"model": PickleDataset(
filepath=(
kedro_project_with_mlflow_conf / "data" / "model.csv"
).as_posix()
),
"params:threshold": MemoryDataset(0.5),
}
)
return catalog_with_parameters


@pytest.fixture
def dummy_signature(dummy_catalog, dummy_pipeline_ml):
input_data = dummy_catalog.load(dummy_pipeline_ml.input_name)
Expand Down Expand Up @@ -441,28 +460,14 @@ def test_mlflow_hook_save_pipeline_ml_with_default_copy_mode_assign(
def test_mlflow_hook_save_pipeline_ml_with_parameters(
kedro_project_with_mlflow_conf, # a fixture to be in a kedro project
pipeline_ml_with_parameters,
catalog_with_parameters,
dummy_run_params,
):
# config_with_base_mlflow_conf is a conftest fixture
bootstrap_project(kedro_project_with_mlflow_conf)
with KedroSession.create(project_path=kedro_project_with_mlflow_conf) as session:
context = session.load_context()

catalog_with_parameters = DataCatalog(
{
"data": MemoryDataset(pd.DataFrame(data=[1], columns=["a"])),
"cleaned_data": MemoryDataset(),
"params:stopwords": MemoryDataset(["Hello", "Hi"]),
"params:penalty": MemoryDataset(0.1),
"model": PickleDataset(
filepath=(
kedro_project_with_mlflow_conf / "data" / "model.csv"
).as_posix()
),
"params:threshold": MemoryDataset(0.5),
}
)

mlflow_hook = MlflowHook()
mlflow_hook.after_context_created(context)

Expand Down Expand Up @@ -694,3 +699,84 @@ def test_mlflow_hook_save_pipeline_ml_with_dataset_factory(
trained_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/artifacts")
# the real test is that the model is loaded without error
assert trained_model is not None


def test_mlflow_hook_save_and_load_pipeline_ml_with_inference_parameters(
kedro_project_with_mlflow_conf, # a fixture to be in a kedro project
pipeline_ml_with_parameters,
catalog_with_parameters,
dummy_run_params,
):
bootstrap_project(kedro_project_with_mlflow_conf)
with KedroSession.create(project_path=kedro_project_with_mlflow_conf) as session:
context = session.load_context()

mlflow_hook = MlflowHook()
mlflow_hook.after_context_created(context)

runner = SequentialRunner()
mlflow_hook.after_catalog_created(
catalog=catalog_with_parameters,
# `after_catalog_created` is not using any of arguments bellow,
# so we are setting them to empty values.
conf_catalog={},
conf_creds={},
feed_dict={},
save_version="",
load_versions="",
)
mlflow_hook.before_pipeline_run(
run_params=dummy_run_params,
pipeline=pipeline_ml_with_parameters,
catalog=catalog_with_parameters,
)
runner.run(
pipeline_ml_with_parameters, catalog_with_parameters, session._hook_manager
)

current_run_id = mlflow.active_run().info.run_id

# This is what we want to test: parameters should be passed by defautl to the signature
mlflow_hook.after_pipeline_run(
run_params=dummy_run_params,
pipeline=pipeline_ml_with_parameters,
catalog=catalog_with_parameters,
)

# test : parameters should have been logged
trained_model = mlflow.pyfunc.load_model(f"runs:/{current_run_id}/model")

# test 1 : the parameters in the signature should have the runner with a default "SequentialRunner"
assert (
'{"name": "runner", "type": "string", "default": "SequentialRunner", "shape": null}'
in trained_model.metadata.signature.to_dict()["params"]
)

# test 2 : the "threshold" parameter of the inference pipeline should be in the signature
# {
# key: dummy_catalog.load(key)
# for key in dummy_pipeline_ml.inference.inputs()
# if key.startswith("params:")
# }

assert (
'{"name": "threshold", "type": "double", "default": 0.5, "shape": null}'
in trained_model.metadata.signature.to_dict()["params"]
)

# test 3 : we get different results when passing parameters

inference_data = pd.DataFrame(data=[0.2, 0.6, 0.9], columns=["a"])

assert all(
trained_model.predict(inference_data)
== pd.DataFrame([0, 1, 1]).values # no param = 0.5, the default
)

assert all(
trained_model.predict(
inference_data,
params={"threshold": 0.8},
)
== pd.DataFrame([0, 0, 1]).values # 0.6 is now below threshold
)

0 comments on commit a8694da

Please sign in to comment.