From a8694daf75c4a1c17ecf138f42d55addff6a756f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yolan=20Honor=C3=A9-Roug=C3=A9?=
 <yolan.honore.rouge@gmail.com>
Date: Sun, 1 Dec 2024 15:29:10 +0100
Subject: [PATCH] add tests

---
 .../framework/hooks/test_hook_pipeline_ml.py  | 120 +++++++++++++++---
 1 file changed, 103 insertions(+), 17 deletions(-)

diff --git a/tests/framework/hooks/test_hook_pipeline_ml.py b/tests/framework/hooks/test_hook_pipeline_ml.py
index 7731a20a..3197a51f 100644
--- a/tests/framework/hooks/test_hook_pipeline_ml.py
+++ b/tests/framework/hooks/test_hook_pipeline_ml.py
@@ -45,7 +45,7 @@ def preprocess_fun(data):
         return data
 
     def train_fun(data, param):
-        return 2
+        return 1
 
     def predict_fun(model, data):
         return data * model
@@ -105,7 +105,7 @@ def remove_stopwords(data, stopwords):
         return data
 
     def train_fun_hyperparam(data, hyperparam):
-        return 2
+        return 1
 
     def predict_fun(model, data):
         return data * model
@@ -156,6 +156,25 @@ def convert_probs_to_pred(data, threshold):
     return pipeline_ml_with_parameters
 
 
+@pytest.fixture
+def catalog_with_parameters(kedro_project_with_mlflow_conf):
+    catalog_with_parameters = DataCatalog(
+        {
+            "data": MemoryDataset(pd.DataFrame(data=[0.5], columns=["a"])),
+            "cleaned_data": MemoryDataset(),
+            "params:stopwords": MemoryDataset(["Hello", "Hi"]),
+            "params:penalty": MemoryDataset(0.1),
+            "model": PickleDataset(
+                filepath=(
+                    kedro_project_with_mlflow_conf / "data" / "model.csv"
+                ).as_posix()
+            ),
+            "params:threshold": MemoryDataset(0.5),
+        }
+    )
+    return catalog_with_parameters
+
+
 @pytest.fixture
 def dummy_signature(dummy_catalog, dummy_pipeline_ml):
     input_data = dummy_catalog.load(dummy_pipeline_ml.input_name)
@@ -441,6 +460,7 @@ def test_mlflow_hook_save_pipeline_ml_with_default_copy_mode_assign(
 def test_mlflow_hook_save_pipeline_ml_with_parameters(
     kedro_project_with_mlflow_conf,  # a fixture to be in a kedro project
     pipeline_ml_with_parameters,
+    catalog_with_parameters,
     dummy_run_params,
 ):
     # config_with_base_mlflow_conf is a conftest fixture
@@ -448,21 +468,6 @@ def test_mlflow_hook_save_pipeline_ml_with_parameters(
     with KedroSession.create(project_path=kedro_project_with_mlflow_conf) as session:
         context = session.load_context()
 
-        catalog_with_parameters = DataCatalog(
-            {
-                "data": MemoryDataset(pd.DataFrame(data=[1], columns=["a"])),
-                "cleaned_data": MemoryDataset(),
-                "params:stopwords": MemoryDataset(["Hello", "Hi"]),
-                "params:penalty": MemoryDataset(0.1),
-                "model": PickleDataset(
-                    filepath=(
-                        kedro_project_with_mlflow_conf / "data" / "model.csv"
-                    ).as_posix()
-                ),
-                "params:threshold": MemoryDataset(0.5),
-            }
-        )
-
         mlflow_hook = MlflowHook()
         mlflow_hook.after_context_created(context)
 
@@ -694,3 +699,84 @@ def test_mlflow_hook_save_pipeline_ml_with_dataset_factory(
         trained_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/artifacts")
         # the real test is that the model is loaded without error
         assert trained_model is not None
+
+
+def test_mlflow_hook_save_and_load_pipeline_ml_with_inference_parameters(
+    kedro_project_with_mlflow_conf,  # a fixture to be in a kedro project
+    pipeline_ml_with_parameters,
+    catalog_with_parameters,
+    dummy_run_params,
+):
+    bootstrap_project(kedro_project_with_mlflow_conf)
+    with KedroSession.create(project_path=kedro_project_with_mlflow_conf) as session:
+        context = session.load_context()
+
+        mlflow_hook = MlflowHook()
+        mlflow_hook.after_context_created(context)
+
+        runner = SequentialRunner()
+        mlflow_hook.after_catalog_created(
+            catalog=catalog_with_parameters,
+            # `after_catalog_created` is not using any of arguments bellow,
+            # so we are setting them to empty values.
+            conf_catalog={},
+            conf_creds={},
+            feed_dict={},
+            save_version="",
+            load_versions="",
+        )
+        mlflow_hook.before_pipeline_run(
+            run_params=dummy_run_params,
+            pipeline=pipeline_ml_with_parameters,
+            catalog=catalog_with_parameters,
+        )
+        runner.run(
+            pipeline_ml_with_parameters, catalog_with_parameters, session._hook_manager
+        )
+
+        current_run_id = mlflow.active_run().info.run_id
+
+        # This is what we want to test: parameters should be passed by defautl to the signature
+        mlflow_hook.after_pipeline_run(
+            run_params=dummy_run_params,
+            pipeline=pipeline_ml_with_parameters,
+            catalog=catalog_with_parameters,
+        )
+
+        # test : parameters should have been logged
+        trained_model = mlflow.pyfunc.load_model(f"runs:/{current_run_id}/model")
+
+        # test 1 : the parameters in the signature should have the runner with a default "SequentialRunner"
+        assert (
+            '{"name": "runner", "type": "string", "default": "SequentialRunner", "shape": null}'
+            in trained_model.metadata.signature.to_dict()["params"]
+        )
+
+        # test 2 : the "threshold" parameter of the inference pipeline should be in the signature
+        # {
+        #     key: dummy_catalog.load(key)
+        #     for key in dummy_pipeline_ml.inference.inputs()
+        #     if key.startswith("params:")
+        # }
+
+        assert (
+            '{"name": "threshold", "type": "double", "default": 0.5, "shape": null}'
+            in trained_model.metadata.signature.to_dict()["params"]
+        )
+
+        # test 3 : we get different results when passing parameters
+
+        inference_data = pd.DataFrame(data=[0.2, 0.6, 0.9], columns=["a"])
+
+        assert all(
+            trained_model.predict(inference_data)
+            == pd.DataFrame([0, 1, 1]).values  # no param = 0.5, the default
+        )
+
+        assert all(
+            trained_model.predict(
+                inference_data,
+                params={"threshold": 0.8},
+            )
+            == pd.DataFrame([0, 0, 1]).values  # 0.6 is now below threshold
+        )