full working pipeline

Mohamed-Mejri · Jan 23, 2023 · fc9e368 · fc9e368
1 parent bc773bb
commit fc9e368
Show file tree

Hide file tree

Showing 8 changed files with 82 additions and 24 deletions.
diff --git a/check_data/conda.yml b/check_data/conda.yml
@@ -3,9 +3,11 @@ channels:
   - conda-forge
   - defaults
 dependencies:
+  - python=3.8
+  - numpy=1.20
   - pandas=1.2.3
   - pip=20.3.3
   - pytest=6.2.2
   - scipy=1.6.1
   - pip:
-      - wandb==0.10.21
+      - wandb==0.13.4
diff --git a/conda.yml b/conda.yml
@@ -3,10 +3,12 @@ channels:
   - conda-forge
   - defaults
 dependencies:
+  - python=3.8
+  - numpy=1.20
   - requests=2.24.0
   - pip=20.3.3
-  - mlflow=1.14.1
+  - mlflow=2.0.1
   - hydra-core=1.0.6
   - pip:
-      - wandb==0.10.21
+      - wandb==0.13.4
       - hydra-joblib-launcher==1.1.2
diff --git a/download/conda.yml b/download/conda.yml
@@ -3,7 +3,9 @@ channels:
   - conda-forge
   - defaults
 dependencies:
+  - python=3.8
+  - numpy=1.20
   - requests=2.24.0
   - pip=20.3.3
   - pip:
-      - wandb==0.10.21
+      - wandb==0.13.4
diff --git a/evaluate/conda.yml b/evaluate/conda.yml
@@ -3,12 +3,14 @@ channels:
   - conda-forge
   - defaults
 dependencies:
+  - python=3.8
+  - numpy=1.20
   - pandas=1.2.3
   - pip=20.3.3
   - scikit-learn=0.24.1
   - matplotlib==3.2.2
   - plotly==4.4.1
   - pillow=8.1.2
-  - mlflow=1.14.1
+  - mlflow=2.0.1
   - pip:
-      - wandb==0.10.21
+      - wandb==0.13.4
diff --git a/main.py b/main.py
@@ -20,8 +20,12 @@ def go(config: DictConfig):
         # This was passed on the command line as a comma-separated list of steps
         steps_to_execute = config["main"]["execute_steps"].split(",")
     else:
-        assert isinstance(config["main"]["execute_steps"], list)
-        steps_to_execute = config["main"]["execute_steps"]
+        print("###########")
+        # print(type(config["main"]["execute_steps"]))
+        # assert isinstance(config["main"]["execute_steps"], list)
+        # steps_to_execute = config["main"]["execute_steps"]
+        steps_to_execute = list(config["main"]["execute_steps"])
+        print(type(steps_to_execute))
 
     # Download step
     if "download" in steps_to_execute:
@@ -39,18 +43,43 @@ def go(config: DictConfig):
 
     if "preprocess" in steps_to_execute:
 
-        ## YOUR CODE HERE: call the preprocess step
-        pass
+        _ = mlflow.run(
+            os.path.join(root_path, "preprocess"),
+            "main",
+            parameters={
+                "input_artifact": "raw_data.parquet:latest",
+                "artifact_name": "preprocessed_data.csv",
+                "artifact_type": "preprocessed_data",
+                "artifact_description": "Data preprocessed"
+            },
+        )
 
     if "check_data" in steps_to_execute:
 
-        ## YOUR CODE HERE: call the check_data step
-        pass
+        _ = mlflow.run(
+            os.path.join(root_path, "check_data"),
+            "main",
+            parameters={
+                "reference_artifact": config["data"]["reference_dataset"],
+                "sample_artifact": "preprocessed_data.csv:latest",
+                "ks_alpha": config["data"]["ks_alpha"]
+            },
+        )
 
     if "segregate" in steps_to_execute:
 
-        ## YOUR CODE HERE: call the segregate step
-        pass
+        _ = mlflow.run(
+            os.path.join(root_path, "segregate"),
+            "main",
+            parameters={
+                "input_artifact": "preprocessed_data.csv:latest",
+                "artifact_root": "data",
+                "artifact_type": "segregated_data",
+                "test_size": config["data"]["test_size"],
+                "random_state": config["main"]["random_seed"],
+                "stratify": config["data"]["stratify"]
+            },
+        )
 
     if "random_forest" in steps_to_execute:
 
@@ -60,14 +89,29 @@ def go(config: DictConfig):
         with open(model_config, "w+") as fp:
             fp.write(OmegaConf.to_yaml(config["random_forest_pipeline"]))
 
-        ## YOUR CODE HERE: call the random_forest step
-        pass
+        _ = mlflow.run(
+            os.path.join(root_path, "random_forest"),
+            "main",
+            parameters={
+                "train_data": "data_train.csv:latest",
+                "model_config": model_config,
+                "export_artifact": config["random_forest_pipeline"]["export_artifact"],
+                "random_seed": config['random_forest_pipeline']['random_forest']['random_state'],
+                "val_size": config["data"]["test_size"],
+                "stratify": config["data"]["stratify"]
+            }
+        )
 
     if "evaluate" in steps_to_execute:
-
-        ## YOUR CODE HERE: call the evaluate step
-        pass
-
+        _ = mlflow.run(
+            os.path.join(root_path, "evaluate"),
+            "main",
+            parameters={
+                "model_export": f"{config['random_forest_pipeline']['export_artifact']}:latest",
+                "test_data": "data_test.csv:latest"
+            }
+        )
+
 
 if __name__ == "__main__":
     go()
diff --git a/preprocess/conda.yml b/preprocess/conda.yml
@@ -3,8 +3,10 @@ channels:
   - conda-forge
   - defaults
 dependencies:
+  - python=3.8
+  - numpy=1.20
   - pandas=1.2.3
   - pip=20.3.3
   - pyarrow=2.0
   - pip:
-      - wandb==0.10.21
+      - wandb==0.13.4
diff --git a/random_forest/conda.yml b/random_forest/conda.yml
@@ -3,13 +3,15 @@ channels:
   - conda-forge
   - defaults
 dependencies:
+  - python=3.8
+  - numpy=1.20
   - pandas=1.2.3
   - pip=20.3.3
   - scikit-learn=0.24.1
   - matplotlib==3.2.2
   - plotly==4.4.1
   - pillow=8.1.2
-  - mlflow=1.14.1
+  - mlflow=2.0.1
   - pip:
-      - wandb==0.10.21
+      - wandb==0.13.4
       - omegaconf==2.0.6
diff --git a/segregate/conda.yml b/segregate/conda.yml
@@ -3,8 +3,10 @@ channels:
   - conda-forge
   - defaults
 dependencies:
+  - python=3.8
+  - numpy=1.20
   - pandas=1.2.3
   - pip=20.3.3
   - scikit-learn=0.24.1
   - pip:
-      - wandb==0.10.21
+      - wandb==0.13.4