up tuning

marcozanotti · Jan 18, 2024 · dd11e38 · dd11e38
1 parent 9b60055
commit dd11e38
Show file tree

Hide file tree

Showing 6 changed files with 397 additions and 237 deletions.
diff --git a/dashboard/R/fit_model.R b/dashboard/R/fit_model.R
@@ -42,7 +42,7 @@ generate_recipe_spec <- function(data, method) {
 
     rcp_spec <- recipe(value ~ ., data = data)
 
-  } else if (method_type == "ml" | method_type == "dl") {
+  } else if (any(method_type %in% c("ml", "dl"))) {
 
     rcp_spec <- recipe(value ~ ., data = data) |>
       step_timeseries_signature(date) |>
@@ -51,13 +51,13 @@ generate_recipe_spec <- function(data, method) {
       step_rm(matches("(iso)|(xts)|(index.num)")) |>
       step_dummy(all_nominal(), one_hot = TRUE)
 
-  } else if (method_type == "mix") {
+  } else if (any(method_type %in% c("mix", "aml"))) {
 
     rcp_spec <- recipe(value ~ ., data = data) |>
       step_timeseries_signature(date) |>
-      step_normalize(date_index.num) |>
+      step_mutate(trend = as.numeric(date)) |>
       step_zv(all_predictors()) |>
-      step_rm(matches("(iso)|(xts)")) |>
+      step_rm(matches("(iso)|(xts)|(index.num)")) |>
       step_dummy(all_nominal(), one_hot = TRUE)
 
   } else {
@@ -346,6 +346,23 @@ generate_model_spec <- function(method, params) {
     ) |>
       set_engine("prophet_xgboost")
 
+  } else if (method == "H2O AutoML") {
+
+    model_spec <- automl_reg(mode = "regression") |>
+      set_engine(
+        engine = "h2o",
+        project_name = "h2o_tsf_dashboard",
+        max_models = 50,
+        max_runtime_secs = !!params$h2o_max_time,
+        max_runtime_secs_per_model = !!params$h2o_max_time_model,
+        nfolds = !!params$h2o_nfolds,
+        sort_metric = !!params$h2o_metric,
+        seed = 1992
+        # include_algos = c("DRF"),
+        # exclude_algos = c("DeepLearning"),
+        # verbosity = NULL
+      )
+
   } else {
     stop(paste("Unknown method", method))
   }
@@ -354,6 +371,29 @@ generate_model_spec <- function(method, params) {
 
 }
 
+# function to set the metric set
+set_metric_set <- function(metric) {
+
+  metric <- tolower(metric)
+  if (metric == "mae") {
+    mtr_set <- yardstick::metric_set(mae)
+  } else if (metric == "mape") {
+    mtr_set <- yardstick::metric_set(mape)
+  } else if (metric == "mase") {
+    mtr_set <- yardstick::metric_set(mase)
+  } else if (metric == "smape") {
+    mtr_set <- yardstick::metric_set(smape)
+  } else if (metric == "mse") {
+    mtr_set <- yardstick::metric_set(mse)
+  } else if (metric == "rmse") {
+    mtr_set <- yardstick::metric_set(rmse)
+  } else {
+    stop(paste("Unknown metric", metric))
+  }
+  return(mtr_set)
+
+}
+
 # function to generate the model specification for tuning
 set_tune_parameters <- function(method, params) {
 
@@ -366,7 +406,6 @@ set_tune_parameters <- function(method, params) {
     }
   }
 
-  mtd_params <- getOption("tsf.dashboard.methods_params")[[method]] # get the parameters for the method
   if (method == "Elastic Net") {
     prm_ui_name <- params$tune_elanet
   } else if (method == "MARS") {
@@ -396,6 +435,8 @@ set_tune_parameters <- function(method, params) {
   } else {
     stop(paste("Unknown method", method))
   }
+
+  mtd_params <- getOption("tsf.dashboard.methods_params")[[method]] # get the parameters for the method
   tune_params <- mtd_params[names(mtd_params) %in% prm_ui_name] # get the parameters to tune
   is_to_tune <- mtd_params %in% tune_params
   new_params <- purrr::map2(mtd_params, is_to_tune, set_tune) |> purrr::set_names(mtd_params)
@@ -456,6 +497,7 @@ fit_model <- function(data, method, params, n_assess, assess_type, seed = 1992)
   wkfl_spec <- workflow() |> add_recipe(rcp_spec) |> add_model(model_spec)
 
   # fitting
+  if (method == "H2O AutoML") { h2o.init() }
   wkfl_fit <- wkfl_spec |> fit(data = train_tbl)
 
   return(wkfl_fit)
@@ -526,11 +568,13 @@ fit_model_tuning <- function(
     data, method, params, n_assess, assess_type,
     validation_type = "Time Series CV",
     n_folds = 5, validation_metric = "rmse", grid_size = 10,
-    seed = 1992
+    bayesian_optimization = TRUE, seed = 1992
 ) {
 
   params_new <- set_tune_parameters(method, params)
   check_parameters(method, params_new)
+  validation_metric <- tolower(validation_metric)
+  valid_metric_set <- set_metric_set(validation_metric)
   set.seed(seed)
 
   # initial split
@@ -557,17 +601,31 @@ fit_model_tuning <- function(
   # tuning
   doFuture::registerDoFuture()
   future::plan(strategy = "multisession", workers = parallelly::availableCores() - 1)
-  tune_fit <- wkfl_spec |>
-    tune::tune_grid(
-      resamples = cv_splits,
-      grid = params$tune_grid_size, # grid_spec
-      metrics = modeltime::default_forecast_accuracy_metric_set(),
-      control = tune::control_grid(save_pred = FALSE, allow_par = TRUE)
-    )
+  if (bayesian_optimization) {
+    tune_fit <- wkfl_spec |>
+      tune::tune_bayes(
+        resamples = cv_splits,
+        metrics = valid_metric_set,
+        initial = as.integer(params$tune_grid_size),
+        objective = tune::conf_bound(kappa = 0.1),
+        iter = 20L, # as.integer(length(params_new) * 20) good practice
+        control = tune::control_bayes(
+          save_pred = FALSE, allow_par = TRUE, verbose = TRUE, no_improve = 5L
+        )
+      )
+  } else {
+    tune_fit <- wkfl_spec |>
+      tune::tune_grid(
+        resamples = cv_splits,
+        metrics = valid_metric_set,
+        grid = as.integer(params$tune_grid_size), # grid_spec
+        control = tune::control_grid(save_pred = FALSE, allow_par = TRUE, verbose = TRUE)
+      )
+  }
   future::plan(strategy = "sequential")
 
   # picking best model
-  best_fit <- tune::show_best(tune_fit, metric = tolower(validation_metric), n = 1)
+  best_fit <- tune::show_best(tune_fit, metric = validation_metric, n = 1)
 
   # fitting (fit to training with optimal values)
   wkfl_fit <- wkfl_spec |> tune::finalize_workflow(best_fit) |> fit(train_tbl)

diff --git a/dashboard/R/generate_forecast.R b/dashboard/R/generate_forecast.R
@@ -75,6 +75,7 @@ generate_forecast <- function(
       conf_interval = 0.95, conf_method = "conformal_split"
     )
 
+  if (method == "H2O AutoML") { h2o.shutdown(prompt = FALSE) }
   res <- list(
     "splits" = splits,
     "fit" = fitted_model_list,

diff --git a/dashboard/R/utils.R b/dashboard/R/utils.R
@@ -8,6 +8,7 @@ set_options <- function() {
       "ml" = c("Linear Regression", "Elastic Net", "MARS", "KNN", "SVM", "Random Forest", "Boosted Trees", "Cubist"),
       "dl" = c("Feed-Forward", "COMING SOON!"),
       "mix" = c("Feed-Forward AR", "ARIMA-Boost", "Prophet-Boost"),
+      "aml" = c("H2O AutoML", "COMING SOON!"),
       "ens" = c("Average", "Weighted Average", "Median"),
       "stk" = c("Linear Regression", "Elastic Net"),
       "tune" = c(
@@ -85,7 +86,9 @@ set_options <- function() {
       ) |> purrr::set_names(c(
         "Random Predictors", "Trees", "Min Node Size", "Tree Depth",
         "Learning Rate", "Min Loss Reduction", "Sample"
-      ))
+      )),
+      "H2O AutoML" = c("h2o_max_time", "h2o_max_time_model", "h2o_nfolds", "h2o_metric") |>
+        purrr::set_names(c("Max Time (secs)", "Max Time per Model (secs)", "Folds", "Metric"))
     ),
     tsf.dashboard.transfs = c("log", "boxcox", "norm", "stand", "diff", "sdiff"),
     tsf.dashboard.test_transfs = c("test_log", "test_diff", "test_sdiff"),
@@ -140,8 +143,14 @@ parse_method <- function(method) {
     res <- "dl"
   } else if (method %in% mtd$mix) {
     res <- "mix"
+  } else if (method %in% mtd$aml) {
+    res <- "aml"
   } else if (method %in% mtd$ens) {
     res <- "ens"
+  } else if (method %in% mtd$stk) {
+    res <- "stk"
+  } else if (method %in% mtd$tune) {
+    res <- "tune"
   } else {
     stop(paste("Unknown method", method))
   }
@@ -198,7 +207,8 @@ get_default <- function(parameter, return_value = TRUE) {
     "arima_boost_mtry" = 5, "arima_boost_trees" = 100, "arima_boost_min_n" = 1, "arima_boost_tree_depth" = 6, # ARIMA-Boost
     "arima_boost_learn_rate" = 0.3, "arima_boost_loss_reduction" = 0, "arima_boost_sample_size" = 1,
     "prophet_boost_mtry" = 5, "prophet_boost_trees" = 100, "prophet_boost_min_n" = 1, "prophet_boost_tree_depth" = 6, #  Prophet-Boost
-    "prophet_boost_learn_rate" = 0.3, "prophet_boost_loss_reduction" = 0, "prophet_boost_sample_size" = 1
+    "prophet_boost_learn_rate" = 0.3, "prophet_boost_loss_reduction" = 0, "prophet_boost_sample_size" = 1,
+    "h2o_max_time" = 30, "h2o_max_time_model" = 15, "h2o_nfolds" = 5, "h2o_metric" = "RMSE"
   )
 
   if (return_value) {

diff --git a/dashboard/test.R b/dashboard/test.R
@@ -198,7 +198,7 @@ input <- list(
   n_folds = 5,
   metric = "RMSE",
   grid_size = 10,
-  tune_xx_elanet = c("Penalty", "Mixture")
+  tune_elanet = c("Penalty", "Mixture")
 )
 input <- list(
   n_future = 12,
@@ -221,6 +221,7 @@ validation_type = input$valid_type
 n_folds = input$n_folds
 validation_metric = input$metric
 grid_size = input$grid_size
+seed = 1992
 
 fitted_model_list <- map(
   input$method,
@@ -262,3 +263,40 @@ res <- map(
     assess_type = input$assess_type
   )
 res$accuracy |> format_accuracy(single_method = TRUE)
+
+
+### GRID
+
+model_spec <- rand_forest(
+  mode = "regression",
+  mtry = tune(),
+  trees = tune(),
+  min_n = tune()
+) |>
+  set_engine("ranger")
+
+model_spec <- boost_tree(
+  mode = "regression",
+  mtry = tune(),
+  trees = tune(),
+  min_n = tune(),
+  tree_depth = tune(),
+  learn_rate = tune(),
+  loss_reduction = tune(),
+  sample_size = tune()
+) |>
+  set_engine("xgboost")
+
+model_spec <- prophet_boost(
+  mode = "regression",
+  mtry = tune(),
+  trees = tune(),
+  min_n = tune(),
+  tree_depth = tune(),
+  learn_rate = tune(),
+  loss_reduction = tune(),
+  sample_size = tune()
+) |>
+  set_engine("prophet_xgboost")
+
+
diff --git a/dashboard/todo.txt b/dashboard/todo.txt
@@ -11,15 +11,17 @@ Next steps:
 - deployment su github actions
 - move to another github repo
 - documentazione in alto a destra
+- aggiungere package::function per ogni funzione
 
 To Do:
-- tune_bayes
-
-- aggiungere metodi di automl (h2o)
+- ancora problema con mtry con tune_bayes, risolvere creando la griglia
+- check h2o in compare e combine
 - pensare e aggiungere la sezione di stacking (LM + Elastic Net)
-- pensare e aggiungere la sezione di scenario forecasting + uncertainty + judgmental (gauges?)
+- pensare e aggiungere la sezione di scenario forecasting + uncertainty + judgmental (gauges?) + rolling variances
 - pensare e aggiungere il save del modello ottimizzato + use optimize in altre sezioni
 - aggiungere metodi di dl (NeuralProphet + NBEATS + DeepAR)
 - pensare e aggiungere la sezione di feature engineering (con in mente il save)
 - modificare output modello con parsing
 - cambiare assegnazione nomi ai parametri in UI
+- XAI
+- aggiornamento cluster h2o