up h2o

up grid generation fix bayes tuning up stacking ui
marcozanotti · Jan 19, 2024 · 582b081 · 582b081
1 parent dd11e38
commit 582b081
Show file tree

Hide file tree

Showing 5 changed files with 174 additions and 125 deletions.
diff --git a/dashboard/R/fit_model.R b/dashboard/R/fit_model.R
@@ -452,22 +452,24 @@ set_tune_parameters <- function(method, params) {
 
 }
 
+# function to generate the feature set
+generate_feature_set <- function(recipe_spec) {
+  feature_set <- recipe_spec |>
+    recipes::prep() |>
+    recipes::bake(new_data = NULL) |>
+    dplyr::select(-date, -value)
+  return(feature_set)
+}
+
 # function to perform grid specification
-generate_grid_spec <- function(method, model_spec, grid_size, seed = 1992) {
+generate_grid_spec <- function(method, model_spec, recipe_spec, grid_size, seed = 1992) {
 
   set.seed(seed)
-  if (method %in% c("Random Forest", "Boosted Trees")) {
-    grid_spec <- grid_latin_hypercube(
-      hardhat::extract_parameter_set_dials(model_spec) |>
-        recipes::update(mtry = mtry(range = c(1, 15))),
-      size = grid_size
-    )
-  } else {
-    grid_spec <- dials::grid_latin_hypercube(
-      hardhat::extract_parameter_set_dials(model_spec),
-      size = grid_size
-    )
-  }
+  feature_set <- generate_feature_set(recipe_spec)
+  updated_parameter_set <- model_spec |>
+    hardhat::extract_parameter_set_dials() |>
+    dials::finalize(x = feature_set)
+  grid_spec <- dials::grid_latin_hypercube(updated_parameter_set, size = grid_size)
   return(grid_spec)
 
 }
@@ -575,7 +577,6 @@ fit_model_tuning <- function(
   check_parameters(method, params_new)
   validation_metric <- tolower(validation_metric)
   valid_metric_set <- set_metric_set(validation_metric)
-  set.seed(seed)
 
   # initial split
   splits <- generate_initial_split(data, n_assess, assess_type)
@@ -596,30 +597,39 @@ fit_model_tuning <- function(
   wkfl_spec <- workflow() |> add_recipe(rcp_spec) |> add_model(model_spec)
 
   # grid specification
-  # grid_spec <- generate_grid_spec(method, model_spec, grid_size, seed)
+  # grid_spec <- generate_grid_spec(method, model_spec, rcp_spec, grid_size, seed)
 
   # tuning
   doFuture::registerDoFuture()
   future::plan(strategy = "multisession", workers = parallelly::availableCores() - 1)
   if (bayesian_optimization) {
+    feat_set <- generate_feature_set(rcp_spec)
+    updated_param_set <- hardhat::extract_parameter_set_dials(model_spec) |>
+      dials::finalize(x = feat_set)
+    set.seed(seed)
     tune_fit <- wkfl_spec |>
       tune::tune_bayes(
         resamples = cv_splits,
         metrics = valid_metric_set,
-        initial = as.integer(params$tune_grid_size),
+        initial = as.integer(params$tune_grid_size), # tune_fit (result from tune_grid)
         objective = tune::conf_bound(kappa = 0.1),
         iter = 20L, # as.integer(length(params_new) * 20) good practice
+        param_info = updated_param_set,
         control = tune::control_bayes(
           save_pred = FALSE, allow_par = TRUE, verbose = TRUE, no_improve = 5L
         )
       )
   } else {
+    set.seed(seed)
     tune_fit <- wkfl_spec |>
       tune::tune_grid(
+        preprocessor = rcp_spec,
         resamples = cv_splits,
         metrics = valid_metric_set,
-        grid = as.integer(params$tune_grid_size), # grid_spec
-        control = tune::control_grid(save_pred = FALSE, allow_par = TRUE, verbose = TRUE)
+        grid = as.integer(params$tune_grid_size), # as.integer(params$tune_grid_size)
+        control = tune::control_grid(
+          save_pred = FALSE, allow_par = TRUE, verbose = TRUE
+        )
       )
   }
   future::plan(strategy = "sequential")

diff --git a/dashboard/R/generate_forecast.R b/dashboard/R/generate_forecast.R
@@ -75,7 +75,7 @@ generate_forecast <- function(
       conf_interval = 0.95, conf_method = "conformal_split"
     )
 
-  if (method == "H2O AutoML") { h2o.shutdown(prompt = FALSE) }
+  if (any(method %in% "H2O AutoML")) { h2o.shutdown(prompt = FALSE) }
   res <- list(
     "splits" = splits,
     "fit" = fitted_model_list,

diff --git a/dashboard/test.R b/dashboard/test.R
@@ -68,22 +68,26 @@ input <- list(
   n_future = 12,
   n_assess = 24,
   assess_type = "Rolling",
-  method = c("ETS", "SARIMA"),
-  auto_ets = TRUE,
-  error = "auto",
-  trend = "auto",
-  season = "auto",
-  damping = "auto",
-  smooth_level = 0.1,
-  smooth_trend = 0.1,
-  smooth_season = 0.1,
-  auto_arima = TRUE,
-  non_seasonal_ar = 1,
-  non_seasonal_differences = 1,
-  non_seasonal_ma = 1,
-  seasonal_ar = 1,
-  seasonal_differences = 1,
-  seasonal_ma = 1
+  method = c("ETS", "SARIMA", "H2O AutoML"),
+  auto_ets = get_default("auto_ets"),
+  error = get_default("error"),
+  trend = get_default("trend"),
+  season = get_default("season"),
+  damping = get_default("damping"),
+  smooth_level = get_default("smooth_level"),
+  smooth_trend = get_default("smooth_trend"),
+  smooth_season = get_default("smooth_season"),
+  auto_arima = get_default("auto_arima"),
+  non_seasonal_ar = get_default("non_seasonal_ar"),
+  non_seasonal_differences = get_default("non_seasonal_differences"),
+  non_seasonal_ma = get_default("non_seasonal_ma"),
+  seasonal_ar = get_default("seasonal_ar"),
+  seasonal_differences = get_default("seasonal_differences"),
+  seasonal_ma = get_default("seasonal_ma"),
+  h2o_max_time = get_default("h2o_max_time"),
+  h2o_max_time_model = get_default("h2o_max_time_model"),
+  h2o_nfolds = get_default("h2o_nfolds"),
+  h2o_metric = get_default("h2o_metric")
 )
 
 fitted_model_list <- map(
@@ -124,29 +128,34 @@ res$accuracy |> format_accuracy(single_method = FALSE) |> filter(Type == "Test")
 # COMBINE -----------------------------------------------------------------
 data_selected <- get_data(datasets[1])
 ts_freq <- data_selected$frequency |> unique() |> parse_frequency()
+ens_methods <- getOption("tsf.dashboard.methods")[["ens"]]
 input <- list(
   n_future = 12,
   n_assess = 24,
   assess_type = "Rolling",
-  method = c("ETS", "SARIMA", "Elastic Net"),
+  method = c("ETS", "SARIMA", "Elastic Net", "H2O AutoML"),
   ens_type = ens_methods,
-  auto_ets = TRUE,
-  error = "auto",
-  trend = "auto",
-  season = "auto",
-  damping = "auto",
-  smooth_level = 0.1,
-  smooth_trend = 0.1,
-  smooth_season = 0.1,
-  auto_arima = TRUE,
-  non_seasonal_ar = 1,
-  non_seasonal_differences = 1,
-  non_seasonal_ma = 1,
-  seasonal_ar = 1,
-  seasonal_differences = 1,
-  seasonal_ma = 1,
-  penalty = 1,
-  mixture = 0.5
+  auto_ets = get_default("auto_ets"),
+  error = get_default("error"),
+  trend = get_default("trend"),
+  season = get_default("season"),
+  damping = get_default("damping"),
+  smooth_level = get_default("smooth_level"),
+  smooth_trend = get_default("smooth_trend"),
+  smooth_season = get_default("smooth_season"),
+  auto_arima = get_default("auto_arima"),
+  non_seasonal_ar = get_default("non_seasonal_ar"),
+  non_seasonal_differences = get_default("non_seasonal_differences"),
+  non_seasonal_ma = get_default("non_seasonal_ma"),
+  seasonal_ar = get_default("seasonal_ar"),
+  seasonal_differences = get_default("seasonal_differences"),
+  seasonal_ma = get_default("seasonal_ma"),
+  penalty = get_default("penalty"),
+  mixture = get_default("mixture"),
+  h2o_max_time = get_default("h2o_max_time"),
+  h2o_max_time_model = get_default("h2o_max_time_model"),
+  h2o_nfolds = get_default("h2o_nfolds"),
+  h2o_metric = get_default("h2o_metric")
 )
 
 fitted_model_list <- map(
@@ -201,26 +210,29 @@ input <- list(
   tune_elanet = c("Penalty", "Mixture")
 )
 input <- list(
-  n_future = 12,
-  n_assess = 24,
-  assess_type = "Rolling",
-  method = "Random Forest",
-  valid_type = "K-Fold CV",
-  n_folds = 5,
-  valid_metric = "RMSE",
-  grid_size = 10,
-  tune_xx_rf = c()
+  tune_n_future = 12,
+  tune_n_assess = 24,
+  tune_assess_type = "Rolling",
+  tune_method = "Random Forest",
+  tune_valid_type = "K-Fold CV",
+  tune_n_folds = 5,
+  tune_valid_metric = "RMSE",
+  tune_bayes = TRUE,
+  tune_grid_size = 10,
+  tune_rf = c("Random Predictors", "Trees")
 )
 
 data = data_selected
 params = input
-n_assess = input$n_assess
-assess_type = input$assess_type
-method = input$method
-validation_type = input$valid_type
-n_folds = input$n_folds
-validation_metric = input$metric
-grid_size = input$grid_size
+n_assess = input$tune_n_assess
+assess_type = input$tune_assess_type
+method = input$tune_method
+validation_type = input$tune_valid_type
+n_folds = input$tune_n_folds
+validation_metric = input$tune_valid_metric
+bayesian_optimization = input$tune_bayes
+grid_size = input$tune_grid_size
+n_future = input$tune_n_future
 seed = 1992
 
 fitted_model_list <- map(
@@ -265,38 +277,3 @@ res <- map(
 res$accuracy |> format_accuracy(single_method = TRUE)
 
 
-### GRID
-
-model_spec <- rand_forest(
-  mode = "regression",
-  mtry = tune(),
-  trees = tune(),
-  min_n = tune()
-) |>
-  set_engine("ranger")
-
-model_spec <- boost_tree(
-  mode = "regression",
-  mtry = tune(),
-  trees = tune(),
-  min_n = tune(),
-  tree_depth = tune(),
-  learn_rate = tune(),
-  loss_reduction = tune(),
-  sample_size = tune()
-) |>
-  set_engine("xgboost")
-
-model_spec <- prophet_boost(
-  mode = "regression",
-  mtry = tune(),
-  trees = tune(),
-  min_n = tune(),
-  tree_depth = tune(),
-  learn_rate = tune(),
-  loss_reduction = tune(),
-  sample_size = tune()
-) |>
-  set_engine("prophet_xgboost")
-
-
diff --git a/dashboard/todo.txt b/dashboard/todo.txt
@@ -14,8 +14,6 @@ Next steps:
 - aggiungere package::function per ogni funzione
 
 To Do:
-- ancora problema con mtry con tune_bayes, risolvere creando la griglia
-- check h2o in compare e combine
 - pensare e aggiungere la sezione di stacking (LM + Elastic Net)
 - pensare e aggiungere la sezione di scenario forecasting + uncertainty + judgmental (gauges?) + rolling variances
 - pensare e aggiungere il save del modello ottimizzato + use optimize in altre sezioni