From 1c7594e5e4235e35dc15d5b15d9ea0f40e4804b2 Mon Sep 17 00:00:00 2001
From: Koen Hufkens <koen.hufkens@gmail.com>
Date: Fri, 16 Feb 2024 18:28:37 +0100
Subject: [PATCH] LSO model

---
 analysis/04_regression_training_LSO.R |  6 ++++--
 data/README.md                        | 17 +++++++++++++++++
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/analysis/04_regression_training_LSO.R b/analysis/04_regression_training_LSO.R
index ae67ba4..9059838 100644
--- a/analysis/04_regression_training_LSO.R
+++ b/analysis/04_regression_training_LSO.R
@@ -1,5 +1,6 @@
 # Basic xgboost model with limited
-# hyperparameter tuning
+# hyperparameter tuning with
+# leave site out cross validation
 
 # load the ecosystem
 library(tidymodels)
@@ -14,7 +15,8 @@ ml_df <- read_ml_data(
   spatial = TRUE
 )
 
-results <- lapply(unique(ml_df$site)[1:2], function(site){
+# Leave-Site-Out cross validation loop
+results <- lapply(unique(ml_df$site), function(site){
 
   #---- data partitioning ----
 
diff --git a/data/README.md b/data/README.md
index 0d41cc0..dccdc8e 100644
--- a/data/README.md
+++ b/data/README.md
@@ -7,6 +7,13 @@ Input data consist of the driver data file:
 
 Which contains all data required for the analysis.
 
+Spectral indices are downloaded from:
+https://awesome-ee-spectral-indices.readthedocs.io/
+on 15/02/2024
+
+more info on indices here:
+https://www.indexdatabase.de/
+
 ## Output
 
 Two models are created, a binary classification (drought day or not), and an
@@ -17,6 +24,16 @@ folder and output is called,
 classification models respectively. These are the best models selected after
 cross validation (see code in the `analysis` folder).
 
+### LSO
+
+The LSO directory contains all models of the leave-site-out model training
+runs, the model name reflects the run for which the site was ommitted from
+training,
+
+`test_*.rds`
+
+would therefore be trained on all data but that from the `test` site.
+
 ### Annotated manuscript
 
 An annotated manuscript of the model result is written up in the vignettes