From d855db2aa810afa47dd285c0c22105a1b10b1725 Mon Sep 17 00:00:00 2001
From: Nat Efrat-Henrici <60049837+nefrathenrici@users.noreply.github.com>
Date: Tue, 19 Mar 2024 16:20:47 -0700
Subject: [PATCH] Add CLI for slurm configuration (#70)

---
 docs/src/experiment_setup_guide.md       |  2 +-
 docs/src/quickstart.md                   |  8 +--
 experiments/pipeline.sh                  | 56 ----------------
 experiments/pipeline.jl => pipeline.jl   |  0
 pipeline.sh                              | 51 +++++++++++++++
 {experiments => slurm}/initialize.sbatch |  3 +-
 {experiments => slurm}/model_run.sbatch  |  3 -
 slurm/parse_commandline.sh               | 82 ++++++++++++++++++++++++
 {experiments => slurm}/update.sbatch     |  1 +
 9 files changed, 141 insertions(+), 65 deletions(-)
 delete mode 100755 experiments/pipeline.sh
 rename experiments/pipeline.jl => pipeline.jl (100%)
 create mode 100755 pipeline.sh
 rename {experiments => slurm}/initialize.sbatch (88%)
 rename {experiments => slurm}/model_run.sbatch (90%)
 create mode 100644 slurm/parse_commandline.sh
 rename {experiments => slurm}/update.sbatch (94%)

diff --git a/docs/src/experiment_setup_guide.md b/docs/src/experiment_setup_guide.md
index 378caf77..97c3ccad 100644
--- a/docs/src/experiment_setup_guide.md
+++ b/docs/src/experiment_setup_guide.md
@@ -13,7 +13,7 @@ For the example experiment, `sphere_held_suarez_rhoe_equilmoist`, this is done b
 `sbatch experiments/sphere_held_suarez_rhoe_equilmoist/generate_observations.sbatch`. This script runs the model, passes the output through the observation map, and saves the result.
 
 Once the observations have been processed and saved, the actual calibration pipeline can be run via
-`bash experiments/pipeline.sh sphere_held_suarez_rhoe_equilmoist 8`.
+`bash pipeline.sh sphere_held_suarez_rhoe_equilmoist -n 10 -c 8`.
 
 !!! note
     The command line interface for `pipeline.sh` will change. For now, the first entry is the experiment id and the second is the number of tasks to use per ensemble member.
diff --git a/docs/src/quickstart.md b/docs/src/quickstart.md
index de10741c..abdbc700 100644
--- a/docs/src/quickstart.md
+++ b/docs/src/quickstart.md
@@ -12,19 +12,19 @@ By default, it runs 10 ensemble members for 3 iterations.
 To run this experiment:
 1. Log onto the Caltech HPC
 2. Clone CalibrateAtmos.jl and `cd` into the repository.
-3. Run: `bash experiments/pipeline.sh sphere_held_suarez_rhoe_equilmoist 8`. This will run the `sphere_held_suarez_rhoe_equilmoist` experiment with 8 tasks per ensemble member.
+3. Run: `bash pipeline.sh -n 10 -c 8 sphere_held_suarez_rhoe_equilmoist`. This will run the `sphere_held_suarez_rhoe_equilmoist` experiment with 10 tasks per ensemble member.
 
 ## Local Machine
-To run an experiment on your local machine, you can use the `experiments/pipeline.jl` script. This is recommended for more lightweight experiments, such as the `surface_fluxes_perfect_model` experiment, which uses the [SurfaceFluxes.jl](https://github.com/CliMA/SurfaceFluxes.jl) package to generate a physical model that calculates the Monin Obukhov turbulent surface fluxes based on idealized atmospheric and surface conditions. Since this is a "perfect model" example, the same model is used to generate synthetic observations using its default parameters and a small amount of noise. These synthetic observations are considered to be the ground truth, which is used to assess the model ensembles' performance when parameters are drawn from the prior parameter distributions. To run this experiment, you can use the following command from terminal to run an interactive run:
+To run an experiment on your local machine, you can use the `pipeline.jl` script. This is recommended for more lightweight experiments, such as the `surface_fluxes_perfect_model` experiment, which uses the [SurfaceFluxes.jl](https://github.com/CliMA/SurfaceFluxes.jl) package to generate a physical model that calculates the Monin Obukhov turbulent surface fluxes based on idealized atmospheric and surface conditions. Since this is a "perfect model" example, the same model is used to generate synthetic observations using its default parameters and a small amount of noise. These synthetic observations are considered to be the ground truth, which is used to assess the model ensembles' performance when parameters are drawn from the prior parameter distributions. To run this experiment, you can use the following command from terminal to run an interactive run:
 
 ```bash
-julia -i experiments/pipeline.jl surface_fluxes_perfect_model
+julia -i pipeline.jl surface_fluxes_perfect_model
 ```
 
 This pipeline mirrors the pipeline of the bash srcipts, and the same example can be run on the HPC cluster if needed:
 
 ```bash
-bash experiments/pipeline.sh surface_fluxes_perfect_model 8
+bash pipeline.sh surface_fluxes_perfect_model 8
 ```
 
 The experiments (such as `surface_fluxes_perfect_model`) can be equally defined within the component model repos (in this case, `SurfaceFluxes.jl`), so that the internals of `CalibrateAtmos.jl` do not explicitly depend on component models.
diff --git a/experiments/pipeline.sh b/experiments/pipeline.sh
deleted file mode 100755
index 9f714f7b..00000000
--- a/experiments/pipeline.sh
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/bin/bash
-# Configure the environment
-export MODULEPATH=/groups/esm/modules:$MODULEPATH
-module load climacommon/2024_02_27
-
-# Parse command line
-experiment_id=${1?Error: no experiment ID given}
-tasks_per_model_run=${2?Error: no tasks per model run given}
-
-# Get ensemble size, number of iterations, and output dir from EKP config file
-ensemble_size=$(grep "ensemble_size:" experiments/$experiment_id/ekp_config.yml | awk '{print $2}')
-n_iterations=$(grep "n_iterations:" experiments/$experiment_id/ekp_config.yml | awk '{print $2}')
-output=$(grep "output_dir:" experiments/$experiment_id/ekp_config.yml | awk '{print $2}')
-
-mkdir $output
-
-echo "Running experiment $experiment_id with $tasks_per_model_run tasks per model run"
-init_id=$(sbatch --parsable \
-                 --output=$output/log.out \
-                 --open-mode=append \
-                 --partition=expansion \
-                 experiments/initialize.sbatch $experiment_id)
-echo "Initialization job_id: $init_id"
-echo ""
-
-# Loop over iterations
-dependency="afterok:$init_id"
-for i in $(seq 0 $((n_iterations - 1)))
-do
-    echo "Scheduling iteration $i"
-    format_i=$(printf "iteration_%03d" "$i")
-
-    ensemble_array_id=$(
-        sbatch --dependency=$dependency --kill-on-invalid-dep=yes --parsable \
-               --job=model-$i \
-               --output=/dev/null \
-               --ntasks=$tasks_per_model_run \
-               --array=1-$ensemble_size \
-               --partition=expansion \
-               experiments/model_run.sbatch $experiment_id $i)
-
-    dependency=afterany:$ensemble_array_id
-    echo "Iteration $i job id: $ensemble_array_id"
-
-    update_id=$(
-        sbatch --dependency=$dependency --kill-on-invalid-dep=yes --parsable \
-               --job=update-$i \
-               --output=$output/log.out \
-               --open-mode=append \
-               --partition=expansion \
-               experiments/update.sbatch $experiment_id $i)
-
-    dependency=afterany:$update_id
-    echo "Update $i job id: $update_id"
-    echo ""
-done
diff --git a/experiments/pipeline.jl b/pipeline.jl
similarity index 100%
rename from experiments/pipeline.jl
rename to pipeline.jl
diff --git a/pipeline.sh b/pipeline.sh
new file mode 100755
index 00000000..a13e3a59
--- /dev/null
+++ b/pipeline.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+set -euo pipefail
+export MODULEPATH=/groups/esm/modules:$MODULEPATH
+module purge
+module load climacommon/2024_03_18
+
+source slurm/parse_commandline.sh
+if [ ! -d $output ] ; then
+    mkdir -p $output
+fi
+
+# Initialize the project and setup calibration
+init_id=$(sbatch --parsable \
+                 --output=$logfile \
+                 --partition=$partition \
+                 slurm/initialize.sbatch $experiment_id)
+echo -e "Initialization job_id: $init_id\n"
+
+# Loop over iterations
+dependency="afterok:$init_id"
+for i in $(seq 0 $((n_iterations - 1)))
+do
+    echo "Scheduling iteration $i"
+    format_i=$(printf "iteration_%03d" "$i")
+
+    ensemble_array_id=$(
+        sbatch --dependency=$dependency --kill-on-invalid-dep=yes --parsable \
+                --job=model-$i \
+                --output=/dev/null \
+                --array=1-$ensemble_size \
+                --time=$slurm_time \
+                --ntasks=$slurm_ntasks \
+                --partition=$partition \
+                --cpus-per-task=$slurm_cpus_per_task \
+                --gpus-per-task=$slurm_gpus_per_task \
+                slurm/model_run.sbatch $experiment_id $i)
+
+    dependency=afterany:$ensemble_array_id
+    echo "Iteration $i job id: $ensemble_array_id"
+
+    update_id=$(
+        sbatch --dependency=$dependency --kill-on-invalid-dep=yes --parsable \
+               --job=update-$i \
+               --output=$logfile \
+               --open-mode=append \
+               --partition=$partition \
+               slurm/update.sbatch $experiment_id $i)
+
+    dependency=afterany:$update_id
+    echo -e "Update $i job id: $update_id\n"
+done
diff --git a/experiments/initialize.sbatch b/slurm/initialize.sbatch
similarity index 88%
rename from experiments/initialize.sbatch
rename to slurm/initialize.sbatch
index 1331c442..c30e612a 100644
--- a/experiments/initialize.sbatch
+++ b/slurm/initialize.sbatch
@@ -1,10 +1,11 @@
 #!/bin/sh
 #SBATCH --time=00:30:00
 #SBATCH --ntasks=1
-#SBATCH --cpus-per-task=1
+#SBATCH --cpus-per-task=8
 #SBATCH --job init_calibration
 
 experiment_id=$1
+JULIA_NUM_PRECOMPILE_TASKS=8
 
 echo "Initializing calibration for experiment: $experiment_id"
 julia --color=no --project=experiments/$experiment_id -e 'using Pkg; Pkg.instantiate(;verbose=true)'
diff --git a/experiments/model_run.sbatch b/slurm/model_run.sbatch
similarity index 90%
rename from experiments/model_run.sbatch
rename to slurm/model_run.sbatch
index 03481b64..a8b6b64a 100644
--- a/experiments/model_run.sbatch
+++ b/slurm/model_run.sbatch
@@ -1,7 +1,4 @@
 #!/bin/bash
-#SBATCH --time=2:00:00
-#SBATCH --cpus-per-task=8
-#SBATCH --mem-per-cpu=8G
 
 # Extract command-line arguments
 experiment_id=$1
diff --git a/slurm/parse_commandline.sh b/slurm/parse_commandline.sh
new file mode 100644
index 00000000..bddf0834
--- /dev/null
+++ b/slurm/parse_commandline.sh
@@ -0,0 +1,82 @@
+# Default arguments
+slurm_time="2:00:00"
+slurm_ntasks="1"
+slurm_cpus_per_task="1"
+slurm_gpus_per_task="0"
+
+help_message="Usage:
+    ./pipeline.sh [options] experiment_id
+
+Options:
+    -t, --time=HH:MM:SS: Set max wallclock time (default: 2:00:00).
+    -n, --ntasks:        Set number of tasks to launch (default: 1).
+    -c, --cpus_per_task: Set CPU cores per task (mutually exclusive with -g, default: 8).
+    -g, --gpus_per_task: Set GPUs per task (mutually exclusive with -c, default: 0).
+    -h, --help:          Display this help message.
+
+Arguments:
+    experiment_id:   A unique identifier for your experiment (required)."
+
+# Parse arguments using getopt
+VALID_ARGS=$(getopt -o h,t:,n:,c:,g: --long help,time:,ntasks:,cpus_per_task:,gpus_per_task: -- "$@")
+if [[ $? -ne 0 ]]; then
+    exit 1;
+fi
+
+eval set -- "$VALID_ARGS"
+
+# Process arguments
+while [ : ]; do
+  case "$1" in
+    -t | --time)
+        slurm_time="$2"
+        shift 2
+        ;;
+    -n | --ntasks)
+        slurm_ntasks="$2"
+        shift 2
+        ;;
+    -c | --cpus_per_task)
+        slurm_cpus_per_task="$2"
+        shift 2
+        ;;
+    -g | --gpus_per_task)
+        slurm_gpus_per_task="$2"
+        shift 2
+        ;;
+    -h | --help)
+        printf "%s\n" "$help_message"
+        exit 0
+        ;;
+    --) shift; break ;;  # End of options
+  esac
+done
+
+experiment_id="$1"
+if [ -z $experiment_id ] ; then
+    echo "Error: No experiment ID provided."
+    exit 1
+fi
+
+# Get values from EKP config file
+ensemble_size=$(grep "ensemble_size:" experiments/$experiment_id/ekp_config.yml | awk '{print $2}')
+n_iterations=$(grep "n_iterations:" experiments/$experiment_id/ekp_config.yml | awk '{print $2}')
+output=$(grep "output_dir:" experiments/$experiment_id/ekp_config.yml | awk '{print $2}')
+logfile=$output/experiment_log.out
+
+# Set partition
+if [[ $slurm_gpus_per_task -gt 0 ]]; then
+    partition=gpu
+else
+    partition=expansion
+fi
+
+# Output slurm configuration
+echo "Running experiment: $experiment_id"
+indent=" └ "
+printf "Slurm configuration (per ensemble member):\n"
+printf "%sTime limit: %s\n" "$indent" "$slurm_time"
+printf "%sTasks: %s\n" "$indent" "$slurm_ntasks"
+printf "%sCPUs per task: %s\n" "$indent" "$slurm_cpus_per_task"
+printf "%sGPUs per task: %s\n" "$indent" "$slurm_gpus_per_task"
+echo ""
diff --git a/experiments/update.sbatch b/slurm/update.sbatch
similarity index 94%
rename from experiments/update.sbatch
rename to slurm/update.sbatch
index 068428a4..f9fb5a2f 100644
--- a/experiments/update.sbatch
+++ b/slurm/update.sbatch
@@ -19,3 +19,4 @@ julia --color=no --project=experiments/$experiment_id -e '
     JLD2.save_object(joinpath(iter_path, "observation_map.jld2"), G_ensemble)
     CalibrateAtmos.update_ensemble(experiment_id, i)
 '
+echo "Update step for iteration $i complete"