From f3c2442f2ad331c6c90f2373040eacd617f2c184 Mon Sep 17 00:00:00 2001 From: nefrathenrici Date: Fri, 15 Mar 2024 16:18:47 -0700 Subject: [PATCH 1/6] Add initial CLI w/ GPU support --- docs/src/experiment_setup_guide.md | 2 +- docs/src/quickstart.md | 8 +-- experiments/pipeline.sh | 56 --------------- experiments/pipeline.jl => pipeline.jl | 0 pipeline.sh | 49 +++++++++++++ {experiments => slurm}/initialize.sbatch | 3 +- {experiments => slurm}/model_run.sbatch | 3 - slurm/parse_commandline.sh | 89 ++++++++++++++++++++++++ {experiments => slurm}/update.sbatch | 1 + 9 files changed, 146 insertions(+), 65 deletions(-) delete mode 100755 experiments/pipeline.sh rename experiments/pipeline.jl => pipeline.jl (100%) create mode 100755 pipeline.sh rename {experiments => slurm}/initialize.sbatch (88%) rename {experiments => slurm}/model_run.sbatch (90%) create mode 100644 slurm/parse_commandline.sh rename {experiments => slurm}/update.sbatch (94%) diff --git a/docs/src/experiment_setup_guide.md b/docs/src/experiment_setup_guide.md index 378caf77..97c3ccad 100644 --- a/docs/src/experiment_setup_guide.md +++ b/docs/src/experiment_setup_guide.md @@ -13,7 +13,7 @@ For the example experiment, `sphere_held_suarez_rhoe_equilmoist`, this is done b `sbatch experiments/sphere_held_suarez_rhoe_equilmoist/generate_observations.sbatch`. This script runs the model, passes the output through the observation map, and saves the result. Once the observations have been processed and saved, the actual calibration pipeline can be run via -`bash experiments/pipeline.sh sphere_held_suarez_rhoe_equilmoist 8`. +`bash pipeline.sh sphere_held_suarez_rhoe_equilmoist -n 10 -c 8`. !!! note The command line interface for `pipeline.sh` will change. For now, the first entry is the experiment id and the second is the number of tasks to use per ensemble member. diff --git a/docs/src/quickstart.md b/docs/src/quickstart.md index de10741c..abdbc700 100644 --- a/docs/src/quickstart.md +++ b/docs/src/quickstart.md @@ -12,19 +12,19 @@ By default, it runs 10 ensemble members for 3 iterations. To run this experiment: 1. Log onto the Caltech HPC 2. Clone CalibrateAtmos.jl and `cd` into the repository. -3. Run: `bash experiments/pipeline.sh sphere_held_suarez_rhoe_equilmoist 8`. This will run the `sphere_held_suarez_rhoe_equilmoist` experiment with 8 tasks per ensemble member. +3. Run: `bash pipeline.sh -n 10 -c 8 sphere_held_suarez_rhoe_equilmoist`. This will run the `sphere_held_suarez_rhoe_equilmoist` experiment with 10 tasks per ensemble member. ## Local Machine -To run an experiment on your local machine, you can use the `experiments/pipeline.jl` script. This is recommended for more lightweight experiments, such as the `surface_fluxes_perfect_model` experiment, which uses the [SurfaceFluxes.jl](https://github.com/CliMA/SurfaceFluxes.jl) package to generate a physical model that calculates the Monin Obukhov turbulent surface fluxes based on idealized atmospheric and surface conditions. Since this is a "perfect model" example, the same model is used to generate synthetic observations using its default parameters and a small amount of noise. These synthetic observations are considered to be the ground truth, which is used to assess the model ensembles' performance when parameters are drawn from the prior parameter distributions. To run this experiment, you can use the following command from terminal to run an interactive run: +To run an experiment on your local machine, you can use the `pipeline.jl` script. This is recommended for more lightweight experiments, such as the `surface_fluxes_perfect_model` experiment, which uses the [SurfaceFluxes.jl](https://github.com/CliMA/SurfaceFluxes.jl) package to generate a physical model that calculates the Monin Obukhov turbulent surface fluxes based on idealized atmospheric and surface conditions. Since this is a "perfect model" example, the same model is used to generate synthetic observations using its default parameters and a small amount of noise. These synthetic observations are considered to be the ground truth, which is used to assess the model ensembles' performance when parameters are drawn from the prior parameter distributions. To run this experiment, you can use the following command from terminal to run an interactive run: ```bash -julia -i experiments/pipeline.jl surface_fluxes_perfect_model +julia -i pipeline.jl surface_fluxes_perfect_model ``` This pipeline mirrors the pipeline of the bash srcipts, and the same example can be run on the HPC cluster if needed: ```bash -bash experiments/pipeline.sh surface_fluxes_perfect_model 8 +bash pipeline.sh surface_fluxes_perfect_model 8 ``` The experiments (such as `surface_fluxes_perfect_model`) can be equally defined within the component model repos (in this case, `SurfaceFluxes.jl`), so that the internals of `CalibrateAtmos.jl` do not explicitly depend on component models. diff --git a/experiments/pipeline.sh b/experiments/pipeline.sh deleted file mode 100755 index 9f714f7b..00000000 --- a/experiments/pipeline.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/bin/bash -# Configure the environment -export MODULEPATH=/groups/esm/modules:$MODULEPATH -module load climacommon/2024_02_27 - -# Parse command line -experiment_id=${1?Error: no experiment ID given} -tasks_per_model_run=${2?Error: no tasks per model run given} - -# Get ensemble size, number of iterations, and output dir from EKP config file -ensemble_size=$(grep "ensemble_size:" experiments/$experiment_id/ekp_config.yml | awk '{print $2}') -n_iterations=$(grep "n_iterations:" experiments/$experiment_id/ekp_config.yml | awk '{print $2}') -output=$(grep "output_dir:" experiments/$experiment_id/ekp_config.yml | awk '{print $2}') - -mkdir $output - -echo "Running experiment $experiment_id with $tasks_per_model_run tasks per model run" -init_id=$(sbatch --parsable \ - --output=$output/log.out \ - --open-mode=append \ - --partition=expansion \ - experiments/initialize.sbatch $experiment_id) -echo "Initialization job_id: $init_id" -echo "" - -# Loop over iterations -dependency="afterok:$init_id" -for i in $(seq 0 $((n_iterations - 1))) -do - echo "Scheduling iteration $i" - format_i=$(printf "iteration_%03d" "$i") - - ensemble_array_id=$( - sbatch --dependency=$dependency --kill-on-invalid-dep=yes --parsable \ - --job=model-$i \ - --output=/dev/null \ - --ntasks=$tasks_per_model_run \ - --array=1-$ensemble_size \ - --partition=expansion \ - experiments/model_run.sbatch $experiment_id $i) - - dependency=afterany:$ensemble_array_id - echo "Iteration $i job id: $ensemble_array_id" - - update_id=$( - sbatch --dependency=$dependency --kill-on-invalid-dep=yes --parsable \ - --job=update-$i \ - --output=$output/log.out \ - --open-mode=append \ - --partition=expansion \ - experiments/update.sbatch $experiment_id $i) - - dependency=afterany:$update_id - echo "Update $i job id: $update_id" - echo "" -done diff --git a/experiments/pipeline.jl b/pipeline.jl similarity index 100% rename from experiments/pipeline.jl rename to pipeline.jl diff --git a/pipeline.sh b/pipeline.sh new file mode 100755 index 00000000..2464febe --- /dev/null +++ b/pipeline.sh @@ -0,0 +1,49 @@ +#!/bin/bash +source slurm/parse_commandline.sh +if [ ! -d $output ] ; then + mkdir $output +fi + +# Initialize the project and setup calibration +init_id=$(sbatch --parsable \ + --output=$logfile \ + --partition=$partition \ + slurm/initialize.sbatch $experiment_id) +echo "Initialization job_id: $init_id" +echo "" + +# Loop over iterations +dependency="afterok:$init_id" +for i in $(seq 0 $((n_iterations - 1))) +do + echo "Scheduling iteration $i" + format_i=$(printf "iteration_%03d" "$i") + + ensemble_array_id=$( + sbatch --dependency=$dependency --kill-on-invalid-dep=yes --parsable \ + --job=model-$i \ + --output=/dev/null \ + --array=1-$ensemble_size \ + --time=$slurm_time \ + --ntasks=$slurm_ntasks \ + --partition=$partition \ + --cpus-per-task=$slurm_cpus_per_task \ + --gpus-per-task=$slurm_gpus_per_task \ + slurm/model_run.sbatch $experiment_id $i + ) + + dependency=afterany:$ensemble_array_id + echo "Iteration $i job id: $ensemble_array_id" + + update_id=$( + sbatch --dependency=$dependency --kill-on-invalid-dep=yes --parsable \ + --job=update-$i \ + --output=$logfile \ + --open-mode=append \ + --partition=$partition \ + slurm/update.sbatch $experiment_id $i) + + dependency=afterany:$update_id + echo "Update $i job id: $update_id" + echo "" +done diff --git a/experiments/initialize.sbatch b/slurm/initialize.sbatch similarity index 88% rename from experiments/initialize.sbatch rename to slurm/initialize.sbatch index 1331c442..c30e612a 100644 --- a/experiments/initialize.sbatch +++ b/slurm/initialize.sbatch @@ -1,10 +1,11 @@ #!/bin/sh #SBATCH --time=00:30:00 #SBATCH --ntasks=1 -#SBATCH --cpus-per-task=1 +#SBATCH --cpus-per-task=8 #SBATCH --job init_calibration experiment_id=$1 +JULIA_NUM_PRECOMPILE_TASKS=8 echo "Initializing calibration for experiment: $experiment_id" julia --color=no --project=experiments/$experiment_id -e 'using Pkg; Pkg.instantiate(;verbose=true)' diff --git a/experiments/model_run.sbatch b/slurm/model_run.sbatch similarity index 90% rename from experiments/model_run.sbatch rename to slurm/model_run.sbatch index 03481b64..a8b6b64a 100644 --- a/experiments/model_run.sbatch +++ b/slurm/model_run.sbatch @@ -1,7 +1,4 @@ #!/bin/bash -#SBATCH --time=2:00:00 -#SBATCH --cpus-per-task=8 -#SBATCH --mem-per-cpu=8G # Extract command-line arguments experiment_id=$1 diff --git a/slurm/parse_commandline.sh b/slurm/parse_commandline.sh new file mode 100644 index 00000000..fc5a8d55 --- /dev/null +++ b/slurm/parse_commandline.sh @@ -0,0 +1,89 @@ +export MODULEPATH=/groups/esm/modules:$MODULEPATH +module load climacommon/2024_03_18 + +# Default arguments +slurm_time="2:00:00" +slurm_ntasks="1" +slurm_cpus_per_task="1" +slurm_gpus_per_task="0" + +help_message="Usage: + ./pipeline.sh [options] experiment_id + +Options: + -t, --time=HH:MM:SS: Set max wallclock time (default: 2:00:00). + -n, --ntasks: Set number of tasks to launch (default: 1). + -c, --cpus_per_task: Set CPU cores per task (mutually exclusive with -g, default: 8). + -g, --gpus_per_task: Set GPUs per task (mutually exclusive with -c, default: 0). + -h, --help: Display this help message. + +Arguments: + experiment_id: A unique identifier for your experiment (required). + +Notes: + Cannot specify both CPU and GPU resources. + Script exits with error on missing arguments or invalid options." + +# Parse arguments using getopt +VALID_ARGS=$(getopt -o h,t:,n:,c:,g: --long help,time:,ntasks:,cpus_per_task:,gpus_per_task: -- "$@") +if [[ $? -ne 0 ]]; then + exit 1; +fi + +eval set -- "$VALID_ARGS" + +# Process arguments +while [ : ]; do + case "$1" in + -t | --time) + slurm_time="$2" + shift 2 + ;; + -n | --ntasks) + slurm_ntasks="$2" + shift 2 + ;; + -c | --cpus_per_task) + slurm_cpus_per_task="$2" + shift 2 + ;; + -g | --gpus_per_task) + slurm_gpus_per_task="$2" + shift 2 + ;; + -h | --help) + printf "%s\n" "$help_message" + exit 0 + ;; + --) shift; break ;; # End of options + esac +done + +experiment_id="$1" +if [ -z $experiment_id ] ; then + echo "Error: No experiment ID provided." + exit 1 +fi + +# Get values from EKP config file +ensemble_size=$(grep "ensemble_size:" experiments/$experiment_id/ekp_config.yml | awk '{print $2}') +n_iterations=$(grep "n_iterations:" experiments/$experiment_id/ekp_config.yml | awk '{print $2}') +output=$(grep "output_dir:" experiments/$experiment_id/ekp_config.yml | awk '{print $2}') +logfile=$output/experiment_log.out + +# Set partition +if [[ $slurm_gpus_per_task -gt 0 ]]; then + partition=gpu +else + partition=expansion +fi + +# Output slurm configuration +echo "Running experiment: $experiment_id" +indent=" └ " +printf "Slurm configuration (per ensemble member):\n" +printf "%sTime limit: %s\n" "$indent" "$slurm_time" +printf "%sTasks: %s\n" "$indent" "$slurm_ntasks" +printf "%sCPUs per task: %s\n" "$indent" "$slurm_cpus_per_task" +printf "%sGPUs per task: %s\n" "$indent" "$slurm_gpus_per_task" +echo "" diff --git a/experiments/update.sbatch b/slurm/update.sbatch similarity index 94% rename from experiments/update.sbatch rename to slurm/update.sbatch index 068428a4..f9fb5a2f 100644 --- a/experiments/update.sbatch +++ b/slurm/update.sbatch @@ -19,3 +19,4 @@ julia --color=no --project=experiments/$experiment_id -e ' JLD2.save_object(joinpath(iter_path, "observation_map.jld2"), G_ensemble) CalibrateAtmos.update_ensemble(experiment_id, i) ' +echo "Update step for iteration $i complete" From b358df5598896b23bb3b001d76061d1c34f4c387 Mon Sep 17 00:00:00 2001 From: nefrathenrici Date: Mon, 18 Mar 2024 15:29:24 -0700 Subject: [PATCH 2/6] minor fixes --- pipeline.sh | 3 +++ slurm/parse_commandline.sh | 9 +-------- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/pipeline.sh b/pipeline.sh index 2464febe..e623a9c6 100755 --- a/pipeline.sh +++ b/pipeline.sh @@ -1,4 +1,7 @@ #!/bin/bash +export MODULEPATH=/groups/esm/modules:$MODULEPATH +module load climacommon/2024_03_18 + source slurm/parse_commandline.sh if [ ! -d $output ] ; then mkdir $output diff --git a/slurm/parse_commandline.sh b/slurm/parse_commandline.sh index fc5a8d55..1fd7d40f 100644 --- a/slurm/parse_commandline.sh +++ b/slurm/parse_commandline.sh @@ -1,6 +1,3 @@ -export MODULEPATH=/groups/esm/modules:$MODULEPATH -module load climacommon/2024_03_18 - # Default arguments slurm_time="2:00:00" slurm_ntasks="1" @@ -18,11 +15,7 @@ Options: -h, --help: Display this help message. Arguments: - experiment_id: A unique identifier for your experiment (required). - -Notes: - Cannot specify both CPU and GPU resources. - Script exits with error on missing arguments or invalid options." + experiment_id: A unique identifier for your experiment (required)." # Parse arguments using getopt VALID_ARGS=$(getopt -o h,t:,n:,c:,g: --long help,time:,ntasks:,cpus_per_task:,gpus_per_task: -- "$@") From bf538b9410f59a3b82c25c1bdfcc0f3b591adb45 Mon Sep 17 00:00:00 2001 From: nefrathenrici Date: Mon, 18 Mar 2024 16:16:43 -0700 Subject: [PATCH 3/6] recursively create output dir --- pipeline.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline.sh b/pipeline.sh index e623a9c6..34239640 100755 --- a/pipeline.sh +++ b/pipeline.sh @@ -4,7 +4,7 @@ module load climacommon/2024_03_18 source slurm/parse_commandline.sh if [ ! -d $output ] ; then - mkdir $output + mkdir -p $output fi # Initialize the project and setup calibration From 898b7af8d699570c9ebdcc70391bbcc55013ca4a Mon Sep 17 00:00:00 2001 From: nefrathenrici Date: Mon, 18 Mar 2024 16:19:56 -0700 Subject: [PATCH 4/6] more tweaks --- pipeline.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/pipeline.sh b/pipeline.sh index 34239640..d3c6fff9 100755 --- a/pipeline.sh +++ b/pipeline.sh @@ -1,4 +1,5 @@ #!/bin/bash +set -euo pipefail export MODULEPATH=/groups/esm/modules:$MODULEPATH module load climacommon/2024_03_18 From 7eba23fccd1aeb17a7d081c166d1ce0b98a29c85 Mon Sep 17 00:00:00 2001 From: nefrathenrici Date: Tue, 19 Mar 2024 14:10:06 -0700 Subject: [PATCH 5/6] more more tweaks --- pipeline.sh | 7 +++---- slurm/parse_commandline.sh | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pipeline.sh b/pipeline.sh index d3c6fff9..e0178198 100755 --- a/pipeline.sh +++ b/pipeline.sh @@ -1,6 +1,7 @@ #!/bin/bash set -euo pipefail export MODULEPATH=/groups/esm/modules:$MODULEPATH +module purge module load climacommon/2024_03_18 source slurm/parse_commandline.sh @@ -13,8 +14,7 @@ init_id=$(sbatch --parsable \ --output=$logfile \ --partition=$partition \ slurm/initialize.sbatch $experiment_id) -echo "Initialization job_id: $init_id" -echo "" +echo -e "Initialization job_id: $init_id\n" # Loop over iterations dependency="afterok:$init_id" @@ -48,6 +48,5 @@ do slurm/update.sbatch $experiment_id $i) dependency=afterany:$update_id - echo "Update $i job id: $update_id" - echo "" + echo -e "Update $i job id: $update_id\n" done diff --git a/slurm/parse_commandline.sh b/slurm/parse_commandline.sh index 1fd7d40f..bddf0834 100644 --- a/slurm/parse_commandline.sh +++ b/slurm/parse_commandline.sh @@ -73,7 +73,7 @@ fi # Output slurm configuration echo "Running experiment: $experiment_id" -indent=" └ " +indent=" └ " printf "Slurm configuration (per ensemble member):\n" printf "%sTime limit: %s\n" "$indent" "$slurm_time" printf "%sTasks: %s\n" "$indent" "$slurm_ntasks" From c941577a504aeb9cebe91068f50b333242d853c6 Mon Sep 17 00:00:00 2001 From: nefrathenrici Date: Tue, 19 Mar 2024 15:58:54 -0700 Subject: [PATCH 6/6] final twea --- pipeline.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pipeline.sh b/pipeline.sh index e0178198..a13e3a59 100755 --- a/pipeline.sh +++ b/pipeline.sh @@ -33,8 +33,7 @@ do --partition=$partition \ --cpus-per-task=$slurm_cpus_per_task \ --gpus-per-task=$slurm_gpus_per_task \ - slurm/model_run.sbatch $experiment_id $i - ) + slurm/model_run.sbatch $experiment_id $i) dependency=afterany:$ensemble_array_id echo "Iteration $i job id: $ensemble_array_id"