Add initial CLI w/ GPU support

CliMA · Mar 18, 2024 · 128fa59 · 128fa59
1 parent e16d189
commit 128fa59
Show file tree

Hide file tree

Showing 5 changed files with 108 additions and 30 deletions.
diff --git a/experiments/initialize.sbatch b/experiments/initialize.sbatch
@@ -1,10 +1,11 @@
 #!/bin/sh
 #SBATCH --time=00:30:00
 #SBATCH --ntasks=1
-#SBATCH --cpus-per-task=1
+#SBATCH --cpus-per-task=8
 #SBATCH --job init_calibration
 
 experiment_id=$1
+JULIA_NUM_PRECOMPILE_TASKS=8
 
 echo "Initializing calibration for experiment: $experiment_id"
 julia --color=no --project=experiments/$experiment_id -e 'using Pkg; Pkg.instantiate(;verbose=true)'

diff --git a/experiments/model_run.sbatch b/experiments/model_run.sbatch
@@ -1,7 +1,4 @@
 #!/bin/bash
-#SBATCH --time=2:00:00
-#SBATCH --cpus-per-task=8
-#SBATCH --mem-per-cpu=8G
 
 # Extract command-line arguments
 experiment_id=$1

diff --git a/experiments/pipeline.sh b/experiments/pipeline.sh
@@ -1,24 +1,13 @@
 #!/bin/bash
-# Configure the environment
-export MODULEPATH=/groups/esm/modules:$MODULEPATH
-module load climacommon/2024_02_27
+source experiments/utils/parse_commandline.sh
+if [ ! -d $output ] ; then
+    mkdir $output
+fi
 
-# Parse command line
-experiment_id=${1?Error: no experiment ID given}
-tasks_per_model_run=${2?Error: no tasks per model run given}
-
-# Get ensemble size, number of iterations, and output dir from EKP config file
-ensemble_size=$(grep "ensemble_size:" experiments/$experiment_id/ekp_config.yml | awk '{print $2}')
-n_iterations=$(grep "n_iterations:" experiments/$experiment_id/ekp_config.yml | awk '{print $2}')
-output=$(grep "output_dir:" experiments/$experiment_id/ekp_config.yml | awk '{print $2}')
-
-mkdir $output
-
-echo "Running experiment $experiment_id with $tasks_per_model_run tasks per model run"
+# Initialize the project and setup calibration
 init_id=$(sbatch --parsable \
-                 --output=$output/log.out \
-                 --open-mode=append \
-                 --partition=expansion \
+                 --output=$logfile \
+                 --partition=$partition \
                  experiments/initialize.sbatch $experiment_id)
 echo "Initialization job_id: $init_id"
 echo ""
@@ -32,22 +21,26 @@ do
 
     ensemble_array_id=$(
         sbatch --dependency=$dependency --kill-on-invalid-dep=yes --parsable \
-               --job=model-$i \
-               --output=/dev/null \
-               --ntasks=$tasks_per_model_run \
-               --array=1-$ensemble_size \
-               --partition=expansion \
-               experiments/model_run.sbatch $experiment_id $i)
+                --job=model-$i \
+                --output=/dev/null \
+                --array=1-$ensemble_size \
+                --time=$slurm_time \
+                --ntasks=$slurm_ntasks \
+                --partition=$partition \
+                --cpus-per-task=$slurm_cpus_per_task \
+                --gpus-per-task=$slurm_gpus_per_task \
+                experiments/model_run.sbatch $experiment_id $i
+    )
 
     dependency=afterany:$ensemble_array_id
     echo "Iteration $i job id: $ensemble_array_id"
 
     update_id=$(
         sbatch --dependency=$dependency --kill-on-invalid-dep=yes --parsable \
                --job=update-$i \
-               --output=$output/log.out \
+               --output=$logfile \
                --open-mode=append \
-               --partition=expansion \
+               --partition=$partition \
                experiments/update.sbatch $experiment_id $i)
 
     dependency=afterany:$update_id

diff --git a/experiments/update.sbatch b/experiments/update.sbatch
@@ -19,3 +19,4 @@ julia --color=no --project=experiments/$experiment_id -e '
     JLD2.save_object(joinpath(iter_path, "observation_map.jld2"), G_ensemble)
     CalibrateAtmos.update_ensemble(experiment_id, i)
 '
+echo "Update step for iteration $i complete"
diff --git a/experiments/utils/parse_commandline.sh b/experiments/utils/parse_commandline.sh
@@ -0,0 +1,86 @@
+# Default arguments
+slurm_time="2:00:00"
+slurm_ntasks="1"
+slurm_cpus_per_task="1"
+slurm_gpus_per_task="0"
+
+help_message="Usage:
+    ./pipeline.sh [options] experiment_id
+
+Options:
+    -t, --time=HH:MM:SS: Set max wallclock time (default: 2:00:00).
+    -n, --ntasks:        Set number of tasks to launch (default: 1).
+    -c, --cpus_per_task: Set CPU cores per task (mutually exclusive with -g, default: 8).
+    -g, --gpus_per_task: Set GPUs per task (mutually exclusive with -c, default: 0).
+    -h, --help:          Display this help message.
+
+Arguments:
+    experiment_id:   A unique identifier for your experiment (required).
+
+Notes:
+    Cannot specify both CPU and GPU resources.
+    Script exits with error on missing arguments or invalid options."
+
+# Parse arguments using getopt
+VALID_ARGS=$(getopt -o h,t:,n:,c:,g: --long help,time:,ntasks:,cpus_per_task:,gpus_per_task: -- "$@")
+if [[ $? -ne 0 ]]; then
+    exit 1;
+fi
+
+eval set -- "$VALID_ARGS"
+
+# Process arguments
+while [ : ]; do
+  case "$1" in
+    -t | --time)
+        slurm_time="$2"
+        shift 2
+        ;;
+    -n | --ntasks)
+        slurm_ntasks="$2"
+        shift 2
+        ;;
+    -c | --cpus_per_task)
+        slurm_cpus_per_task="$2"
+        shift 2
+        ;;
+    -g | --gpus_per_task)
+        slurm_gpus_per_task="$2"
+        shift 2
+        ;;
+    -h | --help)
+        printf "%s\n" "$help_message"
+        exit 0
+        ;;
+    --) shift; break ;;  # End of options
+  esac
+done
+
+experiment_id="$1"
+if [ -z $experiment_id ] ; then
+    echo "Error: No experiment ID provided."
+    exit 1
+fi
+
+# Get values from EKP config file
+ensemble_size=$(grep "ensemble_size:" experiments/$experiment_id/ekp_config.yml | awk '{print $2}')
+n_iterations=$(grep "n_iterations:" experiments/$experiment_id/ekp_config.yml | awk '{print $2}')
+output=$(grep "output_dir:" experiments/$experiment_id/ekp_config.yml | awk '{print $2}')
+logfile=$output/experiment_log.out
+
+# Set partition
+if [[ $slurm_gpus_per_task -gt 0 ]]; then
+    partition=gpu
+else
+    partition=expansion
+fi
+
+# Output slurm configuration
+echo "Running experiment: $experiment_id"
+indent="  └ "
+printf "Slurm configuration (per ensemble member):\n"
+printf "%sTime limit: %s\n" "$indent" "$slurm_time"
+printf "%sTasks: %s\n" "$indent" "$slurm_ntasks"
+printf "%sCPUs per task: %s\n" "$indent" "$slurm_cpus_per_task"
+printf "%sGPUs per task: %s\n" "$indent" "$slurm_gpus_per_task"
+echo ""