diff --git a/experiments/initialize.sbatch b/experiments/initialize.sbatch index 1331c442..c30e612a 100644 --- a/experiments/initialize.sbatch +++ b/experiments/initialize.sbatch @@ -1,10 +1,11 @@ #!/bin/sh #SBATCH --time=00:30:00 #SBATCH --ntasks=1 -#SBATCH --cpus-per-task=1 +#SBATCH --cpus-per-task=8 #SBATCH --job init_calibration experiment_id=$1 +JULIA_NUM_PRECOMPILE_TASKS=8 echo "Initializing calibration for experiment: $experiment_id" julia --color=no --project=experiments/$experiment_id -e 'using Pkg; Pkg.instantiate(;verbose=true)' diff --git a/experiments/model_run.sbatch b/experiments/model_run.sbatch index 03481b64..a8b6b64a 100644 --- a/experiments/model_run.sbatch +++ b/experiments/model_run.sbatch @@ -1,7 +1,4 @@ #!/bin/bash -#SBATCH --time=2:00:00 -#SBATCH --cpus-per-task=8 -#SBATCH --mem-per-cpu=8G # Extract command-line arguments experiment_id=$1 diff --git a/experiments/pipeline.sh b/experiments/pipeline.sh index 9f714f7b..a891654f 100755 --- a/experiments/pipeline.sh +++ b/experiments/pipeline.sh @@ -1,24 +1,13 @@ #!/bin/bash -# Configure the environment -export MODULEPATH=/groups/esm/modules:$MODULEPATH -module load climacommon/2024_02_27 +source experiments/utils/parse_commandline.sh +if [ ! -d $output ] ; then + mkdir $output +fi -# Parse command line -experiment_id=${1?Error: no experiment ID given} -tasks_per_model_run=${2?Error: no tasks per model run given} - -# Get ensemble size, number of iterations, and output dir from EKP config file -ensemble_size=$(grep "ensemble_size:" experiments/$experiment_id/ekp_config.yml | awk '{print $2}') -n_iterations=$(grep "n_iterations:" experiments/$experiment_id/ekp_config.yml | awk '{print $2}') -output=$(grep "output_dir:" experiments/$experiment_id/ekp_config.yml | awk '{print $2}') - -mkdir $output - -echo "Running experiment $experiment_id with $tasks_per_model_run tasks per model run" +# Initialize the project and setup calibration init_id=$(sbatch --parsable \ - --output=$output/log.out \ - --open-mode=append \ - --partition=expansion \ + --output=$logfile \ + --partition=$partition \ experiments/initialize.sbatch $experiment_id) echo "Initialization job_id: $init_id" echo "" @@ -32,12 +21,16 @@ do ensemble_array_id=$( sbatch --dependency=$dependency --kill-on-invalid-dep=yes --parsable \ - --job=model-$i \ - --output=/dev/null \ - --ntasks=$tasks_per_model_run \ - --array=1-$ensemble_size \ - --partition=expansion \ - experiments/model_run.sbatch $experiment_id $i) + --job=model-$i \ + --output=/dev/null \ + --array=1-$ensemble_size \ + --time=$slurm_time \ + --ntasks=$slurm_ntasks \ + --partition=$partition \ + --cpus-per-task=$slurm_cpus_per_task \ + --gpus-per-task=$slurm_gpus_per_task \ + experiments/model_run.sbatch $experiment_id $i + ) dependency=afterany:$ensemble_array_id echo "Iteration $i job id: $ensemble_array_id" @@ -45,9 +38,9 @@ do update_id=$( sbatch --dependency=$dependency --kill-on-invalid-dep=yes --parsable \ --job=update-$i \ - --output=$output/log.out \ + --output=$logfile \ --open-mode=append \ - --partition=expansion \ + --partition=$partition \ experiments/update.sbatch $experiment_id $i) dependency=afterany:$update_id diff --git a/experiments/update.sbatch b/experiments/update.sbatch index 068428a4..f9fb5a2f 100644 --- a/experiments/update.sbatch +++ b/experiments/update.sbatch @@ -19,3 +19,4 @@ julia --color=no --project=experiments/$experiment_id -e ' JLD2.save_object(joinpath(iter_path, "observation_map.jld2"), G_ensemble) CalibrateAtmos.update_ensemble(experiment_id, i) ' +echo "Update step for iteration $i complete" diff --git a/experiments/utils/parse_commandline.sh b/experiments/utils/parse_commandline.sh new file mode 100644 index 00000000..c60a90af --- /dev/null +++ b/experiments/utils/parse_commandline.sh @@ -0,0 +1,86 @@ +# Default arguments +slurm_time="2:00:00" +slurm_ntasks="1" +slurm_cpus_per_task="1" +slurm_gpus_per_task="0" + +help_message="Usage: + ./pipeline.sh [options] experiment_id + +Options: + -t, --time=HH:MM:SS: Set max wallclock time (default: 2:00:00). + -n, --ntasks: Set number of tasks to launch (default: 1). + -c, --cpus_per_task: Set CPU cores per task (mutually exclusive with -g, default: 8). + -g, --gpus_per_task: Set GPUs per task (mutually exclusive with -c, default: 0). + -h, --help: Display this help message. + +Arguments: + experiment_id: A unique identifier for your experiment (required). + +Notes: + Cannot specify both CPU and GPU resources. + Script exits with error on missing arguments or invalid options." + +# Parse arguments using getopt +VALID_ARGS=$(getopt -o h,t:,n:,c:,g: --long help,time:,ntasks:,cpus_per_task:,gpus_per_task: -- "$@") +if [[ $? -ne 0 ]]; then + exit 1; +fi + +eval set -- "$VALID_ARGS" + +# Process arguments +while [ : ]; do + case "$1" in + -t | --time) + slurm_time="$2" + shift 2 + ;; + -n | --ntasks) + slurm_ntasks="$2" + shift 2 + ;; + -c | --cpus_per_task) + slurm_cpus_per_task="$2" + shift 2 + ;; + -g | --gpus_per_task) + slurm_gpus_per_task="$2" + shift 2 + ;; + -h | --help) + printf "%s\n" "$help_message" + exit 0 + ;; + --) shift; break ;; # End of options + esac +done + +experiment_id="$1" +if [ -z $experiment_id ] ; then + echo "Error: No experiment ID provided." + exit 1 +fi + +# Get values from EKP config file +ensemble_size=$(grep "ensemble_size:" experiments/$experiment_id/ekp_config.yml | awk '{print $2}') +n_iterations=$(grep "n_iterations:" experiments/$experiment_id/ekp_config.yml | awk '{print $2}') +output=$(grep "output_dir:" experiments/$experiment_id/ekp_config.yml | awk '{print $2}') +logfile=$output/experiment_log.out + +# Set partition +if [[ $slurm_gpus_per_task -gt 0 ]]; then + partition=gpu +else + partition=expansion +fi + +# Output slurm configuration +echo "Running experiment: $experiment_id" +indent=" └ " +printf "Slurm configuration (per ensemble member):\n" +printf "%sTime limit: %s\n" "$indent" "$slurm_time" +printf "%sTasks: %s\n" "$indent" "$slurm_ntasks" +printf "%sCPUs per task: %s\n" "$indent" "$slurm_cpus_per_task" +printf "%sGPUs per task: %s\n" "$indent" "$slurm_gpus_per_task" +echo ""