Skip to content

Commit

Permalink
Add initial CLI w/ GPU support
Browse files Browse the repository at this point in the history
  • Loading branch information
nefrathenrici committed Mar 18, 2024
1 parent e16d189 commit 128fa59
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 30 deletions.
3 changes: 2 additions & 1 deletion experiments/initialize.sbatch
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
#!/bin/sh
#SBATCH --time=00:30:00
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --cpus-per-task=8
#SBATCH --job init_calibration

experiment_id=$1
JULIA_NUM_PRECOMPILE_TASKS=8

echo "Initializing calibration for experiment: $experiment_id"
julia --color=no --project=experiments/$experiment_id -e 'using Pkg; Pkg.instantiate(;verbose=true)'
Expand Down
3 changes: 0 additions & 3 deletions experiments/model_run.sbatch
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
#!/bin/bash
#SBATCH --time=2:00:00
#SBATCH --cpus-per-task=8
#SBATCH --mem-per-cpu=8G

# Extract command-line arguments
experiment_id=$1
Expand Down
45 changes: 19 additions & 26 deletions experiments/pipeline.sh
Original file line number Diff line number Diff line change
@@ -1,24 +1,13 @@
#!/bin/bash
# Configure the environment
export MODULEPATH=/groups/esm/modules:$MODULEPATH
module load climacommon/2024_02_27
source experiments/utils/parse_commandline.sh
if [ ! -d $output ] ; then
mkdir $output
fi

# Parse command line
experiment_id=${1?Error: no experiment ID given}
tasks_per_model_run=${2?Error: no tasks per model run given}

# Get ensemble size, number of iterations, and output dir from EKP config file
ensemble_size=$(grep "ensemble_size:" experiments/$experiment_id/ekp_config.yml | awk '{print $2}')
n_iterations=$(grep "n_iterations:" experiments/$experiment_id/ekp_config.yml | awk '{print $2}')
output=$(grep "output_dir:" experiments/$experiment_id/ekp_config.yml | awk '{print $2}')

mkdir $output

echo "Running experiment $experiment_id with $tasks_per_model_run tasks per model run"
# Initialize the project and setup calibration
init_id=$(sbatch --parsable \
--output=$output/log.out \
--open-mode=append \
--partition=expansion \
--output=$logfile \
--partition=$partition \
experiments/initialize.sbatch $experiment_id)
echo "Initialization job_id: $init_id"
echo ""
Expand All @@ -32,22 +21,26 @@ do

ensemble_array_id=$(
sbatch --dependency=$dependency --kill-on-invalid-dep=yes --parsable \
--job=model-$i \
--output=/dev/null \
--ntasks=$tasks_per_model_run \
--array=1-$ensemble_size \
--partition=expansion \
experiments/model_run.sbatch $experiment_id $i)
--job=model-$i \
--output=/dev/null \
--array=1-$ensemble_size \
--time=$slurm_time \
--ntasks=$slurm_ntasks \
--partition=$partition \
--cpus-per-task=$slurm_cpus_per_task \
--gpus-per-task=$slurm_gpus_per_task \
experiments/model_run.sbatch $experiment_id $i
)

dependency=afterany:$ensemble_array_id
echo "Iteration $i job id: $ensemble_array_id"

update_id=$(
sbatch --dependency=$dependency --kill-on-invalid-dep=yes --parsable \
--job=update-$i \
--output=$output/log.out \
--output=$logfile \
--open-mode=append \
--partition=expansion \
--partition=$partition \
experiments/update.sbatch $experiment_id $i)

dependency=afterany:$update_id
Expand Down
1 change: 1 addition & 0 deletions experiments/update.sbatch
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@ julia --color=no --project=experiments/$experiment_id -e '
JLD2.save_object(joinpath(iter_path, "observation_map.jld2"), G_ensemble)
CalibrateAtmos.update_ensemble(experiment_id, i)
'
echo "Update step for iteration $i complete"
86 changes: 86 additions & 0 deletions experiments/utils/parse_commandline.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# Default arguments
slurm_time="2:00:00"
slurm_ntasks="1"
slurm_cpus_per_task="1"
slurm_gpus_per_task="0"

help_message="Usage:
./pipeline.sh [options] experiment_id
Options:
-t, --time=HH:MM:SS: Set max wallclock time (default: 2:00:00).
-n, --ntasks: Set number of tasks to launch (default: 1).
-c, --cpus_per_task: Set CPU cores per task (mutually exclusive with -g, default: 8).
-g, --gpus_per_task: Set GPUs per task (mutually exclusive with -c, default: 0).
-h, --help: Display this help message.
Arguments:
experiment_id: A unique identifier for your experiment (required).
Notes:
Cannot specify both CPU and GPU resources.
Script exits with error on missing arguments or invalid options."

# Parse arguments using getopt
VALID_ARGS=$(getopt -o h,t:,n:,c:,g: --long help,time:,ntasks:,cpus_per_task:,gpus_per_task: -- "$@")
if [[ $? -ne 0 ]]; then
exit 1;
fi

eval set -- "$VALID_ARGS"

# Process arguments
while [ : ]; do
case "$1" in
-t | --time)
slurm_time="$2"
shift 2
;;
-n | --ntasks)
slurm_ntasks="$2"
shift 2
;;
-c | --cpus_per_task)
slurm_cpus_per_task="$2"
shift 2
;;
-g | --gpus_per_task)
slurm_gpus_per_task="$2"
shift 2
;;
-h | --help)
printf "%s\n" "$help_message"
exit 0
;;
--) shift; break ;; # End of options
esac
done

experiment_id="$1"
if [ -z $experiment_id ] ; then
echo "Error: No experiment ID provided."
exit 1
fi

# Get values from EKP config file
ensemble_size=$(grep "ensemble_size:" experiments/$experiment_id/ekp_config.yml | awk '{print $2}')
n_iterations=$(grep "n_iterations:" experiments/$experiment_id/ekp_config.yml | awk '{print $2}')
output=$(grep "output_dir:" experiments/$experiment_id/ekp_config.yml | awk '{print $2}')
logfile=$output/experiment_log.out

# Set partition
if [[ $slurm_gpus_per_task -gt 0 ]]; then
partition=gpu
else
partition=expansion
fi

# Output slurm configuration
echo "Running experiment: $experiment_id"
indent=""
printf "Slurm configuration (per ensemble member):\n"
printf "%sTime limit: %s\n" "$indent" "$slurm_time"
printf "%sTasks: %s\n" "$indent" "$slurm_ntasks"
printf "%sCPUs per task: %s\n" "$indent" "$slurm_cpus_per_task"
printf "%sGPUs per task: %s\n" "$indent" "$slurm_gpus_per_task"
echo ""

0 comments on commit 128fa59

Please sign in to comment.