Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
add error up climacommons

test error

test

wait

split cmds

split cmds

split to diff steps

gp fix

no {

rm init

one ln cmd

clean

options

now sol passes

test to fail on BK

revs

try

try

pip fix

try

no bl ln

try

try

sep script

sep script

sep script

sep script

try

try

try

revert fail test

exit 0

try

try

fail test

print status

status wait time fix; test to fail

try to fail, turn off depot

test to pass
  • Loading branch information
LenkaNovak committed Apr 2, 2024
1 parent 4e528c2 commit 6e5e843
Show file tree
Hide file tree
Showing 7 changed files with 112 additions and 34 deletions.
14 changes: 8 additions & 6 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
agents:
queue: new-central
slurm_time: 24:00:00
slurm_time: 4:00:00
modules: climacommon/2024_03_18

env:
Expand Down Expand Up @@ -344,8 +344,6 @@ steps:
slurm_ntasks: 2
slurm_mem: 16GB

- label: "batch script"
command: "sbatch test/mpi_tests/local_checks.sh"

# short high-res performance test
- label: "Unthreaded AMIP FINE" # also reported by longruns with a flame graph
Expand Down Expand Up @@ -431,9 +429,13 @@ steps:
slurm_mem: 20GB
slurm_gpus: 1



- wait
- group: "Bash scripts"
steps:
- label: "Submit and Monitor sbatch Job on Caltech HPC"
# check that (1) the script can be succesfully submitted, (2) it runs successfully
command: "test/mpi_tests/test_sbatch_script.sh"
agents:
slurm_ntasks: 1

- wait

Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ experiments/AMIP/output/*
*.so

# internal tests
testdel.jl
slurm-*.out

# ignore vscode artifacts
*.vscode
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
anim: false
apply_limiter: false
dt: "400secs"
dt_cpl: 400
dt_save_restart: "10days"
dt_save_to_sol: "1days"
energy_check: false
h_elem: 6
hourly_checkpoint: true
hourly_checkpoint_dt: 1
job_id: "coarse_single_ft64_hourly_checkpoints_restart"
mode_name: "amip"
moist: "equil"
mono_surface: false
precip_model: "0M"
rad: "gray"
run_name: "coarse_single_ft64_hourly_checkpoints_restart"
t_end: "800secs"
vert_diff: "true"
4 changes: 4 additions & 0 deletions experiments/AMIP/cli_options.jl
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ function argparse_settings()
help = "Boolean flag indicating whether to checkpoint at intervals of 1 hour or multiple hours"
arg_type = Bool
default = false
"--hourly_checkpoint_dt"
help = "Time interval for hourly checkpointing in hours (20 days by default)"
arg_type = Int
default = 480
"--coupler_output_dir"
help = "Directory to save output files. Note that TempestRemap fails if interactive and paths are too long."
arg_type = String
Expand Down
10 changes: 8 additions & 2 deletions experiments/AMIP/coupler_driver.jl
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ saveat = Float64(time_to_seconds(config_dict["dt_save_to_sol"]))
date0 = date = DateTime(config_dict["start_date"], dateformat"yyyymmdd")
mono_surface = config_dict["mono_surface"]
hourly_checkpoint = config_dict["hourly_checkpoint"]
hourly_checkpoint_dt = config_dict["hourly_checkpoint_dt"]
restart_dir = config_dict["restart_dir"]
restart_t = Int(config_dict["restart_t"])
evolving_ocean = config_dict["evolving_ocean"]
Expand Down Expand Up @@ -526,8 +527,13 @@ The currently implemented callbacks are:
being approximated from wind speed). It is updated at the same frequency as the atmospheric radiation.
NB: Eventually, we will call all of radiation from the coupler, in addition to the albedo calculation.
=#
checkpoint_cb =
HourlyCallback(dt = FT(480), func = checkpoint_sims, ref_date = [dates.date[1]], active = hourly_checkpoint) # 20 days

checkpoint_cb = HourlyCallback(
dt = hourly_checkpoint_dt,
func = checkpoint_sims,
ref_date = [dates.date[1]],
active = hourly_checkpoint,
) # 20 days
update_firstdayofmonth!_cb =
MonthlyCallback(dt = FT(1), func = update_firstdayofmonth!, ref_date = [dates.date1[1]], active = true)
dt_water_albedo = parse(FT, filter(x -> !occursin(x, "hours"), dt_rad))
Expand Down
65 changes: 40 additions & 25 deletions test/mpi_tests/local_checks.sh
Original file line number Diff line number Diff line change
@@ -1,29 +1,25 @@
#!/bin/bash
#SBATCH --time=24:00:00
#SBATCH --nodes=1
#SBATCH --job-name=mpi_restart_test
#SBATCH --reservation=clima
#SBATCH --mem=32GB
#SBATCH --ntasks=2
#SBATCH --job-name=mpi_amip
#SBATCH --time=24:00:00
#SBATCH --mem-per-cpu=16G
#SBATCH --partition=expansion

# TODO: this needs to be updated (+ implement better tests that are caught on Buildkite) #667

export MODULEPATH="/groups/esm/modules:$MODULEPATH"
module purge
module load julia/1.10.1
export JULIA_MPI_BINARY=system
export JULIA_NUM_THREADS=${SLURM_CPUS_PER_TASK:=1}
export CLIMACORE_DISTRIBUTED="MPI"
export JULIA_HDF5_PATH=""
module load climacommon/2024_03_18

export RUN_NAME=amip_restart_mpi_test
export CC_PATH=$(pwd)/ # adjust this to the path of your ClimaCoupler.jl directory
export RUN_NAME=coarse_single_ft64_hourly_checkpoints_restart
export CONFIG_FILE=${CC_PATH}config/model_configs/${RUN_NAME}.yml
export RESTART_DIR=experiments/AMIP/output/amip/${RUN_NAME}_artifacts/
export RESTART_T=200

julia -e 'using Pkg; Pkg.add("MPIPreferences"); using MPIPreferences; use_system_binary()'
julia --project -e 'using Pkg; Pkg.instantiate()'
julia --project -e 'using Pkg; Pkg.build("MPI")'
julia --project -e 'using Pkg; Pkg.build("HDF5")'
julia --project -e 'using Pkg; Pkg.API.precompile()'
export OPENBLAS_NUM_THREADS=1
export JULIA_NVTX_CALLBACKS=gc
export OMPI_MCA_opal_warn_on_missing_libcuda=0
export JULIA_MAX_NUM_PRECOMPILE_FILES=100
export JULIA_CPU_TARGET='broadwell;skylake;icelake;cascadelake;epyc'
export SLURM_KILL_BAD_EXIT=1

julia --project=experiments/AMIP/ -e 'using Pkg; Pkg.instantiate(;verbose=true)'
julia --project=experiments/AMIP/ -e 'using Pkg; Pkg.precompile()'
Expand All @@ -34,10 +30,29 @@ julia --project=artifacts -e 'using Pkg; Pkg.precompile()'
julia --project=artifacts -e 'using Pkg; Pkg.status()'
julia --project=artifacts artifacts/download_artifacts.jl

# run spin up
# - specify `--hourly_checkpoint true` to save monthly checkpoints of all model prognostic states
mpiexec julia --color=yes --project=experiments/AMIP/ experiments/AMIP/coupler_driver.jl --run_name $RUN_NAME --coupled true --start_date 19790101 --hourly_checkpoint true --anim true --surface_setup PrescribedSurface --dt_cpl 200 --energy_check false --mode_name amip --mono_surface false --vert_diff true --moist equil --rad clearsky --precip_model 0M --z_elem 35 --dz_bottom 50 --h_elem 12 --rayleigh_sponge true --alpha_rayleigh_uh 0 --dt 200secs --t_end 0.1days --job_id $RUN_NAME --dt_save_to_sol 1000days --dt_save_state_to_disk 10days --apply_limiter false --FLOAT_TYPE Float64
srun -K julia --project=experiments/AMIP/ experiments/AMIP/coupler_driver.jl --config_file $CONFIG_FILE

# restart from simulation time of 400 seconds
export RESTART_T=400

# setup the new config file with ammened checkpointing frequency
export RESTART_CONFIG_FILE=${CONFIG_FILE::-4}_tmp.yml
cp $CONFIG_FILE $RESTART_CONFIG_FILE
sed -i 's/t_end: \"800secs\"/t_end: \"3600secs\"/g' $RESTART_CONFIG_FILE

# rerun the model
srun -K julia --project=experiments/AMIP/ experiments/AMIP/coupler_driver.jl --config_file $RESTART_CONFIG_FILE --restart_dir $RESTART_DIR --restart_t $RESTART_T

# throw an error if no restart checkpoint files are found
if [ $(ls -1 $RESTART_DIR/checkpoint | wc -l) -lt 5 ]; then
echo "Error: RESTART_DIR does not contain enough files"
exit 1
else
echo "Successful: RESTART_DIR contains $(ls -1 $RESTART_DIR/checkpoint | wc -l) files"
exit 0
fi

# init using a restart
# - specify the directory of the `checkpoint/` folder (i.e., `--restart_dir`) and time (in secs; `--restart_t`) of the restart file
mpiexec julia --color=yes --project=experiments/AMIP/ experiments/AMIP/coupler_driver.jl --run_name $RUN_NAME --coupled true --restart_dir $RESTART_DIR --restart_t $RESTART_T --start_date 19790102 --anim true --surface_setup PrescribedSurface --dt_cpl 200 --energy_check false --mode_name amip --mono_surface false --vert_diff true --moist equil --rad clearsky --precip_model 0M --z_elem 35 --dz_bottom 50 --h_elem 12 --rayleigh_sponge true --alpha_rayleigh_uh 0 --dt 200secs --t_end 0.1days --job_id $RUN_NAME --dt_save_to_sol 1000days --dt_save_state_to_disk 10days --apply_limiter false --FLOAT_TYPE Float64
# Trouble shooting?
# - ensure you're using the latest module file of climacommon and set MODULEPATH to the correct location
# - ensure you're using the latest version of ClimaCoupler.jl
# - did you cd to your version of ClimaCoupler.jl?
32 changes: 32 additions & 0 deletions test/mpi_tests/test_sbatch_script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash

# This script submits a job to the Slurm scheduler and waits for it to finish. It
# reports the job status every 30 seconds until the job completes. If the job
# fails or is terminated, the script prints an error message and exits with a
# non-zero status code. This is used by Buildkite to determine whether the job
# truly succeeded or failed.

# Submit the sbatch script and capture its job ID
JOB_ID=$(sbatch test/mpi_tests/local_checks.sh | awk '{print $4}')
echo "Submitted job with ID: $JOB_ID, output log: slurm-$JOB_ID.out"
START_TIME=$(date +%s)
# Loop until the job finishes
while true; do
# Check the status of the job
STATUS=$(scontrol show job $JOB_ID | grep -oP 'JobState=\K\S+')
sleep 30
ELAPSED_TIME=$(( $(date +%s) - $START_TIME ))
# If the job status is 'PD' (pending) or 'R' (running), wait and continue checking
if [ "$STATUS" == "" ] || [ "$STATUS" == "PENDING" ] || [ "$STATUS" == "RUNNING" ]; then
echo "Job is still running... Elapsed time: $ELAPSED_TIME seconds."
# If the job status is 'CF' (completed successfully), print success message and exit
elif [ "$STATUS" == "COMPLETED" ]; then
echo "Job completed successfully."
exit 0
# If the job status is anything else, print error message and exit
else
echo "Error: Job failed or terminated. See slurm-$JOB_ID.out for more information."
cat "slurm-$JOB_ID.out"
exit 1
fi
done

0 comments on commit 6e5e843

Please sign in to comment.