From 6e5e843321fa64ab71a4092211925a2ffd289769 Mon Sep 17 00:00:00 2001
From: lenka <lenka@caltech.edu>
Date: Thu, 14 Mar 2024 11:30:05 -0700
Subject: [PATCH] init

add error up climacommons

test error

test

wait

split cmds

split cmds

split to diff steps

gp fix

no {

rm init

one ln cmd

clean

options

now sol passes

test to fail on BK

revs

try

try

pip fix

try

no bl ln

try

try

sep script

sep script

sep script

sep script

try

try

try

revert fail test

exit 0

try

try

fail test

print status

status wait time fix; test to fail

try to fail, turn off depot

test to pass
---
 .buildkite/pipeline.yml                       | 14 ++--
 .gitignore                                    |  2 +-
 ...single_ft64_hourly_checkpoints_restart.yml | 19 ++++++
 experiments/AMIP/cli_options.jl               |  4 ++
 experiments/AMIP/coupler_driver.jl            | 10 ++-
 test/mpi_tests/local_checks.sh                | 65 ++++++++++++-------
 test/mpi_tests/test_sbatch_script.sh          | 32 +++++++++
 7 files changed, 112 insertions(+), 34 deletions(-)
 create mode 100644 config/model_configs/coarse_single_ft64_hourly_checkpoints_restart.yml
 create mode 100755 test/mpi_tests/test_sbatch_script.sh

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index bfbc6b4fb..01358dc04 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -1,6 +1,6 @@
 agents:
   queue: new-central
-  slurm_time: 24:00:00
+  slurm_time: 4:00:00
   modules: climacommon/2024_03_18
 
 env:
@@ -344,8 +344,6 @@ steps:
           slurm_ntasks: 2
           slurm_mem: 16GB
 
-      - label: "batch script"
-        command: "sbatch test/mpi_tests/local_checks.sh"
 
       # short high-res performance test
       - label: "Unthreaded AMIP FINE" # also reported by longruns with a flame graph
@@ -431,9 +429,13 @@ steps:
           slurm_mem: 20GB
           slurm_gpus: 1
 
-
-
-      - wait
+  - group: "Bash scripts"
+    steps:
+      - label: "Submit and Monitor sbatch Job on Caltech HPC"
+        # check that (1) the script can be succesfully submitted, (2) it runs successfully
+        command: "test/mpi_tests/test_sbatch_script.sh"
+        agents:
+          slurm_ntasks: 1
 
   - wait
 
diff --git a/.gitignore b/.gitignore
index 2b1054497..c225ec897 100644
--- a/.gitignore
+++ b/.gitignore
@@ -50,7 +50,7 @@ experiments/AMIP/output/*
 *.so
 
 # internal tests
-testdel.jl
+slurm-*.out
 
 # ignore vscode artifacts
 *.vscode
diff --git a/config/model_configs/coarse_single_ft64_hourly_checkpoints_restart.yml b/config/model_configs/coarse_single_ft64_hourly_checkpoints_restart.yml
new file mode 100644
index 000000000..a99b64cea
--- /dev/null
+++ b/config/model_configs/coarse_single_ft64_hourly_checkpoints_restart.yml
@@ -0,0 +1,19 @@
+anim: false
+apply_limiter: false
+dt: "400secs"
+dt_cpl: 400
+dt_save_restart: "10days"
+dt_save_to_sol: "1days"
+energy_check: false
+h_elem: 6
+hourly_checkpoint: true
+hourly_checkpoint_dt: 1
+job_id: "coarse_single_ft64_hourly_checkpoints_restart"
+mode_name: "amip"
+moist: "equil"
+mono_surface: false
+precip_model: "0M"
+rad: "gray"
+run_name: "coarse_single_ft64_hourly_checkpoints_restart"
+t_end: "800secs"
+vert_diff: "true"
diff --git a/experiments/AMIP/cli_options.jl b/experiments/AMIP/cli_options.jl
index 6e69b5f1e..9bab51e06 100644
--- a/experiments/AMIP/cli_options.jl
+++ b/experiments/AMIP/cli_options.jl
@@ -38,6 +38,10 @@ function argparse_settings()
         help = "Boolean flag indicating whether to checkpoint at intervals of 1 hour or multiple hours"
         arg_type = Bool
         default = false
+        "--hourly_checkpoint_dt"
+        help = "Time interval for hourly checkpointing in hours (20 days by default)"
+        arg_type = Int
+        default = 480
         "--coupler_output_dir"
         help = "Directory to save output files. Note that TempestRemap fails if interactive and paths are too long."
         arg_type = String
diff --git a/experiments/AMIP/coupler_driver.jl b/experiments/AMIP/coupler_driver.jl
index fa47c2636..fc0b75bf4 100644
--- a/experiments/AMIP/coupler_driver.jl
+++ b/experiments/AMIP/coupler_driver.jl
@@ -131,6 +131,7 @@ saveat = Float64(time_to_seconds(config_dict["dt_save_to_sol"]))
 date0 = date = DateTime(config_dict["start_date"], dateformat"yyyymmdd")
 mono_surface = config_dict["mono_surface"]
 hourly_checkpoint = config_dict["hourly_checkpoint"]
+hourly_checkpoint_dt = config_dict["hourly_checkpoint_dt"]
 restart_dir = config_dict["restart_dir"]
 restart_t = Int(config_dict["restart_t"])
 evolving_ocean = config_dict["evolving_ocean"]
@@ -526,8 +527,13 @@ The currently implemented callbacks are:
   being approximated from wind speed). It is updated at the same frequency as the atmospheric radiation.
   NB: Eventually, we will call all of radiation from the coupler, in addition to the albedo calculation.
 =#
-checkpoint_cb =
-    HourlyCallback(dt = FT(480), func = checkpoint_sims, ref_date = [dates.date[1]], active = hourly_checkpoint) # 20 days
+
+checkpoint_cb = HourlyCallback(
+    dt = hourly_checkpoint_dt,
+    func = checkpoint_sims,
+    ref_date = [dates.date[1]],
+    active = hourly_checkpoint,
+) # 20 days
 update_firstdayofmonth!_cb =
     MonthlyCallback(dt = FT(1), func = update_firstdayofmonth!, ref_date = [dates.date1[1]], active = true)
 dt_water_albedo = parse(FT, filter(x -> !occursin(x, "hours"), dt_rad))
diff --git a/test/mpi_tests/local_checks.sh b/test/mpi_tests/local_checks.sh
index 08213273f..cb4f49c4f 100644
--- a/test/mpi_tests/local_checks.sh
+++ b/test/mpi_tests/local_checks.sh
@@ -1,29 +1,25 @@
 #!/bin/bash
-#SBATCH --time=24:00:00
-#SBATCH --nodes=1
-#SBATCH --job-name=mpi_restart_test
-#SBATCH --reservation=clima
-#SBATCH --mem=32GB
 #SBATCH --ntasks=2
+#SBATCH --job-name=mpi_amip
+#SBATCH --time=24:00:00
+#SBATCH --mem-per-cpu=16G
+#SBATCH --partition=expansion
 
-# TODO: this needs to be updated (+ implement better tests that are caught on Buildkite) #667
-
+export MODULEPATH="/groups/esm/modules:$MODULEPATH"
 module purge
-module load julia/1.10.1
-export JULIA_MPI_BINARY=system
-export JULIA_NUM_THREADS=${SLURM_CPUS_PER_TASK:=1}
-export CLIMACORE_DISTRIBUTED="MPI"
-export JULIA_HDF5_PATH=""
+module load climacommon/2024_03_18
 
-export RUN_NAME=amip_restart_mpi_test
+export CC_PATH=$(pwd)/ # adjust this to the path of your ClimaCoupler.jl directory
+export RUN_NAME=coarse_single_ft64_hourly_checkpoints_restart
+export CONFIG_FILE=${CC_PATH}config/model_configs/${RUN_NAME}.yml
 export RESTART_DIR=experiments/AMIP/output/amip/${RUN_NAME}_artifacts/
-export RESTART_T=200
 
-julia -e 'using Pkg; Pkg.add("MPIPreferences"); using MPIPreferences; use_system_binary()'
-julia --project -e 'using Pkg; Pkg.instantiate()'
-julia --project -e 'using Pkg; Pkg.build("MPI")'
-julia --project -e 'using Pkg; Pkg.build("HDF5")'
-julia --project -e 'using Pkg; Pkg.API.precompile()'
+export OPENBLAS_NUM_THREADS=1
+export JULIA_NVTX_CALLBACKS=gc
+export OMPI_MCA_opal_warn_on_missing_libcuda=0
+export JULIA_MAX_NUM_PRECOMPILE_FILES=100
+export JULIA_CPU_TARGET='broadwell;skylake;icelake;cascadelake;epyc'
+export SLURM_KILL_BAD_EXIT=1
 
 julia --project=experiments/AMIP/ -e 'using Pkg; Pkg.instantiate(;verbose=true)'
 julia --project=experiments/AMIP/ -e 'using Pkg; Pkg.precompile()'
@@ -34,10 +30,29 @@ julia --project=artifacts -e 'using Pkg; Pkg.precompile()'
 julia --project=artifacts -e 'using Pkg; Pkg.status()'
 julia --project=artifacts artifacts/download_artifacts.jl
 
-# run spin up
-# - specify `--hourly_checkpoint true` to save monthly checkpoints of all model prognostic states
-mpiexec julia --color=yes --project=experiments/AMIP/ experiments/AMIP/coupler_driver.jl --run_name $RUN_NAME --coupled true   --start_date 19790101 --hourly_checkpoint true  --anim true --surface_setup PrescribedSurface --dt_cpl 200 --energy_check false --mode_name amip --mono_surface false --vert_diff true --moist equil --rad clearsky --precip_model 0M --z_elem 35 --dz_bottom 50 --h_elem 12 --rayleigh_sponge true --alpha_rayleigh_uh 0 --dt 200secs --t_end 0.1days --job_id $RUN_NAME --dt_save_to_sol 1000days --dt_save_state_to_disk 10days --apply_limiter false --FLOAT_TYPE Float64
+srun -K julia --project=experiments/AMIP/ experiments/AMIP/coupler_driver.jl --config_file $CONFIG_FILE
+
+# restart from simulation time of 400 seconds
+export RESTART_T=400
+
+# setup the new config file with ammened checkpointing frequency
+export RESTART_CONFIG_FILE=${CONFIG_FILE::-4}_tmp.yml
+cp $CONFIG_FILE $RESTART_CONFIG_FILE
+sed -i 's/t_end: \"800secs\"/t_end: \"3600secs\"/g' $RESTART_CONFIG_FILE
+
+# rerun the model
+srun -K julia --project=experiments/AMIP/ experiments/AMIP/coupler_driver.jl --config_file $RESTART_CONFIG_FILE --restart_dir $RESTART_DIR --restart_t $RESTART_T
+
+# throw an error if no restart checkpoint files are found
+if [ $(ls -1 $RESTART_DIR/checkpoint | wc -l) -lt 5 ]; then
+  echo "Error: RESTART_DIR does not contain enough files"
+  exit 1
+else
+  echo "Successful: RESTART_DIR contains $(ls -1 $RESTART_DIR/checkpoint | wc -l) files"
+  exit 0
+fi
 
-# init using a restart
-# - specify the directory of the `checkpoint/` folder (i.e.,  `--restart_dir`) and time (in secs; `--restart_t`) of the restart file
-mpiexec julia --color=yes --project=experiments/AMIP/ experiments/AMIP/coupler_driver.jl --run_name $RUN_NAME --coupled true --restart_dir $RESTART_DIR --restart_t $RESTART_T --start_date 19790102 --anim true --surface_setup PrescribedSurface --dt_cpl 200 --energy_check false --mode_name amip --mono_surface false --vert_diff true --moist equil --rad clearsky --precip_model 0M --z_elem 35 --dz_bottom 50 --h_elem 12 --rayleigh_sponge true --alpha_rayleigh_uh 0 --dt 200secs --t_end 0.1days --job_id $RUN_NAME --dt_save_to_sol 1000days --dt_save_state_to_disk 10days --apply_limiter false --FLOAT_TYPE Float64
+# Trouble shooting?
+# - ensure you're using the latest module file of climacommon and set MODULEPATH to the correct location
+# - ensure you're using the latest version of ClimaCoupler.jl
+# - did you cd to your version of ClimaCoupler.jl?
diff --git a/test/mpi_tests/test_sbatch_script.sh b/test/mpi_tests/test_sbatch_script.sh
new file mode 100755
index 000000000..83f1a9ac7
--- /dev/null
+++ b/test/mpi_tests/test_sbatch_script.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+# This script submits a job to the Slurm scheduler and waits for it to finish. It
+# reports the job status every 30 seconds until the job completes. If the job
+# fails or is terminated, the script prints an error message and exits with a
+# non-zero status code. This is used by Buildkite to determine whether the job
+# truly succeeded or failed.
+
+# Submit the sbatch script and capture its job ID
+JOB_ID=$(sbatch  test/mpi_tests/local_checks.sh | awk '{print $4}')
+echo "Submitted job with ID: $JOB_ID, output log: slurm-$JOB_ID.out"
+START_TIME=$(date +%s)
+# Loop until the job finishes
+while true; do
+    # Check the status of the job
+    STATUS=$(scontrol show job $JOB_ID | grep -oP 'JobState=\K\S+')
+    sleep 30
+    ELAPSED_TIME=$(( $(date +%s) - $START_TIME ))
+    # If the job status is 'PD' (pending) or 'R' (running), wait and continue checking
+    if [ "$STATUS" == "" ] || [ "$STATUS" == "PENDING" ] || [ "$STATUS" == "RUNNING" ]; then
+        echo "Job is still running... Elapsed time: $ELAPSED_TIME seconds."
+    # If the job status is 'CF' (completed successfully), print success message and exit
+    elif [ "$STATUS" == "COMPLETED" ]; then
+        echo "Job completed successfully."
+        exit 0
+    # If the job status is anything else, print error message and exit
+    else
+        echo "Error: Job failed or terminated. See slurm-$JOB_ID.out for more information."
+        cat "slurm-$JOB_ID.out"
+        exit 1
+    fi
+done