diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index bfbc6b4fb0..27bfe62bd4 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -344,8 +344,17 @@ steps: slurm_ntasks: 2 slurm_mem: 16GB - - label: "batch script" - command: "sbatch test/mpi_tests/local_checks.sh" + # - label: "batch script" + # command: + # - | + # sed -n '10,37' test/mpi_tests/local_checks.sh | bash && + # sed -n '38,49' test/mpi_tests/local_checks.sh | bash && + # sed -n '50,$$p' test/mpi_tests/local_checks.sh | bash + # env: + # CLIMACORE_DISTRIBUTED: "MPI" + # agents: + # slurm_ntasks: 2 + # slurm_mem: 16GB # short high-res performance test - label: "Unthreaded AMIP FINE" # also reported by longruns with a flame graph @@ -431,10 +440,36 @@ steps: slurm_mem: 20GB slurm_gpus: 1 + - group: "Bash scripts" + steps: + - label: "batch script - part 1: first srun" + command: + - "sed -n '8,34p' test/mpi_tests/local_checks.sh | bash" + env: + CLIMACORE_DISTRIBUTED: "MPI" + agents: + slurm_ntasks: 2 + slurm_mem: 16GB - wait + - label: "batch script - part 2: second srun" + command: + - "sed -n '8,33p; 36,46p' test/mpi_tests/local_checks.sh | bash" + env: + CLIMACORE_DISTRIBUTED: "MPI" + agents: + slurm_ntasks: 2 + slurm_mem: 16GB + + - wait + + - label: "batch script - part 3: check for output" + command: "sed -n '8,24p; 47,$$p' test/mpi_tests/local_checks.sh | bash" + agents: + slurm_mem: 16GB + - wait # plot job performance history diff --git a/.gitignore b/.gitignore index 2b10544978..c225ec897f 100644 --- a/.gitignore +++ b/.gitignore @@ -50,7 +50,7 @@ experiments/AMIP/output/* *.so # internal tests -testdel.jl +slurm-*.out # ignore vscode artifacts *.vscode diff --git a/config/model_configs/coarse_single_ft64_hourly_checkpoints_restart.yml b/config/model_configs/coarse_single_ft64_hourly_checkpoints_restart.yml new file mode 100644 index 0000000000..a99b64cea8 --- /dev/null +++ b/config/model_configs/coarse_single_ft64_hourly_checkpoints_restart.yml @@ -0,0 +1,19 @@ +anim: false +apply_limiter: false +dt: "400secs" +dt_cpl: 400 +dt_save_restart: "10days" +dt_save_to_sol: "1days" +energy_check: false +h_elem: 6 +hourly_checkpoint: true +hourly_checkpoint_dt: 1 +job_id: "coarse_single_ft64_hourly_checkpoints_restart" +mode_name: "amip" +moist: "equil" +mono_surface: false +precip_model: "0M" +rad: "gray" +run_name: "coarse_single_ft64_hourly_checkpoints_restart" +t_end: "800secs" +vert_diff: "true" diff --git a/experiments/AMIP/cli_options.jl b/experiments/AMIP/cli_options.jl index 6e69b5f1ef..9bab51e062 100644 --- a/experiments/AMIP/cli_options.jl +++ b/experiments/AMIP/cli_options.jl @@ -38,6 +38,10 @@ function argparse_settings() help = "Boolean flag indicating whether to checkpoint at intervals of 1 hour or multiple hours" arg_type = Bool default = false + "--hourly_checkpoint_dt" + help = "Time interval for hourly checkpointing in hours (20 days by default)" + arg_type = Int + default = 480 "--coupler_output_dir" help = "Directory to save output files. Note that TempestRemap fails if interactive and paths are too long." arg_type = String diff --git a/experiments/AMIP/coupler_driver.jl b/experiments/AMIP/coupler_driver.jl index fa47c2636c..fc0b75bf48 100644 --- a/experiments/AMIP/coupler_driver.jl +++ b/experiments/AMIP/coupler_driver.jl @@ -131,6 +131,7 @@ saveat = Float64(time_to_seconds(config_dict["dt_save_to_sol"])) date0 = date = DateTime(config_dict["start_date"], dateformat"yyyymmdd") mono_surface = config_dict["mono_surface"] hourly_checkpoint = config_dict["hourly_checkpoint"] +hourly_checkpoint_dt = config_dict["hourly_checkpoint_dt"] restart_dir = config_dict["restart_dir"] restart_t = Int(config_dict["restart_t"]) evolving_ocean = config_dict["evolving_ocean"] @@ -526,8 +527,13 @@ The currently implemented callbacks are: being approximated from wind speed). It is updated at the same frequency as the atmospheric radiation. NB: Eventually, we will call all of radiation from the coupler, in addition to the albedo calculation. =# -checkpoint_cb = - HourlyCallback(dt = FT(480), func = checkpoint_sims, ref_date = [dates.date[1]], active = hourly_checkpoint) # 20 days + +checkpoint_cb = HourlyCallback( + dt = hourly_checkpoint_dt, + func = checkpoint_sims, + ref_date = [dates.date[1]], + active = hourly_checkpoint, +) # 20 days update_firstdayofmonth!_cb = MonthlyCallback(dt = FT(1), func = update_firstdayofmonth!, ref_date = [dates.date1[1]], active = true) dt_water_albedo = parse(FT, filter(x -> !occursin(x, "hours"), dt_rad)) diff --git a/test/mpi_tests/local_checks.sh b/test/mpi_tests/local_checks.sh index 08213273fa..84e3656d36 100644 --- a/test/mpi_tests/local_checks.sh +++ b/test/mpi_tests/local_checks.sh @@ -1,29 +1,26 @@ #!/bin/bash -#SBATCH --time=24:00:00 -#SBATCH --nodes=1 -#SBATCH --job-name=mpi_restart_test -#SBATCH --reservation=clima -#SBATCH --mem=32GB #SBATCH --ntasks=2 +#SBATCH --job-name=mpi_amip +#SBATCH --time=24:00:00 +#SBATCH --mem-per-cpu=16G +#SBATCH --partition=expansion -# TODO: this needs to be updated (+ implement better tests that are caught on Buildkite) #667 - +export MODULE_PATH=/groups/esm/modules:$MODULE_PATH module purge -module load julia/1.10.1 -export JULIA_MPI_BINARY=system -export JULIA_NUM_THREADS=${SLURM_CPUS_PER_TASK:=1} -export CLIMACORE_DISTRIBUTED="MPI" -export JULIA_HDF5_PATH="" +module load climacommon/2024_03_18 -export RUN_NAME=amip_restart_mpi_test +export CC_PATH=$(pwd)/ +export RUN_NAME=coarse_single_ft64_hourly_checkpoints_restart +export CONFIG_FILE=${CC_PATH}config/model_configs/${RUN_NAME}.yml export RESTART_DIR=experiments/AMIP/output/amip/${RUN_NAME}_artifacts/ -export RESTART_T=200 -julia -e 'using Pkg; Pkg.add("MPIPreferences"); using MPIPreferences; use_system_binary()' -julia --project -e 'using Pkg; Pkg.instantiate()' -julia --project -e 'using Pkg; Pkg.build("MPI")' -julia --project -e 'using Pkg; Pkg.build("HDF5")' -julia --project -e 'using Pkg; Pkg.API.precompile()' +export OPENBLAS_NUM_THREADS=1 +export JULIA_NVTX_CALLBACKS=gc +export OMPI_MCA_opal_warn_on_missing_libcuda=0 +export JULIA_MAX_NUM_PRECOMPILE_FILES=100 +export JULIA_CPU_TARGET='broadwell;skylake;icelake;cascadelake;epyc' +export CLIMACORE_DISTRIBUTED="MPI" +export SLURM_KILL_BAD_EXIT=1 julia --project=experiments/AMIP/ -e 'using Pkg; Pkg.instantiate(;verbose=true)' julia --project=experiments/AMIP/ -e 'using Pkg; Pkg.precompile()' @@ -34,10 +31,28 @@ julia --project=artifacts -e 'using Pkg; Pkg.precompile()' julia --project=artifacts -e 'using Pkg; Pkg.status()' julia --project=artifacts artifacts/download_artifacts.jl -# run spin up -# - specify `--hourly_checkpoint true` to save monthly checkpoints of all model prognostic states -mpiexec julia --color=yes --project=experiments/AMIP/ experiments/AMIP/coupler_driver.jl --run_name $RUN_NAME --coupled true --start_date 19790101 --hourly_checkpoint true --anim true --surface_setup PrescribedSurface --dt_cpl 200 --energy_check false --mode_name amip --mono_surface false --vert_diff true --moist equil --rad clearsky --precip_model 0M --z_elem 35 --dz_bottom 50 --h_elem 12 --rayleigh_sponge true --alpha_rayleigh_uh 0 --dt 200secs --t_end 0.1days --job_id $RUN_NAME --dt_save_to_sol 1000days --dt_save_state_to_disk 10days --apply_limiter false --FLOAT_TYPE Float64 +srun -K julia --project=experiments/AMIP/ experiments/AMIP/coupler_driver.jl --config_file $CONFIG_FILE + +# restart from simulation time of 400 seconds +export RESTART_T=400 + +# setup the new config file with ammened checkpointing frequency +export RESTART_CONFIG_FILE=${CONFIG_FILE::-4}_tmp.yml +cp $CONFIG_FILE $RESTART_CONFIG_FILE +sed -i 's/t_end: \"800secs\"/t_end: \"3600secs\"/g' $RESTART_CONFIG_FILE + +# rerun the model +srun -K julia --project=experiments/AMIP/ experiments/AMIP/coupler_driver.jl --config_file $RESTART_CONFIG_FILE --restart_dir $RESTART_DIR --restart_t $RESTART_T + +# throw an error if no restart checkpoint files are found +if [ $(ls -1 $RESTART_DIR/checkpoint | wc -l) -lt 5 ]; then + echo "RESTART_DIR does not contain enough files" + exit 1 +else + echo "RESTART_DIR contains $(ls -1 $RESTART_DIR/checkpoint | wc -l) files" +fi -# init using a restart -# - specify the directory of the `checkpoint/` folder (i.e., `--restart_dir`) and time (in secs; `--restart_t`) of the restart file -mpiexec julia --color=yes --project=experiments/AMIP/ experiments/AMIP/coupler_driver.jl --run_name $RUN_NAME --coupled true --restart_dir $RESTART_DIR --restart_t $RESTART_T --start_date 19790102 --anim true --surface_setup PrescribedSurface --dt_cpl 200 --energy_check false --mode_name amip --mono_surface false --vert_diff true --moist equil --rad clearsky --precip_model 0M --z_elem 35 --dz_bottom 50 --h_elem 12 --rayleigh_sponge true --alpha_rayleigh_uh 0 --dt 200secs --t_end 0.1days --job_id $RUN_NAME --dt_save_to_sol 1000days --dt_save_state_to_disk 10days --apply_limiter false --FLOAT_TYPE Float64 +# Trouble shooting? +# - ensure you're using the latest module file of climacommon +# - ensure you're using the latest version of ClimaCoupler.jl +# - did you cd to your version of ClimaCoupler.jl? \ No newline at end of file