Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
add error up climacommons

test error

test

wait

split cmds

split cmds

split to diff steps

gp fix

no {

rm init

one ln cmd

clean
  • Loading branch information
LenkaNovak committed Mar 28, 2024
1 parent 4e528c2 commit 57affed
Show file tree
Hide file tree
Showing 6 changed files with 109 additions and 30 deletions.
39 changes: 37 additions & 2 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -344,8 +344,17 @@ steps:
slurm_ntasks: 2
slurm_mem: 16GB

- label: "batch script"
command: "sbatch test/mpi_tests/local_checks.sh"
# - label: "batch script"
# command:
# - |
# sed -n '10,37' test/mpi_tests/local_checks.sh | bash &&
# sed -n '38,49' test/mpi_tests/local_checks.sh | bash &&
# sed -n '50,$$p' test/mpi_tests/local_checks.sh | bash
# env:
# CLIMACORE_DISTRIBUTED: "MPI"
# agents:
# slurm_ntasks: 2
# slurm_mem: 16GB

# short high-res performance test
- label: "Unthreaded AMIP FINE" # also reported by longruns with a flame graph
Expand Down Expand Up @@ -431,10 +440,36 @@ steps:
slurm_mem: 20GB
slurm_gpus: 1

- group: "Bash scripts"

steps:
- label: "batch script - part 1: first srun"
command:
- "sed -n '8,34p' test/mpi_tests/local_checks.sh | bash"
env:
CLIMACORE_DISTRIBUTED: "MPI"
agents:
slurm_ntasks: 2
slurm_mem: 16GB

- wait

- label: "batch script - part 2: second srun"
command:
- "sed -n '8,33p; 36,46p' test/mpi_tests/local_checks.sh | bash"
env:
CLIMACORE_DISTRIBUTED: "MPI"
agents:
slurm_ntasks: 2
slurm_mem: 16GB

- wait

- label: "batch script - part 3: check for output"
command: "sed -n '8,24p; 47,$$p' test/mpi_tests/local_checks.sh | bash"
agents:
slurm_mem: 16GB

- wait

# plot job performance history
Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ experiments/AMIP/output/*
*.so

# internal tests
testdel.jl
slurm-*.out

# ignore vscode artifacts
*.vscode
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
anim: false
apply_limiter: false
dt: "400secs"
dt_cpl: 400
dt_save_restart: "10days"
dt_save_to_sol: "1days"
energy_check: false
h_elem: 6
hourly_checkpoint: true
hourly_checkpoint_dt: 1
job_id: "coarse_single_ft64_hourly_checkpoints_restart"
mode_name: "amip"
moist: "equil"
mono_surface: false
precip_model: "0M"
rad: "gray"
run_name: "coarse_single_ft64_hourly_checkpoints_restart"
t_end: "800secs"
vert_diff: "true"
4 changes: 4 additions & 0 deletions experiments/AMIP/cli_options.jl
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ function argparse_settings()
help = "Boolean flag indicating whether to checkpoint at intervals of 1 hour or multiple hours"
arg_type = Bool
default = false
"--hourly_checkpoint_dt"
help = "Time interval for hourly checkpointing in hours (20 days by default)"
arg_type = Int
default = 480
"--coupler_output_dir"
help = "Directory to save output files. Note that TempestRemap fails if interactive and paths are too long."
arg_type = String
Expand Down
10 changes: 8 additions & 2 deletions experiments/AMIP/coupler_driver.jl
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ saveat = Float64(time_to_seconds(config_dict["dt_save_to_sol"]))
date0 = date = DateTime(config_dict["start_date"], dateformat"yyyymmdd")
mono_surface = config_dict["mono_surface"]
hourly_checkpoint = config_dict["hourly_checkpoint"]
hourly_checkpoint_dt = config_dict["hourly_checkpoint_dt"]
restart_dir = config_dict["restart_dir"]
restart_t = Int(config_dict["restart_t"])
evolving_ocean = config_dict["evolving_ocean"]
Expand Down Expand Up @@ -526,8 +527,13 @@ The currently implemented callbacks are:
being approximated from wind speed). It is updated at the same frequency as the atmospheric radiation.
NB: Eventually, we will call all of radiation from the coupler, in addition to the albedo calculation.
=#
checkpoint_cb =
HourlyCallback(dt = FT(480), func = checkpoint_sims, ref_date = [dates.date[1]], active = hourly_checkpoint) # 20 days

checkpoint_cb = HourlyCallback(
dt = hourly_checkpoint_dt,
func = checkpoint_sims,
ref_date = [dates.date[1]],
active = hourly_checkpoint,
) # 20 days
update_firstdayofmonth!_cb =
MonthlyCallback(dt = FT(1), func = update_firstdayofmonth!, ref_date = [dates.date1[1]], active = true)
dt_water_albedo = parse(FT, filter(x -> !occursin(x, "hours"), dt_rad))
Expand Down
65 changes: 40 additions & 25 deletions test/mpi_tests/local_checks.sh
Original file line number Diff line number Diff line change
@@ -1,29 +1,26 @@
#!/bin/bash
#SBATCH --time=24:00:00
#SBATCH --nodes=1
#SBATCH --job-name=mpi_restart_test
#SBATCH --reservation=clima
#SBATCH --mem=32GB
#SBATCH --ntasks=2
#SBATCH --job-name=mpi_amip
#SBATCH --time=24:00:00
#SBATCH --mem-per-cpu=16G
#SBATCH --partition=expansion

# TODO: this needs to be updated (+ implement better tests that are caught on Buildkite) #667

export MODULE_PATH=/groups/esm/modules:$MODULE_PATH
module purge
module load julia/1.10.1
export JULIA_MPI_BINARY=system
export JULIA_NUM_THREADS=${SLURM_CPUS_PER_TASK:=1}
export CLIMACORE_DISTRIBUTED="MPI"
export JULIA_HDF5_PATH=""
module load climacommon/2024_03_18

export RUN_NAME=amip_restart_mpi_test
export CC_PATH=$(pwd)/
export RUN_NAME=coarse_single_ft64_hourly_checkpoints_restart
export CONFIG_FILE=${CC_PATH}config/model_configs/${RUN_NAME}.yml
export RESTART_DIR=experiments/AMIP/output/amip/${RUN_NAME}_artifacts/
export RESTART_T=200

julia -e 'using Pkg; Pkg.add("MPIPreferences"); using MPIPreferences; use_system_binary()'
julia --project -e 'using Pkg; Pkg.instantiate()'
julia --project -e 'using Pkg; Pkg.build("MPI")'
julia --project -e 'using Pkg; Pkg.build("HDF5")'
julia --project -e 'using Pkg; Pkg.API.precompile()'
export OPENBLAS_NUM_THREADS=1
export JULIA_NVTX_CALLBACKS=gc
export OMPI_MCA_opal_warn_on_missing_libcuda=0
export JULIA_MAX_NUM_PRECOMPILE_FILES=100
export JULIA_CPU_TARGET='broadwell;skylake;icelake;cascadelake;epyc'
export CLIMACORE_DISTRIBUTED="MPI"
export SLURM_KILL_BAD_EXIT=1

julia --project=experiments/AMIP/ -e 'using Pkg; Pkg.instantiate(;verbose=true)'
julia --project=experiments/AMIP/ -e 'using Pkg; Pkg.precompile()'
Expand All @@ -34,10 +31,28 @@ julia --project=artifacts -e 'using Pkg; Pkg.precompile()'
julia --project=artifacts -e 'using Pkg; Pkg.status()'
julia --project=artifacts artifacts/download_artifacts.jl

# run spin up
# - specify `--hourly_checkpoint true` to save monthly checkpoints of all model prognostic states
mpiexec julia --color=yes --project=experiments/AMIP/ experiments/AMIP/coupler_driver.jl --run_name $RUN_NAME --coupled true --start_date 19790101 --hourly_checkpoint true --anim true --surface_setup PrescribedSurface --dt_cpl 200 --energy_check false --mode_name amip --mono_surface false --vert_diff true --moist equil --rad clearsky --precip_model 0M --z_elem 35 --dz_bottom 50 --h_elem 12 --rayleigh_sponge true --alpha_rayleigh_uh 0 --dt 200secs --t_end 0.1days --job_id $RUN_NAME --dt_save_to_sol 1000days --dt_save_state_to_disk 10days --apply_limiter false --FLOAT_TYPE Float64
srun -K julia --project=experiments/AMIP/ experiments/AMIP/coupler_driver.jl --config_file $CONFIG_FILE

# restart from simulation time of 400 seconds
export RESTART_T=400

# setup the new config file with ammened checkpointing frequency
export RESTART_CONFIG_FILE=${CONFIG_FILE::-4}_tmp.yml
cp $CONFIG_FILE $RESTART_CONFIG_FILE
sed -i 's/t_end: \"800secs\"/t_end: \"3600secs\"/g' $RESTART_CONFIG_FILE

# rerun the model
srun -K julia --project=experiments/AMIP/ experiments/AMIP/coupler_driver.jl --config_file $RESTART_CONFIG_FILE --restart_dir $RESTART_DIR --restart_t $RESTART_T

# throw an error if no restart checkpoint files are found
if [ $(ls -1 $RESTART_DIR/checkpoint | wc -l) -lt 5 ]; then
echo "RESTART_DIR does not contain enough files"
exit 1
else
echo "RESTART_DIR contains $(ls -1 $RESTART_DIR/checkpoint | wc -l) files"
fi

# init using a restart
# - specify the directory of the `checkpoint/` folder (i.e., `--restart_dir`) and time (in secs; `--restart_t`) of the restart file
mpiexec julia --color=yes --project=experiments/AMIP/ experiments/AMIP/coupler_driver.jl --run_name $RUN_NAME --coupled true --restart_dir $RESTART_DIR --restart_t $RESTART_T --start_date 19790102 --anim true --surface_setup PrescribedSurface --dt_cpl 200 --energy_check false --mode_name amip --mono_surface false --vert_diff true --moist equil --rad clearsky --precip_model 0M --z_elem 35 --dz_bottom 50 --h_elem 12 --rayleigh_sponge true --alpha_rayleigh_uh 0 --dt 200secs --t_end 0.1days --job_id $RUN_NAME --dt_save_to_sol 1000days --dt_save_state_to_disk 10days --apply_limiter false --FLOAT_TYPE Float64
# Trouble shooting?
# - ensure you're using the latest module file of climacommon
# - ensure you're using the latest version of ClimaCoupler.jl
# - did you cd to your version of ClimaCoupler.jl?

0 comments on commit 57affed

Please sign in to comment.