Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

longruns test #469

Closed
wants to merge 18 commits into from
239 changes: 132 additions & 107 deletions .buildkite/longruns/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,19 +63,19 @@ steps:
slurm_nodes: 1
slurm_mem_per_cpu: 16G

# DYAMOND AMIP: 1 day (convection resolving)

- label: "MPI AMIP SUPERFINE: dyamond_target"
key: "dyamond_target"
command: "mpiexec julia --color=yes --project=experiments/AMIP/modular/ experiments/AMIP/modular/coupler_driver_modular.jl --config_file $CONFIG_PATH/dyamond_target.yml"
artifact_paths: "experiments/AMIP/modular/output/amip/dyamond_target_artifacts/*"
env:
CLIMACORE_DISTRIBUTED: "MPI"
BUILD_HISTORY_HANDLE: ""
agents:
slurm_ntasks_per_node: 16
slurm_nodes: 4
slurm_mem_per_cpu: 16G
# # DYAMOND AMIP: 1 day (convection resolving)

# - label: "MPI AMIP SUPERFINE: dyamond_target"
# key: "dyamond_target"
# command: "mpiexec julia --color=yes --project=experiments/AMIP/modular/ experiments/AMIP/modular/coupler_driver_modular.jl --config_file $CONFIG_PATH/dyamond_target.yml"
# artifact_paths: "experiments/AMIP/modular/output/amip/dyamond_target_artifacts/*"
# env:
# CLIMACORE_DISTRIBUTED: "MPI"
# BUILD_HISTORY_HANDLE: ""
# agents:
# slurm_ntasks_per_node: 16
# slurm_nodes: 4
# slurm_mem_per_cpu: 16G

# mid-resolution AMIP: longrun (140 days)
- label: "MPI AMIP FINE: target longrun"
Expand All @@ -90,11 +90,10 @@ steps:
slurm_nodes: 4
slurm_mem_per_cpu: 16G

# mid-resolution AMIP: MPI performance scaling (10 days)
- label: "MPI AMIP FINE: n64"
key: "mpi_amip_fine_n64"
command: "mpiexec julia --color=yes --project=experiments/AMIP/modular/ experiments/AMIP/modular/coupler_driver_modular.jl --config_file $CONFIG_PATH/amip_n64_shortrun.yml"
artifact_paths: "experiments/AMIP/modular/output/amip/amip_n64_shortrun_artifacts/*"
- label: "MPI AMIP FINE: new target"
key: "new_target"
command: "mpiexec julia --color=yes --project=experiments/AMIP/modular/ experiments/AMIP/modular/coupler_driver_modular.jl --config_file $CONFIG_PATH/new_target.yml"
artifact_paths: "experiments/AMIP/modular/output/amip/new_target_artifacts/*"
env:
CLIMACORE_DISTRIBUTED: "MPI"
BUILD_HISTORY_HANDLE: ""
Expand All @@ -103,100 +102,126 @@ steps:
slurm_nodes: 4
slurm_mem_per_cpu: 16G

- label: "MPI AMIP FINE: n32"
key: "mpi_amip_fine_n32"
command: "mpiexec julia --color=yes --project=experiments/AMIP/modular/ experiments/AMIP/modular/coupler_driver_modular.jl --config_file $CONFIG_PATH/amip_n32_shortrun.yml"
artifact_paths: "experiments/AMIP/modular/output/amip/amip_n32_shortrun_artifacts/*"
- label: "MPI AMIP FINE: august"
key: "august"
command: "mpiexec julia --color=yes --project=experiments/AMIP/modular/ experiments/AMIP/modular/coupler_driver_modular.jl --config_file $CONFIG_PATH/august.yml"
artifact_paths: "experiments/AMIP/modular/output/amip/august_artifacts/*"
env:
CLIMACORE_DISTRIBUTED: "MPI"
BUILD_HISTORY_HANDLE: ""
agents:
slurm_ntasks_per_node: 8
slurm_ntasks_per_node: 16
slurm_nodes: 4
slurm_mem_per_cpu: 16G

- label: "MPI AMIP FINE: n8"
key: "mpi_amip_fine_n8"
command: "mpiexec julia --color=yes --project=experiments/AMIP/modular/ experiments/AMIP/modular/coupler_driver_modular.jl --config_file $CONFIG_PATH/amip_n8_shortrun.yml"
artifact_paths: "experiments/AMIP/modular/output/amip/amip_n8_shortrun_artifacts/*"
env:
CLIMACORE_DISTRIBUTED: "MPI"
BUILD_HISTORY_HANDLE: ""
agents:
slurm_ntasks_per_node: 8
slurm_nodes: 1
slurm_mem_per_cpu: 16G

- label: "MPI AMIP FINE: n2" # 10d take 21h, so reducing to 1d
key: "mpi_amip_fine_n2"
command: "mpiexec julia --color=yes --project=experiments/AMIP/modular/ experiments/AMIP/modular/coupler_driver_modular.jl --config_file $CONFIG_PATH/amip_n2_shortrun.yml"
artifact_paths: "experiments/AMIP/modular/output/amip/amip_n2_shortrun_artifacts/*"
env:
CLIMACORE_DISTRIBUTED: "MPI"
BUILD_HISTORY_HANDLE: ""
agents:
slurm_ntasks_per_node: 2
slurm_nodes: 1
slurm_mem_per_cpu: 16G

- label: "MPI AMIP FINE: n1" # also reported by longruns with a flame graph; 10d take 21h, so reducing to 1d
key: "mpi_amip_fine_n1"
command: "julia --color=yes --project=experiments/AMIP/modular/ experiments/AMIP/modular/coupler_driver_modular.jl --config_file $CONFIG_PATH/amip_n1_shortrun.yml"
artifact_paths: "experiments/AMIP/modular/output/amip/amip_n1_shortrun_artifacts/*"
env:
BUILD_HISTORY_HANDLE: ""
agents:
slurm_ntasks_per_node: 1
slurm_nodes: 1
slurm_mem_per_cpu: 16G

- label: "MPI AMIP FINE: n1 no couple" # sim time = Δt_cpl (~ benchmarking with standalone models)
key: "mpi_amip_fine_n1_nocouple"
command: "julia --color=yes --project=experiments/AMIP/modular/ experiments/AMIP/modular/coupler_driver_modular.jl --config_file $CONFIG_PATH/amip_n1_shortrun_nocouple.yml"
artifact_paths: "experiments/AMIP/modular/output/amip/amip_n1_shortrun_nocouple_artifacts/*"
env:
BUILD_HISTORY_HANDLE: ""
agents:
slurm_ntasks_per_node: 1
slurm_nodes: 1
slurm_mem_per_cpu: 16G

# mpi_amip_fine_n1 flame graph report (NB: arguments passed from the ci pipeline.yml)
- label: ":rocket: performance: flame graph diff: perf_target_amip_n1_shortrun"
command: "julia --color=yes --project=perf perf/flame_diff.jl --config_file $PERF_CONFIG_PATH/perf_diff_target_amip_n1_shortrun.yml"
artifact_paths: "perf/output/perf_diff_target_amip_n1_shortrun/*"
agents:
slurm_ntasks_per_node: 1
slurm_nodes: 1
slurm_mem_per_cpu: 16G

- wait

# plot job performance history
- label: ":chart_with_downwards_trend: build history"
command:
- build_history main # name of branch to plot
artifact_paths:
- "build_history.html"

- wait

- label: ":envelope: Slack report: build_history"
command:
- slack-upload -c "#coupler-report" -f build_history.html -m html -n build_history -x "Overall job performance"

- label: ":envelope: Slack report: Slabplanet"
command:
- slack-upload -c "#coupler-report" -f experiments/AMIP/modular/output/slabplanet/slabplanet_default_longrun_artifacts/total_energy_log_bucket.png -m png -n slab_coarse_log -x "Slabplanet energy conservation (log error)"
- slack-upload -c "#coupler-report" -f experiments/AMIP/modular/output/slabplanet/slabplanet_default_longrun_artifacts/total_energy_bucket.png -m png -n slab_coarse -x "Slabplanet energy conservation"
- slack-upload -c "#coupler-report" -f experiments/AMIP/modular/output/slabplanet/slabplanet_default_longrun_artifacts/total_water_log_bucket.png -m png -n slab_coarse_w_log -x "Slabplanet water conservation (log error)"
- slack-upload -c "#coupler-report" -f experiments/AMIP/modular/output/slabplanet/slabplanet_default_longrun_artifacts/total_water_bucket.png -m png -n slab_coarse_w -x "Slabplanet water conservation"

- label: ":envelope: Slack report: target AMIP"
command:
- slack-upload -c "#coupler-report" -f experiments/AMIP/modular/output/amip/amip_longrun_target_artifacts/amip_paperplots.png -m png -n amip_fine -x "AMIP Target Longrun"

- label: ":envelope: Slack report: Flame Diff"
command:
- slack-upload -c "#coupler-report" -f perf/output/perf_diff_target_amip_n1_shortrun/flame_diff.html -m png -n amip_fine_flamegraphdiff -x "AMIP Longrun FlameGraphDiff"
- slack-upload -c "#coupler-report" -f perf/output/perf_diff_target_amip_n1_shortrun/flame_diff_self_count.html -m png -n amip_fine_flamegraphdiffself -x "AMIP Longrun FlameGraphDiffSelf"
# # mid-resolution AMIP: MPI performance scaling (10 days)
# - label: "MPI AMIP FINE: n64"
# key: "mpi_amip_fine_n64"
# command: "mpiexec julia --color=yes --project=experiments/AMIP/modular/ experiments/AMIP/modular/coupler_driver_modular.jl --config_file $CONFIG_PATH/amip_n64_shortrun.yml"
# artifact_paths: "experiments/AMIP/modular/output/amip/amip_n64_shortrun_artifacts/*"
# env:
# CLIMACORE_DISTRIBUTED: "MPI"
# BUILD_HISTORY_HANDLE: ""
# agents:
# slurm_ntasks_per_node: 16
# slurm_nodes: 4
# slurm_mem_per_cpu: 16G

# - label: "MPI AMIP FINE: n32"
# key: "mpi_amip_fine_n32"
# command: "mpiexec julia --color=yes --project=experiments/AMIP/modular/ experiments/AMIP/modular/coupler_driver_modular.jl --config_file $CONFIG_PATH/amip_n32_shortrun.yml"
# artifact_paths: "experiments/AMIP/modular/output/amip/amip_n32_shortrun_artifacts/*"
# env:
# CLIMACORE_DISTRIBUTED: "MPI"
# BUILD_HISTORY_HANDLE: ""
# agents:
# slurm_ntasks_per_node: 8
# slurm_nodes: 4
# slurm_mem_per_cpu: 16G

# - label: "MPI AMIP FINE: n8"
# key: "mpi_amip_fine_n8"
# command: "mpiexec julia --color=yes --project=experiments/AMIP/modular/ experiments/AMIP/modular/coupler_driver_modular.jl --config_file $CONFIG_PATH/amip_n8_shortrun.yml"
# artifact_paths: "experiments/AMIP/modular/output/amip/amip_n8_shortrun_artifacts/*"
# env:
# CLIMACORE_DISTRIBUTED: "MPI"
# BUILD_HISTORY_HANDLE: ""
# agents:
# slurm_ntasks_per_node: 8
# slurm_nodes: 1
# slurm_mem_per_cpu: 16G

# - label: "MPI AMIP FINE: n2" # 10d take 21h, so reducing to 1d
# key: "mpi_amip_fine_n2"
# command: "mpiexec julia --color=yes --project=experiments/AMIP/modular/ experiments/AMIP/modular/coupler_driver_modular.jl --config_file $CONFIG_PATH/amip_n2_shortrun.yml"
# artifact_paths: "experiments/AMIP/modular/output/amip/amip_n2_shortrun_artifacts/*"
# env:
# CLIMACORE_DISTRIBUTED: "MPI"
# BUILD_HISTORY_HANDLE: ""
# agents:
# slurm_ntasks_per_node: 2
# slurm_nodes: 1
# slurm_mem_per_cpu: 16G

# - label: "MPI AMIP FINE: n1" # also reported by longruns with a flame graph; 10d take 21h, so reducing to 1d
# key: "mpi_amip_fine_n1"
# command: "julia --color=yes --project=experiments/AMIP/modular/ experiments/AMIP/modular/coupler_driver_modular.jl --config_file $CONFIG_PATH/amip_n1_shortrun.yml"
# artifact_paths: "experiments/AMIP/modular/output/amip/amip_n1_shortrun_artifacts/*"
# env:
# BUILD_HISTORY_HANDLE: ""
# agents:
# slurm_ntasks_per_node: 1
# slurm_nodes: 1
# slurm_mem_per_cpu: 16G

# - label: "MPI AMIP FINE: n1 no couple" # sim time = Δt_cpl (~ benchmarking with standalone models)
# key: "mpi_amip_fine_n1_nocouple"
# command: "julia --color=yes --project=experiments/AMIP/modular/ experiments/AMIP/modular/coupler_driver_modular.jl --config_file $CONFIG_PATH/amip_n1_shortrun_nocouple.yml"
# artifact_paths: "experiments/AMIP/modular/output/amip/amip_n1_shortrun_nocouple_artifacts/*"
# env:
# BUILD_HISTORY_HANDLE: ""
# agents:
# slurm_ntasks_per_node: 1
# slurm_nodes: 1
# slurm_mem_per_cpu: 16G

# # mpi_amip_fine_n1 flame graph report (NB: arguments passed from the ci pipeline.yml)
# - label: ":rocket: performance: flame graph diff: perf_target_amip_n1_shortrun"
# command: "julia --color=yes --project=perf perf/flame_diff.jl --config_file $PERF_CONFIG_PATH/perf_diff_target_amip_n1_shortrun.yml"
# artifact_paths: "perf/output/perf_diff_target_amip_n1_shortrun/*"
# agents:
# slurm_ntasks_per_node: 1
# slurm_nodes: 1
# slurm_mem_per_cpu: 16G

# - wait

# # plot job performance history
# - label: ":chart_with_downwards_trend: build history"
# command:
# - build_history main # name of branch to plot
# artifact_paths:
# - "build_history.html"

# - wait

# - label: ":envelope: Slack report: build_history"
# command:
# - slack-upload -c "#coupler-report" -f build_history.html -m html -n build_history -x "Overall job performance"

# - label: ":envelope: Slack report: Slabplanet"
# command:
# - slack-upload -c "#coupler-report" -f experiments/AMIP/modular/output/slabplanet/slabplanet_default_longrun_artifacts/total_energy_log_bucket.png -m png -n slab_coarse_log -x "Slabplanet energy conservation (log error)"
# - slack-upload -c "#coupler-report" -f experiments/AMIP/modular/output/slabplanet/slabplanet_default_longrun_artifacts/total_energy_bucket.png -m png -n slab_coarse -x "Slabplanet energy conservation"
# - slack-upload -c "#coupler-report" -f experiments/AMIP/modular/output/slabplanet/slabplanet_default_longrun_artifacts/total_water_log_bucket.png -m png -n slab_coarse_w_log -x "Slabplanet water conservation (log error)"
# - slack-upload -c "#coupler-report" -f experiments/AMIP/modular/output/slabplanet/slabplanet_default_longrun_artifacts/total_water_bucket.png -m png -n slab_coarse_w -x "Slabplanet water conservation"

# - label: ":envelope: Slack report: target AMIP"
# command:
# - slack-upload -c "#coupler-report" -f experiments/AMIP/modular/output/amip/amip_longrun_target_artifacts/amip_paperplots.png -m png -n amip_fine -x "AMIP Target Longrun"

# - label: ":envelope: Slack report: Flame Diff"
# command:
# - slack-upload -c "#coupler-report" -f perf/output/perf_diff_target_amip_n1_shortrun/flame_diff.html -m png -n amip_fine_flamegraphdiff -x "AMIP Longrun FlameGraphDiff"
# - slack-upload -c "#coupler-report" -f perf/output/perf_diff_target_amip_n1_shortrun/flame_diff_self_count.html -m png -n amip_fine_flamegraphdiffself -x "AMIP Longrun FlameGraphDiffSelf"
7 changes: 5 additions & 2 deletions config/longrun_configs/amip_longrun_target.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
run_name: "amip_longrun_target"
anim: true
dt_cpl: 150
dt_cpl: 100
energy_check: false
mode_name: "amip"
mono_surface: false
Expand All @@ -18,5 +18,8 @@ dt: "100secs"
t_end: "100days" # TODO this has been decreased from 140 days to avoid instability #460
job_id: "amip_longrun_target"
dt_save_to_sol: "5days"
dt_save_to_disk: "1days"
dt_save_to_disk: "5days"
hourly_checkpoint: true
apply_limiter: false
surface_setup: "PrescribedSurface"
start_date: "19790301"
22 changes: 22 additions & 0 deletions config/longrun_configs/amip_longrun_target_old.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
run_name: "amip_longrun_target"
anim: true
dt_cpl: 150
energy_check: false
mode_name: "amip"
mono_surface: false
vert_diff: "true"
moist: "equil"
rad: "clearsky"
precip_model: "0M"
z_elem: 35
dz_bottom: 50
h_elem: 12
kappa_4: 3e16
rayleigh_sponge: true
alpha_rayleigh_uh: 0
dt: "100secs"
t_end: "100days" # TODO this has been decreased from 140 days to avoid instability #460
job_id: "amip_longrun_target"
dt_save_to_sol: "5days"
dt_save_to_disk: "1days"
apply_limiter: false
31 changes: 31 additions & 0 deletions config/longrun_configs/august.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
run_name: "august"
coupled: true
start_date: "19790301"
monthly_checkpoint: true
surface_setup: "PrescribedSurface"
dt_cpl: 100
energy_check: false
mode_name: "amip"
mono_surface: false
vert_diff: "true"
moist: "equil"
rad: "clearsky"
precip_model: "0M"
z_elem: 35
dz_bottom: 50
h_elem: 12
kappa_4: 4e16
rayleigh_sponge: true
alpha_rayleigh_uh: 0
dt: "100secs"
t_end: "400days"
job_id: "august"
dt_save_to_sol: "10days"
dt_save_to_disk: "1days"
apply_limiter: false
FLOAT_TYPE: "Float64"
post_process: false
anim: true
hourly_checkpoint: true
apply_limiter: false
surface_setup: "PrescribedSurface"
18 changes: 18 additions & 0 deletions config/longrun_configs/new_target.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
run_name: "new_target"
anim: true
dt_cpl: 150
energy_check: false
mode_name: "amip"
mono_surface: false
dt: "150secs"
t_end: "200days" # TODO this has been decreased from 140 days to avoid instability #460
job_id: "new_target"
dt_save_to_sol: "5days"
dt_save_to_disk: "5days"
apply_limiter: false
hourly_checkpoint: true
turb_flux_partition: "CombinedStateFluxes"
atmos_config_file: "config/longrun_configs/longrun_aquaplanet_rhoe_equilmoist_nz63_0M_55km_rs35km_clearsky_tvinsolation.yml"
atmos_toml_file: "toml/longrun_aquaplanet_rhoe_equilmoist_nz63_0M_55km_rs35km_clearsky.toml"
surface_setup: "PrescribedSurface"
start_date: "19790301"
2 changes: 1 addition & 1 deletion config/model_configs/interactive_debug.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ dt_save_restart: "5days"
precip_model: "0M"
run_name: "interactive_debug_run"
job_id: "interactive_debug_run"
monthly_checkpoint: true
hourly_checkpoint: true
4 changes: 4 additions & 0 deletions experiments/AMIP/modular/cli_options.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ function argparse_settings()
help = "Boolean flag indicating whether to checkpoint monthly"
arg_type = Bool
default = false
"--hourly_checkpoint" # TODO generalize to any frequency
help = "Boolean flag indicating whether to checkpoint intervals of 1 hour or multiple thereof"
arg_type = Bool
default = false
"--restart_dir"
help = "Directory containing restart files"
arg_type = String
Expand Down
Loading
Loading