Skip to content

Commit

Permalink
add job array
Browse files Browse the repository at this point in the history
  • Loading branch information
Nathanael Efrat-Henrici committed Oct 27, 2023
1 parent d999e75 commit a26f63a
Show file tree
Hide file tree
Showing 9 changed files with 165 additions and 658 deletions.
672 changes: 27 additions & 645 deletions examples/Manifest.toml

Large diffs are not rendered by default.

20 changes: 20 additions & 0 deletions examples/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
CLIMAParameters = "6eacf6c3-8458-43b9-ae03-caf5306d3d53"
CalibrateAtmos = "4347a170-ebd6-470c-89d3-5c705c0cacc2"
ClimaAtmos = "b2c96348-7fb7-4fe0-8da9-78d88439e717"
ClimaComms = "3a4d1b5c-c61d-41fd-a00a-5873ba7a1b0d"
ClimaCore = "d414da3d-4745-48bb-8d80-42e94e092884"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
EnsembleKalmanProcesses = "aa8a2aa5-91d8-4396-bcef-d4f2ec43552d"
Expand All @@ -12,3 +13,22 @@ YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6"

[compat]
ClimaAtmos = "0.16"

[extras]
CUDA_Runtime_jll = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
HDF5_jll = "0234f1f7-429e-5d53-9886-15a909be8d59"
MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267"

[preferences.CUDA_Runtime_jll]
version = "local"

[preferences.HDF5_jll]
libhdf5_hl_path = "libhdf5_hl"
libhdf5_path = "libhdf5"

[preferences.MPIPreferences]
_format = "1.0"
abi = "OpenMPI"
binary = "system"
libmpi = "libmpi"
mpiexec = "mpiexec"
51 changes: 51 additions & 0 deletions examples/driver-job-array.sbatch
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/bin/bash
#SBATCH --time=24:00:00
#SBATCH --job-name=cal-job-array
#SBATCH --ntasks=40
#SBATCH --cpus-per-task=8
#SBATCH --mem-per-cpu=8G
#SBATCH --output=calibration-job_array-%j.out
#SBATCH --mail-user=nat.henrici@gmail.com
#SBATCH --mail-type=BEGIN,END,FAIL

# Load required modules
module load julia/1.9.3 cuda/12.2 ucx/1.14.1_cuda-12.2 openmpi/4.1.5_cuda-12.2 hdf5/1.12.2-ompi415
export OMPI_MCA_opal_warn_on_missing_libcuda=0

# Check if a job ID is provided as a command-line argument
job_id=${1?Error: no job ID given}
echo "Running job: $job_id"

# Instantiate the Julia project
julia --project=examples -e 'using Pkg; Pkg.instantiate(;verbose=true)'

echo 'Initializing ensemble for calibration.'

julia --project=examples -e "
import CalibrateAtmos
using Distributions
using EnsembleKalmanProcesses.ParameterDistributions
CalibrateAtmos.initialize(\"$job_id\")"

# Get ensemble size and the number of iterations from configuration file
ensemble_size=$(grep "ensemble_size:" examples/$job_id/ekp_config.yml | awk '{print $2}')
n_iterations=$(grep "n_iterations:" examples/$job_id/ekp_config.yml | awk '{print $2}')

# Loop over iterations
# tasks_per_task=$(( $ensemble_size / $SLURM_NTASKS ))
tasks_per_task=4
dep=""
for i in $(seq 0 $n_iterations)
do
echo "Scheduling Iteration $i"
echo "Dep for model run iter $i: $dep"

ensemble_array_id=$(sbatch --dependency=$dep --parsable --job=model-$i --ntasks=$tasks_per_task --array=1-$ensemble_size examples/model_run.sbatch $job_id $i)
dep="afterany:$ensemble_array_id"
echo "Dep for update at iter $i: $dep"
format_i=$(printf "iteration_%03d" "$i")

update_id=$(sbatch --dependency=$dep --job=update-$i --output=output/$job_id/$format_i/update_log.out --parsable examples/update.sbatch $job_id $i)
dep="afterany:$update_id"
echo "ekp update job $i: $update_id"
done
24 changes: 15 additions & 9 deletions examples/driver.sbatch
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
#!/bin/bash
#SBATCH --time=24:00:00
#SBATCH -J calibrate
#SBATCH --mem=1G
#SBATCH --output=calibration.out
module load julia/1.9.3 cuda/11.8 ucx/1.14.1_cuda-11.8 openmpi/4.1.5_cuda-11.8 hdf5/1.12.2-ompi415 nsight-systems/2023.2.1
export JULIA_LOAD_PATH=":/examples/load_path"
#SBATCH -J cal-parallel
#SBATCH --cpus-per-task=8
#SBATCH --ntasks=4
#SBATCH --mem-per-cpu=8G
# #SBATCH --gpus=1
#SBATCH --output=calibration-parallel.out
# #SBATCH --mail-user=nefrathe@caltech.edu
#SBATCH --mail-type=END,FAIL

module load julia/1.9.3 cuda/12.2 ucx/1.14.1_cuda-12.2 openmpi/4.1.5_cuda-12.2 hdf5/1.12.2-ompi415
export JULIA_LOAD_PATH=":examples/load_path"

job_id=${1?Error: no job ID given}
echo "Running job: $job_id"

julia --project=examples -e 'using Pkg; Pkg.develop(path=".");
Pkg.instantiate(;verbose=true)'
julia --project=examples -e 'using Pkg; Pkg.instantiate(;verbose=true)'

# This is a bit messy - could be file
julia --project=examples -e "import CalibrateAtmos; using Distributions;
Expand All @@ -27,9 +32,10 @@ do

for i in $(seq 1 $ensemble_size)
do
srun --cpu-bind=cores -n 1 julia --project=examples -e \
echo "Running iteration $iteration, member $i"
srun --ntasks 4 julia --project=examples -e \
"import CalibrateAtmos;
atmos_config = CalibrateAtmos.get_atmos_config($i, $iteration; \"$job_id\");
atmos_config = CalibrateAtmos.get_atmos_config($i, $iteration, \"$job_id\");
CalibrateAtmos.run_forward_model(atmos_config)"
done

Expand Down
23 changes: 23 additions & 0 deletions examples/model_run.sbatch
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/bash
#SBATCH --time=24:00:00

# Extract command-line arguments
job_id=$1
iteration=$2

# Print job information
current_time=$(date +"%Y-%m-%d %H:%M:%S")
echo "Current timestamp: $current_time"
echo "Running job: $job_id"
echo "Iteration $iteration, Member $SLURM_ARRAY_TASK_ID"

format_i=$(printf "iteration_%03d" "$iteration")
member=$(printf "member_%03d" "$SLURM_ARRAY_TASK_ID")
# Run the forward model
srun --output=output/$job_id/$format_i/$member/model_log.out julia --project=examples -e "
using ClimaComms
println(ClimaComms.init(ClimaComms.context()))
import CalibrateAtmos
atmos_config = CalibrateAtmos.get_atmos_config($SLURM_ARRAY_TASK_ID, $iteration, \"$job_id\")
CalibrateAtmos.run_forward_model(atmos_config)
"
12 changes: 9 additions & 3 deletions examples/sphere_held_suarez_rhoe_equilmoist/atmos_config.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
dt: 200secs
t_end: 800days
t_end: 560days
ode_algo: ARS343
dt_save_to_disk: 5hours
dt_save_to_disk: 50days
moist: equil
forcing: held_suarez
precip_model: 0M
job_id: sphere_held_suarez_rhoe_equilmoist
output_dir: output/sphere_held_suarez_rhoe_equilmoist
restart_file: examples/sphere_held_suarez_rhoe_equilmoist/day200.0.hdf5
restart_file: examples/sphere_held_suarez_rhoe_equilmoist/day200.0.hdf5
output_default_diagnostics: false
diagnostics:
- reduction_time: average
short_name: ta
period: 60days
writer: nc
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ function observation_map(iteration, job_id)
# Cut off first 120 days to get equilibrium, take second level slice
level_slice = 2
ta_second_height = ta[3:size(ta)[1], :, :, level_slice]
# Average across long and latitude
# Average over long and latitude
area_avg_ta_second_height =
longitudinal_avg(latitudinal_avg(ta_second_height))
G_ens[:, m] = [area_avg_ta_second_height[3]]
Expand Down
16 changes: 16 additions & 0 deletions examples/update.sbatch
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/sh
#SBATCH --time=01:00:00

job_id=$1
i=$2

current_time=$(date +"%Y-%m-%d %H:%M:%S")
echo "Current timestamp: $current_time"
echo "Iteration $i"

julia --project=examples -e '
import CalibrateAtmos
include("examples/'$job_id'/observation_map.jl")
observation_map('$i', "'$job_id'")
CalibrateAtmos.update_ensemble("'${job_id}'", '$i')
'
3 changes: 3 additions & 0 deletions src/atmos_interface.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ function get_atmos_config(member, iteration, config_dict::AbstractDict)
config_dict["output_dir"] = member_path
parameter_path = joinpath(member_path, "parameters.toml")
config_dict["toml"] = [parameter_path]

# Turn off default diagnostics
config_dict["output_default_diagnostics"] = false

# Set restart file for initial equilibrium state
ENV["RESTART_FILE"] = config_dict["restart_file"]
Expand Down

0 comments on commit a26f63a

Please sign in to comment.