From 82b82057a74a3be4d87186c87b40a72c14e60ff9 Mon Sep 17 00:00:00 2001 From: ulmononian Date: Tue, 15 Oct 2024 18:58:54 +0000 Subject: [PATCH 1/2] Update/add several files to enable UFS-WM on Ursa. --- cmake/configure_ursa.intel.cmake | 1 + tests/default_vars.sh | 22 ++++++++++++ tests/detect_machine.sh | 5 +++ tests/fv3_conf/compile_slurm.IN_ursa | 23 ++++++++++++ tests/fv3_conf/fv3_slurm.IN_ursa | 54 ++++++++++++++++++++++++++++ tests/module-setup.sh | 7 ++++ tests/rt.sh | 24 +++++++++++++ tests/rt_utils.sh | 3 ++ tests/run_test.sh | 2 +- 9 files changed, 140 insertions(+), 1 deletion(-) create mode 100644 cmake/configure_ursa.intel.cmake create mode 100644 tests/fv3_conf/compile_slurm.IN_ursa create mode 100644 tests/fv3_conf/fv3_slurm.IN_ursa diff --git a/cmake/configure_ursa.intel.cmake b/cmake/configure_ursa.intel.cmake new file mode 100644 index 0000000000..92b8ecb75e --- /dev/null +++ b/cmake/configure_ursa.intel.cmake @@ -0,0 +1 @@ +set(PARALLEL_NETCDF ON CACHE BOOL "Enable parallel NetCDF" FORCE) diff --git a/tests/default_vars.sh b/tests/default_vars.sh index 2aefd87186..874e8957d2 100644 --- a/tests/default_vars.sh +++ b/tests/default_vars.sh @@ -203,6 +203,28 @@ elif [[ ${MACHINE_ID} = hera ]]; then export WPG_cpl_atmw_gdas=24 export WAV_tasks_atmw_gdas=248 +elif [[ ${MACHINE_ID} = ursa ]]; then + + export TPN=192 + + export INPES_dflt=3 + export JNPES_dflt=8 + export INPES_thrd=3 + export JNPES_thrd=4 + export INPES_c384=6 + export JNPES_c384=8 + export THRD_c384=2 + export INPES_c768=8 + export JNPES_c768=16 + export THRD_c768=4 + + export THRD_cpl_atmw_gdas=2 + export INPES_cpl_atmw_gdas=6 + export JNPES_cpl_atmw_gdas=8 + export WPG_cpl_atmw_gdas=24 + export WAV_tasks_atmw_gdas=248 + + elif [[ ${MACHINE_ID} = linux ]]; then export TPN=40 diff --git a/tests/detect_machine.sh b/tests/detect_machine.sh index 0bd0535d8a..0c1fea46cd 100755 --- a/tests/detect_machine.sh +++ b/tests/detect_machine.sh @@ -27,6 +27,8 @@ case $(hostname -f) in hfe0[1-9]) MACHINE_ID=hera ;; ### hera01-09 hfe1[0-2]) MACHINE_ID=hera ;; ### hera10-12 hecflow01) MACHINE_ID=hera ;; ### heraecflow01 + + nfe91) MACHINE_ID=ursa ;; ### ursa s4-submit.ssec.wisc.edu) MACHINE_ID=s4 ;; ### s4 @@ -85,6 +87,9 @@ elif [[ -d /mnt/lfs1 ]]; then elif [[ -d /scratch1 ]]; then # We are on NOAA Hera MACHINE_ID=hera +elif [[ -d /collab1 ]]; then + # We are on NOAA Ursa + MACHINE_ID=ursa elif [[ -d /work ]]; then # We are on MSU Orion or Hercules mount=$(findmnt -n -o SOURCE /home) diff --git a/tests/fv3_conf/compile_slurm.IN_ursa b/tests/fv3_conf/compile_slurm.IN_ursa new file mode 100644 index 0000000000..733f694e83 --- /dev/null +++ b/tests/fv3_conf/compile_slurm.IN_ursa @@ -0,0 +1,23 @@ +#!/bin/bash +#SBATCH -e err +#SBATCH -o out +#SBATCH --account=@[ACCNR] +#SBATCH --qos=@[QUEUE] +#SBATCH --partition=to39-compute +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=30 +#SBATCH --job-name="@[JBNME]" + +set -eux +date_s_start=$(date +%s) +date_start=$(date) +echo -n "${date_s_start}," > job_timestamp.txt +echo "Compile started: ${date_start}" + +"@[PATHRT]/compile.sh" "@[MACHINE_ID]" "@[MAKE_OPT]" "@[COMPILE_ID]" "@[RT_COMPILER]" + +date_end=$(date) +echo "Compile ended: ${date_end}" +date_s_end=$(date +%s) +echo -n "${date_s_end}," >> job_timestamp.txt diff --git a/tests/fv3_conf/fv3_slurm.IN_ursa b/tests/fv3_conf/fv3_slurm.IN_ursa new file mode 100644 index 0000000000..dd19c3868c --- /dev/null +++ b/tests/fv3_conf/fv3_slurm.IN_ursa @@ -0,0 +1,54 @@ +#!/bin/bash +#SBATCH -e err +#SBATCH -o out +#SBATCH --account=@[ACCNR] +#SBATCH --qos=@[QUEUE] +### #SBATCH --ntasks=@[TASKS] +#SBATCH --nodes=@[NODES] +#SBATCH --partition=to39-compute +#SBATCH --ntasks-per-node=@[TPN] +#SBATCH --time=@[WLCLK] +#SBATCH --job-name="@[JBNME]" +### #SBATCH --exclusive + +set -eux +date_s_start=$(date +%s) +echo -n "${date_s_start}," > job_timestamp.txt + +set +x +export MACHINE_ID=ursa +source ./module-setup.sh +module use "${PWD}/modulefiles" +module load modules.fv3 +module list +set -x + +date_start=$(date) +echo "Model started: ${date_start}" + +export MPI_TYPE_DEPTH=20 +export OMP_STACKSIZE=512M +# shellcheck disable=SC2125 +export OMP_NUM_THREADS=@[THRD] +export ESMF_RUNTIME_COMPLIANCECHECK=OFF:depth=4 +export ESMF_RUNTIME_PROFILE=ON +export ESMF_RUNTIME_PROFILE_OUTPUT="SUMMARY" +export PSM_RANKS_PER_CONTEXT=4 +export PSM_SHAREDCONTEXTS=1 + +# Avoid job errors because of filesystem synchronization delays +sync && sleep 1 + +# This "if" block is part of the rt.sh self-tests in error-test.conf. It emulates the model failing to run. +if [ "${JOB_SHOULD_FAIL:-NO}" = WHEN_RUNNING ] ; then + echo "The job should abort now, with exit status 1." 1>&2 + echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2 + false +fi + +srun --label -n @[TASKS] ./fv3.exe + +date_end=$(date) +echo "Model ended: ${date_end}" +date_s_end=$(date +%s) +echo -n "${date_s_end}," >> job_timestamp.txt diff --git a/tests/module-setup.sh b/tests/module-setup.sh index cd606178f6..05578c350a 100755 --- a/tests/module-setup.sh +++ b/tests/module-setup.sh @@ -15,6 +15,13 @@ elif [[ ${MACHINE_ID} = hera ]] ; then fi module purge +elif [[ ${MACHINE_ID} = ursa ]] ; then + # We are on NOAA Ursa + if ( ! eval module help > /dev/null 2>&1 ) ; then + source /apps/lmod/lmod/init/bash + fi + module purge + elif [[ ${MACHINE_ID} = orion ]] ; then # We are on Orion if ( ! eval module help > /dev/null 2>&1 ) ; then diff --git a/tests/rt.sh b/tests/rt.sh index 8eb72b1571..518c8e51e8 100755 --- a/tests/rt.sh +++ b/tests/rt.sh @@ -762,6 +762,30 @@ case ${MACHINE_ID} in PTMP="${dprefix}/stmp2" SCHEDULER=slurm + ;; + ursa) + echo "rt.sh: Setting up ursa..." + if [[ "${ROCOTO:-false}" == true ]] ; then + module load rocoto + ROCOTO_SCHEDULER=slurm + fi + + # ecflow not yet available on ursa + #if [[ "${ECFLOW:-false}" == true ]] ; then + # module load ecflow/5.11.4 + #fi + + QUEUE="batch" + COMPILE_QUEUE="batch" + + PARTITION= + dprefix="/collab1/data/$USER" + DISKNM="/collab1/data/Cameron.Book/UFS-WM_RT" + STMP="${STMP:-${dprefix}/RT_BASELINE}" + PTMP="${PTMP:-${dprefix}/RT_RUNDIRS}" + + SCHEDULER=slurm + ;; orion) echo "rt.sh: Setting up orion..." diff --git a/tests/rt_utils.sh b/tests/rt_utils.sh index ecf4259421..f166a60dfe 100755 --- a/tests/rt_utils.sh +++ b/tests/rt_utils.sh @@ -303,6 +303,9 @@ rocoto_create_compile_task() { if [[ ${MACHINE_ID} == hera ]]; then BUILD_WALLTIME="01:00:00" fi + if [[ ${MACHINE_ID} == ursa ]]; then + BUILD_WALLTIME="01:00:00" + fi if [[ ${MACHINE_ID} == orion ]]; then BUILD_WALLTIME="01:00:00" fi diff --git a/tests/run_test.sh b/tests/run_test.sh index 64f7f007d2..3725aa8fa5 100755 --- a/tests/run_test.sh +++ b/tests/run_test.sh @@ -446,7 +446,7 @@ if [[ ${skip_check_results} == false ]]; then else if [[ ${i##*.} == nc* ]] ; then - if [[ " orion hercules hera wcoss2 acorn derecho gaea jet s4 noaacloud " =~ ${MACHINE_ID} ]]; then + if [[ " orion hercules hera ursa wcoss2 acorn derecho gaea jet s4 noaacloud " =~ ${MACHINE_ID} ]]; then printf "USING NCCMP.." >> "${RT_LOG}" printf "USING NCCMP.." if [[ ${CMP_DATAONLY} == false ]]; then From b32bbfa680fce5350f4bd98a5fd7cc8b59803499 Mon Sep 17 00:00:00 2001 From: Cameron Book <43379611+ulmononian@users.noreply.github.com> Date: Thu, 5 Dec 2024 12:18:37 -0800 Subject: [PATCH 2/2] Update rt.sh --- tests/rt.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/rt.sh b/tests/rt.sh index 3276e14719..ff08474b0a 100755 --- a/tests/rt.sh +++ b/tests/rt.sh @@ -778,7 +778,7 @@ case ${MACHINE_ID} in COMPILE_QUEUE="batch" PARTITION= - dprefix="/collab1/data/$USER" + dprefix="/collab1/data/${USER}" DISKNM="/collab1/data/Cameron.Book/UFS-WM_RT" STMP="${STMP:-${dprefix}/RT_BASELINE}" PTMP="${PTMP:-${dprefix}/RT_RUNDIRS}"