Skip to content

Commit

Permalink
replace rrfs_v1beta_failing with tests of two different error conditi…
Browse files Browse the repository at this point in the history
…ons so ecflow can test it:

1. rrfs_v1beta_fail_to_copy = fails in run_test.sh before job_card
2. rrfs_v1beta_fail_to_run = fails in the job_card
  • Loading branch information
SamuelTrahanNOAA committed Jul 11, 2024
1 parent 69619cf commit f8b4b3a
Show file tree
Hide file tree
Showing 17 changed files with 193 additions and 12 deletions.
26 changes: 26 additions & 0 deletions tests/error-test.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@


# FIXME: THIS FILE SHOULD NOT BE MERGED TO DEVELOP


# This should succeed
COMPILE | rrfs | intel | -DAPP=ATM -DCCPP_SUITES=FV3_RAP,FV3_RAP_sfcdiff,FV3_HRRR,FV3_RRFS_v1beta,FV3_RRFS_v1nssl -D32BIT=ON | | fv3 |

# This should succeed
RUN | rrfs_v1nssl_nohailnoccn | | baseline |

# These variants of rrfs_v1beta should always fail, and prevent the workflow from completing.
RUN | rrfs_v1beta_fail_to_copy | | baseline |
RUN | rrfs_v1beta_fail_to_run | | baseline |

# Removing -DFASTER=ON here ensures results change, but the test runs. The workflow jobs should complete
# for all three of these tests, but the results should change.
COMPILE | atm_faster_dyn32 | intel | -DAPP=ATM -DCCPP_SUITES=FV3_GFS_v17_p8,FV3_GFS_v15_thompson_mynn_lam3km -D32BIT=ON | | fv3 |
RUN | regional_control_faster | | baseline |

# The --invalid-argument ensures the compile job will fail. The workflow should not submit the tests jobs for this compile job.
COMPILE | hafsw | intel | -DAPP=HAFSW --invalid-argument -DMOVING_NEST=ON -DCCPP_SUITES=FV3_HAFS_v1_gfdlmp_tedmf,FV3_HAFS_v1_gfdlmp_tedmf_nonsst,FV3_HAFS_v1_thompson_tedmf_gfdlsf,FV3_global_nest_v1 -D32BIT=ON | | fv3 |
RUN | hafs_regional_atm | | baseline |
RUN | hafs_regional_atm_thompson_gfdlsf | | baseline |
RUN | hafs_regional_atm_ocn | | baseline |
RUN | hafs_regional_atm_wav | | baseline |
7 changes: 7 additions & 0 deletions tests/fv3_conf/fv3_qsub.IN_acorn
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,13 @@ export ESMF_RUNTIME_COMPLIANCECHECK=OFF:depth=4
export ESMF_RUNTIME_PROFILE=ON
export ESMF_RUNTIME_PROFILE_OUTPUT="SUMMARY"

# FIXME: THIS NEW "IF" BLOCK SHOULD NOT BE MERGED TO DEVELOP
if [ "${JOB_SHOULD_FAIL:-NO}" = WHEN_RUNNING ] ; then
echo "The job should abort now, with exit status 1." 1>&2
echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2
false
fi

mpiexec -n @[TASKS] -ppn @[TPN] -depth @[THRD] ./fv3.exe

echo "Model ended: " `date`
Expand Down
7 changes: 7 additions & 0 deletions tests/fv3_conf/fv3_qsub.IN_derecho
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@ export MPICH_COLL_OPT_OFF=1
# Avoid job errors because of filesystem synchronization delays
sync && sleep 1

# FIXME: THIS NEW "IF" BLOCK SHOULD NOT BE MERGED TO DEVELOP
if [ "${JOB_SHOULD_FAIL:-NO}" = WHEN_RUNNING ] ; then
echo "The job should abort now, with exit status 1." 1>&2
echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2
false
fi

mpiexec -n @[UFS_TASKS] -ppn @[PPN] --hostfile $PBS_NODEFILE ./fv3.exe

echo "Model ended: " `date`
Expand Down
7 changes: 7 additions & 0 deletions tests/fv3_conf/fv3_qsub.IN_wcoss2
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,13 @@ export ESMF_RUNTIME_COMPLIANCECHECK=OFF:depth=4
export ESMF_RUNTIME_PROFILE=ON
export ESMF_RUNTIME_PROFILE_OUTPUT="SUMMARY"

# FIXME: THIS NEW "IF" BLOCK SHOULD NOT BE MERGED TO DEVELOP
if [ "${JOB_SHOULD_FAIL:-NO}" == WHEN_RUNNING ] ; then
echo "The job should abort now, with exit status 1." 1>&2
echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2
false
fi

mpiexec -n @[TASKS] -ppn @[TPN] -depth @[THRD] ./fv3.exe

echo "Model ended: " `date`
Expand Down
8 changes: 8 additions & 0 deletions tests/fv3_conf/fv3_slurm.IN_expanse
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,14 @@ echo "Model started: "`date`
export OMP_STACK_SIZE=512M
export OMP_NUM_THREADS=@[THRD]
export I_MPI_PMI_LIBRARY=/cm/shared/apps/slurm/current/lib64/libpmi.so

# FIXME: THIS NEW "IF" BLOCK SHOULD NOT BE MERGED TO DEVELOP
if [ "${JOB_SHOULD_FAIL:-NO}" == WHEN_RUNNING ] ; then
echo "The job should abort now, with exit status 1." 1>&2
echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2
false
fi

srun -n @[TASKS] ./fv3.exe

echo "Model ended: " `date`
Expand Down
7 changes: 7 additions & 0 deletions tests/fv3_conf/fv3_slurm.IN_gaea
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,13 @@ export ESMF_RUNTIME_PROFILE_OUTPUT="SUMMARY"
# Avoid job errors because of filesystem synchronization delays
sync && sleep 1

# FIXME: THIS NEW "IF" BLOCK SHOULD NOT BE MERGED TO DEVELOP
if [ "${JOB_SHOULD_FAIL:-NO}" == WHEN_RUNNING ] ; then
echo "The job should abort now, with exit status 1." 1>&2
echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2
false
fi

srun --label -n @[TASKS] ./fv3.exe

echo "Model ended: " `date`
Expand Down
7 changes: 7 additions & 0 deletions tests/fv3_conf/fv3_slurm.IN_hera
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,13 @@ export PSM_SHAREDCONTEXTS=1
# Avoid job errors because of filesystem synchronization delays
sync && sleep 1

# FIXME: THIS NEW "IF" BLOCK SHOULD NOT BE MERGED TO DEVELOP
if [ "${JOB_SHOULD_FAIL:-NO}" == WHEN_RUNNING ] ; then
echo "The job should abort now, with exit status 1." 1>&2
echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2
false
fi

# shellcheck disable=SC2102
srun --label -n @[TASKS] ./fv3.exe

Expand Down
7 changes: 7 additions & 0 deletions tests/fv3_conf/fv3_slurm.IN_hercules
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,13 @@ fi
# Avoid job errors because of filesystem synchronization delays
sync && sleep 1

# FIXME: THIS NEW "IF" BLOCK SHOULD NOT BE MERGED TO DEVELOP
if [ "${JOB_SHOULD_FAIL:-NO}" == WHEN_RUNNING ] ; then
echo "The job should abort now, with exit status 1." 1>&2
echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2
false
fi

srun --label -n @[TASKS] ./fv3.exe

echo "Model ended: " `date`
Expand Down
7 changes: 7 additions & 0 deletions tests/fv3_conf/fv3_slurm.IN_jet
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,13 @@ export ESMF_RUNTIME_PROFILE_OUTPUT="SUMMARY"
# Avoid job errors because of filesystem synchronization delays
sync && sleep 1

# FIXME: THIS NEW "IF" BLOCK SHOULD NOT BE MERGED TO DEVELOP
if [ "${JOB_SHOULD_FAIL:-NO}" == WHEN_RUNNING ] ; then
echo "The job should abort now, with exit status 1." 1>&2
echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2
false
fi

srun --label -n @[TASKS] --cpus-per-task=@[THRD] ./fv3.exe

echo "Model ended: " `date`
Expand Down
7 changes: 7 additions & 0 deletions tests/fv3_conf/fv3_slurm.IN_noaacloud
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,13 @@ export OMP_NUM_THREADS=1
# Avoid job errors because of filesystem synchronization delays
sync && sleep 1

# FIXME: THIS NEW "IF" BLOCK SHOULD NOT BE MERGED TO DEVELOP
if [ "${JOB_SHOULD_FAIL:-NO}" == WHEN_RUNNING ] ; then
echo "The job should abort now, with exit status 1." 1>&2
echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2
false
fi

srun --mpi=pmi2 --label -n @[TASKS] ./fv3.exe

echo "Model ended: " `date`
Expand Down
7 changes: 7 additions & 0 deletions tests/fv3_conf/fv3_slurm.IN_orion
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,13 @@ export ESMF_RUNTIME_PROFILE_OUTPUT="SUMMARY"
# Avoid job errors because of filesystem synchronization delays
sync && sleep 1

# FIXME: THIS NEW "IF" BLOCK SHOULD NOT BE MERGED TO DEVELOP
if [ "${JOB_SHOULD_FAIL:-NO}" == WHEN_RUNNING ] ; then
echo "The job should abort now, with exit status 1." 1>&2
echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2
false
fi

srun --label -n @[TASKS] ./fv3.exe

echo "Model ended: " `date`
Expand Down
7 changes: 7 additions & 0 deletions tests/fv3_conf/fv3_slurm.IN_s4
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@ export PSM_SHAREDCONTEXTS=1
# Avoid job errors because of filesystem synchronization delays
sync && sleep 1

# FIXME: THIS NEW "IF" BLOCK SHOULD NOT BE MERGED TO DEVELOP
if [ "${JOB_SHOULD_FAIL:-NO}" = WHEN_RUNNING ] ; then
echo "The job should abort now, with exit status 1." 1>&2
echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2
false
fi

srun --label -n @[TASKS] ./fv3.exe

echo "Model ended: " `date`
Expand Down
7 changes: 7 additions & 0 deletions tests/fv3_conf/fv3_slurm.IN_stampede
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,13 @@ export LD_BIND_NOW=1
# Avoid job errors because of filesystem synchronization delays
#sync && sleep 1

# FIXME: THIS NEW "IF" BLOCK SHOULD NOT BE MERGED TO DEVELOP
if [ "${JOB_SHOULD_FAIL:-NO}" = WHEN_RUNNING ] ; then
echo "The job should abort now, with exit status 1." 1>&2
echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2
false
fi

#mpirun -prepend-rank -np $SBATCH_NP ./fv3.exe
ibrun -n @[TASKS] ./fv3.exe

Expand Down
5 changes: 3 additions & 2 deletions tests/rt.conf
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,9 @@ RUN | rrfs_v1beta |
RUN | rrfs_v1nssl | | baseline |
RUN | rrfs_v1nssl_nohailnoccn | | baseline |

# This variant of rrfs_v1beta should always fail.
RUN | rrfs_v1beta_failing | | baseline |
# These variants of rrfs_v1beta should always fail.
RUN | rrfs_v1beta_fail_to_copy | | baseline |
RUN | rrfs_v1beta_fail_to_run | | baseline |

COMPILE | csawmg | intel | -DAPP=ATM -DCCPP_SUITES=FV3_GFS_v16_csawmg,FV3_GFS_v16_ras | - noaacloud | fv3 |
RUN | control_csawmg | - noaacloud | baseline |
Expand Down
15 changes: 7 additions & 8 deletions tests/run_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -365,19 +365,18 @@ elif [[ ${SCHEDULER} = 'slurm' ]]; then
fi
fi

# FIXME: THIS NEW "IF" BLOCK SHOULD NOT BE MERGED TO DEVELOP
if [[ "${JOB_SHOULD_FAIL:-NO}" == WHEN_COPYING ]] ; then
echo "The job should abort now, with exit status 1." 1>&2
echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2
false
fi

################################################################################
# Submit test job
################################################################################
export OMP_ENV=${OMP_ENV:-""}
if [[ ${SCHEDULER} = 'none' ]]; then

# FIXME: THIS NEW "IF" BLOCK SHOULD NOT BE MERGED TO DEVELOP
if [[ "${JOB_SHOULD_FAIL:-NO}" == YES ]] ; then
echo "The job should abort now, with exit status 1." 1>&2
echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2
false
fi

ulimit -s unlimited
if [[ ${CI_TEST} = 'true' ]]; then
( eval "${OMP_ENV}" ;
Expand Down
70 changes: 70 additions & 0 deletions tests/tests/rrfs_v1beta_fail_to_copy
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@


# FIXME: THIS FILE SHOULD NOT BE MERGED TO DEVELOP


###############################################################################
#
# RRFS v1beta variant that always fails at runtime
#
###############################################################################

export TEST_DESCR="Variant of RRFS_v1beta that always fails at runtime"

export CNTL_DIR=rrfs_v1beta_fail_to_copy

export LIST_FILES="sfcf000.nc \
sfcf009.nc \
sfcf012.nc \
atmf000.nc \
atmf009.nc \
atmf012.nc \
GFSFLX.GrbF00 \
GFSFLX.GrbF09 \
GFSFLX.GrbF12 \
GFSPRS.GrbF00 \
GFSPRS.GrbF09 \
GFSPRS.GrbF12 \
RESTART/20210323.060000.coupler.res \
RESTART/20210323.060000.fv_core.res.nc \
RESTART/20210323.060000.fv_core.res.tile1.nc \
RESTART/20210323.060000.fv_core.res.tile2.nc \
RESTART/20210323.060000.fv_core.res.tile3.nc \
RESTART/20210323.060000.fv_core.res.tile4.nc \
RESTART/20210323.060000.fv_core.res.tile5.nc \
RESTART/20210323.060000.fv_core.res.tile6.nc \
RESTART/20210323.060000.fv_srf_wnd.res.tile1.nc \
RESTART/20210323.060000.fv_srf_wnd.res.tile2.nc \
RESTART/20210323.060000.fv_srf_wnd.res.tile3.nc \
RESTART/20210323.060000.fv_srf_wnd.res.tile4.nc \
RESTART/20210323.060000.fv_srf_wnd.res.tile5.nc \
RESTART/20210323.060000.fv_srf_wnd.res.tile6.nc \
RESTART/20210323.060000.fv_tracer.res.tile1.nc \
RESTART/20210323.060000.fv_tracer.res.tile2.nc \
RESTART/20210323.060000.fv_tracer.res.tile3.nc \
RESTART/20210323.060000.fv_tracer.res.tile4.nc \
RESTART/20210323.060000.fv_tracer.res.tile5.nc \
RESTART/20210323.060000.fv_tracer.res.tile6.nc \
RESTART/20210323.060000.phy_data.tile1.nc \
RESTART/20210323.060000.phy_data.tile2.nc \
RESTART/20210323.060000.phy_data.tile3.nc \
RESTART/20210323.060000.phy_data.tile4.nc \
RESTART/20210323.060000.phy_data.tile5.nc \
RESTART/20210323.060000.phy_data.tile6.nc \
RESTART/20210323.060000.sfc_data.tile1.nc \
RESTART/20210323.060000.sfc_data.tile2.nc \
RESTART/20210323.060000.sfc_data.tile3.nc \
RESTART/20210323.060000.sfc_data.tile4.nc \
RESTART/20210323.060000.sfc_data.tile5.nc \
RESTART/20210323.060000.sfc_data.tile6.nc"

export_rrfs_v1
export RESTART_INTERVAL="6 -1"
export OUTPUT_FH='0 09 12'

# A special flag that tells the job to fail at runtime.
export JOB_SHOULD_FAIL=WHEN_COPYING

if [[ " hera orion hercules jet " =~ " ${MACHINE_ID} " ]] ; then
ZSTANDARD_LEVEL=5
fi
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

export TEST_DESCR="Variant of RRFS_v1beta that always fails at runtime"

export CNTL_DIR=rrfs_v1beta_failing
export CNTL_DIR=rrfs_v1beta_fail_to_run

export LIST_FILES="sfcf000.nc \
sfcf009.nc \
Expand Down Expand Up @@ -63,7 +63,7 @@ export RESTART_INTERVAL="6 -1"
export OUTPUT_FH='0 09 12'

# A special flag that tells the job to fail at runtime.
export JOB_SHOULD_FAIL=YES
export JOB_SHOULD_FAIL=WHEN_RUNNING

if [[ " hera orion hercules jet " =~ " ${MACHINE_ID} " ]] ; then
ZSTANDARD_LEVEL=5
Expand Down

0 comments on commit f8b4b3a

Please sign in to comment.