diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 5068b961f7..b0b51922c5 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -211,3 +211,4 @@ ush/python/pygfs/utils/marine_da_utils.py @guillaumevernieres @AndrewEichmann-NO # Specific workflow scripts workflow/generate_workflows.sh @DavidHuber-NOAA +workflow/build_compute.py @DavidHuber-NOAA @aerorahul diff --git a/.gitignore b/.gitignore index 49fb3f438a..f3cb1e1b3e 100644 --- a/.gitignore +++ b/.gitignore @@ -85,6 +85,9 @@ parm/wafs # Ignore sorc and logs folders from externals #-------------------------------------------- +sorc/build.xml +sorc/build.db +sorc/build_lock.db sorc/*log sorc/logs sorc/calc_analysis.fd diff --git a/ci/Jenkinsfile b/ci/Jenkinsfile index 9e2381268d..b7a29e15b0 100644 --- a/ci/Jenkinsfile +++ b/ci/Jenkinsfile @@ -120,9 +120,7 @@ pipeline { def error_logs_message = "" dir("${HOMEgfs}/sorc") { try { - sh(script: './build_all.sh -kgu') // build the global-workflow executables for GFS variant (UFS-wx-model, WW3 pre/post executables) - sh(script: './build_ww3prepost.sh -w > ./logs/build_ww3prepost_gefs.log 2>&1') // build the WW3 pre/post processing executables for GEFS variant - sh(script: './build_ufs.sh -w -e gefs_model.x > ./logs/build_ufs_gefs.log 2>&1') // build the UFS-wx-model executable for GEFS variant + sh(script: './build_compute.sh all') // build the global-workflow executables } catch (Exception error_build) { echo "Failed to build global-workflow: ${error_build.getMessage()}" if ( fileExists("logs/error.logs") ) { diff --git a/docs/source/clone.rst b/docs/source/clone.rst index d3f81f2e47..ec0018157a 100644 --- a/docs/source/clone.rst +++ b/docs/source/clone.rst @@ -18,35 +18,39 @@ Clone the `global-workflow` and `cd` into the `sorc` directory: git clone --recursive https://github.com/NOAA-EMC/global-workflow cd global-workflow/sorc -For forecast-only (coupled or uncoupled) build of the components: +.. _build_examples: + +The build_all.sh script can be used to build all required components of the global workflow. The accepted arguments is a list of systems to be built. This includes builds for GFS and GEFS forecast-only experiments, GSI and GDASApp-based DA for cycled GFS experiments. See `feature availability `__ to see which system(s) are available on each supported system. :: - ./build_all.sh + ./build_all.sh [gfs] [gefs] [gs] [gdas] [all] -For cycled (w/ data assimilation) use the `-g` option during build: +For example, to run GFS experiments with GSI DA, execute: :: - ./build_all.sh -g + ./build_all.sh gfs gsi -For coupled cycling (include new UFSDA) use the `-gu` options during build: +This builds the GFS, UFS-utils, GFS-utils, WW3 with PDLIB (structured wave grids), UPP, GSI, GSI-monitor, and GSI-utils executables. -[Currently only available on Hera, Orion, and Hercules] +For coupled cycling (include new UFSDA) execute: :: - ./build_all.sh -gu + ./build_all.sh gfs gdas +This builds all of the same executables, except it builds the GDASApp instead of the GSI. -For building without PDLIB (unstructured grid) for the wave model, use the `-w` options during build: +To run GEFS (forecast-only) execute: :: - ./build_all.sh -w + ./build_all.sh gefs +This builds the GEFS, UFS-utils, GFS-utils, WW3 *without* PDLIB (unstructure wave grids), and UPP executables. -Build workflow components and link workflow artifacts such as executables, etc. +Once the building is complete, link workflow artifacts such as executables, configuration files, and scripts via :: @@ -107,40 +111,19 @@ Under the ``/sorc`` folder is a script to build all components called ``build_al :: - ./build_all.sh [-a UFS_app][-g][-h][-u][-v] + ./build_all.sh [-a UFS_app][-k][-h][-v] [list of system(s) to build] -a UFS_app: Build a specific UFS app instead of the default - -g: - Build GSI + -k: + Kill all builds immediately if one fails -h: Print this help message and exit - -j: - Specify maximum number of build jobs (n) - -u: - Build UFS-DA -v: Execute all build scripts with -v option to turn on verbose where supported -For forecast-only (coupled or uncoupled) build of the components: - -:: - - ./build_all.sh - -For cycled (w/ data assimilation) use the `-g` option during build: - -:: - - ./build_all.sh -g - -For coupled cycling (include new UFSDA) use the `-gu` options during build: - -[Currently only available on Hera, Orion, and Hercules] - -:: - - ./build_all.sh -gu + Lastly, pass to build_all.sh a list of systems to build. This includes `gfs`, `gefs`, `sfs` (not fully supported), `gsi`, `gdas`, and `all`. +For examples of how to use this script, see :ref:`build examples `. ^^^^^^^^^^^^^^^ Link components @@ -156,4 +139,3 @@ After running the checkout and build scripts run the link script: Where: ``-o``: Run in operations (NCO) mode. This creates copies instead of using symlinks and is generally only used by NCO during installation into production. - diff --git a/env/AWSPW.env b/env/AWSPW.env index e366128a1d..f365695f85 100755 --- a/env/AWSPW.env +++ b/env/AWSPW.env @@ -33,7 +33,29 @@ else exit 2 fi -if [[ "${step}" = "fcst" ]] || [[ "${step}" = "efcs" ]]; then +if [[ "${step}" = "prep" ]] || [[ "${step}" = "prepbufr" ]]; then + + export POE="NO" + export BACK="NO" + export sys_tp="AWSPW" + export launcher_PREP="srun" + +elif [[ "${step}" = "prepsnowobs" ]]; then + + export APRUN_CALCFIMS="${APRUN_default}" + +elif [[ "${step}" = "prep_emissions" ]]; then + + export APRUN="${APRUN_default}" + +elif [[ "${step}" = "waveinit" ]] || [[ "${step}" = "waveprep" ]] || [[ "${step}" = "wavepostsbs" ]] || [[ "${step}" = "wavepostbndpnt" ]] || [[ "${step}" = "wavepostbndpntbll" ]] || [[ "${step}" = "wavepostpnt" ]]; then + + export CFP_MP="YES" + if [[ "${step}" = "waveprep" ]]; then export MP_PULSE=0 ; fi + export wavempexec=${launcher} + export wave_mpmd=${mpmd_opt} + +elif [[ "${step}" = "fcst" ]] || [[ "${step}" = "efcs" ]]; then export launcher="srun --mpi=pmi2 -l" @@ -52,52 +74,16 @@ elif [[ "${step}" = "waveinit" ]] || [[ "${step}" = "waveprep" ]] || [[ "${step} elif [[ "${step}" = "post" ]]; then - export NTHREADS_NP=${NTHREADS1} - export APRUN_NP="${APRUN_default}" - - export NTHREADS_DWN=${threads_per_task_dwn:-1} - [[ ${NTHREADS_DWN} -gt ${max_threads_per_task} ]] && export NTHREADS_DWN=${max_threads_per_task} - export APRUN_DWN="${launcher} -n ${ntasks_dwn}" - -elif [[ "${step}" = "atmos_products" ]]; then - - export USE_CFP="YES" # Use MPMD for downstream product generation on Hera + export NTHREADS_UPP=${NTHREADS1} + export APRUN_UPP="${APRUN_default} --cpus-per-task=${NTHREADS_UPP}" elif [[ "${step}" = "oceanice_products" ]]; then export NTHREADS_OCNICEPOST=${NTHREADS1} export APRUN_OCNICEPOST="${launcher} -n 1 --cpus-per-task=${NTHREADS_OCNICEPOST}" -elif [[ "${step}" = "ecen" ]]; then - - export NTHREADS_ECEN=${NTHREADSmax} - export APRUN_ECEN="${APRUN_default}" - - export NTHREADS_CHGRES=${threads_per_task_chgres:-12} - [[ ${NTHREADS_CHGRES} -gt ${max_tasks_per_node} ]] && export NTHREADS_CHGRES=${max_tasks_per_node} - export APRUN_CHGRES="time" - - export NTHREADS_CALCINC=${threads_per_task_calcinc:-1} - [[ ${NTHREADS_CALCINC} -gt ${max_threads_per_task} ]] && export NTHREADS_CALCINC=${max_threads_per_task} - export APRUN_CALCINC="${APRUN_default}" - -elif [[ "${step}" = "esfc" ]]; then - - export NTHREADS_ESFC=${NTHREADSmax} - export APRUN_ESFC="${APRUN_default}" - - export NTHREADS_CYCLE=${threads_per_task_cycle:-14} - [[ ${NTHREADS_CYCLE} -gt ${max_tasks_per_node} ]] && export NTHREADS_CYCLE=${max_tasks_per_node} - export APRUN_CYCLE="${APRUN_default}" - -elif [[ "${step}" = "epos" ]]; then - - export NTHREADS_EPOS=${NTHREADSmax} - export APRUN_EPOS="${APRUN_default}" - -elif [[ "${step}" = "fit2obs" ]]; then +elif [[ "${step}" = "atmos_products" ]]; then - export NTHREADS_FIT2OBS=${NTHREADS1} - export MPIRUN="${APRUN_default}" + export USE_CFP="YES" # Use MPMD for downstream product generation on AWS fi diff --git a/env/AZUREPW.env b/env/AZUREPW.env index c2faeb2bf6..b2b4063ff3 100755 --- a/env/AZUREPW.env +++ b/env/AZUREPW.env @@ -15,6 +15,7 @@ export mpmd_opt="--multi-prog --output=mpmd.%j.%t.out" # Configure MPI environment export OMP_STACKSIZE=2048000 export NTHSTACK=1024000000 +export UCX_TLS=ud,sm,self ulimit -s unlimited ulimit -a @@ -50,6 +51,10 @@ elif [[ "${step}" = "waveinit" ]] || [[ "${step}" = "waveprep" ]] || [[ "${step} export wavempexec=${launcher} export wave_mpmd=${mpmd_opt} +elif [[ "${step}" = "prep_emissions" ]]; then + + export APRUN="${APRUN_default}" + elif [[ "${step}" = "post" ]]; then export NTHREADS_NP=${NTHREADS1} @@ -71,7 +76,7 @@ elif [[ "${step}" = "oceanice_products" ]]; then elif [[ "${step}" = "ecen" ]]; then export NTHREADS_ECEN=${NTHREADSmax} - export APRUN_ECEN="${APRUN}" + export APRUN_ECEN="${APRUN_default}" export NTHREADS_CHGRES=${threads_per_task_chgres:-12} [[ ${NTHREADS_CHGRES} -gt ${max_tasks_per_node} ]] && export NTHREADS_CHGRES=${max_tasks_per_node} @@ -79,25 +84,25 @@ elif [[ "${step}" = "ecen" ]]; then export NTHREADS_CALCINC=${threads_per_task_calcinc:-1} [[ ${NTHREADS_CALCINC} -gt ${max_threads_per_task} ]] && export NTHREADS_CALCINC=${max_threads_per_task} - export APRUN_CALCINC="${APRUN}" + export APRUN_CALCINC="${APRUN_default}" elif [[ "${step}" = "esfc" ]]; then export NTHREADS_ESFC=${NTHREADSmax} - export APRUN_ESFC="${APRUN}" + export APRUN_ESFC="${APRUN_default}" export NTHREADS_CYCLE=${threads_per_task_cycle:-14} [[ ${NTHREADS_CYCLE} -gt ${max_tasks_per_node} ]] && export NTHREADS_CYCLE=${max_tasks_per_node} - export APRUN_CYCLE="${APRUN}" + export APRUN_CYCLE="${APRUN_default}" elif [[ "${step}" = "epos" ]]; then export NTHREADS_EPOS=${NTHREADSmax} - export APRUN_EPOS="${APRUN}" + export APRUN_EPOS="${APRUN_default}" elif [[ "${step}" = "fit2obs" ]]; then export NTHREADS_FIT2OBS=${NTHREADS1} - export MPIRUN="${APRUN}" + export MPIRUN="${APRUN_default}" fi diff --git a/env/GOOGLEPW.env b/env/GOOGLEPW.env index c3b5ec806a..d84008d648 100755 --- a/env/GOOGLEPW.env +++ b/env/GOOGLEPW.env @@ -45,7 +45,7 @@ if [[ "${step}" = "fcst" ]] || [[ "${step}" = "efcs" ]]; then elif [[ "${step}" = "prep_emissions" ]]; then - export APRUN + export APRUN="${APRUN_default}" elif [[ "${step}" = "waveinit" ]] || [[ "${step}" = "waveprep" ]] || [[ "${step}" = "wavepostsbs" ]] || [[ "${step}" = "wavepostbndpnt" ]] || [[ "${step}" = "wavepostbndpntbll" ]] || [[ "${step}" = "wavepostpnt" ]]; then @@ -102,6 +102,6 @@ elif [[ "${step}" = "epos" ]]; then elif [[ "${step}" = "fit2obs" ]]; then export NTHREADS_FIT2OBS=${NTHREADS1} - export MPIRUN="${APRUN}" + export MPIRUN="${APRUN_default}" fi diff --git a/modulefiles/module_base.noaacloud.lua b/modulefiles/module_base.noaacloud.lua index 7997b618e4..3a7cc75d7a 100644 --- a/modulefiles/module_base.noaacloud.lua +++ b/modulefiles/module_base.noaacloud.lua @@ -5,8 +5,11 @@ Load environment to run GFS on noaacloud local spack_mod_path=(os.getenv("spack_mod_path") or "None") prepend_path("MODULEPATH", spack_mod_path) +load("gnu") load(pathJoin("stack-intel", (os.getenv("stack_intel_ver") or "None"))) load(pathJoin("stack-intel-oneapi-mpi", (os.getenv("stack_impi_ver") or "None"))) +unload("gnu") + load(pathJoin("python", (os.getenv("python_ver") or "None"))) load(pathJoin("jasper", (os.getenv("jasper_ver") or "None"))) diff --git a/modulefiles/module_gwci.noaacloud.lua b/modulefiles/module_gwci.noaacloud.lua index c3142cd60d..2ac284ef85 100644 --- a/modulefiles/module_gwci.noaacloud.lua +++ b/modulefiles/module_gwci.noaacloud.lua @@ -2,10 +2,10 @@ help([[ Load environment to run GFS workflow setup scripts on noaacloud ]]) -prepend_path("MODULEPATH", "/contrib/spack-stack/spack-stack-1.6.0/envs/unified-env/install/modulefiles/Core") +prepend_path("MODULEPATH", "/contrib/spack-stack-rocky8/spack-stack-1.6.0/envs/ue-env/install/modulefiles/Core") -load(pathJoin("stack-intel", os.getenv("2021.3.0"))) -load(pathJoin("stack-intel-oneapi-mpi", os.getenv("2021.3.0"))) +load(pathJoin("stack-intel", os.getenv("2021.10.0"))) +load(pathJoin("stack-intel-oneapi-mpi", os.getenv("2021.10.0"))) load(pathJoin("netcdf-c", os.getenv("4.9.2"))) load(pathJoin("netcdf-fortran", os.getenv("4.6.1"))) diff --git a/modulefiles/module_gwsetup.noaacloud.lua b/modulefiles/module_gwsetup.noaacloud.lua index f3845e8d72..e2aa4050a3 100644 --- a/modulefiles/module_gwsetup.noaacloud.lua +++ b/modulefiles/module_gwsetup.noaacloud.lua @@ -4,17 +4,18 @@ Load environment to run GFS workflow setup scripts on noaacloud load(pathJoin("rocoto")) -prepend_path("MODULEPATH", "/contrib/spack-stack/spack-stack-1.6.0/envs/unified-env/install/modulefiles/Core") +prepend_path("MODULEPATH", "/contrib/spack-stack-rocky8/spack-stack-1.6.0/envs/ue-intel/install/modulefiles/Core") -local stack_intel_ver=os.getenv("stack_intel_ver") or "2021.3.0" -local python_ver=os.getenv("python_ver") or "3.10.3" +load("gnu") +local stack_intel_ver=os.getenv("stack_intel_ver") or "2021.10.0" +local stack_mpi_ver=os.getenv("stack_mpi_ver") or "2021.10.0" load(pathJoin("stack-intel", stack_intel_ver)) -load(pathJoin("python", python_ver)) +load(pathJoin("stack-intel-oneapi-mpi", stack_mpi_ver)) +unload("gnu") + load("py-jinja2") load("py-pyyaml") load("py-numpy") -local git_ver=os.getenv("git_ver") or "1.8.3.1" -load(pathJoin("git", git_ver)) whatis("Description: GFS run setup environment") diff --git a/parm/archive/enkf.yaml.j2 b/parm/archive/enkf.yaml.j2 index 9f9ad296f8..12167198cb 100644 --- a/parm/archive/enkf.yaml.j2 +++ b/parm/archive/enkf.yaml.j2 @@ -3,6 +3,7 @@ enkf: target: "{{ ATARDIR }}/{{ cycle_YMDH }}/{{ RUN }}.tar" required: # Logs + {% if RUN == 'enkfgdas' %} {% for mem in range(1, nmem_ens + 1) %} - "logs/{{ cycle_YMDH }}/{{ RUN }}_fcst_mem{{ '%03d' % mem }}.log" {% endfor %} @@ -10,6 +11,7 @@ enkf: - "logs/{{ cycle_YMDH }}/{{ RUN }}_epos{{ '%03d' % (fhr - fhmin) }}.log" {% endfor %} - "logs/{{ cycle_YMDH }}/{{ RUN }}_echgres.log" + {% endif %} - "logs/{{ cycle_YMDH }}/{{ RUN }}_esfc.log" {% for grp in range(IAUFHRS | length) %} - "logs/{{ cycle_YMDH }}/{{ RUN }}_ecen{{ '%03d' % grp }}.log" @@ -37,6 +39,7 @@ enkf: {% endfor %} # Ensemble mean and spread + {% if RUN == 'enkfgdas' %} {% for fhr in range(3, fhmax + 1, 3) %} - "{{ COMIN_ATMOS_HISTORY_ENSSTAT | relpath(ROTDIR) }}/{{ head }}atmf{{ '%03d' % fhr }}.ensmean.nc" - "{{ COMIN_ATMOS_HISTORY_ENSSTAT | relpath(ROTDIR) }}/{{ head }}sfcf{{ '%03d' % fhr }}.ensmean.nc" @@ -44,6 +47,7 @@ enkf: - "{{ COMIN_ATMOS_HISTORY_ENSSTAT | relpath(ROTDIR) }}/{{ head }}atmf{{ '%03d' % fhr }}.ensspread.nc" {% endif %} {% endfor %} + {% endif %} # Ensemble mean state {% if not DO_JEDIATMENS %} diff --git a/parm/archive/enkf_grp.yaml.j2 b/parm/archive/enkf_grp.yaml.j2 index 933ca45caf..3b58bbb27d 100644 --- a/parm/archive/enkf_grp.yaml.j2 +++ b/parm/archive/enkf_grp.yaml.j2 @@ -10,12 +10,14 @@ enkf_grp: {% set COMIN_ATMOS_RESTART_MEM = COMIN_ATMOS_RESTART_MEM_list[imem] %} # Forecast data + {% if RUN == 'enkfgdas' %} {% for fhr in range(3, 10, 3) %} - "{{ COMIN_ATMOS_HISTORY_MEM | relpath(ROTDIR) }}/{{ head }}atmf{{ "%03d" % fhr }}.nc" {% endfor %} # Only store the 6-hour surface forecast - "{{ COMIN_ATMOS_HISTORY_MEM | relpath(ROTDIR) }}/{{ head }}sfcf006.nc" + {% endif %} # Store the individual member analysis data {% if not lobsdiag_forenkf %} diff --git a/parm/archive/enkf_restartb_grp.yaml.j2 b/parm/archive/enkf_restartb_grp.yaml.j2 index c7aaf6682e..50595a6bbf 100644 --- a/parm/archive/enkf_restartb_grp.yaml.j2 +++ b/parm/archive/enkf_restartb_grp.yaml.j2 @@ -22,6 +22,7 @@ enkf_restartb_grp: {% endfor %} # Now get the restart files. + {% if RUN == 'enkfgdas' %} {% for r_time in range(restart_interval, fhmax + 1, restart_interval) %} {% set r_timedelta = (r_time | string + "H") | to_timedelta %} {% set r_dt = current_cycle | add_to_datetime(r_timedelta) %} @@ -38,3 +39,4 @@ enkf_restartb_grp: - "{{ COMIN_ATMOS_RESTART_MEM | relpath(ROTDIR) }}/{{ r_prefix }}.fv_core.res.nc" {% endfor %} {% endfor %} + {% endif %} diff --git a/parm/config/gefs/config.resources b/parm/config/gefs/config.resources index e1b9a036de..68f81c1039 100644 --- a/parm/config/gefs/config.resources +++ b/parm/config/gefs/config.resources @@ -41,15 +41,15 @@ case ${machine} in ;; "AWSPW") export PARTITION_BATCH="compute" - max_tasks_per_node=36 + max_tasks_per_node=48 ;; "AZUREPW") export PARTITION_BATCH="compute" - max_tasks_per_node=24 + max_tasks_per_node=36 ;; "GOOGLEPW") export PARTITION_BATCH="compute" - max_tasks_per_node=32 + max_tasks_per_node=30 ;; *) echo "FATAL ERROR: Unknown machine encountered by ${BASH_SOURCE[0]}" diff --git a/parm/config/gefs/config.resources.AWSPW b/parm/config/gefs/config.resources.AWSPW index a735c7622d..f91460b6aa 100644 --- a/parm/config/gefs/config.resources.AWSPW +++ b/parm/config/gefs/config.resources.AWSPW @@ -9,3 +9,61 @@ unset memory for mem_var in $(env | grep '^memory_' | cut -d= -f1); do unset "${mem_var}" done + +step=$1 + +case ${step} in + "fcst" | "efcs") + export PARTITION_BATCH="compute" + max_tasks_per_node=48 + ;; + + "arch") + export PARTITION_BATCH="process" + max_tasks_per_node=24 + ;; + + "prep_emissions") + export PARTITION_BATCH="process" + max_tasks_per_node=24 + export ntasks=1 + export threads_per_task=1 + export tasks_per_node=$(( max_tasks_per_node / threads_per_task )) + ;; + + "waveinit") + export PARTITION_BATCH="process" + max_tasks_per_node=24 + export ntasks=12 + export threads_per_task=1 + export tasks_per_node=$(( max_tasks_per_node / threads_per_task )) + export NTASKS=${ntasks} + ;; + + "wavepostpnt") + export PARTITION_BATCH="compute" + max_tasks_per_node=48 + export ntasks=240 + export threads_per_task=1 + export tasks_per_node=$(( max_tasks_per_node / threads_per_task )) + export NTASKS=${ntasks} + ;; + + "wavepostsbs" | "wavepostbndpnt" | "wavepostbndpntbll") + export PARTITION_BATCH="process" + max_tasks_per_node=24 + export ntasks=24 + export threads_per_task=1 + export tasks_per_node=$(( max_tasks_per_node / threads_per_task )) + export NTASKS=${ntasks} + ;; + + *) + export PARTITION_BATCH="process" + max_tasks_per_node=24 + ;; + +esac + +export max_tasks_per_node + diff --git a/parm/config/gfs/config.resources b/parm/config/gfs/config.resources index 18ef4014b8..39610a7b1a 100644 --- a/parm/config/gfs/config.resources +++ b/parm/config/gfs/config.resources @@ -112,16 +112,16 @@ case ${machine} in ;; "AWSPW") export PARTITION_BATCH="compute" - npe_node_max=36 - max_tasks_per_node=36 + npe_node_max=48 + max_tasks_per_node=48 # TODO Supply a max mem/node value for AWS # shellcheck disable=SC2034 mem_node_max="" ;; "AZUREPW") export PARTITION_BATCH="compute" - npe_node_max=24 - max_tasks_per_node=24 + npe_node_max=36 + max_tasks_per_node=36 # TODO Supply a max mem/node value for AZURE # shellcheck disable=SC2034 mem_node_max="" diff --git a/parm/config/gfs/config.resources.AWSPW b/parm/config/gfs/config.resources.AWSPW index a735c7622d..22fe110670 100644 --- a/parm/config/gfs/config.resources.AWSPW +++ b/parm/config/gfs/config.resources.AWSPW @@ -9,3 +9,27 @@ unset memory for mem_var in $(env | grep '^memory_' | cut -d= -f1); do unset "${mem_var}" done + +step=$1 + +case ${step} in + "fcst" | "efcs") + export PARTITION_BATCH="compute" + max_tasks_per_node=48 + ;; + + "arch") + export PARTITION_BATCH="process" + max_tasks_per_node=24 + ;; + + + *) + export PARTITION_BATCH="process" + max_tasks_per_node=24 + ;; + +esac + +export max_tasks_per_node + diff --git a/scripts/exgdas_enkf_earc.py b/scripts/exgdas_enkf_earc.py index 535dd2ea37..107d541a41 100755 --- a/scripts/exgdas_enkf_earc.py +++ b/scripts/exgdas_enkf_earc.py @@ -28,7 +28,7 @@ def main(): 'DOHYBVAR', 'DOIAU_ENKF', 'IAU_OFFSET', 'DOIAU', 'DO_CA', 'DO_CALC_INCREMENT', 'assim_freq', 'ARCH_CYC', 'DO_JEDISNOWDA', 'ARCH_WARMICFREQ', 'ARCH_FCSTICFREQ', - 'IAUFHRS_ENKF', 'NET'] + 'IAUFHRS_ENKF', 'NET', 'NMEM_ENS_GFS'] archive_dict = AttrDict() for key in keys: diff --git a/sorc/build_all.sh b/sorc/build_all.sh index 0f0e634d49..f4618b948c 100755 --- a/sorc/build_all.sh +++ b/sorc/build_all.sh @@ -13,32 +13,23 @@ set +x #------------------------------------ function _usage() { cat << EOF -Builds all of the global-workflow components by calling the individual build - scripts in sequence. +Builds all of the global-workflow components by calling the individual build scripts in parallel. -Usage: ${BASH_SOURCE[0]} [-a UFS_app][-c build_config][-d][-f][-h][-j n][-v][-w][-y] +Usage: ${BASH_SOURCE[0]} [-a UFS_app][-c build_config][-d][-f][-h][-v] [gfs] [gefs] [sfs] [gsi] [gdas] [all] -a UFS_app: - Build a specific UFS app instead of the default + Build a specific UFS app instead of the default. This will be applied to all UFS (GFS, GEFS, SFS) builds. -d: Build in debug mode -f: - Build the UFS model using the -DFASTER=ON option - -g: - Build GSI + Build the UFS model(s) using the -DFASTER=ON option. -h: Print this help message and exit - -j: - Specify maximum number of build jobs (n) -k: Kill all builds if any build fails - -u: - Build UFS-DA -v: Execute all build scripts with -v option to turn on verbose where supported - -w: - Use structured wave grid - -y: - Use hydrostatic version of FV3 + + Specified systems (gfs, gefs, sfs, gsi, gdas) are non-exclusive, so they can be built together. EOF exit 1 } @@ -48,30 +39,21 @@ readonly HOMEgfs=$(cd "$(dirname "$(readlink -f -n "${BASH_SOURCE[0]}" )" )/.." cd "${HOMEgfs}/sorc" || exit 1 _build_ufs_opt="" -_build_ufsda="NO" -_build_gsi="NO" _build_debug="" _verbose_opt="" -_wave_opt="" -_hydro_opt="" _build_job_max=20 _quick_kill="NO" _ufs_exec="-e gfs_model.x" # Reset option counter in case this script is sourced OPTIND=1 -while getopts ":a:dfghj:kuvwy" option; do +while getopts ":a:dfhkv" option; do case "${option}" in a) _build_ufs_opt+="-a ${OPTARG} ";; f) _build_ufs_opt+="-f ";; d) _build_debug="-d" ;; - g) _build_gsi="YES" ;; h) _usage;; - j) _build_job_max="${OPTARG} ";; k) _quick_kill="YES" ;; - u) _build_ufsda="YES" ;; - v) _verbose_opt="-v";; - w) _wave_opt="-w"; _ufs_exec="-e gefs_model.x";; - y) _hydro_opt="-y"; _ufs_exec="-e sfs_model.x";; + v) _verbose_opt="-v" ;; :) echo "[${BASH_SOURCE[0]}]: ${option} requires an argument" _usage @@ -82,20 +64,91 @@ while getopts ":a:dfghj:kuvwy" option; do ;; esac done - shift $((OPTIND-1)) +# If no build system was specified, build for gfs forecast-only +if [[ $# -eq 0 ]]; then + selected_systems="gfs" +else + selected_systems="$*" +fi + +supported_systems=("gfs" "gefs" "sfs" "gsi" "gdas" "all") + +declare -A system_builds +system_builds=( + ["gfs"]="ufs_gfs gfs_utils ufs_utils upp ww3_gfs" + ["gefs"]="ufs_gefs gfs_utils ufs_utils upp ww3_gefs" + ["sfs"]="ufs_sfs gfs_utils ufs_utils upp ww3_gefs" + ["gsi"]="gsi_enkf gsi_monitor gsi_utils" + ["gdas"]="gdas gsi_monitor gsi_utils" + ["all"]="ufs_gfs gfs_utils ufs_utils upp ww3_gfs ufs_gefs ufs_sfs ww3_gefs gdas gsi_enkf gsi_monitor gsi_utils" +) + logs_dir="${HOMEgfs}/sorc/logs" if [[ ! -d "${logs_dir}" ]]; then echo "Creating logs folder" mkdir -p "${logs_dir}" || exit 1 fi -# Check final exec folder exists -if [[ ! -d "${HOMEgfs}/exec" ]]; then - echo "Creating ${HOMEgfs}/exec folder" - mkdir -p "${HOMEgfs}/exec" -fi +# Jobs per build ("min max") +declare -A build_jobs build_opts build_scripts +build_jobs=( + ["ufs_gfs"]=8 ["ufs_gefs"]=8 ["ufs_sfs"]=8 ["gdas"]=8 ["gsi_enkf"]=2 ["gfs_utils"]=1 ["ufs_utils"]=1 + ["ww3_gfs"]=1 ["ww3_gefs"]=1 ["gsi_utils"]=1 ["gsi_monitor"]=1 ["gfs_utils"]=1 ["upp"]=1 +) + +# Establish build options for each job +_gfs_exec="gfs_model.x" +_gefs_exec="gefs_model.x" +_sfs_exec="sfs_model.x" +build_opts=( + ["ufs_gfs"]="${wave_opt} ${_build_ufs_opt} ${_verbose_opt} ${_build_debug} -e ${_gfs_exec}" + ["ufs_gefs"]="${wave_opt} ${_build_ufs_opt} ${_verbose_opt} ${_build_debug} -e ${_gefs_exec}" + ["ufs_sfs"]="${wave_opt} ${_build_ufs_opt} ${_verbose_opt} ${_build_debug} -e ${_sfs_exec}" + ["upp"]="${_build_debug}" + ["ww3_gfs"]="${_verbose_opt} ${_build_debug}" + ["ww3_gefs"]="-w ${_verbose_opt} ${_build_debug}" + ["gdas"]="${_verbose_opt} ${_build_debug}" + ["ufs_utils"]="${_verbose_opt} ${_build_debug}" + ["gfs_utils"]="${_verbose_opt} ${_build_debug}" + ["gsi_utils"]="${_verbose_opt} ${_build_debug}" + ["gsi_enkf"]="${_verbose_opt} ${_build_debug}" + ["gsi_monitor"]="${_verbose_opt} ${_build_debug}" +) + +# Set the build script name for each build +build_scripts=( + ["ufs_gfs"]="build_ufs.sh" + ["ufs_gefs"]="build_ufs.sh" + ["ufs_sfs"]="build_ufs.sh" + ["gdas"]="build_gdas.sh" + ["gsi_enkf"]="build_gsi_enkf.sh" + ["gfs_utils"]="build_gfs_utils.sh" + ["ufs_utils"]="build_ufs_utils.sh" + ["ww3_gfs"]="build_ww3prepost.sh" + ["ww3_gefs"]="build_ww3prepost.sh" + ["gsi_utils"]="build_gsi_utils.sh" + ["gsi_monitor"]="build_gsi_monitor.sh" + ["gfs_utils"]="build_gfs_utils.sh" + ["upp"]="build_upp.sh" +) + +# Check the requested systems to make sure we can build them +declare -A builds +system_count=0 +for system in ${selected_systems}; do + # shellcheck disable=SC2076 + if [[ " ${supported_systems[*]} " =~ " ${system} " ]]; then + (( system_count += 1 )) + for build in ${system_builds["${system}"]}; do + builds["${build}"]="yes" + done + else + echo "Unsupported build system: ${system}" + _usage + fi +done #------------------------------------ # GET MACHINE @@ -108,6 +161,9 @@ if [[ -z "${MACHINE_ID}" ]]; then exit 1 fi +# Create the log directory +mkdir -p "${HOMEgfs}/sorc/logs" + #------------------------------------ # SOURCE BUILD VERSION FILES #------------------------------------ @@ -123,87 +179,18 @@ ERRSCRIPT=${ERRSCRIPT:-'eval [[ $errs = 0 ]]'} # shellcheck disable= errs=0 -declare -A build_jobs -declare -A build_opts - #------------------------------------ # Check which builds to do and assign # of build jobs #------------------------------------ -# Mandatory builds, unless otherwise specified, for the UFS -big_jobs=0 -build_jobs["ufs"]=8 -big_jobs=$((big_jobs+1)) -build_opts["ufs"]="${_wave_opt} ${_hydro_opt} ${_verbose_opt} ${_build_ufs_opt} ${_build_debug} ${_ufs_exec}" - -build_jobs["upp"]=1 -build_opts["upp"]="${_build_debug}" - -build_jobs["ufs_utils"]=1 -build_opts["ufs_utils"]="${_verbose_opt} ${_build_debug}" - -build_jobs["gfs_utils"]=1 -build_opts["gfs_utils"]="${_verbose_opt} ${_build_debug}" - -build_jobs["ww3prepost"]=1 -build_opts["ww3prepost"]="${_wave_opt} ${_verbose_opt} ${_build_ufs_opt} ${_build_debug}" - -# Optional DA builds -if [[ "${_build_ufsda}" == "YES" ]]; then - if [[ "${MACHINE_ID}" != "orion" && "${MACHINE_ID}" != "hera" && "${MACHINE_ID}" != "hercules" && "${MACHINE_ID}" != "wcoss2" && "${MACHINE_ID}" != "noaacloud" && "${MACHINE_ID}" != "gaeac5" && "${MACHINE_ID}" != "gaeac6" ]]; then - echo "NOTE: The GDAS App is not supported on ${MACHINE_ID}. Disabling build." - else - build_jobs["gdas"]=8 - big_jobs=$((big_jobs+1)) - build_opts["gdas"]="${_verbose_opt} ${_build_debug}" - fi -fi -if [[ "${_build_gsi}" == "YES" ]]; then - build_jobs["gsi_enkf"]=2 - build_opts["gsi_enkf"]="${_verbose_opt} ${_build_debug}" -fi -if [[ "${_build_gsi}" == "YES" || "${_build_ufsda}" == "YES" ]] ; then - build_jobs["gsi_utils"]=1 - build_opts["gsi_utils"]="${_verbose_opt} ${_build_debug}" - build_jobs["gsi_monitor"]=1 - build_opts["gsi_monitor"]="${_verbose_opt} ${_build_debug}" -fi - -# Go through all builds and adjust CPU counts down if necessary -requested_cpus=0 -build_list="" -for build in "${!build_jobs[@]}"; do - if [[ -z "${build_list}" ]]; then - build_list="${build}" - else - build_list="${build_list}, ${build}" - fi - if [[ ${build_jobs[${build}]} -gt ${_build_job_max} ]]; then - build_jobs[${build}]=${_build_job_max} - fi - requested_cpus=$(( requested_cpus + build_jobs[${build}] )) -done - echo "Building ${build_list}" -# Go through all builds and adjust CPU counts up if possible -if [[ ${requested_cpus} -lt ${_build_job_max} && ${big_jobs} -gt 0 ]]; then - # Add cores to the gdas and ufs build jobs - extra_cores=$(( _build_job_max - requested_cpus )) - extra_cores=$(( extra_cores / big_jobs )) - for build in "${!build_jobs[@]}"; do - if [[ "${build}" == "gdas" || "${build}" == "ufs" ]]; then - build_jobs[${build}]=$(( build_jobs[${build}] + extra_cores )) - fi - done -fi - procs_in_use=0 declare -A build_ids check_builds() { - for chk_build in "${!build_jobs[@]}"; do + for chk_build in "${!builds[@]}"; do # Check if the build is complete and if so what the status was if [[ -n "${build_ids[${chk_build}]+0}" ]]; then if ! ps -p "${build_ids[${chk_build}]}" > /dev/null; then @@ -213,7 +200,7 @@ check_builds() echo "build_${chk_build}.sh failed! Exiting!" echo "Check logs/build_${chk_build}.log for details." echo "logs/build_${chk_build}.log" > "${HOMEgfs}/sorc/logs/error.logs" - for kill_build in "${!build_jobs[@]}"; do + for kill_build in "${!builds[@]}"; do if [[ -n "${build_ids[${kill_build}]+0}" ]]; then pkill -P "${build_ids[${kill_build}]}" fi @@ -228,15 +215,15 @@ check_builds() builds_started=0 # Now start looping through all of the jobs until everything is done -while [[ ${builds_started} -lt ${#build_jobs[@]} ]]; do - for build in "${!build_jobs[@]}"; do +while [[ ${builds_started} -lt ${#builds[@]} ]]; do + for build in "${!builds[@]}"; do # Has the job started? if [[ -n "${build_jobs[${build}]+0}" && -z "${build_ids[${build}]+0}" ]]; then # Do we have enough processors to run it? if [[ ${_build_job_max} -ge $(( build_jobs[build] + procs_in_use )) ]]; then # double-quoting build_opts here will not work since it is a string of options #shellcheck disable=SC2086 - "./build_${build}.sh" ${build_opts[${build}]:-} -j "${build_jobs[${build}]}" > \ + "./${build_scripts[${build}]}" ${build_opts[${build}]:-} -j "${build_jobs[${build}]}" > \ "${logs_dir}/build_${build}.log" 2>&1 & build_ids["${build}"]=$! echo "Starting build_${build}.sh" @@ -249,7 +236,7 @@ while [[ ${builds_started} -lt ${#build_jobs[@]} ]]; do # Also recalculate how many processors are in use to account for completed builds builds_started=0 procs_in_use=0 - for build in "${!build_jobs[@]}"; do + for build in "${!builds[@]}"; do # Has the build started? if [[ -n "${build_ids[${build}]+0}" ]]; then builds_started=$(( builds_started + 1)) @@ -275,7 +262,7 @@ done # Wait for all jobs to complete and check return statuses -while [[ "${#build_jobs[@]}" -gt 0 ]]; do +while [[ "${#builds[@]}" -gt 0 ]]; do # If requested, check if any build has failed and exit if so if [[ "${_quick_kill}" == "YES" ]]; then @@ -286,7 +273,7 @@ while [[ "${#build_jobs[@]}" -gt 0 ]]; do fi fi - for build in "${!build_jobs[@]}"; do + for build in "${!builds[@]}"; do # Test if each job is complete and if so, notify and remove from the array if [[ -n "${build_ids[${build}]+0}" ]]; then if ! ps -p "${build_ids[${build}]}" > /dev/null; then @@ -294,14 +281,14 @@ while [[ "${#build_jobs[@]}" -gt 0 ]]; do build_stat=$? errs=$((errs+build_stat)) if [[ ${build_stat} == 0 ]]; then - echo "build_${build}.sh completed successfully!" + echo "${build_scripts[${build}]} completed successfully!" else - echo "build_${build}.sh failed with status ${build_stat}!" + echo "${build_scripts[${build}]} failed with status ${build_stat}!" fi # Remove the completed build from the list of PIDs unset 'build_ids[${build}]' - unset 'build_jobs[${build}]' + unset 'builds[${build}]' fi fi done diff --git a/sorc/build_compute.sh b/sorc/build_compute.sh new file mode 100755 index 0000000000..794b4fa350 --- /dev/null +++ b/sorc/build_compute.sh @@ -0,0 +1,115 @@ +#!/usr/bin/env bash + +function _usage() { + cat << EOF +Builds all of the global-workflow components on compute nodes. + +Usage: ${BASH_SOURCE[0]} [-h][-v][-A ] [ gfs gefs sfs gsi gdas all] + -h: + Print this help message and exit + -v: + Verbose mode + -A: + HPC account to use for the compute-node builds + (default is \$HOMEgfs/ci/platforms/config.\$machine:\$HPC_ACCOUNT) + + Input arguments are the system(s) to build. + Valid options are + "gfs", "gefs", "sfs", "gsi", "gdas", or "all". + (default is "gfs") +EOF + exit 1 +} +# This script launches compute-node builds of selected submodules +# Two positional arguments are accepted: + +set -eu + +rocoto_verbose_opt="" +verbose="NO" +build_xml="build.xml" +build_db="build.db" +build_lock_db="build_lock.db" + +OPTIND=1 +while getopts ":hA:v" option; do + case "${option}" in + h) _usage;; + A) export HPC_ACCOUNT="${OPTARG}" ;; + v) verbose="YES" && rocoto_verbose_opt="-v10";; + :) + echo "[${BASH_SOURCE[0]}]: ${option} requires an argument" + _usage + ;; + *) + echo "[${BASH_SOURCE[0]}]: Unrecognized option: ${option}" + _usage + ;; + esac +done +shift $((OPTIND-1)) + +# Set build system to gfs if not specified +if [[ $# -eq 0 ]]; then + systems="gfs" +else + systems=$* +fi + +if [[ "${verbose}" == "YES" ]]; then + set -x +fi + +# shellcheck disable=SC2155,SC2312 +HOMEgfs=$(cd "$(dirname "$(readlink -f -n "${BASH_SOURCE[0]}" )" )/.." && pwd -P) +cd "${HOMEgfs}/sorc" || exit 1 + +# Delete the rocoto XML and database if they exist +rm -f "${build_xml}" "${build_db}" "${build_lock_db}" + +echo "Sourcing global-workflow modules ..." +source "${HOMEgfs}/workflow/gw_setup.sh" + +echo "Generating build.xml for building global-workflow programs on compute nodes ..." +# Catch errors manually from here out +set +e +"${HOMEgfs}/workflow/build_compute.py" --yaml "${HOMEgfs}/workflow/build_opts.yaml" --systems "${systems}" +rc=$? +if (( rc != 0 )); then + echo "FATAL ERROR: ${BASH_SOURCE[0]} failed to create 'build.xml' with error code ${rc}" + exit 1 +fi + +echo "Launching builds in parallel on compute nodes ..." +runcmd="rocotorun -w ${build_xml} -d ${build_db} ${rocoto_verbose_opt}" + +finished=false +${runcmd} +echo "Running builds on compute nodes" +while [[ "${finished}" == "false" ]]; do + sleep 3m + ${runcmd} + state="$("${HOMEgfs}/ci/scripts/utils/rocotostat.py" -w "${build_xml}" -d "${build_db}")" + if [[ "${verbose_opt}" == "true" ]]; then + echo "Rocoto is in state ${state}" + else + echo -n "." + fi + + if [[ "${state}" == "DONE" ]]; then + finished=true + elif [[ "${state}" == "RUNNING" ]]; then + finished=false + elif [[ "${state}" == "DEAD" ]]; then + echo "FATAL ERROR: ${BASH_SOURCE[0]} one or more builds failed!" + # TODO add capability to determine which build(s) failed + exit 2 + else + echo "FATAL ERROR: ${BASH_SOURCE[0]} rocoto failed with state '${state}'" + exit 3 + fi +done + +echo "All builds completed successfully!" + +exit 0 diff --git a/sorc/build_ufs.sh b/sorc/build_ufs.sh index 773c104be3..3b0b3ed638 100755 --- a/sorc/build_ufs.sh +++ b/sorc/build_ufs.sh @@ -12,7 +12,7 @@ EXEC_NAME="gfs_model.x" while getopts ":da:fj:e:vwy" option; do case "${option}" in - d) BUILD_TYPE="Debug";; + d) BUILD_TYPE="DEBUG";; a) APP="${OPTARG}";; f) FASTER="ON";; j) BUILD_JOBS="${OPTARG}";; diff --git a/sorc/build_upp.sh b/sorc/build_upp.sh index e217e171db..15e2dfb146 100755 --- a/sorc/build_upp.sh +++ b/sorc/build_upp.sh @@ -26,6 +26,31 @@ if [[ ! -d "../exec" ]]; then mkdir -p ../exec fi +# The UPP does not load a cmake module and the WCOSS2 compute nodes do not have cmake in PATH by default +# Add cmake to the default modules if the command isn't found +# TODO remove this workaround when issue NOAA-EMC/UPP#1106 is addressed. +if ! command -v cmake >& /dev/null; then + export COMPILER="intel" + if [[ -z ${HOMEgfs+x} ]]; then + # shellcheck disable=SC2155 + readonly HOMEgfs=$(cd "$(dirname "$(readlink -f -n "${BASH_SOURCE[0]}" )" )/.." && pwd -P) + fi + source "${HOMEgfs}/ush/detect_machine.sh" + if [[ "${MACHINE_ID}" == "wcoss2" ]]; then + set +x + module try-load cmake + + if module is-loaded cmake; then + LMOD_SYSTEM_DEFAULT_MODULES="${LMOD_SYSTEM_DEFAULT_MODULES} cmake" + echo "Added cmake to the default modules" + else + echo "FATAL ERROR Could not find cmake or a cmake module!" + exit 2 + fi + set -x + fi +fi + cd ufs_model.fd/FV3/upp/tests # shellcheck disable=SC2086 BUILD_JOBS=${BUILD_JOBS:-8} ./compile_upp.sh ${_opts} diff --git a/ush/interp_atmos_master.sh b/ush/interp_atmos_master.sh index 4c4ee4b03c..3a3edc470b 100755 --- a/ush/interp_atmos_master.sh +++ b/ush/interp_atmos_master.sh @@ -53,7 +53,11 @@ export err=$?; err_chk # trim and mask for all grids for grid in "${grids[@]}"; do trim_rh "${output_file_prefix}_${grid}"; export err=$?; err_chk - mod_icec "${output_file_prefix}_${grid}"; export err=$?; err_chk + # shellcheck disable=SC2312 + var_count=$(${WGRIB2} "${output_file_prefix}_${grid}" -match "LAND|ICEC" |wc -l) + if [[ "${var_count}" -eq 2 ]]; then + mod_icec "${output_file_prefix}_${grid}"; export err=$?; err_chk + fi done exit 0 diff --git a/versions/build.noaacloud.ver b/versions/build.noaacloud.ver index fc288b76b5..b5fd272b4b 100644 --- a/versions/build.noaacloud.ver +++ b/versions/build.noaacloud.ver @@ -1,5 +1,5 @@ -export stack_intel_ver=2021.3.0 -export stack_impi_ver=2021.3.0 +export stack_intel_ver=2021.10.0 +export stack_impi_ver=2021.10.0 export spack_env=gsi-addon-env source "${HOMEgfs:-}/versions/spack.ver" -export spack_mod_path="/contrib/spack-stack/spack-stack-${spack_stack_ver}/envs/gsi-addon-env/install/modulefiles/Core" +export spack_mod_path="/contrib/spack-stack-rocky8/spack-stack-${spack_stack_ver}/envs/gsi-addon-env/install/modulefiles/Core" diff --git a/versions/run.noaacloud.ver b/versions/run.noaacloud.ver index 1fc3779b2e..98ec2b36f9 100644 --- a/versions/run.noaacloud.ver +++ b/versions/run.noaacloud.ver @@ -1,8 +1,8 @@ -export stack_intel_ver=2021.3.0 -export stack_impi_ver=2021.3.0 +export stack_intel_ver=2021.10.0 +export stack_impi_ver=2021.10.0 export spack_env=gsi-addon-env source "${HOMEgfs:-}/versions/spack.ver" -export spack_mod_path="/contrib/spack-stack/spack-stack-${spack_stack_ver}/envs/gsi-addon-env/install/modulefiles/Core" +export spack_mod_path="/contrib/spack-stack-rocky8/spack-stack-${spack_stack_ver}/envs/gsi-addon-env/install/modulefiles/Core" export cdo_ver=2.2.0 diff --git a/workflow/applications/gfs_cycled.py b/workflow/applications/gfs_cycled.py index 543d7a9d8c..5ecfddf276 100644 --- a/workflow/applications/gfs_cycled.py +++ b/workflow/applications/gfs_cycled.py @@ -317,7 +317,9 @@ def get_task_names(self): task_names[run].append('echgres') if 'gdas' in run else 0 task_names[run] += ['ediag'] if options['lobsdiag_forenkf'] else ['eomg'] task_names[run].append('esnowanl') if options['do_jedisnowda'] and 'gdas' in run else 0 + task_names[run].append('efcs') if 'gdas' in run else 0 + task_names[run].append('epos') if 'gdas' in run else 0 - task_names[run] += ['stage_ic', 'ecen', 'esfc', 'efcs', 'epos', 'earc', 'cleanup'] + task_names[run] += ['stage_ic', 'ecen', 'esfc', 'earc', 'cleanup'] return task_names diff --git a/workflow/build_compute.py b/workflow/build_compute.py new file mode 100755 index 0000000000..7787e9ad40 --- /dev/null +++ b/workflow/build_compute.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python3 + +""" +Entry point for setting up a compute-node build +""" + +import os +from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter +from typing import Dict + +from wxflow import parse_yaml, AttrDict + +from hosts import Host +import rocoto.rocoto as rocoto + + +_here = os.path.dirname(__file__) +HOMEgfs = os.path.abspath(os.path.join(os.path.abspath(_here), '..')) + + +def input_args(*argv): + """ + Method to collect user arguments for `compute_build.py` + """ + + description = """ + Setup files and directories to start a compute build. + """ + + parser = ArgumentParser(description=description, + formatter_class=ArgumentDefaultsHelpFormatter) + + parser.add_argument('--yaml', help='Input YAML file', + type=str, required=False, default='build_opts.yaml') + parser.add_argument('--account', help='HPC account to use; default is host-dependent', required=False, default=os.getenv('HPC_ACCOUNT')) + parser.add_argument('--systems', help='System(s) to build (options: gfs, gefs, sfs, gsi, gdas, or all)', required=False, default='gfs') + + inputs = parser.parse_args(list(*argv) if len(argv) else None) + + return inputs + + +def get_task_spec(task_name: str, task_spec: Dict, host_spec: Dict) -> Dict: + """ + Generate a task specification dictionary for a given task. + + Parameters + ---------- + task_name: str + The name of the task. + task_spec: Dict + The specification of the task, containing command, walltime, and cores. + host_spec: Dict + The specification of the host, containing account, queue, partition, and native. + + Returns: + -------- + task_dict: Dict + A dictionary containing the task specification, including resources and other task-related information. + """ + + task_dict = AttrDict() + task_dict.task_name = task_name + task_dict.cycledef = "build" + task_dict.maxtries = 1 + task_dict.command = f"cd {HOMEgfs}/sorc/; {task_spec.command}" + task_dict.job_name = task_name + task_dict.log = f"{HOMEgfs}/sorc/logs/{task_name}.log" + + task_dict.resources = AttrDict() + task_dict.resources.account = host_spec.account + task_dict.resources.queue = host_spec.queue + task_dict.resources.partition = host_spec.partition + task_dict.resources.walltime = task_spec.walltime + task_dict.resources.native = host_spec.native + task_dict.resources.memory = None + task_dict.resources.nodes = 1 + task_dict.resources.ntasks = task_spec.cores + task_dict.resources.ppn = task_spec.cores + task_dict.resources.threads = 1 + + return task_dict + + +def get_host_specs(host: Dict) -> Dict: + """Generate host specs for the build.xml file based on Host() info + + Parameters + ---------- + host : Dict + Host information returned by Host() + + Returns + ------- + specs: Dict + Consolidated compute specifics needed for the XML + """ + + native = None + partition = None + + if host.info.SCHEDULER in ['pbspro']: + native = '-l place=vscatter' + elif host.info.SCHEDULER in ['slurm']: + native = '--export=NONE' + if host.info.PARTITION_BATCH not in [""]: + partition = host.info.PARTITION_BATCH + + if host.info.RESERVATION not in [""]: + native += f' --reservation={host.info.RESERVATION}' + + if host.info.CLUSTERS not in [""]: + native += f' --clusters={host.info.CLUSTERS}' + + specs = AttrDict() + specs.scheduler = host.info.SCHEDULER + specs.account = host.info.ACCOUNT + specs.queue = host.info.QUEUE + specs.partition = partition + specs.native = native + + return specs + + +def main(*argv): + + user_inputs = input_args(*argv) + host_specs = get_host_specs(Host()) + + # Update the default host account if the user supplied one + if user_inputs.account is not None: + host_specs.account = user_inputs.account + + build_specs = AttrDict(parse_yaml(user_inputs.yaml)) + + systems = user_inputs.systems.split() if "all" not in user_inputs.systems else ["all"] + + # Determine systems to build + builds = set() + if systems[0] == "all": + builds = build_specs.build + else: + builds.update(build_specs.systems["common"]) + try: + for system in systems: + builds.update(build_specs.systems[system]) + except KeyError as e: + raise KeyError(f"{system} is not a valid global-workflow system!") from e + + # Build the task specs from the build specs and host specs + task_specs = AttrDict() + for task_name, task_spec in build_specs.build.items(): + if task_name in builds: + task_specs[task_name] = get_task_spec(task_name, task_spec, host_specs) + + # Start building the XML + strings = ['', + '', + f'', + f'\t{HOMEgfs}/sorc/logs/build.log', + '\t190001010000 190001010000 24:00:00', + '\n'] + xml_header = '\n'.join(strings) + xml_footer = '\n\n' + + task_list = [] + for _, task_spec in task_specs.items(): + task_list.append(rocoto.create_task(task_spec)) + xml_tasks = '\n'.join(task_list) + + xml = ''.join([xml_header, xml_tasks, xml_footer]) + xml_file = f"{HOMEgfs}/sorc/build.xml" + with open(xml_file, 'w') as fh: + fh.write(xml) + + +if __name__ == '__main__': + main() diff --git a/workflow/build_opts.yaml b/workflow/build_opts.yaml new file mode 100644 index 0000000000..464701c2f3 --- /dev/null +++ b/workflow/build_opts.yaml @@ -0,0 +1,94 @@ +systems: + common: + - "ufs_utils" + - "gfs_utils" + - "upp" + gfs: + - "gfs_model" + - "gfs_ww3prepost" + gsi: + - "gsi_enkf" + - "gsi_utils" + - "gsi_monitor" + gdas: + - "gdas" + - "gsi_utils" + - "gsi_monitor" + gefs: + - "gefs_model" + - "gefs_ww3_prepost" + sfs: + - "sfs_model" + - "gefs_ww3_prepost" +build: + gfs_model: + command: "./build_ufs.sh -e gfs_model.x -j 12" + log: "build_ufs_gfs.log" + cores: 12 + walltime: "00:30:00" + + gfs_ww3prepost: + command: "./build_ww3prepost.sh -j 4" + log: "build_ww3prepost_gfs.log" + cores: 4 + walltime: "00:10:00" + + gefs_model: + command: "./build_ufs.sh -w -e gefs_model.x -j 12" + log: "build_ufs_gefs.log" + cores: 12 + walltime: "00:30:00" + + gefs_ww3_prepost: + command: "./build_ww3prepost.sh -w -j 4" + log: "build_ww3prepost_gefs.log" + cores: 4 + walltime: "00:10:00" + + sfs_model: + command: "./build_ufs.sh -y -e sfs_model.x -j 12" + log: "build_ufs_sfs.log" + cores: 12 + walltime: "00:30:00" + + upp: + command: "./build_upp.sh -j 8" + log: "build_upp.log" + cores: 8 + walltime: "00:10:00" + + gsi_enkf: + command: "./build_gsi_enkf.sh -j 8" + log: "build_gsi_enkf.log" + cores: 8 + walltime: "00:15:00" + + gsi_monitor: + command: "./build_gsi_monitor.sh -j 4" + log: "build_gsi_monitor.log" + cores: 4 + walltime: "00:10:00" + + gsi_utils: + command: "./build_gsi_utils.sh -j 6" + log: "build_gsi_utils.log" + cores: 6 + walltime: "00:10:00" + + ufs_utils: + command: "./build_ufs_utils.sh -j 8" + log: "build_ufs_utils.log" + cores: 8 + walltime: "00:10:00" + + gfs_utils: + command: "./build_gfs_utils.sh -j 6" + log: "build_gfs_utils.log" + cores: 6 + walltime: "00:10:00" + + gdas: + command: "./build_gdas.sh -j 12" + log: "build_gdas.log" + cores: 12 + walltime: "01:00:00" diff --git a/workflow/generate_workflows.sh b/workflow/generate_workflows.sh index c98fa3028a..a5615a8b0d 100755 --- a/workflow/generate_workflows.sh +++ b/workflow/generate_workflows.sh @@ -19,11 +19,6 @@ function _usage() { -b Run build_all.sh with default flags (build the UFS, UPP, UFS_Utils, and GFS-utils only - -B "build flags" - Run build_all.sh with the build specified flags. Refer to - build_all.sh -h for a list of valid flags. - NOTE: the list of build flags MUST be in quotes. - -u Update submodules before building and/or generating experiments. -y "list of YAMLs to run" @@ -37,13 +32,12 @@ function _usage() { -G Run all valid GFS cases in the specified YAML directory. If -b is specified, then "-g -u" (build the GSI and GDASApp) - will be passed to build_all.sh unless -B is also specified. + will be passed to build_all.sh. Note that these builds are disabled on some systems, which will result in a warning from build_all.sh. -E Run all valid GEFS cases in the specified YAML directory. - If -b is specified, then "-w" will be passed to build_all.sh - unless -B is also specified. + If -b is specified, then "-w" will be passed to build_all.sh. -S (Not yet supported!) Run all valid SFS cases in the specified YAML directory. @@ -91,7 +85,6 @@ HOMEgfs="" _specified_home=false _build=false _build_flags="" -_explicit_build_flags=false _update_submods=false declare -a _yaml_list=("C48_ATM") _specified_yaml_list=false @@ -126,7 +119,6 @@ while [[ $# -gt 0 && "$1" != "--" ]]; do fi ;; b) _build=true ;; - B) _build_flags="${OPTARG}" && _explicit_build_flags=true ;; u) _update_submods=true ;; y) # Start over with an empty _yaml_list declare -a _yaml_list=() @@ -231,18 +223,6 @@ else done fi -# Test if multiple "run_all" options were set -_count_run_alls=0 -[[ "${_run_all_gfs}" == "true" ]] && ((_count_run_alls+=1)) -[[ "${_run_all_gefs}" == "true" ]] && ((_count_run_alls+=1)) -[[ "${_run_all_sfs}" == "true" ]] && ((_count_run_alls+=1)) - -if (( _count_run_alls > 1 )) ; then - echo "Only one run all option (-G -E -S) may be specified" - echo "Rerun with just one option and/or with -h for usage examples" - exit 5 -fi - # If -S is specified, exit (for now). # TODO when SFS tests come online, enable this option. if [[ "${_run_all_sfs}" == "true" ]]; then @@ -277,7 +257,7 @@ function select_all_yamls() # Bash cannot return an array from a function and any edits are descoped at # the end of the function, so use a nameref instead. - local -n _nameref_yaml_list='_yaml_list' + local -n _nameref_yaml_list="${2}" if [[ "${_specified_yaml_list}" == false ]]; then # Start over with an empty _yaml_list @@ -328,21 +308,20 @@ EOM # Check if running all GEFS cases if [[ "${_run_all_gefs}" == "true" ]]; then # Append -w to build_all.sh flags if -E was specified - if [[ "${_explicit_build_flags}" == "false" && "${_build}" == "true" ]]; then - _build_flags="-w" - fi + _build_flags="${_build_flags} gefs " - select_all_yamls "gefs" + declare -a _gefs_yaml_list + select_all_yamls "gefs" "_gefs_yaml_list" + _yaml_list=("${_yaml_list[@]}" "${_gefs_yaml_list[@]}") fi -# Check if running all SFS cases +# Check if running all GFS cases if [[ "${_run_all_gfs}" == "true" ]]; then - # Append -g -u to build_all.sh flags if -G was specified - if [[ "${_explicit_build_flags}" == "false" && "${_build}" == "true" ]]; then - _build_flags="-g -u" - fi + _build_flags="${_build_flags} gfs " - select_all_yamls "gfs" + declare -a _gfs_yaml_list + select_all_yamls "gfs" "_gfs_yaml_list" + _yaml_list=("${_yaml_list[@]}" "${_gfs_yaml_list[@]}") fi # Loading modules sometimes raises unassigned errors, so disable checks @@ -397,7 +376,7 @@ if [[ "${_build}" == "true" ]]; then printf "Building via build_all.sh %s\n\n" "${_build_flags}" # Let the output of build_all.sh go to stdout regardless of verbose options #shellcheck disable=SC2086,SC2248 - ${HOMEgfs}/sorc/build_all.sh ${_build_flags} ${_verbose_flag} + ${HOMEgfs}/sorc/build_all.sh ${_verbose_flag} ${_build_flags} fi # Link the workflow silently unless there's an error diff --git a/workflow/hosts/awspw.yaml b/workflow/hosts/awspw.yaml index b98c838faa..c80800725a 100644 --- a/workflow/hosts/awspw.yaml +++ b/workflow/hosts/awspw.yaml @@ -27,5 +27,5 @@ MAKE_ACFTBUFR: 'NO' DO_TRACKER: 'NO' DO_GENESIS: 'NO' DO_METP: 'NO' -SUPPORT_WAVES: 'NO' -SUPPORTED_RESOLUTIONS: ['C48', 'C96'] # TODO: Test and support all cubed-sphere resolutions. +SUPPORTED_RESOLUTIONS: ['C48', 'C96', 'C192', 'C384', 'C768'] # TODO: Test and support all cubed-sphere resolutions. +AERO_INPUTS_DIR: /contrib/global-workflow-shared-data/data/gocart_emissions diff --git a/workflow/hosts/azurepw.yaml b/workflow/hosts/azurepw.yaml index 4725e28962..d7c064dc60 100644 --- a/workflow/hosts/azurepw.yaml +++ b/workflow/hosts/azurepw.yaml @@ -24,5 +24,7 @@ LOCALARCH: 'NO' ATARDIR: '' # TODO: This will not yet work from AZURE. MAKE_NSSTBUFR: 'NO' MAKE_ACFTBUFR: 'NO' -SUPPORT_WAVES: 'NO' -SUPPORTED_RESOLUTIONS: ['C48', 'C96'] # TODO: Test and support all cubed-sphere resolutions. +DO_TRACKER: 'NO' +DO_GENESIS: 'NO' +DO_METP: 'NO' +SUPPORTED_RESOLUTIONS: ['C48', 'C96', 'C384', 'C768'] # TODO: Test and support all cubed-sphere resolutions. diff --git a/workflow/hosts/googlepw.yaml b/workflow/hosts/googlepw.yaml index 1b979b6bc9..8ba8e18e74 100644 --- a/workflow/hosts/googlepw.yaml +++ b/workflow/hosts/googlepw.yaml @@ -24,5 +24,7 @@ LOCALARCH: 'NO' ATARDIR: '' # TODO: This will not yet work from GOOGLE. MAKE_NSSTBUFR: 'NO' MAKE_ACFTBUFR: 'NO' -SUPPORT_WAVES: 'NO' -SUPPORTED_RESOLUTIONS: ['C48', 'C96'] # TODO: Test and support all cubed-sphere resolutions. +DO_TRACKER: 'NO' +DO_GENESIS: 'NO' +DO_METP: 'NO' +SUPPORTED_RESOLUTIONS: ['C48', 'C96', 'C384'] # TODO: Test and support all cubed-sphere resolutions. diff --git a/workflow/rocoto/gfs_tasks.py b/workflow/rocoto/gfs_tasks.py index 54870b79cc..9b6f712380 100644 --- a/workflow/rocoto/gfs_tasks.py +++ b/workflow/rocoto/gfs_tasks.py @@ -2896,7 +2896,10 @@ def _get_eposgroups(epos): def earc(self): deps = [] - dep_dict = {'type': 'metatask', 'name': f'{self.run}_epmn'} + if 'enkfgdas' in self.run: + dep_dict = {'type': 'metatask', 'name': f'{self.run}_epmn'} + else: + dep_dict = {'type': 'task', 'name': f'{self.run}_esfc'} deps.append(rocoto.add_dependency(dep_dict)) dependencies = rocoto.create_dependency(dep=deps) diff --git a/workflow/setup_expt.py b/workflow/setup_expt.py index 574dc0d91a..09bc1c90ac 100755 --- a/workflow/setup_expt.py +++ b/workflow/setup_expt.py @@ -372,7 +372,6 @@ def query_and_clean(dirname, force_clean=False): def validate_user_request(host, inputs): supp_res = host.info['SUPPORTED_RESOLUTIONS'] - supp_waves = host.info.get('SUPPORT_WAVES', 'YES') machine = host.machine for attr in ['resdetatmos', 'resensatmos']: try: @@ -382,9 +381,6 @@ def validate_user_request(host, inputs): if expt_res not in supp_res: raise NotImplementedError(f"Supported resolutions on {machine} are:\n{', '.join(supp_res)}") - if "W" in inputs.app and supp_waves == "NO": - raise NotImplementedError(f"Waves are not supported on {machine}") - def get_ocean_resolution(resdetatmos): """