diff --git a/job_scripts/perlmutter/process.xrb b/job_scripts/perlmutter/process.xrb index 46cb632..25972d9 100755 --- a/job_scripts/perlmutter/process.xrb +++ b/job_scripts/perlmutter/process.xrb @@ -17,7 +17,7 @@ work_dir=`pwd` HPSS_DIR=`basename $work_dir` # set HTAR command -HTAR=/usr/bin/htar +HTAR=htar # path to the ftime executable -- used for making a simple ftime.out file # listing the name of the plotfile and its simulation time @@ -229,12 +229,12 @@ function process_files datestr=$(date +"%Y%m%d_%H%M_%S") ftime_files=$(find . -maxdepth 1 -name "ftime.out" -print) inputs_files=$(find . -maxdepth 1 -name "inputs*" -print) -probin_files=$(find . -maxdepth 1 -name "probin*" -print) +diag_files=$(find . -maxdepth 1 -name "*diag.out" -print) model_files=$(find . -maxdepth 1 -name "*.hse.*" -print) -slurm_files=$(find . -maxdepth 1 -name "*.slurm" -print) +job_files=$(find . -maxdepth 1 -name "*.slurm" -print) $(find . -maxdepth 1 -name "*.submit" -print) process_files=$(find . -maxdepth 1 -name "process*" -print) -${HTAR} -cvf ${HPSS_DIR}/diag_files_${datestr}.tar ${model_files} ${ftime_files} ${inputs_files} ${probin_files} ${slurm_files} ${process_files} >> /dev/null +${HTAR} -cvf ${HPSS_DIR}/diag_files_${datestr}.tar ${model_files} ${ftime_files} ${inputs_files} ${probin_files} ${job_files} ${process_files} >> /dev/null # Loop, waiting for plt and chk directories to appear. diff --git a/sphinx_docs/source/nersc-hpss.rst b/sphinx_docs/source/nersc-hpss.rst index 1e1c15e..4a3dae1 100644 --- a/sphinx_docs/source/nersc-hpss.rst +++ b/sphinx_docs/source/nersc-hpss.rst @@ -67,21 +67,27 @@ The following describes how to use the scripts: overwriting the stored copy, especially if a purge took place. The same is done with checkpoint files. +Some additional notes: -Additionally, if the ``ftime`` executable is in your path -(``ftime.cpp`` lives in ``amrex/Tools/Plotfile/``), then -the script will create a file called ``ftime.out`` that lists the name -of the plotfile and the corresponding simulation time. - -Finally, right when the job is submitted, the script will tar up all -of the diagnostic files, ``ftime.out``, submission script, inputs and -probin, and archive them on HPSS. The .tar file is given a name that -contains the date-string to allow multiple archives to co-exist. When -``process.xrb`` is running, it creates a lockfile (called -``process.pid``) that ensures that only one instance of the script is -running at any one time. Sometimes if the machine crashes, the -``process.pid`` file will be left behind, in which case, the script -aborts. Just delete that if you know the script is not running. +* If the ``ftime`` executable is in your path (``ftime.cpp`` lives in + ``amrex/Tools/Plotfile/``), then the script will create a file + called ``ftime.out`` that lists the name of the plotfile and the + corresponding simulation time. + +* Right when the job is run, the script will tar up all of the + diagnostic files, ``ftime.out``, submission script, and inputs and + archive them on HPSS. The ``.tar`` file is given a name that contains + the date-string to allow multiple archives to co-exist. + +* When ``process.xrb`` is running, it creates a lockfile (called + ``process.pid``) that ensures that only one instance of the script + is running at any one time. + + .. warning:: + + Sometimes if the job is not terminated normally, the + ``process.pid`` file will be left behind, in which case, the script + aborts. Just delete that if you know the script is not running. Jobs in the xfer queue start up quickly. The best approach is to start one as you start your main job (or make it dependent on the main