From 3b1f326f5f173b59df0f0d7dee057e4965f1a50b Mon Sep 17 00:00:00 2001 From: "Eric T. Johnson" Date: Fri, 6 Dec 2024 16:52:34 -0500 Subject: [PATCH] Use GNU parallel in Kronos archiving script This now transfers at more than 2TiB/hour, based on my transfers of my 2D flame_wave runs. --- job_scripts/hpss/kronos_process.sh | 180 ++++++++++++++--------------- 1 file changed, 85 insertions(+), 95 deletions(-) diff --git a/job_scripts/hpss/kronos_process.sh b/job_scripts/hpss/kronos_process.sh index cec5972..4b1570c 100755 --- a/job_scripts/hpss/kronos_process.sh +++ b/job_scripts/hpss/kronos_process.sh @@ -12,7 +12,7 @@ set -e jobidfile=process.jobid -# set the prefix of the plotfiles and checkpoint files (passed to find(1) -name) +# set the prefix of the plotfiles and checkpoint files (a fnmatch(3) pattern) plt_prefix='*plt' chk_prefix='*chk' @@ -53,14 +53,19 @@ echo "$SLURM_JOB_ID" > "$jobidfile" # if our process is killed, remove the lock file first function cleanup() { - echo "process: received signal; removing $jobidfile" + echo "process: removing $jobidfile" command rm -f "$jobidfile" # remove the EXIT handler, since we only want to do this once trap - EXIT # don't exit, so we can finish the current operation: # $jobidfile is checked at the start of each loop iteration in process_files() } -trap cleanup EXIT HUP INT QUIT TERM XCPU +function cleanup_killed() { + echo "process: received signal; stopping" + cleanup +} +trap cleanup_killed HUP INT QUIT TERM XCPU +trap cleanup EXIT # Number of seconds to sleep before checking again. N=60 @@ -100,78 +105,95 @@ fi # Subsequent invocations of this routine will skip over any plotfiles or # checkpoint files that have a corresponding .processed file. - -function process_files +# this function does all the actual data transfer, and is run in parallel +function process_single_file { - if [ ! -f "$jobidfile" ]; then - echo "process: $jobidfile has been removed, exiting" + local dir=$1 + local job_slot=$2 + + local done_dir + # right-hand side is not quoted, as we want it to be treated as a pattern + if [[ $dir == ${plt_prefix}* ]]; then + done_dir=plotfiles + elif [[ $dir == ${chk_prefix}* ]]; then + done_dir=checkfiles + fi + + if ! [[ -f "$jobidfile" ]]; then + echo "$job_slot | process: $jobidfile has been removed, exiting" exit fi + if [[ -d "${dir}" ]]; then - # plotfiles + # only work on the file if there is not a .processed file in the + # main directory or the plotfiles/ directory + if ! [[ -f "${dir}.processed" ]] && ! [[ -f "${done_dir}/${dir}.processed" ]]; then - # Take all but the final plt file -- we want to ensure they're completely - # written to disk. Strip out any tar files that are lying around as well - # as pltXXXXX.processed files. We restrict the find command to a depth of - # 1 to avoid catching any already-processed files in the plotfiles/ - # directory - mapfile -t pltlist < <( - find . -maxdepth 1 -type d -name "${plt_prefix}"'?????' -print | sort - find . -maxdepth 1 -type d -name "${plt_prefix}"'??????' -print | sort - find . -maxdepth 1 -type d -name "${plt_prefix}"'???????' -print | sort - ) + # do processing + printf '%2d | archiving %s to Kronos\n' "$job_slot" "$dir" - if (( ${#pltlist[@]} > 1 )); then - # Don't process the final plt file - unset 'pltlist[-1]' + # store the file on Kronos + if tar -cvf "${KRONOS_DIR}/${dir}.tar" "${dir}" > "${dir}.log"; then - for dir in "${pltlist[@]}" - do - if ! [[ -f "$jobidfile" ]]; then - echo "process: $jobidfile has been removed, exiting" - exit - fi - if [[ -d "${dir}" ]]; then + # mark this file as processed so we skip it next time + date > "${dir}.processed" - # only work on the file if there is not a .processed file in the - # main directory or the plotfiles/ directory - if ! [[ -f "${dir}.processed" ]] && ! [[ -f "plotfiles/${dir}.processed" ]]; then + if [[ $done_dir == plotfiles ]]; then + # output the plotfile name and simulation time to ftime.out + # TODO: we should update this file in diag_files_${datestr}.tar + if command -v "${FTIME_EXE}" > /dev/null; then + "${FTIME_EXE}" "${dir}" >> ftime.out + fi + fi - # do processing - echo "archiving ${dir} to Kronos" + # store the log file along with the archive + mv "${dir}.log" "${KRONOS_DIR}" - # store the file on Kronos - if tar -cvf "${KRONOS_DIR}/${dir}.tar" "${dir}" > "${dir}.log"; then + # move the file into the transferred directory + mv "${dir}" "$done_dir" - # mark this file as processed so we skip it next time - date > "${dir}.processed" + # ..and the corresponding .processed file too. + mv "${dir}.processed" "$done_dir" - # output the plotfile name and simulation time to ftime.out - # TODO: we should update this file in diag_files_${datestr}.tar - if command -v "${FTIME_EXE}" > /dev/null; then - "${FTIME_EXE}" "${dir}" >> ftime.out - fi + #if [[ $done_dir == plotfiles ]]; then + # # and visualize it + # runtimevis.py "${done_dir}/${dir}" + #fi - # store the log file along with the archive - mv "${dir}.log" "${KRONOS_DIR}" + fi - # move the plotfile into the plotfiles directory - mv "${dir}" plotfiles/ + printf '%2d | done with %s\n' "$job_slot" "$dir" - # ..and the corresponding .processed file too. - mv "${dir}.processed" plotfiles/ + fi # end test of whether file already processed - # and visualize it - #runtimevis.py "plotfiles/${dir}" + fi # end test of whether file is a directory (as it should be) +} - fi +# these are needed for GNU parallel +export jobidfile plt_prefix chk_prefix FTIME_EXE KRONOS_DIR +export -f process_single_file - fi # end test of whether plotfile already processed +function process_files +{ + if [ ! -f "$jobidfile" ]; then + echo "process: $jobidfile has been removed, exiting" + exit + fi - fi # end test of whether plotfile is a directory (as it should be) + # plotfiles - done - fi + # Take all but the final plt file -- we want to ensure they're completely + # written to disk. Strip out any tar files that are lying around as well + # as pltXXXXX.processed files. We restrict the find command to a depth of + # 1 to avoid catching any already-processed files in the plotfiles/ + # directory + mapfile -t pltlist < <( + { + find . -maxdepth 1 -type d -name "${plt_prefix}"'?????' -print | sort; + find . -maxdepth 1 -type d -name "${plt_prefix}"'??????' -print | sort; + find . -maxdepth 1 -type d -name "${plt_prefix}"'???????' -print | sort; + } | head -n-1 # don't process the final plotfile + ) # checkpoint files @@ -182,49 +204,17 @@ function process_files # 1 to avoid catching any already-processed files in the checkfiles/ # directory mapfile -t chklist < <( - find . -maxdepth 2 -type f -path "${chk_prefix}"'?[05]000/Header' -printf '%h\n' | sort - find . -maxdepth 2 -type f -path "${chk_prefix}"'??[05]000/Header' -printf '%h\n' | sort - find . -maxdepth 2 -type f -path "${chk_prefix}"'???[05]000/Header' -printf '%h\n' | sort + { + find . -maxdepth 2 -type f -path "${chk_prefix}"'??000/Header' -printf '%h\n' | sort + find . -maxdepth 2 -type f -path "${chk_prefix}"'???000/Header' -printf '%h\n' | sort + find . -maxdepth 2 -type f -path "${chk_prefix}"'????000/Header' -printf '%h\n' | sort + } | head -n-1 # don't process the final checkpoint file ) - if (( ${#chklist[@]} > 1 )); then - # Don't process the final chk file - unset 'chklist[-1]' - - for dir in "${chklist[@]}" - do - if ! [[ -f "$jobidfile" ]]; then - echo "process: $jobidfile has been removed, exiting" - exit - fi - if [[ -d "${dir}" ]]; then - - if ! [[ -f "${dir}.processed" ]] && ! [[ -f "checkfiles/${dir}.processed" ]]; then - echo "archiving ${dir} to Kronos" - - # store the file on Kronos - if tar -cvf "${KRONOS_DIR}/${dir}.tar" "${dir}" > "${dir}.log"; then - - # mark this file as processed so we skip it next time - date > "${dir}.processed" - - # store the log file along with the archive - mv "${dir}.log" "${KRONOS_DIR}" - - # move the checkpoint file into the checkfiles directory - mv "${dir}" checkfiles/ - - # ..and the corresponding .processed file too. - mv "${dir}.processed" checkfiles/ - - fi - - fi - - fi - done - fi + # do the archiving in parallel + # use --line-buffer so the start and finish lines are interleaved properly + parallel --line-buffer -j 32 process_single_file '{}' '{%}' ::: "${pltlist[@]}" "${chklist[@]}" }