Skip to content

Commit

Permalink
Update ngen_cal entrypoint.
Browse files Browse the repository at this point in the history
  • Loading branch information
robertbartel committed Aug 2, 2023
1 parent 6f4c13f commit 0c0fcdb
Showing 1 changed file with 48 additions and 12 deletions.
60 changes: 48 additions & 12 deletions docker/main/ngen/ngen_cal_entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ while [ ${#} -gt 0 ]; do
WORKER_INDEX="${2:?}"
shift
;;
--calibration-config-file)
CALIBRATION_CONFIG_BASENAME="${2:?}"
shift
;;
esac
shift
done
Expand All @@ -51,10 +55,18 @@ cd ${OUTPUT_DATASET_DIR:?Output dataset directory not defined}

start_calibration() {
# Start ngen calibration
echo "$(print_date) Starting serial ngen calibration"
if [ -n "${PARTITION_DATASET_DIR:-}" ]; then
echo "$(print_date) Starting ngen calibration with parallel ngen execution"
else
echo "$(print_date) Starting ngen calibration with serial ngen execution"
fi

# Find and use copy of config in output dataset
CALIBRATION_CONFIG_FILE=$(find ${OUTPUT_DATASET_DIR:?} -name "*.yaml" -maxdepth 1 | head -1)
if [ -n "${CALIBRATION_CONFIG_BASENAME:-}" ]; then
CALIBRATION_CONFIG_FILE=$(find ${OUTPUT_DATASET_DIR:?} -name "${CALIBRATION_CONFIG_BASENAME}" -maxdepth 1 | head -1)
else
CALIBRATION_CONFIG_FILE=$(find ${OUTPUT_DATASET_DIR:?} -name "*.yaml" -maxdepth 1 | head -1)
fi

if [ -z "${CALIBRATION_CONFIG_FILE}" ]; then
echo "Error: NGEN calibration yaml file not found" 2>&1
Expand All @@ -71,16 +83,40 @@ start_calibration() {
return ${NGEN_RETURN}
}

# Copy config files to output dataset for record keeping, but only from the "main" worker node
# We can allow worker index to not be supplied when executing serially, so apply default substitution
if [ ${WORKER_INDEX:-0} -eq 0 ]; then
# TODO: perform copy of configs to output dataset outside of image (in service) for better performance
cp -a ${CONFIG_DATASET_DIR:?Config dataset directory not defined}/. ${OUTPUT_DATASET_DIR:?}
if [ -n "${PARTITION_DATASET_DIR:-}" ]; then
# Also, when partition config present, copy that for record keeping
# We can allow worker index to not be supplied when executing serially
if [ "${WORKER_INDEX:-0}" = "0" ]; then
if [ "$(whoami)" = "${MPI_USER:?MPI user not defined}" ]; then
# This will only have an effect when running with multiple MPI nodes, so its safe to have even in serial exec
trap close_remote_workers EXIT
# Have "main" (potentially only) worker copy config files to output dataset for record keeping
# TODO: perform copy of configs to output dataset outside of image (in service) for better performance
cp -a ${PARTITION_DATASET_DIR}/. ${OUTPUT_DATASET_DIR:?}
cp -a ${CONFIG_DATASET_DIR:?Config dataset directory not defined}/. ${OUTPUT_DATASET_DIR:?}
if [ -n "${PARTITION_DATASET_DIR:-}" ]; then
# Include partition config dataset too if appropriate
# TODO: perform copy of configs to output dataset outside of image (in service) for better performance
cp -a ${PARTITION_DATASET_DIR}/. ${OUTPUT_DATASET_DIR:?}
fi

# Run the same function to execute ngen_cal (it's config will handle whether MPI is used internally)
start_calibration
else
# Start SSHD on the main worker if have an MPI job
if [ -n "${PARTITION_DATASET_DIR:-}" ]; then
echo "$(print_date) Starting SSH daemon on main worker"
/usr/sbin/sshd -D &
_SSH_D_PID="$!"

trap cleanup_sshuser_exit EXIT
fi

# Make sure we run ngen/ngen-cal as our MPI_USER
echo "$(print_date) Running exec script as '${MPI_USER:?}'"
# Do this by just re-running this script with the same args, but as the other user
# The script will modify its behavior as needed depending on current user (see associated "if" for this "else")
_EXEC_STRING="${0} ${@}"
su ${MPI_USER:?} --session-command "${_EXEC_STRING}"
#time su ${MPI_USER:?} --session-command "${_EXEC_STRING}"
fi
else
run_secondary_mpi_ssh_worker_node
fi

start_calibration

0 comments on commit 0c0fcdb

Please sign in to comment.