From 6912d802bddcb21540e7a3e540815fd48f3360d5 Mon Sep 17 00:00:00 2001 From: Taylor McKinnon Date: Mon, 5 Feb 2024 15:08:56 -0800 Subject: [PATCH] add nvidia-driver image --- images/nvidia-driver/Dockerfile | 74 +++++ images/nvidia-driver/empty | 0 images/nvidia-driver/nvidia-driver | 474 +++++++++++++++++++++++++++++ 3 files changed, 548 insertions(+) create mode 100644 images/nvidia-driver/Dockerfile create mode 100644 images/nvidia-driver/empty create mode 100644 images/nvidia-driver/nvidia-driver diff --git a/images/nvidia-driver/Dockerfile b/images/nvidia-driver/Dockerfile new file mode 100644 index 0000000..e057613 --- /dev/null +++ b/images/nvidia-driver/Dockerfile @@ -0,0 +1,74 @@ +FROM nvidia/cuda:11.4.3-base-ubuntu18.04 + +ENV NVIDIA_VISIBLE_DEVICES=void + +RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections + +RUN dpkg --add-architecture i386 && \ + apt-get update && apt-get install -y --no-install-recommends \ + apt-transport-https \ + apt-utils \ + bc \ + binutils \ + build-essential \ + ca-certificates \ + curl \ + gnupg2 \ + jq \ + kmod \ + libc6:i386 \ + libelf-dev \ + libssl-dev \ + module-init-tools \ + software-properties-common && \ + rm -rf /var/lib/apt/lists/* + +RUN echo "deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ bionic main" > /etc/apt/sources.list && \ + echo "deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ bionic-updates main" >> /etc/apt/sources.list && \ + echo "deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ bionic-security main" >> /etc/apt/sources.list && \ + usermod -o -u 0 -g 0 _apt + +RUN curl -fsSL -o /usr/local/bin/donkey https://github.com/3XX0/donkey/releases/download/v1.1.0/donkey && \ + curl -fsSL -o /usr/local/bin/extract-vmlinux https://raw.githubusercontent.com/torvalds/linux/master/scripts/extract-vmlinux && \ + chmod +x /usr/local/bin/donkey /usr/local/bin/extract-vmlinux + +ARG BASE_URL=http://us.download.nvidia.com/XFree86/Linux-x86_64 +ARG DRIVER_VERSION=535.154.05 +ENV DRIVER_VERSION=$DRIVER_VERSION +ENV DEBIAN_FRONTEND=noninteractive + +# Install the userspace components and copy the kernel module sources. +RUN cd /tmp && \ + curl -fSsl -O $BASE_URL/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run && \ + sh NVIDIA-Linux-x86_64-$DRIVER_VERSION.run -x && \ + cd NVIDIA-Linux-x86_64-$DRIVER_VERSION* && \ + ./nvidia-installer --silent \ + --no-kernel-module \ + --install-compat32-libs \ + --no-nouveau-check \ + --no-nvidia-modprobe \ + --no-rpms \ + --no-backup \ + --no-check-for-alternate-installs \ + --no-libglx-indirect \ + --no-install-libglvnd \ + --x-prefix=/tmp/null \ + --x-module-path=/tmp/null \ + --x-library-path=/tmp/null \ + --x-sysconfig-path=/tmp/null && \ + mkdir -p /usr/src/nvidia-$DRIVER_VERSION && \ + mv LICENSE mkprecompiled kernel /usr/src/nvidia-$DRIVER_VERSION && \ + sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-$DRIVER_VERSION/.manifest && \ + rm -rf /tmp/* + +COPY nvidia-driver /usr/local/bin + +WORKDIR /usr/src/nvidia-$DRIVER_VERSION + +ARG PUBLIC_KEY=empty +COPY ${PUBLIC_KEY} kernel/pubkey.x509 + +# Add NGC DL license from the CUDA image +RUN mkdir /licenses && mv /NGC-DL-CONTAINER-LICENSE /licenses/NGC-DL-CONTAINER-LICENSE + +ENTRYPOINT ["nvidia-driver"] diff --git a/images/nvidia-driver/empty b/images/nvidia-driver/empty new file mode 100644 index 0000000..e69de29 diff --git a/images/nvidia-driver/nvidia-driver b/images/nvidia-driver/nvidia-driver new file mode 100644 index 0000000..15702bf --- /dev/null +++ b/images/nvidia-driver/nvidia-driver @@ -0,0 +1,474 @@ +#! /bin/bash +# Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved. + +set -eu + +RUN_DIR=/run/nvidia +PID_FILE=${RUN_DIR}/${0##*/}.pid +DRIVER_VERSION=${DRIVER_VERSION:?"Missing driver version"} +NVIDIA_BINUITLS_DIR=/opt/nvidia/binutils +NVIDIA_KMODS_DIR=/opt/nvidia/${DRIVER_VERSION} + +export COREOS_RELEASE_CHANNEL=stable +COREOS_RELEASE_BOARD=amd64-usr +COREOS_ALL_RELEASES="https://www.flatcar-linux.org/releases-json/releases-${COREOS_RELEASE_CHANNEL}.json" + +export DEBIAN_FRONTEND=noninteractive + +# The developement environment is required to access gcc and binutils +# binutils is particularly required even if we have precompiled kernel interfaces +# as the ld version needs to match the version used to build the linked kernel interfaces later +# Note that container images can only contain precompiled (but not linked) kernel modules +_install_development_env() { + + echo "Installing the Flatcar development environment on the filesystem $PWD..." + + # Get the flatcar development environment for a given kernel version. + # The environment is mounted on a loop device and then we chroot into + # the mount to build the NVIDIA kernel driver interface. + # The resulting module files and module dependencies are then archived. + local latest_release + latest_release=$(echo "${COREOS_RELEASE_VERSION}" | awk '{ print $1 }') + local dev_image="flatcar_developer_container.bin" + local dev_image_url="https://${COREOS_RELEASE_CHANNEL}.release.flatcar-linux.net/${COREOS_RELEASE_BOARD}/${latest_release}/${dev_image}.bz2" + + curl -Ls "${dev_image_url}" | bzip2 -dq > "${dev_image}" + local sector_size + sector_size=$(fdisk -l "${dev_image}" | grep "^Sector size" | awk '{ print $4 }') + local sector_start + sector_start=$(fdisk -l "${dev_image}" | grep "^${dev_image}*" | awk '{ print $2 }') + local offset_limit=$((sector_start * sector_size)) + + mkdir -p /mnt/coreos + local loop_dev="/dev/loop0" + _exec mount -o loop=${loop_dev},offset=${offset_limit} ${dev_image} /mnt/coreos + + # add more space to the device + local mount_path + mount_path=$(losetup --list --noheadings -O BACK-FILE "${loop_dev}") + dd if=/dev/zero bs=1MiB of="${mount_path}" conv=notrunc oflag=append count=3000 + losetup -c ${loop_dev} + resize2fs ${loop_dev} + + # Version.txt contains some pre-defined environment variables + # that we will use when building the kernel modules + curl -fOSsL https://${COREOS_RELEASE_CHANNEL}.release.flatcar-linux.net/${COREOS_RELEASE_BOARD}/${latest_release}/version.txt + cp version.txt /usr/src + + # Prepare the mount point for the chroot + cp --dereference /etc/resolv.conf /mnt/coreos/etc/ + _exec mount --types proc /proc /mnt/coreos/proc + _exec mount --rbind /sys /mnt/coreos/sys + _exec mount --make-rslave /mnt/coreos/sys + _exec mount --rbind /dev /mnt/coreos/dev + _exec mount --make-rslave /mnt/coreos/dev + mkdir -p /mnt/coreos/usr/src + _exec mount --rbind /usr/src /mnt/coreos/usr/src + + # Archive the binutils since we need the linker for re-linking the modules + if [ ! -e ${NVIDIA_BINUITLS_DIR} ]; then + mkdir -p ${NVIDIA_BINUITLS_DIR}/libs + mkdir -p ${NVIDIA_BINUITLS_DIR}/bin + fi + local binutils_ver=$(ls -d /mnt/coreos/usr/lib64/binutils/x86_64-cros-linux-gnu/*) + binutils_ver=${binutils_ver##*/} + cp -r /mnt/coreos/usr/x86_64-cros-linux-gnu/binutils-bin/${binutils_ver}/* ${NVIDIA_BINUITLS_DIR}/bin/ + cp -r /mnt/coreos/usr/lib64/binutils/x86_64-cros-linux-gnu/${binutils_ver}/* ${NVIDIA_BINUITLS_DIR}/libs/ + +} + +_cleanup_development_env() { + echo "Cleaning up the development environment..." + _exec umount -l /mnt/coreos + rm -rf /mnt/coreos + rm -f "/usr/src/nvidia-${DRIVER_VERSION}/flatcar_developer_container.bin" +} + +# Install the kernel modules header/builtin/order files and generate the kernel version string. +# The kernel sources are installed on the loop mount by following the official documentation at +# https://kinvolk.io/docs/flatcar-container-linux/latest/reference/developer-guides/kernel-modules/ +# We also ensure that the kernel version string (/proc/version) is written so it can be +# archived along with the precompiled kernel interfaces. +_install_prerequisites() ( + + rm -rf "/lib/modules/${KERNEL_VERSION}" + rm -rf /usr/src/linux* + + # Gather the tag for the release matching the current kernel version. + local kernel + kernel=$(echo "${KERNEL_VERSION}" | cut -d "-" -f1) + export COREOS_RELEASE_VERSION=$(curl -Ls "${COREOS_ALL_RELEASES}" | jq -r --arg kernel_ver "${kernel}" 'to_entries[] | select ((.value.major_software.kernel[0] == $kernel_ver) and (.key != "current")) | .key') + + _install_development_env + echo "Installing the Flatcar kernel sources into the development environment..." + + cat <<'EOF' | chroot /mnt/coreos /bin/bash +KERNEL_VERSION=$(cat /usr/src/linux/include/config/kernel.release || ls /lib/modules) +KERNEL_STRING=$(echo "${KERNEL_VERSION}" | cut -d "-" -f1) +echo "Installing kernel sources for kernel version ${KERNEL_VERSION}..." +source /etc/os-release +[ $(echo ${VERSION_ID//./ } | awk '{print $1}') -lt 2346 ] && \ + echo "Fixing path for flatcar kernel sources..." && \ + sed -i -e "s;http://builds.developer.core-os.net/boards/;https://storage.googleapis.com/flatcar-jenkins/boards/;g" /etc/portage/make.conf +export $(cat /usr/src/version.txt | xargs) +emerge-gitclone \ +&& export OVERLAY_VERSION="${COREOS_RELEASE_CHANNEL}-${COREOS_RELEASE_VERSION}" \ +&& export PORTAGE_VERSION="${COREOS_RELEASE_CHANNEL}-${COREOS_RELEASE_VERSION}" \ +&& git -C /var/lib/portage/coreos-overlay checkout "$OVERLAY_VERSION" \ +&& git -C /var/lib/portage/portage-stable checkout "$PORTAGE_VERSION" +emerge -gKq coreos-sources \ +|| echo "failed to download binaries, fallback build from source:" \ +&& emerge -q --jobs 4 --load-average 4 coreos-sources +# Ensure that the symbolic link points to the right kernel version +rm -f /usr/src/linux +ln -s /usr/src/linux-${KERNEL_STRING}-coreos /usr/src/linux +cp /lib/modules/${KERNEL_VERSION}/build/.config /usr/src/linux +make -C /usr/src/linux modules_prepare > /dev/null +cp /lib/modules/${KERNEL_VERSION}/build/Module.symvers /usr/src/linux/ +depmod ${KERNEL_VERSION} +mkdir -p /usr/src/linux/proc +cp /proc/version /usr/src/linux/proc/ + +echo "Compiling NVIDIA driver kernel modules with $(gcc --version | head -1)..." +cp /lib/modules/${KERNEL_VERSION}/build/scripts/module.lds /usr/src/nvidia-*/kernel +cd /usr/src/nvidia-*/kernel +export IGNORE_CC_MISMATCH=1 +export IGNORE_MISSING_MODULE_SYMVERS=1 +make -s -j SYSSRC=/lib/modules/${KERNEL_VERSION}/source nv-linux.o nv-modeset-linux.o > /dev/null +EOF + mkdir -p /lib/modules/${KERNEL_VERSION} + cp -r /mnt/coreos/lib/modules/${KERNEL_VERSION}/* /lib/modules/${KERNEL_VERSION}/ + depmod "${KERNEL_VERSION}" + + _cleanup_development_env +) + +# Check if the kernel version requires a new precompiled driver packages. +_kernel_requires_package() { + local proc_mount_arg="" + + echo "Checking NVIDIA driver packages..." + cd "/usr/src/nvidia-${DRIVER_VERSION}/kernel" + + if [ "${KERNEL_VERSION}" != "$(uname -r)" ]; then + proc_mount_arg="--proc-mount-point /usr/src/linux/proc" + fi + # Note that mkprecompiled does not handle empty double-quotes; + # so don't wrap variables in double-quotes + # The precompiled directory structure is laid out as one directory per KERNEL_VERSION + # and it includes the packed kernel interfaces and other files. So we use the following search pattern: + for pkg_name in $(ls -d -1 precompiled/*/nvidia-modules* 2> /dev/null); do + if ! ../mkprecompiled --match ${pkg_name} ${proc_mount_arg} > /dev/null; then + echo "Found NVIDIA driver package ${pkg_name##*/}" + return 1 + fi + done + return 0 +} + +# Compile the kernel modules, optionally sign them, and generate a precompiled package for use later. +_create_driver_package() ( + local pkg_name="nvidia-modules-${KERNEL_VERSION%%-*}${PACKAGE_TAG:+-${PACKAGE_TAG}}" + local nvidia_sign_args="" + local nvidia_modeset_sign_args="" + local nvidia_uvm_sign_args="" + + cd "/usr/src/nvidia-${DRIVER_VERSION}/kernel" + + if [ -n "${PRIVATE_KEY}" ]; then + echo "Signing NVIDIA driver kernel modules..." + donkey get "${PRIVATE_KEY}" sh -c "PATH=${PATH}:/usr/src/linux-headers-${KERNEL_VERSION}/scripts && \ + sign-file sha512 \$DONKEY_FILE pubkey.x509 nvidia.ko nvidia.ko.sign && \ + sign-file sha512 \$DONKEY_FILE pubkey.x509 nvidia-modeset.ko nvidia-modeset.ko.sign && \ + sign-file sha512 \$DONKEY_FILE pubkey.x509 nvidia-uvm.ko" + nvidia_sign_args="--linked-module nvidia.ko --signed-module nvidia.ko.sign" + nvidia_modeset_sign_args="--linked-module nvidia-modeset.ko --signed-module nvidia-modeset.ko.sign" + nvidia_uvm_sign_args="--signed" + fi + + # Note that mkprecompiled does not handle empty double-quotes; + # so don't wrap variables in double-quotes + echo "Building NVIDIA driver package ${pkg_name}..." + ../mkprecompiled --pack ${pkg_name} --description ${KERNEL_VERSION} \ + --proc-mount-point /usr/src/linux/proc \ + --driver-version ${DRIVER_VERSION} \ + --kernel-interface nv-linux.o \ + --linked-module-name nvidia.ko \ + --core-object-name nvidia/nv-kernel.o_binary \ + ${nvidia_sign_args} \ + --target-directory . \ + --kernel-interface nv-modeset-linux.o \ + --linked-module-name nvidia-modeset.ko \ + --core-object-name nvidia-modeset/nv-modeset-kernel.o_binary \ + ${nvidia_modeset_sign_args} \ + --target-directory . \ + --kernel-module nvidia-uvm.ko \ + ${nvidia_uvm_sign_args} \ + --target-directory . + # Archive the module files, so we can re-link later + # The archive is created one per kernel version, including the package created above + # and the corresponding module files. + # Finally, delete any linked kernel modules (.ko) as they cannot be part of the container image + local archive_dir=precompiled/${KERNEL_VERSION} + mkdir -p "${archive_dir}" + mv "${pkg_name}" "${archive_dir}" + local kmods="nvidia nvidia-uvm nvidia-modeset nvidia-drm" + for m in $kmods; do + cp ${m}.mod.o "${archive_dir}" + rm -f ${m}.ko + done + cp modules.order "${archive_dir}" + echo "Packaged precompiled driver into $PWD/${archive_dir}" +) + +# Load the kernel modules and start persistenced. +_load_driver() { + echo "Loading NVIDIA driver kernel modules..." + modprobe -d ${NVIDIA_KMODS_DIR} -a nvidia nvidia-uvm nvidia-modeset + + echo "Starting NVIDIA persistence daemon..." + nvidia-persistenced --persistence-mode +} + +# Stop persistenced and unload the kernel modules if they are currently loaded. +_unload_driver() { + local rmmod_args=() + local nvidia_deps=0 + local nvidia_refs=0 + local nvidia_uvm_refs=0 + local nvidia_modeset_refs=0 + + echo "Stopping NVIDIA persistence daemon..." + if [ -f /var/run/nvidia-persistenced/nvidia-persistenced.pid ]; then + local pid + pid=$(< /var/run/nvidia-persistenced/nvidia-persistenced.pid) + + kill -SIGTERM "${pid}" + for i in $(seq 1 10); do + kill -0 "${pid}" 2> /dev/null || break + sleep 0.1 + done + if [ "$i" -eq 10 ]; then + echo "Could not stop NVIDIA persistence daemon" >&2 + return 1 + fi + fi + + echo "Unloading NVIDIA driver kernel modules..." + if [ -f /sys/module/nvidia_modeset/refcnt ]; then + nvidia_modeset_refs=$(< /sys/module/nvidia_modeset/refcnt) + rmmod_args+=("nvidia-modeset") + ((++nvidia_deps)) + fi + if [ -f /sys/module/nvidia_uvm/refcnt ]; then + nvidia_uvm_refs=$(< /sys/module/nvidia_uvm/refcnt) + rmmod_args+=("nvidia-uvm") + ((++nvidia_deps)) + fi + if [ -f /sys/module/nvidia/refcnt ]; then + nvidia_refs=$(< /sys/module/nvidia/refcnt) + rmmod_args+=("nvidia") + fi + if [ "${nvidia_refs}" -gt "${nvidia_deps}" ] || [ "${nvidia_uvm_refs}" -gt 0 ] || \ + [ "${nvidia_modeset_refs}" -gt 0 ]; then + echo "Could not unload NVIDIA driver kernel modules, driver is in use" >&2 + return 1 + fi + + if [ ${#rmmod_args[@]} -gt 0 ]; then + rmmod "${rmmod_args[@]}" + fi + return 0 +} + +# Link and install the kernel modules from a precompiled package using the nvidia-installer. +_install_driver() { + local install_args=() + + echo "Installing NVIDIA driver kernel modules..." + local base_dir=/usr/src/nvidia-${DRIVER_VERSION}/kernel + cd "${base_dir}" + + if [ "${ACCEPT_LICENSE}" = "yes" ]; then + install_args+=("--accept-license") + fi + # Prepare the final destination of the kernel modules and a staging directory + # We will also setup some variables needed in the chroot + # The precompiled kernel interfaces are first unpacked, and then re-linked using + # the same linker that was used to build the precompiled modules + mkdir -p ${NVIDIA_KMODS_DIR}/lib/modules/${KERNEL_VERSION} + local staging_dir=${base_dir}/staging/${KERNEL_VERSION} + mkdir -p ${staging_dir} + + local pkg_name="nvidia-modules-${KERNEL_VERSION%%-*}${PACKAGE_TAG:+-${PACKAGE_TAG}}" + local archive_dir=${base_dir}/precompiled/${KERNEL_VERSION} + # Note that mkprecompiled does not handle empty double-quotes; + # so don't wrap variables in double-quotes + ../mkprecompiled --unpack ${archive_dir}/${pkg_name} -o ${staging_dir} + cp ${archive_dir}/*.mod.o ${staging_dir} + + # Re-link the modules by using the binutils we archived from the development + # environment. + local old_ld_path=$LD_LIBRARY_PATH + export LD_LIBRARY_PATH=${NVIDIA_BINUITLS_DIR}/libs:$LD_LIBRARY_PATH + cd ${staging_dir} + cp ../../nvidia/nv-kernel.o_binary . + cp ../../nvidia-modeset/nv-modeset-kernel.o_binary . + echo "Re-linking NVIDIA driver kernel modules..." + ${NVIDIA_BINUITLS_DIR}/bin/ld -d -r -T ${base_dir}/module.lds -o nvidia.ko ./nv-linux.o ./nv-kernel.o_binary ./nvidia.mod.o + ${NVIDIA_BINUITLS_DIR}/bin/ld -d -r -T ${base_dir}/module.lds -o nvidia-modeset.ko ./nv-modeset-linux.o ./nv-modeset-kernel.o_binary nvidia-modeset.mod.o + export LD_LIBRARY_PATH=${old_ld_path} + + + # Copy the kernel modules to their final destination and generate the module dependencies + cp ${staging_dir}/*.ko ${NVIDIA_KMODS_DIR}/lib/modules/${KERNEL_VERSION} + cp ${archive_dir}/modules.order ${NVIDIA_KMODS_DIR}/lib/modules/${KERNEL_VERSION} + depmod -b ${NVIDIA_KMODS_DIR} ${KERNEL_VERSION} +} + +# Execute binaries by root owning them first +_exec() { + exec_bin_path=$(command -v "$1") + exec_user=$(stat -c "%u" "${exec_bin_path}") + exec_group=$(stat -c "%g" "${exec_bin_path}") + if [[ "${exec_user}" != "0" || "${exec_group}" != "0" ]]; then + chown 0:0 "${exec_bin_path}" + "$@" + chown "${exec_user}":"${exec_group}" "${exec_bin_path}" + else + "$@" + fi +} + +# Mount the driver rootfs into the run directory with the exception of sysfs. +_mount_rootfs() { + echo "Mounting NVIDIA driver rootfs..." + _exec mount --make-runbindable /sys + _exec mount --make-private /sys + mkdir -p ${RUN_DIR}/driver + _exec mount --rbind / ${RUN_DIR}/driver +} + +# Unmount the driver rootfs from the run directory. +_unmount_rootfs() { + echo "Unmounting NVIDIA driver rootfs..." + if findmnt -r -o TARGET | grep "${RUN_DIR}/driver" > /dev/null; then + _exec umount -l -R ${RUN_DIR}/driver + fi +} + +_shutdown() { + if _unload_driver; then + _unmount_rootfs + rm -f "${PID_FILE}" + return 0 + fi + return 1 +} + +init() { + printf "\\n========== NVIDIA Software Installer ==========\\n" + printf "Starting installation of NVIDIA driver version %s for Linux kernel version %s\\n" "${DRIVER_VERSION}" "${KERNEL_VERSION}" + + exec 3> "${PID_FILE}" + if ! flock -n 3; then + echo "An instance of the NVIDIA driver is already running, aborting" + exit 1 + fi + echo $$ >&3 + + trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM + trap "_shutdown" EXIT + + _unload_driver || exit 1 + _unmount_rootfs + + if _kernel_requires_package; then + _install_prerequisites + _create_driver_package + fi + + _install_driver + _load_driver + _mount_rootfs + + echo "Done, now waiting for signal" + sleep infinity & + trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM + trap - EXIT + while true; do wait $! || continue; done + exit 0 +} + +update() { + exec 3>&2 + if exec 2> /dev/null 4< "${PID_FILE}"; then + if ! flock -n 4 && read -r pid <&4 && kill -0 "${pid}"; then + exec > >(tee -a "/proc/${pid}/fd/1") + exec 2> >(tee -a "/proc/${pid}/fd/2" >&3) + else + exec 2>&3 + fi + exec 4>&- + fi + exec 3>&- + + printf "\\n========== NVIDIA Software Updater ==========\\n" + printf "Starting update of NVIDIA driver version %s for Linux kernel version %s\\n" "${DRIVER_VERSION}" "${KERNEL_VERSION}" + + trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM + + _install_prerequisites + + if _kernel_requires_package; then + _create_driver_package + fi + + echo "Done" + exit 0 +} + +usage() { + cat >&2 <