From 89718233fc843136f14f555e674dd32a1e8447d2 Mon Sep 17 00:00:00 2001 From: Siavash Ameli Date: Sat, 11 May 2024 14:52:26 -0700 Subject: [PATCH] fix driver conflict --- ...-docker-manylinux2014_x86_64_cuda_10.2.yml | 6 +- ...-docker-manylinux2014_x86_64_cuda_11.8.yml | 6 +- ...-docker-manylinux2014_x86_64_cuda_12.0.yml | 6 +- ...-docker-manylinux2014_x86_64_cuda_12.3.yml | 6 +- ...docker-manylinux_2_28_x86_64_cuda_12.3.yml | 6 +- README.rst | 69 +++++++++++++++++-- .../manylinux2014_x86_64_cuda_10.2/Dockerfile | 3 +- .../manylinux2014_x86_64_cuda_11.8/Dockerfile | 3 +- .../manylinux2014_x86_64_cuda_12.0/Dockerfile | 3 +- .../manylinux2014_x86_64_cuda_12.3/Dockerfile | 3 +- .../Dockerfile | 2 - .../Dockerfile | 2 - 12 files changed, 82 insertions(+), 33 deletions(-) diff --git a/.github/workflows/deploy-docker-manylinux2014_x86_64_cuda_10.2.yml b/.github/workflows/deploy-docker-manylinux2014_x86_64_cuda_10.2.yml index dfd495b..d4b5077 100644 --- a/.github/workflows/deploy-docker-manylinux2014_x86_64_cuda_10.2.yml +++ b/.github/workflows/deploy-docker-manylinux2014_x86_64_cuda_10.2.yml @@ -1,9 +1,9 @@ name: deploy-docker-manylinux2014_x86_64_cuda_10.2 on: - # push: - # branches: - # - main + push: + branches: + - main release: types: - published diff --git a/.github/workflows/deploy-docker-manylinux2014_x86_64_cuda_11.8.yml b/.github/workflows/deploy-docker-manylinux2014_x86_64_cuda_11.8.yml index 23da71c..409375a 100644 --- a/.github/workflows/deploy-docker-manylinux2014_x86_64_cuda_11.8.yml +++ b/.github/workflows/deploy-docker-manylinux2014_x86_64_cuda_11.8.yml @@ -1,9 +1,9 @@ name: deploy-docker-manylinux2014_x86_64_cuda_11.8 on: - # push: - # branches: - # - main + push: + branches: + - main release: types: - published diff --git a/.github/workflows/deploy-docker-manylinux2014_x86_64_cuda_12.0.yml b/.github/workflows/deploy-docker-manylinux2014_x86_64_cuda_12.0.yml index e690a24..d3f28e6 100644 --- a/.github/workflows/deploy-docker-manylinux2014_x86_64_cuda_12.0.yml +++ b/.github/workflows/deploy-docker-manylinux2014_x86_64_cuda_12.0.yml @@ -1,9 +1,9 @@ name: deploy-docker-manylinux2014_x86_64_cuda_12.0 on: - # push: - # branches: - # - main + push: + branches: + - main release: types: - published diff --git a/.github/workflows/deploy-docker-manylinux2014_x86_64_cuda_12.3.yml b/.github/workflows/deploy-docker-manylinux2014_x86_64_cuda_12.3.yml index b572223..18feff4 100644 --- a/.github/workflows/deploy-docker-manylinux2014_x86_64_cuda_12.3.yml +++ b/.github/workflows/deploy-docker-manylinux2014_x86_64_cuda_12.3.yml @@ -1,9 +1,9 @@ name: deploy-docker-manylinux2014_x86_64_cuda_12.3 on: - # push: - # branches: - # - main + push: + branches: + - main release: types: - published diff --git a/.github/workflows/deploy-docker-manylinux_2_28_x86_64_cuda_12.3.yml b/.github/workflows/deploy-docker-manylinux_2_28_x86_64_cuda_12.3.yml index 102121f..55314e0 100644 --- a/.github/workflows/deploy-docker-manylinux_2_28_x86_64_cuda_12.3.yml +++ b/.github/workflows/deploy-docker-manylinux_2_28_x86_64_cuda_12.3.yml @@ -1,9 +1,9 @@ name: deploy-docker-manylinux_2_28_x86_64_cuda_12.3 on: - # push: - # branches: - # - main + push: + branches: + - main release: types: - published diff --git a/README.rst b/README.rst index c84bb9e..df503d7 100644 --- a/README.rst +++ b/README.rst @@ -6,7 +6,7 @@ manylinux-cuda Download Images =============== -Obtain the docker images from Dockerhub for the following CUDA versions: +Obtain the docker images from Docker Hub for the following CUDA versions: X86_64 Architecture ------------------- @@ -75,13 +75,30 @@ To maintain a minimal Docker image size, only the essential compilers and librar * CUDA compiler: ``cuda-crt``, ``cuda-cuobjdump``, ``cuda-cuxxfilt``, ``cuda-nvcc``, ``cuda-nvprune``, ``cuda-nvvm``, ``cuda-cudart``, ``cuda-nvrtc``, ``cuda-opencl``, * CUDA libraries: ``libcublas``, ``libcufft``, ``libcufile``, ``libcurand``, ``libcusolver``, ``libcusparse``, ``libnpp``, ``libnvjitlink``, ``libnvjpeg`` * CUDA development libraries: ``cuda-cccl``, ``cuda-cudart-devel``, ``cuda-driver-devel``, ``cuda-nvrtc-devel``, ``cuda-opencl-devel``, ``cuda-profiler-api``, ``libcublas-devel``, ``libcufft-devel``, ``libcufile-devel``, ``libcurand-devel``, ``libcusolver-devel``, ``libcusparse-devel``, ``libnpp-devel``, ``libnvjitlink-devel``, ``libnvjpeg-devel`` -* NVIDIA driver: ``nvidia-driver:latest-dkms`` (*see note below* :sup:`1`) If you need additional packages from CUDA toolkit to be included in the images, please feel free to create a `GitHub issue `__. -.. line-block:: +NVIDIA Driver +============= - :sup:`1. NVIDIA driver is not available on manylinux2014 on AARCH64 arch. To use NVIDIA driver on AARCH64 arch, use manylinux_2_xx.` +The Docker images do not include the NVIDIA driver to prevent incompatibility issues with the host system's native driver when used at runtime. + +For users who might need specific components of the NVIDIA driver, such as ``libcuda.so``, to compile their code, the driver can be installed within the container using the following commands based on your image's base distribution: + +* For ``manylinux2`` images: + + :: + + dnf -y install epel-release + dnf -y module install nvidia-driver:latest-dkms + +* For ``manylinux2014`` images: + + :: + + yum install nvidia-driver-latest-dkms + +Note, however, that this step should generally be avoided unless strictly required, as it may lead to compatibility issues between the driver versions in the container and on the host system. If possible, it is recommended to rely on the host system's driver installation when running containers that require GPU access. Environment Variables ===================== @@ -122,16 +139,56 @@ The output of the above command is: Cuda compilation tools, release 12.0, V12.0.76 Build cuda_12.3.r12.0/compiler.31968024_0 +Using Host's GPU +================ + +The primary purpose of these Docker images is to build code, such as Python wheels, using the *manylinux* standard. While this process does not require access to the host's GPU, you might want to use them at runtime on the host's GPU, particularly for testing purposes. + +To access host's GPU device from the container, install *Nvidia Container Toolkit* as follows. + +1. Add the package to the repository: + + :: + + distribution=$(. /etc/os-release;echo $ID$VERSION_ID) + curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - + curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list + +2. Install `nvidia-contaner-toolkit` by: + + :: + + sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit + +3. Restart docker to be able to use it: + + :: + + sudo systemctl restart docker + +To use host's GPU, add ``--gpus all`` to any of the docker commands given before, such as: + +:: + + docker run --gpus all -it sameli/manylinux_2_28_x86_64_cuda_12.3 + +To check the host's NVIDIA driver version, CUDA runtime library version, and list of available GPU devices, run ``nvida-smi`` command, such as by: + +:: + + docker run --gpus all sameli/manylinux_2_28_x86_64_cuda_12.3 nvidia-smi + + Troubleshooting =============== -When running the docker containers in Github action, you may encounter this error: +When running the docker containers in GitHub action, you may encounter this error: :: no space left on device. -To resolve this, try clearing the Github's runner cache before executing the docker container: +To resolve this, try clearing the GitHub's runner cache before executing the docker container: :: diff --git a/docker/manylinux2014_x86_64_cuda_10.2/Dockerfile b/docker/manylinux2014_x86_64_cuda_10.2/Dockerfile index 87a5125..ad2960a 100644 --- a/docker/manylinux2014_x86_64_cuda_10.2/Dockerfile +++ b/docker/manylinux2014_x86_64_cuda_10.2/Dockerfile @@ -31,8 +31,7 @@ RUN yum install -y yum-utils RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo RUN yum -y install cuda-compiler-${VER}.${ARCH} \ cuda-libraries-${VER}.${ARCH} \ - cuda-libraries-dev-${VER}.${ARCH} \ - nvidia-driver-latest-dkms + cuda-libraries-dev-${VER}.${ARCH} RUN yum clean all RUN rm -rf /var/cache/yum/* RUN echo "/usr/local/cuda-10.2/lib64" >> /etc/ld.so.conf.d/999_nvidia_cuda.conf diff --git a/docker/manylinux2014_x86_64_cuda_11.8/Dockerfile b/docker/manylinux2014_x86_64_cuda_11.8/Dockerfile index 14ae664..7c9713a 100644 --- a/docker/manylinux2014_x86_64_cuda_11.8/Dockerfile +++ b/docker/manylinux2014_x86_64_cuda_11.8/Dockerfile @@ -31,8 +31,7 @@ RUN yum install -y yum-utils RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo RUN yum -y install cuda-compiler-${VER}.${ARCH} \ cuda-libraries-${VER}.${ARCH} \ - cuda-libraries-devel-${VER}.${ARCH} \ - nvidia-driver-latest-dkms + cuda-libraries-devel-${VER}.${ARCH} RUN yum clean all RUN rm -rf /var/cache/yum/* RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/999_nvidia_cuda.conf diff --git a/docker/manylinux2014_x86_64_cuda_12.0/Dockerfile b/docker/manylinux2014_x86_64_cuda_12.0/Dockerfile index 028e366..7b4029d 100644 --- a/docker/manylinux2014_x86_64_cuda_12.0/Dockerfile +++ b/docker/manylinux2014_x86_64_cuda_12.0/Dockerfile @@ -31,8 +31,7 @@ RUN yum install -y yum-utils RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo RUN yum -y install cuda-compiler-${VER}.${ARCH} \ cuda-libraries-${VER}.${ARCH} \ - cuda-libraries-devel-${VER}.${ARCH} \ - nvidia-driver-latest-dkms + cuda-libraries-devel-${VER}.${ARCH} RUN yum clean all RUN rm -rf /var/cache/yum/* RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/999_nvidia_cuda.conf diff --git a/docker/manylinux2014_x86_64_cuda_12.3/Dockerfile b/docker/manylinux2014_x86_64_cuda_12.3/Dockerfile index 762bc16..d2141b9 100644 --- a/docker/manylinux2014_x86_64_cuda_12.3/Dockerfile +++ b/docker/manylinux2014_x86_64_cuda_12.3/Dockerfile @@ -31,8 +31,7 @@ RUN yum install -y yum-utils RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo RUN yum -y install cuda-compiler-${VER}.${ARCH} \ cuda-libraries-${VER}.${ARCH} \ - cuda-libraries-devel-${VER}.${ARCH} \ - nvidia-driver-latest-dkms + cuda-libraries-devel-${VER}.${ARCH} RUN yum clean all RUN rm -rf /var/cache/yum/* RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/999_nvidia_cuda.conf diff --git a/docker/manylinux_2_28_aarch64_cuda_12.3/Dockerfile b/docker/manylinux_2_28_aarch64_cuda_12.3/Dockerfile index 54e2e24..a53d2ac 100644 --- a/docker/manylinux_2_28_aarch64_cuda_12.3/Dockerfile +++ b/docker/manylinux_2_28_aarch64_cuda_12.3/Dockerfile @@ -28,11 +28,9 @@ ARG VER="12-3" ARG ARCH="aarch64" RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo -RUN dnf -y install epel-release RUN dnf -y install cuda-compiler-${VER}.${ARCH} \ cuda-libraries-${VER}.${ARCH} \ cuda-libraries-devel-${VER}.${ARCH} -RUN dnf -y module install nvidia-driver:latest-dkms RUN dnf clean all RUN rm -rf /var/cache/dnf/* RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/999_nvidia_cuda.conf diff --git a/docker/manylinux_2_28_x86_64_cuda_12.3/Dockerfile b/docker/manylinux_2_28_x86_64_cuda_12.3/Dockerfile index c47e947..bc1dfd9 100644 --- a/docker/manylinux_2_28_x86_64_cuda_12.3/Dockerfile +++ b/docker/manylinux_2_28_x86_64_cuda_12.3/Dockerfile @@ -28,11 +28,9 @@ ARG VER="12-3" ARG ARCH="x86_64" RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo -RUN dnf -y install epel-release RUN dnf -y install cuda-compiler-${VER}.${ARCH} \ cuda-libraries-${VER}.${ARCH} \ cuda-libraries-devel-${VER}.${ARCH} -RUN dnf -y module install nvidia-driver:latest-dkms RUN dnf clean all RUN rm -rf /var/cache/dnf/* RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/999_nvidia_cuda.conf