From 8e2e3f3b40b50fcc2c804085d7fcd7ddb2ff4851 Mon Sep 17 00:00:00 2001 From: Eta Date: Tue, 5 Mar 2024 14:14:21 -0500 Subject: [PATCH] feat: Update NCCL, CUDA, cuDNN, and HPC-X --- .github/workflows/ubuntu-20.yml | 32 ++++++++++++++++---------------- .github/workflows/ubuntu-22.yml | 14 +++++++------- Dockerfile.ubuntu20 | 2 +- Dockerfile.ubuntu22 | 4 ++-- 4 files changed, 26 insertions(+), 26 deletions(-) diff --git a/.github/workflows/ubuntu-20.yml b/.github/workflows/ubuntu-20.yml index 54f5ae0..306f72d 100644 --- a/.github/workflows/ubuntu-20.yml +++ b/.github/workflows/ubuntu-20.yml @@ -31,7 +31,7 @@ jobs: cuda-version-major: "12.0" nccl-version: 2.19.3-1 cuda-samples-version: "12.0" - hpcx-distribution: "hpcx-v2.16-gcc-mlnx_ofed-ubuntu20.04-cuda12-gdrcopy2-nccl2.18-x86_64" + hpcx-distribution: "hpcx-v2.18-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64" cu121: uses: ./.github/workflows/build.yml @@ -44,7 +44,7 @@ jobs: cuda-version-major: "12.1" nccl-version: 2.18.3-1 cuda-samples-version: "12.1" - hpcx-distribution: "hpcx-v2.16-gcc-mlnx_ofed-ubuntu20.04-cuda12-gdrcopy2-nccl2.18-x86_64" + hpcx-distribution: "hpcx-v2.18-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64" cu122: uses: ./.github/workflows/build.yml @@ -55,19 +55,19 @@ jobs: base-tag: 12.2.2-cudnn8-devel-ubuntu20.04 cuda-version-minor: "12.2.2" cuda-version-major: "12.2" - nccl-version: 2.19.3-1 + nccl-version: 2.20.3-1 cuda-samples-version: "12.2" - hpcx-distribution: "hpcx-v2.16-gcc-mlnx_ofed-ubuntu20.04-cuda12-gdrcopy2-nccl2.18-x86_64" + hpcx-distribution: "hpcx-v2.18-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64" -# cu123: -# uses: ./.github/workflows/build.yml -# with: -# folder: . -# dockerfile: Dockerfile.ubuntu20 -# base-image: nvidia/cuda -# base-tag: 12.3.0-cudnn8-devel-ubuntu20.04 -# cuda-version-minor: "12.3.0" -# cuda-version-major: "12.3" -# nccl-version: 2.19.3-1 -# cuda-samples-version: "12.3" -# hpcx-distribution: "hpcx-v2.17-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64" + cu123: + uses: ./.github/workflows/build.yml + with: + folder: . + dockerfile: Dockerfile.ubuntu20 + base-image: nvidia/cuda + base-tag: 12.3.2-cudnn9-devel-ubuntu20.04 + cuda-version-minor: "12.3.2" + cuda-version-major: "12.3" + nccl-version: 2.20.3-1 + cuda-samples-version: "12.3" + hpcx-distribution: "hpcx-v2.18-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64" diff --git a/.github/workflows/ubuntu-22.yml b/.github/workflows/ubuntu-22.yml index 62f688a..d2671c1 100644 --- a/.github/workflows/ubuntu-22.yml +++ b/.github/workflows/ubuntu-22.yml @@ -18,7 +18,7 @@ jobs: cuda-version-major: "12.0" nccl-version: 2.18.5-1 cuda-samples-version: "12.0" - hpcx-distribution: "hpcx-v2.17-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64" + hpcx-distribution: "hpcx-v2.18-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64" cu121: uses: ./.github/workflows/build.yml @@ -31,7 +31,7 @@ jobs: cuda-version-major: "12.1" nccl-version: 2.18.3-1 cuda-samples-version: "12.1" - hpcx-distribution: "hpcx-v2.17-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64" + hpcx-distribution: "hpcx-v2.18-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64" cu122: uses: ./.github/workflows/build.yml @@ -44,7 +44,7 @@ jobs: cuda-version-major: "12.2" nccl-version: 2.19.3-1 cuda-samples-version: "12.2" - hpcx-distribution: "hpcx-v2.17-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64" + hpcx-distribution: "hpcx-v2.18-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64" cu123: uses: ./.github/workflows/build.yml @@ -52,9 +52,9 @@ jobs: folder: . dockerfile: Dockerfile.ubuntu22 base-image: nvidia/cuda - base-tag: 12.3.1-devel-ubuntu22.04 - cuda-version-minor: "12.3.1" + base-tag: 12.3.2-cudnn9-devel-ubuntu22.04 + cuda-version-minor: "12.3.2" cuda-version-major: "12.3" - nccl-version: 2.19.3-1 + nccl-version: 2.20.3-1 cuda-samples-version: "12.3" - hpcx-distribution: "hpcx-v2.17-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64" + hpcx-distribution: "hpcx-v2.18-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64" diff --git a/Dockerfile.ubuntu20 b/Dockerfile.ubuntu20 index 6b0c478..51b326f 100644 --- a/Dockerfile.ubuntu20 +++ b/Dockerfile.ubuntu20 @@ -63,7 +63,7 @@ RUN mkdir /tmp/build && \ ARG HPCX_DISTRIBUTION="hpcx-v2.14-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda11-gdrcopy2-nccl2.16-x86_64" RUN cd /tmp && \ export HPCX_DIR="/opt/hpcx" && \ - wget -q -O - http://blobstore.object.ord1.coreweave.com/drivers/${HPCX_DISTRIBUTION}.tbz | tar xjf - && \ + wget -q -O - https://blobstore.object.ord1.coreweave.com/drivers/${HPCX_DISTRIBUTION}.tbz | tar xjf - && \ grep -IrlF "/build-result/${HPCX_DISTRIBUTION}" ${HPCX_DISTRIBUTION} | xargs -rd'\n' sed -i -e "s:/build-result/${HPCX_DISTRIBUTION}:${HPCX_DIR}:g" && \ mv ${HPCX_DISTRIBUTION} ${HPCX_DIR} diff --git a/Dockerfile.ubuntu22 b/Dockerfile.ubuntu22 index 9f8f871..126188d 100644 --- a/Dockerfile.ubuntu22 +++ b/Dockerfile.ubuntu22 @@ -62,10 +62,10 @@ RUN mkdir /tmp/build && \ # HPC-X # grep + sed is used as a workaround to update hardcoded pkg-config / libtools archive / CMake prefixes -ARG HPCX_DISTRIBUTION="hpcx-v2.17-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64" +ARG HPCX_DISTRIBUTION="hpcx-v2.18-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64" RUN cd /tmp && \ export HPCX_DIR="/opt/hpcx" && \ - wget -q -O - http://blobstore.object.ord1.coreweave.com/drivers/${HPCX_DISTRIBUTION}.tbz | tar xjf - && \ + wget -q -O - https://blobstore.object.ord1.coreweave.com/drivers/${HPCX_DISTRIBUTION}.tbz | tar xjf - && \ grep -IrlF "/build-result/${HPCX_DISTRIBUTION}" ${HPCX_DISTRIBUTION} | xargs -rd'\n' sed -i -e "s:/build-result/${HPCX_DISTRIBUTION}:${HPCX_DIR}:g" && \ mv ${HPCX_DISTRIBUTION} ${HPCX_DIR}