From 8d39e97ea48b7a1e095015add58df11e7fe4078a Mon Sep 17 00:00:00 2001 From: Eta Date: Thu, 31 Oct 2024 11:54:56 -0500 Subject: [PATCH] build: Use the correct `HPCX_DISTRIBUTION` based on architecture --- .github/workflows/ubuntu-20.yml | 26 ++++---------------------- .github/workflows/ubuntu-22.yml | 8 ++++---- Dockerfile.ubuntu20 | 5 ++++- Dockerfile.ubuntu22 | 5 ++++- 4 files changed, 16 insertions(+), 28 deletions(-) diff --git a/.github/workflows/ubuntu-20.yml b/.github/workflows/ubuntu-20.yml index de6867a..a065fbf 100644 --- a/.github/workflows/ubuntu-20.yml +++ b/.github/workflows/ubuntu-20.yml @@ -7,24 +7,6 @@ on: - .github/workflows/build.yml jobs: - cu118: - uses: ./.github/workflows/build.yml - secrets: - ORG_BUILDKIT_CLIENT_TOKEN: ${{ secrets.ORG_BUILDKIT_CLIENT_TOKEN }} - BUILDKIT_CONSUMER_DOPPLER_PROJECT: ${{ secrets.BUILDKIT_CONSUMER_DOPPLER_PROJECT }} - BUILDKIT_CONSUMER_DOPPLER_CONFIG: ${{ secrets.BUILDKIT_CONSUMER_DOPPLER_CONFIG }} - BUILDKIT_CONSUMER_ENDPOINT: ${{ secrets.BUILDKIT_CONSUMER_ENDPOINT }} - with: - folder: . - dockerfile: Dockerfile.ubuntu20 - base-image: nvidia/cuda - base-tag: 11.8.0-cudnn8-devel-ubuntu20.04 - cuda-version-minor: "11.8.0" - cuda-version-major: "11.8" - nccl-version: 2.16.5-1 - cuda-samples-version: "11.6" - hpcx-distribution: "hpcx-v2.14-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda11-gdrcopy2-nccl2.16-x86_64" - cu120: uses: ./.github/workflows/build.yml secrets: @@ -41,7 +23,7 @@ jobs: cuda-version-major: "12.0" nccl-version: 2.19.3-1 cuda-samples-version: "12.0" - hpcx-distribution: "hpcx-v2.20-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64" + hpcx-distribution: "hpcx-v2.20-gcc-mlnx_ofed-ubuntu20.04-cuda12" cu122: uses: ./.github/workflows/build.yml @@ -59,7 +41,7 @@ jobs: cuda-version-major: "12.2" nccl-version: 2.21.5-1 cuda-samples-version: "12.2" - hpcx-distribution: "hpcx-v2.20-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64" + hpcx-distribution: "hpcx-v2.20-gcc-mlnx_ofed-ubuntu20.04-cuda12" cu124: uses: ./.github/workflows/build.yml @@ -77,7 +59,7 @@ jobs: cuda-version-major: "12.4" nccl-version: 2.23.4-1 cuda-samples-version: "12.4" - hpcx-distribution: "hpcx-v2.20-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64" + hpcx-distribution: "hpcx-v2.20-gcc-mlnx_ofed-ubuntu20.04-cuda12" cu126: uses: ./.github/workflows/build.yml @@ -95,4 +77,4 @@ jobs: cuda-version-major: "12.6" nccl-version: 2.23.4-1 cuda-samples-version: "12.5" - hpcx-distribution: "hpcx-v2.20-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64" + hpcx-distribution: "hpcx-v2.20-gcc-mlnx_ofed-ubuntu20.04-cuda12" diff --git a/.github/workflows/ubuntu-22.yml b/.github/workflows/ubuntu-22.yml index bb837cc..fce596e 100644 --- a/.github/workflows/ubuntu-22.yml +++ b/.github/workflows/ubuntu-22.yml @@ -23,7 +23,7 @@ jobs: cuda-version-major: "12.0" nccl-version: 2.18.5-1 cuda-samples-version: "12.0" - hpcx-distribution: "hpcx-v2.20-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64" + hpcx-distribution: "hpcx-v2.20-gcc-mlnx_ofed-ubuntu22.04-cuda12" cu122: uses: ./.github/workflows/build.yml @@ -41,7 +41,7 @@ jobs: cuda-version-major: "12.2" nccl-version: 2.23.4-1 cuda-samples-version: "12.2" - hpcx-distribution: "hpcx-v2.20-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64" + hpcx-distribution: "hpcx-v2.20-gcc-mlnx_ofed-ubuntu22.04-cuda12" cu124: uses: ./.github/workflows/build.yml @@ -59,7 +59,7 @@ jobs: cuda-version-major: "12.4" nccl-version: 2.23.4-1 cuda-samples-version: "12.4" - hpcx-distribution: "hpcx-v2.20-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64" + hpcx-distribution: "hpcx-v2.20-gcc-mlnx_ofed-ubuntu22.04-cuda12" cu126: uses: ./.github/workflows/build.yml @@ -77,4 +77,4 @@ jobs: cuda-version-major: "12.6" nccl-version: 2.23.4-1 cuda-samples-version: "12.5" - hpcx-distribution: "hpcx-v2.20-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64" + hpcx-distribution: "hpcx-v2.20-gcc-mlnx_ofed-ubuntu22.04-cuda12" diff --git a/Dockerfile.ubuntu20 b/Dockerfile.ubuntu20 index 3871510..9786c38 100644 --- a/Dockerfile.ubuntu20 +++ b/Dockerfile.ubuntu20 @@ -63,7 +63,10 @@ RUN apt-get -qq update \ # HPC-X # grep + sed is used as a workaround to update hardcoded pkg-config / libtools archive / CMake prefixes -ARG HPCX_DISTRIBUTION="hpcx-v2.14-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda11-gdrcopy2-nccl2.16-x86_64" +ARG HPCX_DISTRIBUTION="hpcx-v2.20-gcc-mlnx_ofed-ubuntu20.04-cuda12" +ARG _HPCX_ARCH="${TARGETARCH/amd64/x86_64}" +ARG _HPCX_ARCH="${_HPCX_ARCH/arm64/aarch64}" +ARG HPCX_DISTRIBUTION="${HPCX_DISTRIBUTION}-${_HPCX_ARCH}" RUN cd /tmp && \ export HPCX_DIR="/opt/hpcx" && \ wget -q -O - https://blobstore.object.ord1.coreweave.com/drivers/${HPCX_DISTRIBUTION}.tbz | tar xjf - && \ diff --git a/Dockerfile.ubuntu22 b/Dockerfile.ubuntu22 index f2db082..7eadff2 100644 --- a/Dockerfile.ubuntu22 +++ b/Dockerfile.ubuntu22 @@ -65,7 +65,10 @@ RUN apt-get -qq update \ # HPC-X # grep + sed is used as a workaround to update hardcoded pkg-config / libtools archive / CMake prefixes -ARG HPCX_DISTRIBUTION="hpcx-v2.18-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64" +ARG HPCX_DISTRIBUTION="hpcx-v2.20-gcc-mlnx_ofed-ubuntu22.04-cuda12" +ARG _HPCX_ARCH="${TARGETARCH/amd64/x86_64}" +ARG _HPCX_ARCH="${_HPCX_ARCH/arm64/aarch64}" +ARG HPCX_DISTRIBUTION="${HPCX_DISTRIBUTION}-${_HPCX_ARCH}" RUN cd /tmp && \ export HPCX_DIR="/opt/hpcx" && \ wget -q -O - https://blobstore.object.ord1.coreweave.com/drivers/${HPCX_DISTRIBUTION}.tbz | tar xjf - && \