Skip to content

Commit

Permalink
Merge pull request #31 from coreweave/eta/nccl-cuda-cudnn-updates
Browse files Browse the repository at this point in the history
feat: Update NCCL, CUDA, cuDNN, and HPC-X
  • Loading branch information
wbrown authored Mar 5, 2024
2 parents e676c84 + 8e2e3f3 commit 868dc3d
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 26 deletions.
32 changes: 16 additions & 16 deletions .github/workflows/ubuntu-20.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
cuda-version-major: "12.0"
nccl-version: 2.19.3-1
cuda-samples-version: "12.0"
hpcx-distribution: "hpcx-v2.16-gcc-mlnx_ofed-ubuntu20.04-cuda12-gdrcopy2-nccl2.18-x86_64"
hpcx-distribution: "hpcx-v2.18-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64"

cu121:
uses: ./.github/workflows/build.yml
Expand All @@ -44,7 +44,7 @@ jobs:
cuda-version-major: "12.1"
nccl-version: 2.18.3-1
cuda-samples-version: "12.1"
hpcx-distribution: "hpcx-v2.16-gcc-mlnx_ofed-ubuntu20.04-cuda12-gdrcopy2-nccl2.18-x86_64"
hpcx-distribution: "hpcx-v2.18-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64"

cu122:
uses: ./.github/workflows/build.yml
Expand All @@ -55,19 +55,19 @@ jobs:
base-tag: 12.2.2-cudnn8-devel-ubuntu20.04
cuda-version-minor: "12.2.2"
cuda-version-major: "12.2"
nccl-version: 2.19.3-1
nccl-version: 2.20.3-1
cuda-samples-version: "12.2"
hpcx-distribution: "hpcx-v2.16-gcc-mlnx_ofed-ubuntu20.04-cuda12-gdrcopy2-nccl2.18-x86_64"
hpcx-distribution: "hpcx-v2.18-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64"

# cu123:
# uses: ./.github/workflows/build.yml
# with:
# folder: .
# dockerfile: Dockerfile.ubuntu20
# base-image: nvidia/cuda
# base-tag: 12.3.0-cudnn8-devel-ubuntu20.04
# cuda-version-minor: "12.3.0"
# cuda-version-major: "12.3"
# nccl-version: 2.19.3-1
# cuda-samples-version: "12.3"
# hpcx-distribution: "hpcx-v2.17-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64"
cu123:
uses: ./.github/workflows/build.yml
with:
folder: .
dockerfile: Dockerfile.ubuntu20
base-image: nvidia/cuda
base-tag: 12.3.2-cudnn9-devel-ubuntu20.04
cuda-version-minor: "12.3.2"
cuda-version-major: "12.3"
nccl-version: 2.20.3-1
cuda-samples-version: "12.3"
hpcx-distribution: "hpcx-v2.18-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64"
14 changes: 7 additions & 7 deletions .github/workflows/ubuntu-22.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
cuda-version-major: "12.0"
nccl-version: 2.18.5-1
cuda-samples-version: "12.0"
hpcx-distribution: "hpcx-v2.17-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64"
hpcx-distribution: "hpcx-v2.18-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64"

cu121:
uses: ./.github/workflows/build.yml
Expand All @@ -31,7 +31,7 @@ jobs:
cuda-version-major: "12.1"
nccl-version: 2.18.3-1
cuda-samples-version: "12.1"
hpcx-distribution: "hpcx-v2.17-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64"
hpcx-distribution: "hpcx-v2.18-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64"

cu122:
uses: ./.github/workflows/build.yml
Expand All @@ -44,17 +44,17 @@ jobs:
cuda-version-major: "12.2"
nccl-version: 2.19.3-1
cuda-samples-version: "12.2"
hpcx-distribution: "hpcx-v2.17-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64"
hpcx-distribution: "hpcx-v2.18-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64"

cu123:
uses: ./.github/workflows/build.yml
with:
folder: .
dockerfile: Dockerfile.ubuntu22
base-image: nvidia/cuda
base-tag: 12.3.1-devel-ubuntu22.04
cuda-version-minor: "12.3.1"
base-tag: 12.3.2-cudnn9-devel-ubuntu22.04
cuda-version-minor: "12.3.2"
cuda-version-major: "12.3"
nccl-version: 2.19.3-1
nccl-version: 2.20.3-1
cuda-samples-version: "12.3"
hpcx-distribution: "hpcx-v2.17-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64"
hpcx-distribution: "hpcx-v2.18-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64"
2 changes: 1 addition & 1 deletion Dockerfile.ubuntu20
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ RUN mkdir /tmp/build && \
ARG HPCX_DISTRIBUTION="hpcx-v2.14-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda11-gdrcopy2-nccl2.16-x86_64"
RUN cd /tmp && \
export HPCX_DIR="/opt/hpcx" && \
wget -q -O - http://blobstore.object.ord1.coreweave.com/drivers/${HPCX_DISTRIBUTION}.tbz | tar xjf - && \
wget -q -O - https://blobstore.object.ord1.coreweave.com/drivers/${HPCX_DISTRIBUTION}.tbz | tar xjf - && \
grep -IrlF "/build-result/${HPCX_DISTRIBUTION}" ${HPCX_DISTRIBUTION} | xargs -rd'\n' sed -i -e "s:/build-result/${HPCX_DISTRIBUTION}:${HPCX_DIR}:g" && \
mv ${HPCX_DISTRIBUTION} ${HPCX_DIR}

Expand Down
4 changes: 2 additions & 2 deletions Dockerfile.ubuntu22
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,10 @@ RUN mkdir /tmp/build && \

# HPC-X
# grep + sed is used as a workaround to update hardcoded pkg-config / libtools archive / CMake prefixes
ARG HPCX_DISTRIBUTION="hpcx-v2.17-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64"
ARG HPCX_DISTRIBUTION="hpcx-v2.18-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64"
RUN cd /tmp && \
export HPCX_DIR="/opt/hpcx" && \
wget -q -O - http://blobstore.object.ord1.coreweave.com/drivers/${HPCX_DISTRIBUTION}.tbz | tar xjf - && \
wget -q -O - https://blobstore.object.ord1.coreweave.com/drivers/${HPCX_DISTRIBUTION}.tbz | tar xjf - && \
grep -IrlF "/build-result/${HPCX_DISTRIBUTION}" ${HPCX_DISTRIBUTION} | xargs -rd'\n' sed -i -e "s:/build-result/${HPCX_DISTRIBUTION}:${HPCX_DIR}:g" && \
mv ${HPCX_DISTRIBUTION} ${HPCX_DIR}

Expand Down

0 comments on commit 868dc3d

Please sign in to comment.