Skip to content

Commit

Permalink
Merge pull request #27 from coreweave/wbrown.cuda13
Browse files Browse the repository at this point in the history
build: Add disabled CUDA 12.3, add `--allow-downgrades`.
  • Loading branch information
wbrown authored Nov 29, 2023
2 parents 47644c6 + 03f9015 commit 3ddac2c
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 22 deletions.
38 changes: 19 additions & 19 deletions .github/workflows/ubuntu-20.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,6 @@ on:
- .github/workflows/build.yml

jobs:
cu117:
uses: ./.github/workflows/build.yml
with:
folder: .
dockerfile: Dockerfile.ubuntu20
base-image: nvidia/cuda
base-tag: 11.7.1-cudnn8-devel-ubuntu20.04
cuda-version-minor: "11.7.1"
cuda-version-major: "11.7"
nccl-version: 2.14.3-1
cuda-samples-version: "11.6"
hpcx-version: "2.14"
hpcx-nccl-version: "2.16"
hpcx-cuda-version: "11"
hpcx-mlnx-ofed: "MLNX_OFED_LINUX-5"

cu118:
uses: ./.github/workflows/build.yml
with:
Expand All @@ -32,7 +16,7 @@ jobs:
base-tag: 11.8.0-cudnn8-devel-ubuntu20.04
cuda-version-minor: "11.8.0"
cuda-version-major: "11.8"
nccl-version: 2.16.2-1
nccl-version: 2.16.5-1
cuda-samples-version: "11.6"
hpcx-version: "2.14"
hpcx-nccl-version: "2.16"
Expand All @@ -48,7 +32,7 @@ jobs:
base-tag: 12.0.1-cudnn8-devel-ubuntu20.04
cuda-version-minor: "12.0.1"
cuda-version-major: "12.0"
nccl-version: 2.18.5-1
nccl-version: 2.19.3-1
cuda-samples-version: "12.0"
hpcx-version: "2.16"
hpcx-nccl-version: "2.18"
Expand Down Expand Up @@ -80,9 +64,25 @@ jobs:
base-tag: 12.2.2-cudnn8-devel-ubuntu20.04
cuda-version-minor: "12.2.2"
cuda-version-major: "12.2"
nccl-version: 2.18.5-1
nccl-version: 2.19.3-1
cuda-samples-version: "12.2"
hpcx-version: "2.16"
hpcx-nccl-version: "2.18"
hpcx-cuda-version: "12"
hpcx-mlnx-ofed: "mlnx_ofed"

# cu123:
# uses: ./.github/workflows/build.yml
# with:
# folder: .
# dockerfile: Dockerfile.ubuntu20
# base-image: nvidia/cuda
# base-tag: 12.3.0-cudnn8-devel-ubuntu20.04
# cuda-version-minor: "12.3.0"
# cuda-version-major: "12.3"
# nccl-version: 2.19.3-1
# cuda-samples-version: "12.3"
# hpcx-version: "2.17"
# hpcx-nccl-version: "2.19"
# hpcx-cuda-version: "12"
# hpcx-mlnx-ofed: "mlnx_ofed"
10 changes: 7 additions & 3 deletions Dockerfile.ubuntu20
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,19 @@ ARG TARGET_NCCL_VERSION=2.14.3-1

ARG DEBIAN_FRONTEND=noninteractive
RUN apt-get -qq update && \
apt-get -qq install -y --allow-change-held-packages --no-install-recommends \
apt-get -qq install -y \
--allow-change-held-packages \
--no-install-recommends \
--allow-downgrades \
build-essential libtool autoconf automake autotools-dev unzip \
ca-certificates \
wget curl openssh-server vim environment-modules \
iputils-ping net-tools \
libnuma1 libsubunit0 libpci-dev \
libpmix-dev \
datacenter-gpu-manager \
libnccl2=$TARGET_NCCL_VERSION+cuda${CUDA_VERSION_MAJOR} libnccl-dev=${TARGET_NCCL_VERSION}+cuda${CUDA_VERSION_MAJOR} \
libnccl2=$TARGET_NCCL_VERSION+cuda${CUDA_VERSION_MAJOR} \
libnccl-dev=${TARGET_NCCL_VERSION}+cuda${CUDA_VERSION_MAJOR} \
git

# Mellanox OFED (latest)
Expand Down Expand Up @@ -63,7 +67,7 @@ ARG HPCX_MLNX_OFED="MLNX_OFED_LINUX-5"
RUN cd /tmp && \
export HPCX_DISTRIBUTION="hpcx-v${HPCX_VERSION}-gcc-${HPCX_MLNX_OFED}-ubuntu20.04-cuda${HPCX_CUDA_VERSION}-gdrcopy2-nccl${HPCX_NCCL_VERSION}-x86_64" \
HPCX_DIR="/opt/hpcx" && \
wget -q -O - http://blobstore.s3.ord1.coreweave.com/drivers/${HPCX_DISTRIBUTION}.tbz | tar xjf - && \
wget -q -O - http://blobstore.object.ord1.coreweave.com/drivers/${HPCX_DISTRIBUTION}.tbz | tar xjf - && \
grep -IrlF "/build-result/${HPCX_DISTRIBUTION}" ${HPCX_DISTRIBUTION} | xargs -rd'\n' sed -i -e "s:/build-result/${HPCX_DISTRIBUTION}:${HPCX_DIR}:g" && \
mv ${HPCX_DISTRIBUTION} ${HPCX_DIR}

Expand Down

0 comments on commit 3ddac2c

Please sign in to comment.