Skip to content

Commit

Permalink
Update the dockerfile with a few necessary changes from NeMo to achie…
Browse files Browse the repository at this point in the history
…ve good H100 numerical accuracy in flash attention with TE
  • Loading branch information
jstjohn committed Dec 19, 2024
1 parent 6f34fad commit 4c539f8
Showing 1 changed file with 12 additions and 12 deletions.
24 changes: 12 additions & 12 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Base image with apex and transformer engine, but without NeMo or Megatron-LM.
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.02-py3
# Note that the core NeMo docker container is defined here:
# https://gitlab-master.nvidia.com/dl/JoC/nemo-ci/-/blob/main/llm_train/Dockerfile.train
# with settings that get defined/injected from this config:
# https://gitlab-master.nvidia.com/dl/JoC/nemo-ci/-/blob/main/.gitlab-ci.yml
# We should keep versions in our container up to date to ensure that we get the latest tested perf improvements and
# training loss curves from NeMo.
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.10-py3

FROM rust:1.82.0 as rust-env

Expand All @@ -25,7 +31,7 @@ RUN git clone https://github.com/NVIDIA/apex.git && \
--config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam --group_norm"

# Transformer Engine pre-1.7.0. 1.7 standardizes the meaning of bits in the attention mask to match
ARG TE_COMMIT=c27ee60ec746210bcea4ec33958dbbff06706506
ARG TE_COMMIT=2215fa5c7557b66034068816020f9f611019e457
RUN git clone https://github.com/NVIDIA/TransformerEngine.git && \
cd TransformerEngine && \
git fetch origin ${TE_COMMIT} && \
Expand All @@ -49,11 +55,11 @@ RUN apt-get install -y gnupg
# Check the nemo dependency for causal conv1d and make sure this checkout
# tag matches. If not, update the tag in the following line.
RUN CAUSAL_CONV1D_FORCE_BUILD=TRUE pip --disable-pip-version-check --no-cache-dir install \
git+https://github.com/Dao-AILab/causal-conv1d.git@v1.2.0.post2
git+https://github.com/Dao-AILab/causal-conv1d.git@v1.2.2.post1

# Mamba dependancy installation
RUN pip --disable-pip-version-check --no-cache-dir install \
git+https://github.com/state-spaces/mamba.git@v2.0.3
git+https://github.com/state-spaces/mamba.git@v2.2.2

RUN pip install hatchling # needed to install nemo-run
ARG NEMU_RUN_TAG=34259bd3e752fef94045a9a019e4aaf62bd11ce2
Expand All @@ -72,21 +78,15 @@ RUN apt-get update && \
rm -rf /var/lib/apt/lists/*
RUN apt purge -y libslurm37 libpmi2-0 && \
apt autoremove -y
RUN source /usr/local/nvm/nvm.sh && \
NODE_VER=$(nvm current) && \
nvm deactivate && \
nvm uninstall $NODE_VER && \
sed -i "/NVM/d" /root/.bashrc && \
sed -i "/nvm.sh/d" /etc/bash.bashrc


# Use UV to install python packages from the workspace. This just installs packages into the system's python
# environment, and does not use the current uv.lock file.
COPY --from=ghcr.io/astral-sh/uv:0.4.25 /uv /usr/local/bin/uv
ENV UV_LINK_MODE=copy \
UV_COMPILE_BYTECODE=1 \
UV_PYTHON_DOWNLOADS=never \
UV_SYSTEM_PYTHON=true \
UV_NO_CACHE=1
UV_SYSTEM_PYTHON=true

# Install the bionemo-geomtric requirements ahead of copying over the rest of the repo, so that we can cache their
# installation. These involve building some torch extensions, so they can take a while to install.
Expand Down

0 comments on commit 4c539f8

Please sign in to comment.