Skip to content

Commit

Permalink
Add torch 2.2.1 support (#3059)
Browse files Browse the repository at this point in the history
* 2.2 is patch free

* docker

* fix torchvision

* run generate

* remove apex
  • Loading branch information
mvpatel2000 authored Feb 27, 2024
1 parent 1c52d47 commit 0814e01
Show file tree
Hide file tree
Showing 6 changed files with 23 additions and 43 deletions.
4 changes: 4 additions & 0 deletions composer/trainer/mosaic_fsdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ def patch_pytorch():
from torch.distributed.fsdp import _runtime_utils
_runtime_utils._validate_and_get_hybrid_shard_state = lambda *args, **kwargs: None

elif version.parse(torch.__version__) < version.parse('2.2.2'):
# Monkey patch for torch < 2.2.2 ie torch == 2.2.1
pass

elif version.parse(torch.__version__) < version.parse('2.3.1'):
# Monkey patch for torch < 2.3.1 ie torch == 2.3.0
# Note: this is the same patch as 2.2.0, we are just making a new if branch
Expand Down
20 changes: 0 additions & 20 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -270,26 +270,6 @@ RUN if [ -n "$MOFED_VERSION" ] ; then \
rm -rf /tmp/mofed ; \
fi


#####################
# Install NVIDIA Apex
#####################
# skip if torch nightly is installed as there is incompatability
RUN if [[ -n "$CUDA_VERSION" ]] && [[ -z "${PYTORCH_NIGHTLY_URL}" ]]; then \
mkdir -p /tmp/apex && \
cd /tmp/apex && \
git clone https://github.com/NVIDIA/apex && \
cd apex && \
git checkout 82ee367f3da74b4cd62a1fb47aa9806f0f47b58b && \
pip${PYTHON_VERSION} install --no-cache-dir -r requirements.txt && \
pip${PYTHON_VERSION} install --no-cache-dir \
--global-option="--cpp_ext" \
--global-option="--cuda_ext" \
--target /usr/local/lib/python${PYTHON_VERSION}/dist-packages \
./ && \
rm -rf /tmp/apex ; \
fi

##########################
# Install Flash Attention
##########################
Expand Down
6 changes: 3 additions & 3 deletions docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ To install composer, once inside the image, run `pip install mosaicml`.
|----------------|----------|-------------------|---------------------|------------------|------------------------------------------------------------------------------------------|
| Ubuntu 20.04 | Base | 2.3.0 | 12.1.0 (Infiniband) | 3.11 | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.11-ubuntu20.04` |
| Ubuntu 20.04 | Base | 2.3.0 | 12.1.0 (EFA) | 3.11 | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.11-ubuntu20.04-aws` |
| Ubuntu 20.04 | Base | 2.2.0 | 12.1.0 (Infiniband) | 3.11 | `mosaicml/pytorch:2.2.0_cu121-python3.11-ubuntu20.04` |
| Ubuntu 20.04 | Base | 2.2.0 | 12.1.0 (EFA) | 3.11 | `mosaicml/pytorch:2.2.0_cu121-python3.11-ubuntu20.04-aws` |
| Ubuntu 20.04 | Base | 2.2.0 | cpu | 3.11 | `mosaicml/pytorch:2.2.0_cpu-python3.11-ubuntu20.04` |
| Ubuntu 20.04 | Base | 2.2.1 | 12.1.0 (Infiniband) | 3.11 | `mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04` |
| Ubuntu 20.04 | Base | 2.2.1 | 12.1.0 (EFA) | 3.11 | `mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04-aws` |
| Ubuntu 20.04 | Base | 2.2.1 | cpu | 3.11 | `mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04` |
| Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04` |
| Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (EFA) | 3.10 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04-aws` |
| Ubuntu 20.04 | Base | 2.1.2 | cpu | 3.10 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04` |
Expand Down
24 changes: 12 additions & 12 deletions docker/build_matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
- AWS_OFI_NCCL_VERSION: ''
BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
CUDA_VERSION: 12.1.0
IMAGE_NAME: torch-2-2-0-cu121
IMAGE_NAME: torch-2-2-1-cu121
MOFED_VERSION: 5.5-1.0.3.2
NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
Expand All @@ -21,15 +21,15 @@
PYTHON_VERSION: '3.11'
PYTORCH_NIGHTLY_URL: ''
PYTORCH_NIGHTLY_VERSION: ''
PYTORCH_VERSION: 2.2.0
PYTORCH_VERSION: 2.2.1
TAGS:
- mosaicml/pytorch:2.2.0_cu121-python3.11-ubuntu20.04
- mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
TARGET: pytorch_stage
TORCHVISION_VERSION: 0.17.0
TORCHVISION_VERSION: 0.17.1
- AWS_OFI_NCCL_VERSION: v1.7.4-aws
BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
CUDA_VERSION: 12.1.0
IMAGE_NAME: torch-2-2-0-cu121-aws
IMAGE_NAME: torch-2-2-1-cu121-aws
MOFED_VERSION: ''
NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
Expand All @@ -48,25 +48,25 @@
PYTHON_VERSION: '3.11'
PYTORCH_NIGHTLY_URL: ''
PYTORCH_NIGHTLY_VERSION: ''
PYTORCH_VERSION: 2.2.0
PYTORCH_VERSION: 2.2.1
TAGS:
- mosaicml/pytorch:2.2.0_cu121-python3.11-ubuntu20.04-aws
- mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04-aws
TARGET: pytorch_stage
TORCHVISION_VERSION: 0.17.0
TORCHVISION_VERSION: 0.17.1
- AWS_OFI_NCCL_VERSION: ''
BASE_IMAGE: ubuntu:20.04
CUDA_VERSION: ''
IMAGE_NAME: torch-2-2-0-cpu
IMAGE_NAME: torch-2-2-1-cpu
MOFED_VERSION: ''
NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
PYTHON_VERSION: '3.11'
PYTORCH_NIGHTLY_URL: ''
PYTORCH_NIGHTLY_VERSION: ''
PYTORCH_VERSION: 2.2.0
PYTORCH_VERSION: 2.2.1
TAGS:
- mosaicml/pytorch:2.2.0_cpu-python3.11-ubuntu20.04
- mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
TARGET: pytorch_stage
TORCHVISION_VERSION: 0.17.0
TORCHVISION_VERSION: 0.17.1
- AWS_OFI_NCCL_VERSION: ''
BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
CUDA_VERSION: 12.1.0
Expand Down
8 changes: 4 additions & 4 deletions docker/generate_build_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@


def _get_torchvision_version(pytorch_version: str):
if pytorch_version == '2.2.0':
return '0.17.0'
if pytorch_version == '2.2.1':
return '0.17.1'
if pytorch_version == '2.1.2':
return '0.16.2'
if pytorch_version == '2.0.1':
Expand All @@ -42,7 +42,7 @@ def _get_cuda_version(pytorch_version: str, use_cuda: bool):
# From https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/
if not use_cuda:
return ''
if pytorch_version == '2.2.0':
if pytorch_version == '2.2.1':
return '12.1.0'
if pytorch_version == '2.1.2':
return '12.1.0'
Expand Down Expand Up @@ -163,7 +163,7 @@ def _write_table(table_tag: str, table_contents: str):


def _main():
python_pytorch_versions = [('3.11', '2.2.0'), ('3.10', '2.1.2'), ('3.10', '2.0.1')]
python_pytorch_versions = [('3.11', '2.2.1'), ('3.10', '2.1.2'), ('3.10', '2.0.1')]
cuda_options = [True, False]
stages = ['pytorch_stage']
interconnects = ['mellanox', 'EFA'] # mellanox is default, EFA needed for AWS
Expand Down
4 changes: 0 additions & 4 deletions tests/test_passes.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,6 @@ class TestAlgorithmOrderingPasses:
@pytest.mark.parametrize('algorithm_cls', [LowPrecisionLayerNorm])
def test_algorithm_last(self, algorithm_cls: Type[Algorithm], always_match_algorithms: List[Algorithm],
dummy_logger: Logger, dummy_state: State):

if algorithm_cls == LowPrecisionLayerNorm:
pytest.importorskip('apex')

algorithm = algorithm_cls()
algorithm.apply = Mock(return_value='algo')
algorithm.match = Mock(return_value=True)
Expand Down

0 comments on commit 0814e01

Please sign in to comment.