From 0814e01698ed9c244d6af30211bcfce3deeeed3d Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 27 Feb 2024 10:06:47 -0500 Subject: [PATCH] Add torch 2.2.1 support (#3059) * 2.2 is patch free * docker * fix torchvision * run generate * remove apex --- composer/trainer/mosaic_fsdp.py | 4 ++++ docker/Dockerfile | 20 -------------------- docker/README.md | 6 +++--- docker/build_matrix.yaml | 24 ++++++++++++------------ docker/generate_build_matrix.py | 8 ++++---- tests/test_passes.py | 4 ---- 6 files changed, 23 insertions(+), 43 deletions(-) diff --git a/composer/trainer/mosaic_fsdp.py b/composer/trainer/mosaic_fsdp.py index 07a4f15fbf..51bf891491 100644 --- a/composer/trainer/mosaic_fsdp.py +++ b/composer/trainer/mosaic_fsdp.py @@ -61,6 +61,10 @@ def patch_pytorch(): from torch.distributed.fsdp import _runtime_utils _runtime_utils._validate_and_get_hybrid_shard_state = lambda *args, **kwargs: None + elif version.parse(torch.__version__) < version.parse('2.2.2'): + # Monkey patch for torch < 2.2.2 ie torch == 2.2.1 + pass + elif version.parse(torch.__version__) < version.parse('2.3.1'): # Monkey patch for torch < 2.3.1 ie torch == 2.3.0 # Note: this is the same patch as 2.2.0, we are just making a new if branch diff --git a/docker/Dockerfile b/docker/Dockerfile index e5ae9b9468..7c9735e13f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -270,26 +270,6 @@ RUN if [ -n "$MOFED_VERSION" ] ; then \ rm -rf /tmp/mofed ; \ fi - -##################### -# Install NVIDIA Apex -##################### -# skip if torch nightly is installed as there is incompatability -RUN if [[ -n "$CUDA_VERSION" ]] && [[ -z "${PYTORCH_NIGHTLY_URL}" ]]; then \ - mkdir -p /tmp/apex && \ - cd /tmp/apex && \ - git clone https://github.com/NVIDIA/apex && \ - cd apex && \ - git checkout 82ee367f3da74b4cd62a1fb47aa9806f0f47b58b && \ - pip${PYTHON_VERSION} install --no-cache-dir -r requirements.txt && \ - pip${PYTHON_VERSION} install --no-cache-dir \ - --global-option="--cpp_ext" \ - --global-option="--cuda_ext" \ - --target /usr/local/lib/python${PYTHON_VERSION}/dist-packages \ - ./ && \ - rm -rf /tmp/apex ; \ - fi - ########################## # Install Flash Attention ########################## diff --git a/docker/README.md b/docker/README.md index d0624e2665..1491e162f2 100644 --- a/docker/README.md +++ b/docker/README.md @@ -32,9 +32,9 @@ To install composer, once inside the image, run `pip install mosaicml`. |----------------|----------|-------------------|---------------------|------------------|------------------------------------------------------------------------------------------| | Ubuntu 20.04 | Base | 2.3.0 | 12.1.0 (Infiniband) | 3.11 | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.11-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.3.0 | 12.1.0 (EFA) | 3.11 | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.11-ubuntu20.04-aws` | -| Ubuntu 20.04 | Base | 2.2.0 | 12.1.0 (Infiniband) | 3.11 | `mosaicml/pytorch:2.2.0_cu121-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.2.0 | 12.1.0 (EFA) | 3.11 | `mosaicml/pytorch:2.2.0_cu121-python3.11-ubuntu20.04-aws` | -| Ubuntu 20.04 | Base | 2.2.0 | cpu | 3.11 | `mosaicml/pytorch:2.2.0_cpu-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.2.1 | 12.1.0 (Infiniband) | 3.11 | `mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.2.1 | 12.1.0 (EFA) | 3.11 | `mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04-aws` | +| Ubuntu 20.04 | Base | 2.2.1 | cpu | 3.11 | `mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (EFA) | 3.10 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04-aws` | | Ubuntu 20.04 | Base | 2.1.2 | cpu | 3.10 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04` | diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index 21c36347e9..fb548db8bb 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -2,7 +2,7 @@ - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.0 - IMAGE_NAME: torch-2-2-0-cu121 + IMAGE_NAME: torch-2-2-1-cu121 MOFED_VERSION: 5.5-1.0.3.2 NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 @@ -21,15 +21,15 @@ PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.2.0 + PYTORCH_VERSION: 2.2.1 TAGS: - - mosaicml/pytorch:2.2.0_cu121-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04 TARGET: pytorch_stage - TORCHVISION_VERSION: 0.17.0 + TORCHVISION_VERSION: 0.17.1 - AWS_OFI_NCCL_VERSION: v1.7.4-aws BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.0 - IMAGE_NAME: torch-2-2-0-cu121-aws + IMAGE_NAME: torch-2-2-1-cu121-aws MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 @@ -48,25 +48,25 @@ PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.2.0 + PYTORCH_VERSION: 2.2.1 TAGS: - - mosaicml/pytorch:2.2.0_cu121-python3.11-ubuntu20.04-aws + - mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04-aws TARGET: pytorch_stage - TORCHVISION_VERSION: 0.17.0 + TORCHVISION_VERSION: 0.17.1 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: ubuntu:20.04 CUDA_VERSION: '' - IMAGE_NAME: torch-2-2-0-cpu + IMAGE_NAME: torch-2-2-1-cpu MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.2.0 + PYTORCH_VERSION: 2.2.1 TAGS: - - mosaicml/pytorch:2.2.0_cpu-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04 TARGET: pytorch_stage - TORCHVISION_VERSION: 0.17.0 + TORCHVISION_VERSION: 0.17.1 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.0 diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index ca378388c6..215acef5b8 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -23,8 +23,8 @@ def _get_torchvision_version(pytorch_version: str): - if pytorch_version == '2.2.0': - return '0.17.0' + if pytorch_version == '2.2.1': + return '0.17.1' if pytorch_version == '2.1.2': return '0.16.2' if pytorch_version == '2.0.1': @@ -42,7 +42,7 @@ def _get_cuda_version(pytorch_version: str, use_cuda: bool): # From https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/ if not use_cuda: return '' - if pytorch_version == '2.2.0': + if pytorch_version == '2.2.1': return '12.1.0' if pytorch_version == '2.1.2': return '12.1.0' @@ -163,7 +163,7 @@ def _write_table(table_tag: str, table_contents: str): def _main(): - python_pytorch_versions = [('3.11', '2.2.0'), ('3.10', '2.1.2'), ('3.10', '2.0.1')] + python_pytorch_versions = [('3.11', '2.2.1'), ('3.10', '2.1.2'), ('3.10', '2.0.1')] cuda_options = [True, False] stages = ['pytorch_stage'] interconnects = ['mellanox', 'EFA'] # mellanox is default, EFA needed for AWS diff --git a/tests/test_passes.py b/tests/test_passes.py index 19e5dc0843..3f3a99dee6 100644 --- a/tests/test_passes.py +++ b/tests/test_passes.py @@ -84,10 +84,6 @@ class TestAlgorithmOrderingPasses: @pytest.mark.parametrize('algorithm_cls', [LowPrecisionLayerNorm]) def test_algorithm_last(self, algorithm_cls: Type[Algorithm], always_match_algorithms: List[Algorithm], dummy_logger: Logger, dummy_state: State): - - if algorithm_cls == LowPrecisionLayerNorm: - pytest.importorskip('apex') - algorithm = algorithm_cls() algorithm.apply = Mock(return_value='algo') algorithm.match = Mock(return_value=True)