Adding the new feature of FPDT (#6462)

[FPDT](https://arxiv.org/abs/2408.16978) can only be used with [this version](microsoft/Megatron-DeepSpeed#441) of Megatron-DeepSpeed. --------- Co-authored-by: Jinghan Yao <yjhmitweb@ascend-rw02.ten.osc.edu> Co-authored-by: Sam Ade Jacobs <samjacobs@microsoft.com> Co-authored-by: Jinghan Yao <yjhmitweb@ascend-rw01.ten.osc.edu> Co-authored-by: Logan Adams <loadams@microsoft.com> Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com> Co-authored-by: Jinghan Yao <yjhmitweb@cardinal-rw02.ten.osc.edu> Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> Co-authored-by: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com> Co-authored-by: Masahiro Tanaka <mtanaka@microsoft.com>
microsoft · Nov 26, 2024 · 13f4943 · 13f4943
1 parent f743fec
commit 13f4943
Show file tree

Hide file tree

Showing 6 changed files with 1,432 additions and 2 deletions.
diff --git a/.github/workflows/nv-flash-attn.yml b/.github/workflows/nv-flash-attn.yml
@@ -0,0 +1,64 @@
+name: nv-flash-attn
+
+on:
+  workflow_dispatch:
+  pull_request:
+    paths:
+      - 'deepspeed/sequence/**'
+      - 'tests/unit/sequence_parallelism/**'
+      - '.github/workflows/nv-flash-attn.yml'
+  schedule:
+    - cron: "0 0 * * *"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  unit-tests:
+    runs-on: [self-hosted, nvidia, a6000]
+    container:
+      image: nvcr.io/nvidia/pytorch:24.03-py3
+      ports:
+        - 80
+      options: --gpus all --shm-size "8G"
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Check container state
+        run: |
+          ldd --version
+          nvcc --version
+          nvidia-smi
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+      - name: Install transformers
+        run: |
+          git clone --depth=1 https://github.com/huggingface/transformers
+          cd transformers
+          git rev-parse --short HEAD
+          python -m pip install .
+      - name: Install deepspeed
+        run: |
+          python -m pip install .[dev]
+          ds_report
+      - name: Install FlashAttention
+        run: |
+          python -m pip install flash-attn
+      - name: Python environment
+        run: |
+          python -m pip list
+      - name: Unit tests
+        run: |
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          cd tests
+          python -m pytest --color=yes --durations=0 --verbose -rF unit/sequence_parallelism/test_ulysses.py --torch_ver="2.3" --cuda_ver="12"
+      - name: Open GitHub issue if nightly CI fails
+        if: ${{ failure() && (github.event_name == 'schedule') }}
+        uses: JasonEtco/create-an-issue@v2
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
+          update_existing: true
diff --git a/deepspeed/runtime/activation_checkpointing/checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py
@@ -369,7 +369,9 @@ def is_activation_to_checkpoint(item):
         Is an activation to be checkpointed
     """
     global mp_size
-    return torch.is_tensor(item) and item.is_floating_point() and item.numel() >= mp_size
+    extra_flag = (not hasattr(item, 'no_checkpointing')) or (hasattr(item, 'no_checkpointing')
+                                                             and item.no_checkpointing == False)
+    return torch.is_tensor(item) and item.is_floating_point() and item.numel() >= mp_size and extra_flag
 
 
 def partition_activations(args, cpu_checkpoint, contiguous_checkpoint):