vLLM 0.6.2 rebase (#340)

you know the drill
HabanaAI · Sep 27, 2024 · f347a84 · f347a84
2 parents c3577af + 5ffcfa3
commit f347a84
Show file tree

Hide file tree

Showing 183 changed files with 8,393 additions and 3,103 deletions.
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -70,7 +70,7 @@ steps:
   - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
   - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
   - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
-  
+
 - label: Core Test # 10min
   mirror_hardwares: [amd]
   fast_check: true
@@ -90,8 +90,11 @@ steps:
   commands:
   - pip install -e ./plugins/vllm_add_dummy_model
   - pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
-  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
   - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
+  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
+  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
+  - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
   - pytest -v -s entrypoints/openai
   - pytest -v -s entrypoints/test_chat_utils.py
   - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
@@ -207,6 +210,21 @@ steps:
   command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
   parallelism: 4
 
+- label: "PyTorch Fullgraph Smoke Test"
+  fast_check: true
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+  - pytest -v -s compile/test_full_graph_smoke.py
+
+- label: "PyTorch Fullgraph Test"
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+  - pytest -v -s compile/test_full_graph.py
+
 - label: Kernels Test %N # 30min each
   mirror_hardwares: [amd]
   source_file_dependencies:
@@ -352,7 +370,7 @@ steps:
   - tests/distributed/
   - vllm/compilation
   commands:
-  - pytest -v -s ./compile/test_full_graph.py
+  - pytest -v -s ./compile/test_full_graph_multi_gpu.py
   - pytest -v -s ./compile/test_wrapper.py
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
   - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus

diff --git a/.github/workflows/cpu-test.yml b/.github/workflows/cpu-test.yml
@@ -27,6 +27,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install torch --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install -r requirements-build.txt
         pip install -r requirements-hpu.txt
         VLLM_TARGET_DEVICE=hpu python setup.py develop
     - name: cpu-test

diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,5 @@
-# vllm commit id, generated by setup.py
-vllm/commit_id.py
+# version file generated by setuptools-scm
+/vllm/_version.py
 
 # vllm-flash-attn built from source
 vllm/vllm_flash_attn/

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -223,6 +223,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/quantization/gguf/gguf_kernel.cu"
     "csrc/quantization/fp8/fp8_marlin.cu"
     "csrc/custom_all_reduce.cu"
+    "csrc/permute_cols.cu"
     "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
     "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
     "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
@@ -315,6 +316,11 @@ set(VLLM_MOE_EXT_SRC
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   list(APPEND VLLM_MOE_EXT_SRC
+      "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
+      "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
+      "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
+      "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
+      "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
       "csrc/moe/marlin_moe_ops.cu")
 endif()
 

diff --git a/Dockerfile b/Dockerfile
@@ -79,15 +79,13 @@ ENV MAX_JOBS=${max_jobs}
 ARG nvcc_threads=8
 ENV NVCC_THREADS=$nvcc_threads
 
-ARG buildkite_commit
-ENV BUILDKITE_COMMIT=${buildkite_commit}
-
 ARG USE_SCCACHE
 ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
 ARG SCCACHE_REGION_NAME=us-west-2
 ARG SCCACHE_S3_NO_CREDENTIALS=0
 # if USE_SCCACHE is set, use sccache to speed up compilation
 RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=.git,target=.git \
     if [ "$USE_SCCACHE" = "1" ]; then \
         echo "Installing sccache..." \
         && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
@@ -107,6 +105,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=.git,target=.git  \
     if [ "$USE_SCCACHE" != "1" ]; then \
         python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
     fi
@@ -203,7 +202,7 @@ FROM vllm-base AS vllm-openai
 
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate hf_transfer 'modelscope!=1.15.0'
+    pip install accelerate hf_transfer 'modelscope!=1.15.0' bitsandbytes>=0.44.0 timm==0.9.10
 
 ENV VLLM_USAGE_SOURCE production-docker-image
 

diff --git a/Dockerfile.cpu b/Dockerfile.cpu
@@ -62,8 +62,10 @@ ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
 
 RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=bind,source=.git,target=.git \
     VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
-    pip install dist/*.whl
+    pip install dist/*.whl && \
+    rm -rf dist
 
 WORKDIR /workspace/
 

diff --git a/Dockerfile.neuron b/Dockerfile.neuron
@@ -6,9 +6,12 @@ FROM $BASE_IMAGE
 RUN echo "Base image is $BASE_IMAGE"
 
 # Install some basic utilities
-RUN apt-get update \
-    && apt-get install python3 python3-pip -y \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 
+RUN apt-get update && \
+    apt-get install -y \
+        git \
+        python3 \
+        python3-pip \
+        ffmpeg libsm6 libxext6 libgl1
 
 ### Mount Point ###
 # When launching the container, mount the code directory to /app
@@ -22,17 +25,17 @@ RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
 RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
 RUN python3 -m pip install --pre neuronx-cc==2.15.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
 
-COPY ./vllm /app/vllm/vllm
-COPY ./setup.py /app/vllm/setup.py
-COPY ./requirements-common.txt /app/vllm/requirements-common.txt
-COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
+COPY . /app/vllm
 
 RUN cd /app/vllm \
-    && python3 -m pip install -U -r requirements-neuron.txt
+    && python3 -m pip install -U \
+        cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
+        -r requirements-neuron.txt
 
 ENV VLLM_TARGET_DEVICE neuron
-RUN cd /app/vllm \
-    && pip install -e . \
+RUN --mount=type=bind,source=.git,target=.git \
+    cd /app/vllm \
+    && pip install --no-build-isolation -v -e . \
     && cd ..
 
 CMD ["/bin/bash"]
diff --git a/Dockerfile.openvino b/Dockerfile.openvino
@@ -4,8 +4,9 @@
 FROM ubuntu:22.04 AS dev
 
 RUN apt-get update -y && \
-    apt-get install -y python3-pip git && \
-    apt-get install -y ffmpeg libsm6 libxext6 libgl1 
+    apt-get install -y \
+        git python3-pip \
+        ffmpeg libsm6 libxext6 libgl1
 WORKDIR /workspace
 
 # copy requirements

diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
@@ -16,9 +16,15 @@ COPY ./ /workspace/vllm
 WORKDIR /workspace/vllm
 
 # These packages will be in rocketce eventually
-RUN pip install -v cmake xformers torch==2.3.1 uvloop==0.20.0 -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
-
-RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
+RUN --mount=type=cache,target=/root/.cache/pip  \
+    pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
+        cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
+        torch==2.3.1 \
+        -r requirements-cpu.txt \
+        xformers uvloop==0.20.0
+
+RUN --mount=type=bind,source=.git,target=.git \
+    VLLM_TARGET_DEVICE=cpu python3 setup.py install
 
 WORKDIR /workspace/
 

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
@@ -1,24 +1,18 @@
-# Default ROCm 6.1 base image
-ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
+# Default ROCm 6.2 base image
+ARG BASE_IMAGE="rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0"
 
 # Default ROCm ARCHes to build vLLM for.
 ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
 
 # Whether to install CK-based flash-attention
 # If 0, will not install flash-attention
 ARG BUILD_FA="1"
-# If `TRY_FA_WHEEL=1`, we will try installing flash-attention from `FA_WHEEL_URL`
-# If this succeeds, we use the downloaded wheel and skip building flash-attention.
-# Otherwise, ROCm flash-attention from `FA_BRANCH` will be built for the
-# architectures specified in `FA_GFX_ARCHS`
-ARG TRY_FA_WHEEL="1"
-ARG FA_WHEEL_URL="https://github.com/ROCm/flash-attention/releases/download/v2.5.9post1-cktile-vllm/flash_attn-2.5.9.post1-cp39-cp39-linux_x86_64.whl"
 ARG FA_GFX_ARCHS="gfx90a;gfx942"
-ARG FA_BRANCH="23a2b1c2"
+ARG FA_BRANCH="3cea2fb"
 
 # Whether to build triton on rocm
 ARG BUILD_TRITON="1"
-ARG TRITON_BRANCH="e0fc12c"
+ARG TRITON_BRANCH="e192dba"
 
 ### Base image build stage
 FROM $BASE_IMAGE AS base
@@ -50,14 +44,17 @@ RUN python3 -m pip install --upgrade pip
 # Remove sccache so it doesn't interfere with ccache
 # TODO: implement sccache support across components
 RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
-# Install torch == 2.5.0 on ROCm
-RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
-        *"rocm-6.1"*) \
+
+# Install torch == 2.6.0 on ROCm
+RUN --mount=type=cache,target=/root/.cache/pip \
+    case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
+        *"rocm-6.2"*) \
             python3 -m pip uninstall -y torch torchvision \
-            && python3 -m pip install --no-cache-dir --pre \
-                torch==2.5.0.dev20240726 \
-                torchvision==0.20.0.dev20240726 \
-               --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
+            && python3 -m pip install --pre \
+                torch==2.6.0.dev20240918 \
+                setuptools-scm>=8 \
+                torchvision==0.20.0.dev20240918 \
+                --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \
         *) ;; esac
 
 ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
@@ -79,25 +76,18 @@ RUN cd /opt/rocm/share/amd_smi \
 ### Flash-Attention wheel build stage
 FROM base AS build_fa
 ARG BUILD_FA
-ARG TRY_FA_WHEEL
-ARG FA_WHEEL_URL
 ARG FA_GFX_ARCHS
 ARG FA_BRANCH
 # Build ROCm flash-attention wheel if `BUILD_FA = 1`
 RUN --mount=type=cache,target=${CCACHE_DIR} \
     if [ "$BUILD_FA" = "1" ]; then \
-        if [ "${TRY_FA_WHEEL}" = "1" ] && python3 -m pip install "${FA_WHEEL_URL}"; then \
-            # If a suitable wheel exists, we download it instead of building FA
-            mkdir -p /install && wget -N "${FA_WHEEL_URL}" -P /install; \
-        else \
-            mkdir -p libs \
-            && cd libs \
-            && git clone https://github.com/ROCm/flash-attention.git \
-            && cd flash-attention \
-            && git checkout "${FA_BRANCH}" \
-            && git submodule update --init \
-            && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
-        fi; \
+        mkdir -p libs \
+        && cd libs \
+        && git clone https://github.com/ROCm/flash-attention.git \
+        && cd flash-attention \
+        && git checkout "${FA_BRANCH}" \
+        && git submodule update --init \
+        && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
     # Create an empty directory otherwise as later build stages expect one
     else mkdir -p /install; \
     fi
@@ -112,6 +102,7 @@ RUN --mount=type=cache,target=${CCACHE_DIR} \
     if [ "$BUILD_TRITON" = "1" ]; then \
     mkdir -p libs \
     && cd libs \
+    && python3 -m pip install ninja cmake wheel pybind11 \
     && git clone https://github.com/OpenAI/triton.git \
     && cd triton \
     && git checkout "${TRITON_BRANCH}" \
@@ -129,7 +120,7 @@ COPY . .
 
 # Package upgrades for useful functionality or to avoid dependency issues
 RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install --upgrade numba scipy huggingface-hub[cli]
+    python3 -m pip install --upgrade numba scipy huggingface-hub[cli] pytest-shard
 
 
 # Workaround for ray >= 2.10.0
@@ -138,15 +129,9 @@ ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
 ENV TOKENIZERS_PARALLELISM=false
 
 RUN --mount=type=cache,target=${CCACHE_DIR} \
+    --mount=type=bind,source=.git,target=.git \
     --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -Ur requirements-rocm.txt \
-    && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
-        *"rocm-6.1"*) \
-            # Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM
-            wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib \
-            # Prevent interference if torch bundles its own HIP runtime
-            && rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \
-        *) ;; esac \
     && python3 setup.py clean --all \
     && python3 setup.py develop
 

diff --git a/Dockerfile.tpu b/Dockerfile.tpu
@@ -5,16 +5,25 @@ FROM $BASE_IMAGE
 WORKDIR /workspace
 
 # Install some basic utilities
-RUN apt-get update && apt-get install -y ffmpeg libsm6 libxext6 libgl1 
+RUN apt-get update && apt-get install -y \
+    git \
+    ffmpeg libsm6 libxext6 libgl1
 
 # Install the TPU and Pallas dependencies.
-RUN python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
-RUN python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
 
 # Build vLLM.
 COPY . /workspace/vllm
 ENV VLLM_TARGET_DEVICE="tpu"
-RUN cd /workspace/vllm && python3 -m pip install -r requirements-tpu.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=.git,target=.git \
+     cd /workspace/vllm && \
+    python3 -m pip install \
+        cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
+        -r requirements-tpu.txt
 RUN cd /workspace/vllm && python3 setup.py develop
 
 CMD ["/bin/bash"]
diff --git a/Dockerfile.xpu b/Dockerfile.xpu
@@ -7,15 +7,20 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
     echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
     chmod 644 /usr/share/keyrings/intel-graphics.gpg
 
-RUN apt-get update  -y \
-&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1 
+RUN apt-get update  -y && \
+    apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1
 
 COPY ./ /workspace/vllm
 
 WORKDIR /workspace/vllm
 
-RUN pip install -v -r requirements-xpu.txt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -v --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
+        cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
+        -r requirements-xpu.txt
 
-RUN VLLM_TARGET_DEVICE=xpu python3 setup.py install
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=.git,target=.git \
+    VLLM_TARGET_DEVICE=xpu python3 setup.py install
 
 CMD ["/bin/bash"]