Merge branch 'master' into fix/rename_vllm_dockerfile

pytorch · Sep 24, 2024 · 8394fd7 · 8394fd7
2 parents fd71f07 + 6881ec5
commit 8394fd7
Show file tree

Hide file tree

Showing 50 changed files with 776 additions and 258 deletions.
diff --git a/.github/workflows/benchmark_nightly.yml b/.github/workflows/benchmark_nightly.yml
@@ -55,7 +55,7 @@ jobs:
           NEURON_RT_NUM_CORES: 1
         run: python benchmarks/auto_benchmark.py --input benchmarks/benchmark_config_neuronx.yaml --skip false
       - name: Save benchmark artifacts
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v4
         with:
           name: nightly ${{ matrix.hardware }} artifact
           path: /tmp/ts_benchmark
@@ -72,7 +72,7 @@ jobs:
       - name: Update benchmark artifacts for auto validation
         run: python benchmarks/utils/update_artifacts.py --output /tmp/ts_artifacts/${{ matrix.hardware }}_benchmark_validation
       - name: Upload the updated benchmark artifacts for auto validation
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v4
         with:
           name: ${{ matrix.hardware }}_benchmark_validation
           path: /tmp/ts_artifacts

diff --git a/.github/workflows/docker-nightly-build.yml b/.github/workflows/docker-nightly-build.yml
@@ -1,10 +1,11 @@
 name: Push Docker Nightly
 
 on:
-  # run every day at 1:15pm
+  # Run every day at 1:15pm
   schedule:
     - cron: "15 13 * * *"
   workflow_dispatch:
+
 jobs:
   nightly:
     runs-on: [self-hosted, ci-gpu]
@@ -32,12 +33,14 @@ jobs:
       - name: Push Docker Nightly
         run: |
           cd docker
+          sudo apt-get update
+          docker buildx use multibuilder
           python docker_nightly.py --cleanup
       - name: Push KServe Docker Nightly
         run: |
           cd kubernetes/kserve
+          docker buildx use multibuilder
           python docker_nightly.py --cleanup
-
       - name: Open issue on failure
         if: ${{ failure() && github.event_name  == 'schedule' }}
         uses: dacbd/create-issue-action@v1

diff --git a/.github/workflows/official_release_docker.yml b/.github/workflows/official_release_docker.yml
@@ -42,9 +42,11 @@ jobs:
         if: github.event.inputs.upload_docker == 'yes'
         run: |
           cd docker
+          docker buildx use multibuilder
           python build_upload_release.py --cleanup
       - name: Build & Upload pytorch/torchserve-kfs Docker images
         if: github.event.inputs.upload_kfs == 'yes'
         run: |
           cd kubernetes/kserve
+          docker buildx use multibuilder
           python build_upload_release.py --cleanup
diff --git a/README.md b/README.md
@@ -62,12 +62,23 @@ Refer to [torchserve docker](docker/README.md) for details.
 
 ### 🤖 Quick Start LLM Deployment
 
+#### VLLM Engine
 ```bash
 # Make sure to install torchserve with pip or conda as described above and login with `huggingface-cli login`
-python -m ts.llm_launcher --model_id meta-llama/Meta-Llama-3-8B-Instruct --disable_token_auth
+python -m ts.llm_launcher --model_id meta-llama/Meta-Llama-3.1-8B-Instruct --disable_token_auth
 
 # Try it out
-curl -X POST -d '{"model":"meta-llama/Meta-Llama-3-8B-Instruct", "prompt":"Hello, my name is", "max_tokens": 200}' --header "Content-Type: application/json" "http://localhost:8080/predictions/model/1.0/v1/completions"
+curl -X POST -d '{"model":"meta-llama/Meta-Llama-3.1-8B-Instruct", "prompt":"Hello, my name is", "max_tokens": 200}' --header "Content-Type: application/json" "http://localhost:8080/predictions/model/1.0/v1/completions"
+```
+
+#### TRT-LLM Engine
+```bash
+# Make sure to install torchserve with python venv as described above and login with `huggingface-cli login`
+# pip install -U --use-deprecated=legacy-resolver -r requirements/trt_llm.txt
+python -m ts.llm_launcher --model_id meta-llama/Meta-Llama-3.1-8B-Instruct --engine trt_llm --disable_token_auth
+
+# Try it out
+curl -X POST -d '{"prompt":"count from 1 to 9 in french ", "max_tokens": 100}' --header "Content-Type: application/json" "http://localhost:8080/predictions/model"
 ```
 
 ### 🚢 Quick Start LLM Deployment with Docker

diff --git a/benchmarks/utils/system_under_test.py b/benchmarks/utils/system_under_test.py
@@ -113,6 +113,7 @@ def start(self):
         execute("torchserve --stop", wait=True)
         click.secho("*Setting up model store...", fg="green")
         self._prepare_local_dependency()
+        self._clear_neuron_cache_if_exists()
         click.secho("*Starting local Torchserve instance...", fg="green")
 
         ts_cmd = (
@@ -141,6 +142,31 @@ def start(self):
                 if "Model server started" in str(line).strip():
                     break
 
+    def _clear_neuron_cache_if_exists(self):
+        cache_dir = "/var/tmp/neuron-compile-cache/"
+
+        # Check if the directory exists
+        if os.path.exists(cache_dir) and os.path.isdir(cache_dir):
+            click.secho(
+                f"Directory {cache_dir} exists. Clearing contents...", fg="green"
+            )
+
+            # Remove the directory contents
+            for filename in os.listdir(cache_dir):
+                file_path = os.path.join(cache_dir, filename)
+                try:
+                    if os.path.isfile(file_path) or os.path.islink(file_path):
+                        os.unlink(file_path)
+                    elif os.path.isdir(file_path):
+                        shutil.rmtree(file_path)
+                except Exception as e:
+                    click.secho(f"Failed to delete {file_path}. Reason: {e}", fg="red")
+            click.secho(f"Cache cleared: {cache_dir}", fg="green")
+        else:
+            click.secho(
+                f"Directory {cache_dir} does not exist. No action taken.", fg="green"
+            )
+
     def stop(self):
         click.secho("*Terminating Torchserve instance...", fg="green")
         execute("torchserve --stop", wait=True)

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -73,7 +73,7 @@ COPY ./ serve
 RUN \
     if echo "$LOCAL_CHANGES" | grep -q "false"; then \
         rm -rf serve;\
-        git clone --recursive $REPO_URL -b $BRANCH_NAME; \
+        git clone --recursive $REPO_URL -b $BRANCH_NAME serve; \
     fi
 
 
@@ -238,7 +238,7 @@ COPY ./ serve
 RUN \
     if echo "$LOCAL_CHANGES" | grep -q "false"; then \
         rm -rf serve;\
-        git clone --recursive $REPO_URL -b $BRANCH_NAME; \
+        git clone --recursive $REPO_URL -b $BRANCH_NAME serve; \
     fi
 
 COPY --from=compile-image /home/venv /home/venv

diff --git a/docker/build_image.sh b/docker/build_image.sh
@@ -18,6 +18,8 @@ BUILD_NIGHTLY=false
 BUILD_FROM_SRC=false
 LOCAL_CHANGES=true
 PYTHON_VERSION=3.9
+ARCH="linux/arm64,linux/amd64"
+MULTI=false
 
 for arg in "$@"
 do
@@ -101,6 +103,10 @@ do
           BUILD_CPP=true
           shift
           ;;
+        -m|--multi)
+          MULTI=true
+          shift
+          ;;
         -n|--nightly)
           BUILD_NIGHTLY=true
           shift
@@ -214,12 +220,17 @@ then
   fi
 fi
 
-if [ "${BUILD_TYPE}" == "production" ]
-then
-  DOCKER_BUILDKIT=1 docker build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg USE_CUDA_VERSION="${CUDA_VERSION}"  --build-arg PYTHON_VERSION="${PYTHON_VERSION}"\
-  --build-arg BUILD_NIGHTLY="${BUILD_NIGHTLY}" --build-arg BRANCH_NAME="${BRANCH_NAME}" --build-arg REPO_URL="${REPO_URL}" --build-arg BUILD_FROM_SRC="${BUILD_FROM_SRC}"\
-  --build-arg LOCAL_CHANGES="${LOCAL_CHANGES}" -t "${DOCKER_TAG}" --target production-image  ../
-elif [ "${BUILD_TYPE}" == "ci" ]
+if [ "${BUILD_TYPE}" == "production" ]; then
+  if [ "${MULTI}" == "true" ]; then
+    DOCKER_BUILDKIT=1 docker buildx build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg USE_CUDA_VERSION="${CUDA_VERSION}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}"\
+    --build-arg BUILD_NIGHTLY="${BUILD_NIGHTLY}" --build-arg BRANCH_NAME="${BRANCH_NAME}" --build-arg REPO_URL="${REPO_URL}" --build-arg BUILD_FROM_SRC="${BUILD_FROM_SRC}"\
+    --build-arg LOCAL_CHANGES="${LOCAL_CHANGES}" -t "${DOCKER_TAG}" --platform "${ARCH}" --target production-image ../ --push
+  else
+    DOCKER_BUILDKIT=1 docker buildx build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg USE_CUDA_VERSION="${CUDA_VERSION}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}"\
+    --build-arg BUILD_NIGHTLY="${BUILD_NIGHTLY}" --build-arg BRANCH_NAME="${BRANCH_NAME}" --build-arg REPO_URL="${REPO_URL}" --build-arg BUILD_FROM_SRC="${BUILD_FROM_SRC}"\
+    --build-arg LOCAL_CHANGES="${LOCAL_CHANGES}" -t "${DOCKER_TAG}" --target production-image ../ --load
+  fi
+elif [ "${BUILD_TYPE}" == "ci" ];
 then
   DOCKER_BUILDKIT=1 docker build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg USE_CUDA_VERSION="${CUDA_VERSION}"  --build-arg PYTHON_VERSION="${PYTHON_VERSION}"\
   --build-arg BUILD_NIGHTLY="${BUILD_NIGHTLY}" --build-arg BRANCH_NAME="${BRANCH_NAME}" --build-arg REPO_URL="${REPO_URL}" --build-arg BUILD_FROM_SRC="${BUILD_FROM_SRC}"\

diff --git a/docker/build_upload_release.py b/docker/build_upload_release.py
@@ -31,7 +31,7 @@
     organization = args.organization
 
     # Upload pytorch/torchserve docker binaries
-    try_and_handle(f"./build_image.sh -t {organization}/torchserve:latest", dry_run)
+    try_and_handle(f"./build_image.sh -m -t {organization}/torchserve:latest", dry_run)
     try_and_handle(
         f"./build_image.sh -g -cv cu121 -t {organization}/torchserve:latest-gpu",
         dry_run,
@@ -44,14 +44,17 @@
         f"./build_image.sh -bt dev -g -cv cu121 -cpp -t {organization}/torchserve:latest-cpp-dev-gpu",
         dry_run,
     )
+
     try_and_handle(
-        f"docker tag {organization}/torchserve:latest {organization}/torchserve:latest-cpu",
+        f"docker buildx imagetools create --tag {organization}/torchserve:latest-cpu {organization}/torchserve:latest",
         dry_run,
     )
+
     try_and_handle(
-        f"docker tag {organization}/torchserve:latest {organization}/torchserve:{check_ts_version()}-cpu",
+        f"docker buildx imagetools create --tag {organization}/torchserve:{check_ts_version()}-cpu {organization}/torchserve:latest",
         dry_run,
     )
+
     try_and_handle(
         f"docker tag {organization}/torchserve:latest-gpu {organization}/torchserve:{check_ts_version()}-gpu",
         dry_run,
@@ -66,12 +69,9 @@
     )
 
     for image in [
-        f"{organization}/torchserve:latest",
-        f"{organization}/torchserve:latest-cpu",
         f"{organization}/torchserve:latest-gpu",
         f"{organization}/torchserve:latest-cpp-dev-cpu",
         f"{organization}/torchserve:latest-cpp-dev-gpu",
-        f"{organization}/torchserve:{check_ts_version()}-cpu",
         f"{organization}/torchserve:{check_ts_version()}-gpu",
         f"{organization}/torchserve:{check_ts_version()}-cpp-dev-cpu",
         f"{organization}/torchserve:{check_ts_version()}-cpp-dev-gpu",

diff --git a/docker/docker_nightly.py b/docker/docker_nightly.py
@@ -39,7 +39,7 @@
     cpp_dev_gpu_version = f"{project}:cpp-dev-gpu-{get_nightly_version()}"
 
     # Build Nightly images and append the date in the name
-    try_and_handle(f"./build_image.sh -n -t {organization}/{cpu_version}", dry_run)
+    try_and_handle(f"./build_image.sh -m -n -t {organization}/{cpu_version}", dry_run)
     try_and_handle(
         f"./build_image.sh -g -cv cu121 -n -t {organization}/{gpu_version}",
         dry_run,
@@ -54,18 +54,17 @@
     )
 
     # Push Nightly images to official PyTorch Dockerhub account
-    try_and_handle(f"docker push {organization}/{cpu_version}", dry_run)
     try_and_handle(f"docker push {organization}/{gpu_version}", dry_run)
     try_and_handle(f"docker push {organization}/{cpp_dev_cpu_version}", dry_run)
     try_and_handle(f"docker push {organization}/{cpp_dev_gpu_version}", dry_run)
 
     # Tag nightly images with latest
     try_and_handle(
-        f"docker tag {organization}/{cpu_version} {organization}/{project}:latest-cpu",
+        f"docker buildx imagetools create --tag {organization}/{project}:latest-cpu {organization}/{cpu_version}",
         dry_run,
     )
     try_and_handle(
-        f"docker tag {organization}/{gpu_version} {organization}/{project}:latest-gpu",
+        f"docker buildx imagetools create --tag {organization}/{project}:latest-gpu {organization}/{gpu_version}",
         dry_run,
     )
     try_and_handle(
@@ -78,8 +77,6 @@
     )
 
     # Push images with latest tag
-    try_and_handle(f"docker push {organization}/{project}:latest-cpu", dry_run)
-    try_and_handle(f"docker push {organization}/{project}:latest-gpu", dry_run)
     try_and_handle(f"docker push {organization}/{project}:latest-cpp-dev-cpu", dry_run)
     try_and_handle(f"docker push {organization}/{project}:latest-cpp-dev-gpu", dry_run)
 

diff --git a/docs/Troubleshooting.md b/docs/Troubleshooting.md
@@ -76,8 +76,8 @@ Relevant issues: [[#566](https://github.com/pytorch/serve/issues/566)]
 #### How can I resolve model specific python dependency?
 You can provide a requirements.txt while creating a mar file using "--requirements-file/ -r" flag. You can refer to the [waveglow text-to-speech-synthesizer](https://github.com/pytorch/serve/tree/master/examples/text_to_speech_synthesizer) example
 
--   [waveglow mar creation script](https://github.com/pytorch/serve/blob/master/examples/text_to_speech_synthesizer/create_mar.sh)
--   [waveglow handler](https://github.com/pytorch/serve/blob/2d9c7ccc316f592374943a1963c1057bbe232c9e/examples/text_to_speech_synthesizer/waveglow_handler.py#L49)
+-   [waveglow mar creation script](https://github.com/pytorch/serve/blob/master/examples/text_to_speech_synthesizer/WaveGlow/create_mar.sh)
+-   [waveglow handler](https://github.com/pytorch/serve/blob/2d9c7ccc316f592374943a1963c1057bbe232c9e/examples/text_to_speech_synthesizer/WaveGlow/waveglow_handler.py#L49)
 
 Relevant issues: [[#566](https://github.com/pytorch/serve/issues/566)]
 Refer [Torch model archiver cli](https://github.com/pytorch/serve/blob/master/model-archiver/README.md#torch-model-archiver-command-line-interface) for more details.

diff --git a/docs/custom_service.md b/docs/custom_service.md
@@ -250,7 +250,7 @@ class ModelHandler(BaseHandler):
 
 ```
 
-Refer [waveglow_handler](https://github.com/pytorch/serve/blob/master/examples/text_to_speech_synthesizer/waveglow_handler.py) for more details.
+Refer [waveglow_handler](https://github.com/pytorch/serve/blob/master/examples/text_to_speech_synthesizer/WaveGlow/waveglow_handler.py) for more details.
 
 #### Captum explanations for custom handler
 
@@ -362,7 +362,7 @@ class CustomImageClassifier(ImageClassifier):
 For more details refer following examples :
 - [mnist digit classifier handler](https://github.com/pytorch/serve/tree/master/examples/image_classifier)
 - [Huggingface transformer generalized handler](https://github.com/pytorch/serve/blob/master/examples/Huggingface_Transformers/Transformer_handler_generalized.py)
-- [Waveglow text to speech synthesizer](https://github.com/pytorch/serve/blob/master/examples/text_to_speech_synthesizer/waveglow_handler.py)
+- [Waveglow text to speech synthesizer](https://github.com/pytorch/serve/blob/master/examples/text_to_speech_synthesizer/WaveGlow/waveglow_handler.py)
 
 ## Creating a model archive with an entry point
 

diff --git a/examples/custom_endpoint_plugin/ModelReady.java b/examples/custom_endpoint_plugin/ModelReady.java
@@ -0,0 +1,58 @@
+package org.pytorch.serve.plugins.endpoint;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.Map;
+import org.pytorch.serve.servingsdk.Context;
+import org.pytorch.serve.servingsdk.Model;
+import org.pytorch.serve.servingsdk.ModelServerEndpoint;
+import org.pytorch.serve.servingsdk.Worker;
+import org.pytorch.serve.servingsdk.annotations.Endpoint;
+import org.pytorch.serve.servingsdk.annotations.helpers.EndpointTypes;
+import org.pytorch.serve.servingsdk.http.Request;
+import org.pytorch.serve.servingsdk.http.Response;
+
+@Endpoint(
+        urlPattern = "model-ready",
+        endpointType = EndpointTypes.INFERENCE,
+        description = "Endpoint indicating registered model/s ready to serve inference requests")
+public class ModelReady extends ModelServerEndpoint {
+    private boolean modelsLoaded(Context ctx) {
+        Map<String, Model> modelMap = ctx.getModels();
+
+        if (modelMap.isEmpty()) {
+            return false;
+        }
+
+        for (Map.Entry<String, Model> entry : modelMap.entrySet()) {
+            boolean workerReady = false;
+            for (Worker w : entry.getValue().getModelWorkers()) {
+                if (w.isRunning()) {
+                    workerReady = true;
+                    break;
+                }
+            }
+            if (!workerReady) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    @Override
+    public void doGet(Request req, Response rsp, Context ctx) throws IOException {
+        if (modelsLoaded(ctx)) {
+            rsp.setStatus(200, "Model/s ready");
+            rsp.getOutputStream()
+                    .write(
+                            "{\n\t\"Status\": \"Model/s ready\"\n}\n"
+                                    .getBytes(StandardCharsets.UTF_8));
+        } else {
+            rsp.setStatus(503, "Model/s not ready");
+            rsp.getOutputStream()
+                    .write(
+                            "{\n\t\"Status\": \"Model/s not ready\"\n}\n"
+                                    .getBytes(StandardCharsets.UTF_8));
+        }
+    }
+}