diff --git a/benchmarks/utils/system_under_test.py b/benchmarks/utils/system_under_test.py index 3f20f8b0d2..85820203b8 100644 --- a/benchmarks/utils/system_under_test.py +++ b/benchmarks/utils/system_under_test.py @@ -113,6 +113,7 @@ def start(self): execute("torchserve --stop", wait=True) click.secho("*Setting up model store...", fg="green") self._prepare_local_dependency() + self._clear_neuron_cache_if_exists() click.secho("*Starting local Torchserve instance...", fg="green") ts_cmd = ( @@ -141,6 +142,31 @@ def start(self): if "Model server started" in str(line).strip(): break + def _clear_neuron_cache_if_exists(self): + cache_dir = "/var/tmp/neuron-compile-cache/" + + # Check if the directory exists + if os.path.exists(cache_dir) and os.path.isdir(cache_dir): + click.secho( + f"Directory {cache_dir} exists. Clearing contents...", fg="green" + ) + + # Remove the directory contents + for filename in os.listdir(cache_dir): + file_path = os.path.join(cache_dir, filename) + try: + if os.path.isfile(file_path) or os.path.islink(file_path): + os.unlink(file_path) + elif os.path.isdir(file_path): + shutil.rmtree(file_path) + except Exception as e: + click.secho(f"Failed to delete {file_path}. Reason: {e}", fg="red") + click.secho(f"Cache cleared: {cache_dir}", fg="green") + else: + click.secho( + f"Directory {cache_dir} does not exist. No action taken.", fg="green" + ) + def stop(self): click.secho("*Terminating Torchserve instance...", fg="green") execute("torchserve --stop", wait=True) diff --git a/docker/Dockerfile b/docker/Dockerfile index 71d48f58fe..94f4a1ba99 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -73,7 +73,7 @@ COPY ./ serve RUN \ if echo "$LOCAL_CHANGES" | grep -q "false"; then \ rm -rf serve;\ - git clone --recursive $REPO_URL -b $BRANCH_NAME; \ + git clone --recursive $REPO_URL -b $BRANCH_NAME serve; \ fi @@ -238,7 +238,7 @@ COPY ./ serve RUN \ if echo "$LOCAL_CHANGES" | grep -q "false"; then \ rm -rf serve;\ - git clone --recursive $REPO_URL -b $BRANCH_NAME; \ + git clone --recursive $REPO_URL -b $BRANCH_NAME serve; \ fi COPY --from=compile-image /home/venv /home/venv diff --git a/examples/large_models/utils/test_llm_streaming_response.py b/examples/large_models/utils/test_llm_streaming_response.py index 55f9129bc3..94e6b373e5 100644 --- a/examples/large_models/utils/test_llm_streaming_response.py +++ b/examples/large_models/utils/test_llm_streaming_response.py @@ -196,7 +196,7 @@ def parse_args(): "--model-version", type=str, default="1.0", - help="Model vesion. Default: 1.0", + help="Model version. Default: 1.0", ) return parser.parse_args() diff --git a/examples/large_models/vllm/llama3/model-config.yaml b/examples/large_models/vllm/llama3/model-config.yaml index 1fea5ff658..a699d25496 100644 --- a/examples/large_models/vllm/llama3/model-config.yaml +++ b/examples/large_models/vllm/llama3/model-config.yaml @@ -2,7 +2,7 @@ minWorkers: 1 maxWorkers: 1 maxBatchDelay: 100 -responseTimeout: 1200 +startupTimeout: 1200 deviceType: "gpu" asyncCommunication: true diff --git a/examples/large_models/vllm/lora/Readme.md b/examples/large_models/vllm/lora/Readme.md index c592f23a73..0469d53567 100644 --- a/examples/large_models/vllm/lora/Readme.md +++ b/examples/large_models/vllm/lora/Readme.md @@ -55,7 +55,7 @@ The vllm integration uses an OpenAI compatible interface which lets you perform Curl: ```bash -curl --header "Content-Type: application/json" --request POST --data @prompt.json http://localhost:8080/predictions/llama-8b-lora/1.0/v1 +curl --header "Content-Type: application/json" --request POST --data @prompt.json http://localhost:8080/predictions/llama-8b-lora/1.0/v1/completions ``` Python + Request: diff --git a/examples/large_models/vllm/lora/model-config.yaml b/examples/large_models/vllm/lora/model-config.yaml index 85db70338e..8fda7ef6fa 100644 --- a/examples/large_models/vllm/lora/model-config.yaml +++ b/examples/large_models/vllm/lora/model-config.yaml @@ -2,7 +2,7 @@ minWorkers: 1 maxWorkers: 1 maxBatchDelay: 100 -responseTimeout: 1200 +startupTimeout: 1200 deviceType: "gpu" asyncCommunication: true diff --git a/examples/large_models/vllm/mistral/model-config.yaml b/examples/large_models/vllm/mistral/model-config.yaml index 7aac9b01b1..4f4ef5dc25 100644 --- a/examples/large_models/vllm/mistral/model-config.yaml +++ b/examples/large_models/vllm/mistral/model-config.yaml @@ -2,7 +2,7 @@ minWorkers: 1 maxWorkers: 1 maxBatchDelay: 100 -responseTimeout: 1200 +startupTimeout: 1200 deviceType: "gpu" asyncCommunication: true diff --git a/examples/large_models/vllm/requirements.txt b/examples/large_models/vllm/requirements.txt index a603b3c4ec..6d0209c820 100644 --- a/examples/large_models/vllm/requirements.txt +++ b/examples/large_models/vllm/requirements.txt @@ -1 +1 @@ -vllm==0.5.0 +vllm==0.6.1.post2 diff --git a/frontend/server/src/main/java/org/pytorch/serve/openapi/OpenApiUtils.java b/frontend/server/src/main/java/org/pytorch/serve/openapi/OpenApiUtils.java index 5d216f0465..f307188647 100644 --- a/frontend/server/src/main/java/org/pytorch/serve/openapi/OpenApiUtils.java +++ b/frontend/server/src/main/java/org/pytorch/serve/openapi/OpenApiUtils.java @@ -203,7 +203,7 @@ private static Operation getSetDefaultOperation() { MediaType error = getErrorResponse(); operation.addResponse( - new Response("200", "Default vesion succsesfully updated for model", status)); + new Response("200", "Default version successfully updated for model", status)); operation.addResponse( new Response("404", "Model not found or Model version not found", error)); operation.addResponse(new Response("500", "Internal Server Error", error)); diff --git a/frontend/server/src/main/java/org/pytorch/serve/util/ApiUtils.java b/frontend/server/src/main/java/org/pytorch/serve/util/ApiUtils.java index 46c476affd..70f5a1c644 100644 --- a/frontend/server/src/main/java/org/pytorch/serve/util/ApiUtils.java +++ b/frontend/server/src/main/java/org/pytorch/serve/util/ApiUtils.java @@ -98,7 +98,7 @@ public static String setDefault(String modelName, String newModelVersion) ModelManager modelManager = ModelManager.getInstance(); modelManager.setDefaultVersion(modelName, newModelVersion); String msg = - "Default vesion succsesfully updated for model \"" + "Default version successfully updated for model \"" + modelName + "\" to \"" + newModelVersion diff --git a/frontend/server/src/main/java/org/pytorch/serve/wlm/AsyncWorkerThread.java b/frontend/server/src/main/java/org/pytorch/serve/wlm/AsyncWorkerThread.java index 2dd1272b01..526f3b9232 100644 --- a/frontend/server/src/main/java/org/pytorch/serve/wlm/AsyncWorkerThread.java +++ b/frontend/server/src/main/java/org/pytorch/serve/wlm/AsyncWorkerThread.java @@ -33,7 +33,7 @@ public class AsyncWorkerThread extends WorkerThread { // protected ConcurrentHashMap requestsInBackend; protected static final Logger logger = LoggerFactory.getLogger(AsyncWorkerThread.class); - protected static final long MODEL_LOAD_TIMEOUT = 10L; + protected static final long WORKER_TIMEOUT = 2L; protected boolean loadingFinished; protected CountDownLatch latch; @@ -53,6 +53,7 @@ public AsyncWorkerThread( @Override public void run() { responseTimeout = model.getResponseTimeout(); + startupTimeout = model.getStartupTimeout(); Thread thread = Thread.currentThread(); thread.setName(getWorkerName()); currentThread.set(thread); @@ -80,11 +81,11 @@ public void run() { if (loadingFinished == false) { latch = new CountDownLatch(1); - if (!latch.await(MODEL_LOAD_TIMEOUT, TimeUnit.MINUTES)) { + if (!latch.await(startupTimeout, TimeUnit.SECONDS)) { throw new WorkerInitializationException( - "Worker did not load the model within" - + MODEL_LOAD_TIMEOUT - + " mins"); + "Worker did not load the model within " + + startupTimeout + + " seconds"); } } @@ -99,7 +100,7 @@ public void run() { logger.debug("Shutting down the thread .. Scaling down."); } else { logger.debug( - "Backend worker monitoring thread interrupted or backend worker process died., responseTimeout:" + "Backend worker monitoring thread interrupted or backend worker process died. responseTimeout:" + responseTimeout + "sec", e); diff --git a/frontend/server/src/test/java/org/pytorch/serve/ModelServerTest.java b/frontend/server/src/test/java/org/pytorch/serve/ModelServerTest.java index 518143ec04..f419a26657 100644 --- a/frontend/server/src/test/java/org/pytorch/serve/ModelServerTest.java +++ b/frontend/server/src/test/java/org/pytorch/serve/ModelServerTest.java @@ -429,7 +429,7 @@ public void testSetDefaultVersionNoop() throws InterruptedException { StatusResponse resp = JsonUtils.GSON.fromJson(TestUtils.getResult(), StatusResponse.class); Assert.assertEquals( resp.getStatus(), - "Default vesion succsesfully updated for model \"noopversioned\" to \"1.2.1\""); + "Default version successfully updated for model \"noopversioned\" to \"1.2.1\""); } @Test( diff --git a/frontend/server/src/test/resources/management_open_api.json b/frontend/server/src/test/resources/management_open_api.json index 352cc1bd83..4e926297a1 100644 --- a/frontend/server/src/test/resources/management_open_api.json +++ b/frontend/server/src/test/resources/management_open_api.json @@ -1671,7 +1671,7 @@ ], "responses": { "200": { - "description": "Default vesion succsesfully updated for model", + "description": "Default version successfully updated for model", "content": { "application/json": { "schema": { diff --git a/frontend/server/src/test/resources/model_management_api.json b/frontend/server/src/test/resources/model_management_api.json index 9ca6badc69..93cb7bae6b 100644 --- a/frontend/server/src/test/resources/model_management_api.json +++ b/frontend/server/src/test/resources/model_management_api.json @@ -1216,7 +1216,7 @@ ], "responses": { "200": { - "description": "Default vesion succsesfully updated for model", + "description": "Default version successfully updated for model", "content": { "application/json": { "schema": { diff --git a/kubernetes/kserve/build_image.sh b/kubernetes/kserve/build_image.sh index 670915f1fc..39a005b8f0 100755 --- a/kubernetes/kserve/build_image.sh +++ b/kubernetes/kserve/build_image.sh @@ -66,5 +66,5 @@ cp -r ../../third_party . if [ "${MULTI}" == "true" ]; then DOCKER_BUILDKIT=1 docker buildx build --file "$DOCKER_FILE" --build-arg BASE_IMAGE=$BASE_IMAGE --platform "${ARCH}" -t "$DOCKER_TAG" --push . else - DOCKER_BUILDKIT=1 docker buildx build --file "$DOCKER_FILE" --build-arg BASE_IMAGE=$BASE_IMAGE -t "$DOCKER_TAG" --load . + DOCKER_BUILDKIT=1 docker buildx build --file "$DOCKER_FILE" --build-arg BASE_IMAGE=$BASE_IMAGE -t "$DOCKER_TAG" --push . fi diff --git a/kubernetes/kserve/build_upload_release.py b/kubernetes/kserve/build_upload_release.py index 0c7c481671..e9dd311610 100644 --- a/kubernetes/kserve/build_upload_release.py +++ b/kubernetes/kserve/build_upload_release.py @@ -39,11 +39,6 @@ dry_run, ) - for image in [ - f"{organization}/torchserve-kfs:{check_ts_version()}-gpu", - ]: - try_and_handle(f"docker push {image}", dry_run) - # Cleanup built images if args.cleanup: try_and_handle(f"docker system prune --all --volumes -f", dry_run) diff --git a/kubernetes/kserve/docker_nightly.py b/kubernetes/kserve/docker_nightly.py index d5d3f13b76..38c3bdea19 100644 --- a/kubernetes/kserve/docker_nightly.py +++ b/kubernetes/kserve/docker_nightly.py @@ -43,22 +43,17 @@ dry_run, ) - # Push Nightly images to official PyTorch Dockerhub account - try_and_handle(f"docker push {organization}/{gpu_version}", dry_run) - # Tag nightly images with latest try_and_handle( f"docker buildx imagetools create --tag {organization}/{project}:latest-cpu {organization}/{cpu_version}", dry_run, ) + try_and_handle( - f"docker tag {organization}/{gpu_version} {organization}/{project}:latest-gpu", + f"docker buildx imagetools create --tag {organization}/{project}:latest-gpu {organization}/{gpu_version}", dry_run, ) - # Push images with latest tag - try_and_handle(f"docker push {organization}/{project}:latest-gpu", dry_run) - # Cleanup built images if args.cleanup: try_and_handle(f"docker system prune --all --volumes -f", dry_run) diff --git a/ts/llm_launcher.py b/ts/llm_launcher.py index 1d95731e2f..591eedd474 100644 --- a/ts/llm_launcher.py +++ b/ts/llm_launcher.py @@ -67,6 +67,7 @@ def get_model_config(args, model_snapshot_path=None): "batchSize": 1, "maxBatchDelay": 100, "responseTimeout": 1200, + "startupTimeout": args.startup_timeout, "deviceType": "gpu", "asyncCommunication": True, } @@ -227,7 +228,7 @@ def main(args): parser.add_argument( "--vllm_engine.max_num_seqs", type=int, - default=16, + default=256, help="Max sequences in vllm engine", ) @@ -245,6 +246,13 @@ def main(args): help="Cache dir", ) + parser.add_argument( + "--startup_timeout", + type=int, + default=1200, + help="Model startup timeout in seconds", + ) + parser.add_argument( "--engine", type=str, @@ -272,6 +280,7 @@ def main(args): default=0.1, help="KV Cache free gpu memory fraction", ) + args = parser.parse_args() main(args)