Skip to content

Commit

Permalink
Merge branch 'master' into snap_fix
Browse files Browse the repository at this point in the history
  • Loading branch information
agunapal authored Sep 23, 2024
2 parents 9a00295 + d993070 commit 46d1a57
Show file tree
Hide file tree
Showing 18 changed files with 59 additions and 33 deletions.
26 changes: 26 additions & 0 deletions benchmarks/utils/system_under_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def start(self):
execute("torchserve --stop", wait=True)
click.secho("*Setting up model store...", fg="green")
self._prepare_local_dependency()
self._clear_neuron_cache_if_exists()
click.secho("*Starting local Torchserve instance...", fg="green")

ts_cmd = (
Expand Down Expand Up @@ -141,6 +142,31 @@ def start(self):
if "Model server started" in str(line).strip():
break

def _clear_neuron_cache_if_exists(self):
cache_dir = "/var/tmp/neuron-compile-cache/"

# Check if the directory exists
if os.path.exists(cache_dir) and os.path.isdir(cache_dir):
click.secho(
f"Directory {cache_dir} exists. Clearing contents...", fg="green"
)

# Remove the directory contents
for filename in os.listdir(cache_dir):
file_path = os.path.join(cache_dir, filename)
try:
if os.path.isfile(file_path) or os.path.islink(file_path):
os.unlink(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path)
except Exception as e:
click.secho(f"Failed to delete {file_path}. Reason: {e}", fg="red")
click.secho(f"Cache cleared: {cache_dir}", fg="green")
else:
click.secho(
f"Directory {cache_dir} does not exist. No action taken.", fg="green"
)

def stop(self):
click.secho("*Terminating Torchserve instance...", fg="green")
execute("torchserve --stop", wait=True)
Expand Down
4 changes: 2 additions & 2 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ COPY ./ serve
RUN \
if echo "$LOCAL_CHANGES" | grep -q "false"; then \
rm -rf serve;\
git clone --recursive $REPO_URL -b $BRANCH_NAME; \
git clone --recursive $REPO_URL -b $BRANCH_NAME serve; \
fi


Expand Down Expand Up @@ -238,7 +238,7 @@ COPY ./ serve
RUN \
if echo "$LOCAL_CHANGES" | grep -q "false"; then \
rm -rf serve;\
git clone --recursive $REPO_URL -b $BRANCH_NAME; \
git clone --recursive $REPO_URL -b $BRANCH_NAME serve; \
fi

COPY --from=compile-image /home/venv /home/venv
Expand Down
2 changes: 1 addition & 1 deletion examples/large_models/utils/test_llm_streaming_response.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def parse_args():
"--model-version",
type=str,
default="1.0",
help="Model vesion. Default: 1.0",
help="Model version. Default: 1.0",
)

return parser.parse_args()
Expand Down
2 changes: 1 addition & 1 deletion examples/large_models/vllm/llama3/model-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
minWorkers: 1
maxWorkers: 1
maxBatchDelay: 100
responseTimeout: 1200
startupTimeout: 1200
deviceType: "gpu"
asyncCommunication: true

Expand Down
2 changes: 1 addition & 1 deletion examples/large_models/vllm/lora/Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ The vllm integration uses an OpenAI compatible interface which lets you perform

Curl:
```bash
curl --header "Content-Type: application/json" --request POST --data @prompt.json http://localhost:8080/predictions/llama-8b-lora/1.0/v1
curl --header "Content-Type: application/json" --request POST --data @prompt.json http://localhost:8080/predictions/llama-8b-lora/1.0/v1/completions
```

Python + Request:
Expand Down
2 changes: 1 addition & 1 deletion examples/large_models/vllm/lora/model-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
minWorkers: 1
maxWorkers: 1
maxBatchDelay: 100
responseTimeout: 1200
startupTimeout: 1200
deviceType: "gpu"
asyncCommunication: true

Expand Down
2 changes: 1 addition & 1 deletion examples/large_models/vllm/mistral/model-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
minWorkers: 1
maxWorkers: 1
maxBatchDelay: 100
responseTimeout: 1200
startupTimeout: 1200
deviceType: "gpu"
asyncCommunication: true

Expand Down
2 changes: 1 addition & 1 deletion examples/large_models/vllm/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
vllm==0.5.0
vllm==0.6.1.post2
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ private static Operation getSetDefaultOperation() {
MediaType error = getErrorResponse();

operation.addResponse(
new Response("200", "Default vesion succsesfully updated for model", status));
new Response("200", "Default version successfully updated for model", status));
operation.addResponse(
new Response("404", "Model not found or Model version not found", error));
operation.addResponse(new Response("500", "Internal Server Error", error));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ public static String setDefault(String modelName, String newModelVersion)
ModelManager modelManager = ModelManager.getInstance();
modelManager.setDefaultVersion(modelName, newModelVersion);
String msg =
"Default vesion succsesfully updated for model \""
"Default version successfully updated for model \""
+ modelName
+ "\" to \""
+ newModelVersion
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
public class AsyncWorkerThread extends WorkerThread {
// protected ConcurrentHashMap requestsInBackend;
protected static final Logger logger = LoggerFactory.getLogger(AsyncWorkerThread.class);
protected static final long MODEL_LOAD_TIMEOUT = 10L;
protected static final long WORKER_TIMEOUT = 2L;

protected boolean loadingFinished;
protected CountDownLatch latch;
Expand All @@ -53,6 +53,7 @@ public AsyncWorkerThread(
@Override
public void run() {
responseTimeout = model.getResponseTimeout();
startupTimeout = model.getStartupTimeout();
Thread thread = Thread.currentThread();
thread.setName(getWorkerName());
currentThread.set(thread);
Expand Down Expand Up @@ -80,11 +81,11 @@ public void run() {

if (loadingFinished == false) {
latch = new CountDownLatch(1);
if (!latch.await(MODEL_LOAD_TIMEOUT, TimeUnit.MINUTES)) {
if (!latch.await(startupTimeout, TimeUnit.SECONDS)) {
throw new WorkerInitializationException(
"Worker did not load the model within"
+ MODEL_LOAD_TIMEOUT
+ " mins");
"Worker did not load the model within "
+ startupTimeout
+ " seconds");
}
}

Expand All @@ -99,7 +100,7 @@ public void run() {
logger.debug("Shutting down the thread .. Scaling down.");
} else {
logger.debug(
"Backend worker monitoring thread interrupted or backend worker process died., responseTimeout:"
"Backend worker monitoring thread interrupted or backend worker process died. responseTimeout:"
+ responseTimeout
+ "sec",
e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ public void testSetDefaultVersionNoop() throws InterruptedException {
StatusResponse resp = JsonUtils.GSON.fromJson(TestUtils.getResult(), StatusResponse.class);
Assert.assertEquals(
resp.getStatus(),
"Default vesion succsesfully updated for model \"noopversioned\" to \"1.2.1\"");
"Default version successfully updated for model \"noopversioned\" to \"1.2.1\"");
}

@Test(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1671,7 +1671,7 @@
],
"responses": {
"200": {
"description": "Default vesion succsesfully updated for model",
"description": "Default version successfully updated for model",
"content": {
"application/json": {
"schema": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1216,7 +1216,7 @@
],
"responses": {
"200": {
"description": "Default vesion succsesfully updated for model",
"description": "Default version successfully updated for model",
"content": {
"application/json": {
"schema": {
Expand Down
2 changes: 1 addition & 1 deletion kubernetes/kserve/build_image.sh
Original file line number Diff line number Diff line change
Expand Up @@ -66,5 +66,5 @@ cp -r ../../third_party .
if [ "${MULTI}" == "true" ]; then
DOCKER_BUILDKIT=1 docker buildx build --file "$DOCKER_FILE" --build-arg BASE_IMAGE=$BASE_IMAGE --platform "${ARCH}" -t "$DOCKER_TAG" --push .
else
DOCKER_BUILDKIT=1 docker buildx build --file "$DOCKER_FILE" --build-arg BASE_IMAGE=$BASE_IMAGE -t "$DOCKER_TAG" --load .
DOCKER_BUILDKIT=1 docker buildx build --file "$DOCKER_FILE" --build-arg BASE_IMAGE=$BASE_IMAGE -t "$DOCKER_TAG" --push .
fi
5 changes: 0 additions & 5 deletions kubernetes/kserve/build_upload_release.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,6 @@
dry_run,
)

for image in [
f"{organization}/torchserve-kfs:{check_ts_version()}-gpu",
]:
try_and_handle(f"docker push {image}", dry_run)

# Cleanup built images
if args.cleanup:
try_and_handle(f"docker system prune --all --volumes -f", dry_run)
9 changes: 2 additions & 7 deletions kubernetes/kserve/docker_nightly.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,22 +43,17 @@
dry_run,
)

# Push Nightly images to official PyTorch Dockerhub account
try_and_handle(f"docker push {organization}/{gpu_version}", dry_run)

# Tag nightly images with latest
try_and_handle(
f"docker buildx imagetools create --tag {organization}/{project}:latest-cpu {organization}/{cpu_version}",
dry_run,
)

try_and_handle(
f"docker tag {organization}/{gpu_version} {organization}/{project}:latest-gpu",
f"docker buildx imagetools create --tag {organization}/{project}:latest-gpu {organization}/{gpu_version}",
dry_run,
)

# Push images with latest tag
try_and_handle(f"docker push {organization}/{project}:latest-gpu", dry_run)

# Cleanup built images
if args.cleanup:
try_and_handle(f"docker system prune --all --volumes -f", dry_run)
11 changes: 10 additions & 1 deletion ts/llm_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def get_model_config(args, model_snapshot_path=None):
"batchSize": 1,
"maxBatchDelay": 100,
"responseTimeout": 1200,
"startupTimeout": args.startup_timeout,
"deviceType": "gpu",
"asyncCommunication": True,
}
Expand Down Expand Up @@ -227,7 +228,7 @@ def main(args):
parser.add_argument(
"--vllm_engine.max_num_seqs",
type=int,
default=16,
default=256,
help="Max sequences in vllm engine",
)

Expand All @@ -245,6 +246,13 @@ def main(args):
help="Cache dir",
)

parser.add_argument(
"--startup_timeout",
type=int,
default=1200,
help="Model startup timeout in seconds",
)

parser.add_argument(
"--engine",
type=str,
Expand Down Expand Up @@ -272,6 +280,7 @@ def main(args):
default=0.1,
help="KV Cache free gpu memory fraction",
)

args = parser.parse_args()

main(args)

0 comments on commit 46d1a57

Please sign in to comment.