Skip to content

Commit

Permalink
Update
Browse files Browse the repository at this point in the history
  • Loading branch information
lshmouse committed Oct 9, 2024
1 parent ea0bdb7 commit 3d8ef3d
Show file tree
Hide file tree
Showing 5 changed files with 83 additions and 5 deletions.
24 changes: 21 additions & 3 deletions experimental/vllm_example/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,28 @@ load("@pip//:requirements.bzl", "requirement")
load("@rules_python//python:defs.bzl", "py_binary", "py_library")

py_binary(
name = "simple_inference",
srcs = ["simple_inference.py"],
main = "simple_inference.py",
name = "vllm_batch",
srcs = ["vllm_batch.py"],
main = "vllm_batch.py",
deps = [
requirement("vllm"),
],
)

py_binary(
name = "vllm_local",
srcs = ["vllm_local.py"],
main = "vllm_local.py",
deps = [
requirement("vllm"),
],
)

py_binary(
name = "vllm_on_ray",
srcs = ["vllm_on_ray.py"],
main = "vllm_on_ray.py",
deps = [
requirement("vllm"),
],
)
34 changes: 32 additions & 2 deletions experimental/vllm_example/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,34 @@
## vllm
## vLLM

### Setup
```
pip install vllm
```

### Example
local
```
python vllm_local.py
```
Ray cluster
```
ray init
python vllm_on_ray.py
```
Test:
```
curl http://localhost:8000/inference?query="Who%20are%20you"
```

### TODO
- deploy vllm on k8s with helm chart
- vllm on remote ray cluster
- ray cluster internals

### References
- [vllm](https://github.com/vllm/vllm)
- https://zhuanlan.zhihu.com/p/710614883
- https://docs.ray.io/en/latest/serve/tutorials/vllm-example.html
- https://docs.vllm.ai/en/latest/
- https://github.com/vllm-project/vllm/issues/1363
- http://kubeagi.k8s.com.cn/docs/Concepts/models
- https://github.com/skypilot-org/skypilot
File renamed without changes.
5 changes: 5 additions & 0 deletions experimental/vllm_example/vllm_local.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from vllm import LLM

llm = LLM("qwen/qwen2-0.5B")
output = llm.generate("Who are you?")
print(output)
25 changes: 25 additions & 0 deletions experimental/vllm_example/vllm_on_ray.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from fastapi import FastAPI
from ray import serve
from vllm import LLM

app = FastAPI()


@serve.deployment(num_replicas=1, ray_actor_options={"num_gpus": 1})
@serve.ingress(app)
class FastAPIDeployment:
def __init__(self):
self.llm = LLM("qwen/qwen2-0.5B")

@app.get("/inference")
def model_inference(self, query: str) -> str:
print("query: %s" % query)
output = self.llm.generate(query)
return str(output)

@app.get("/hello")
def hello(self) -> str:
return "hello"


serve.run(FastAPIDeployment.bind(), route_prefix="/", name="qwen2_model_service")

0 comments on commit 3d8ef3d

Please sign in to comment.