Update

lshmouse · Oct 9, 2024 · 3d8ef3d · 3d8ef3d
1 parent ea0bdb7
commit 3d8ef3d
Show file tree

Hide file tree

Showing 5 changed files with 83 additions and 5 deletions.
diff --git a/experimental/vllm_example/BUILD b/experimental/vllm_example/BUILD
@@ -2,10 +2,28 @@ load("@pip//:requirements.bzl", "requirement")
 load("@rules_python//python:defs.bzl", "py_binary", "py_library")
 
 py_binary(
-    name = "simple_inference",
-    srcs = ["simple_inference.py"],
-    main = "simple_inference.py",
+    name = "vllm_batch",
+    srcs = ["vllm_batch.py"],
+    main = "vllm_batch.py",
     deps = [
         requirement("vllm"),
     ],
 )
+
+py_binary(
+    name = "vllm_local",
+    srcs = ["vllm_local.py"],
+    main = "vllm_local.py",
+    deps = [
+        requirement("vllm"),
+    ],
+)
+
+py_binary(
+    name = "vllm_on_ray",
+    srcs = ["vllm_on_ray.py"],
+    main = "vllm_on_ray.py",
+    deps = [
+        requirement("vllm"),
+    ],
+)
diff --git a/experimental/vllm_example/README.md b/experimental/vllm_example/README.md
@@ -1,4 +1,34 @@
-## vllm
+## vLLM
+
+### Setup
+```
+pip install vllm
+```
+
+### Example
+local
+```
+python vllm_local.py
+```
+Ray cluster
+```
+ray init
+python vllm_on_ray.py
+```
+Test:
+```
+curl http://localhost:8000/inference?query="Who%20are%20you"
+```
+
+### TODO
+- deploy vllm on k8s with helm chart
+- vllm on remote ray cluster
+- ray cluster internals
 
 ### References
-- [vllm](https://github.com/vllm/vllm)
+- https://zhuanlan.zhihu.com/p/710614883
+- https://docs.ray.io/en/latest/serve/tutorials/vllm-example.html
+- https://docs.vllm.ai/en/latest/
+- https://github.com/vllm-project/vllm/issues/1363
+- http://kubeagi.k8s.com.cn/docs/Concepts/models
+- https://github.com/skypilot-org/skypilot
diff --git a/...rimental/vllm_example/simple_inference.py → experimental/vllm_example/vllm_batch.py b/...rimental/vllm_example/simple_inference.py → experimental/vllm_example/vllm_batch.py
diff --git a/experimental/vllm_example/vllm_local.py b/experimental/vllm_example/vllm_local.py
@@ -0,0 +1,5 @@
+from vllm import LLM
+
+llm = LLM("qwen/qwen2-0.5B")
+output = llm.generate("Who are you?")
+print(output)
diff --git a/experimental/vllm_example/vllm_on_ray.py b/experimental/vllm_example/vllm_on_ray.py
@@ -0,0 +1,25 @@
+from fastapi import FastAPI
+from ray import serve
+from vllm import LLM
+
+app = FastAPI()
+
+
+@serve.deployment(num_replicas=1, ray_actor_options={"num_gpus": 1})
+@serve.ingress(app)
+class FastAPIDeployment:
+    def __init__(self):
+        self.llm = LLM("qwen/qwen2-0.5B")
+
+    @app.get("/inference")
+    def model_inference(self, query: str) -> str:
+        print("query: %s" % query)
+        output = self.llm.generate(query)
+        return str(output)
+
+    @app.get("/hello")
+    def hello(self) -> str:
+        return "hello"
+
+
+serve.run(FastAPIDeployment.bind(), route_prefix="/", name="qwen2_model_service")