From e328128b7209d132c12cbd8275efa911b5031b5e Mon Sep 17 00:00:00 2001
From: lwaekfjlk <1125027232@qq.com>
Date: Wed, 11 Oct 2023 18:42:30 +0000
Subject: [PATCH] update readme and fix some bug

---
 llm_deploy/README.md    | 4 +++-
 llm_deploy/vllm_test.py | 6 +++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/llm_deploy/README.md b/llm_deploy/README.md
index 2ad30532..a6cff81f 100644
--- a/llm_deploy/README.md
+++ b/llm_deploy/README.md
@@ -1,3 +1,5 @@
 We need to use an unmerged branch to support deploying lora-finetuned model. (the forked repo is https://github.com/troph-team/vllm.git)
 
-Go to the vllm dir and pip install -e .
\ No newline at end of file
+Go to the vllm dir and pip install -e .
+
+To notice https://github.com/vllm-project/vllm/issues/1283, need to modify the config file to == 2.0.1 and the pytorch version if facing with CUDA version error.
\ No newline at end of file
diff --git a/llm_deploy/vllm_test.py b/llm_deploy/vllm_test.py
index 3421dc1d..0ebb79ac 100644
--- a/llm_deploy/vllm_test.py
+++ b/llm_deploy/vllm_test.py
@@ -1,11 +1,11 @@
 from vllm import LLM, SamplingParams
 from vllm.model_executor.adapters import lora
 
-# Create an LLM.
-llm = LLM(model="../llm_ft/vicuna-7b-1.5", gpu_memory_utilization=0.05)
+# Create an LLM, need to change gpu memory utilization based on our need
+llm = LLM(model="../llm_ft/vicuna-7b-1.5", gpu_memory_utilization=0.5)
 
 # Add LoRA adapter
-lora.LoRAModel.from_pretrained(llm.llm_engine.workers[0].model, "../llm_ft/checkpoints/checkpoint-1200")
+lora.LoRAModel.from_pretrained(llm.llm_engine.workers[0].model, "../llm_ft/vicuna_checkpoints/checkpoint-1200")
 
 prompts = [
     "Hello, my name is",