From e328128b7209d132c12cbd8275efa911b5031b5e Mon Sep 17 00:00:00 2001 From: lwaekfjlk <1125027232@qq.com> Date: Wed, 11 Oct 2023 18:42:30 +0000 Subject: [PATCH] update readme and fix some bug --- llm_deploy/README.md | 4 +++- llm_deploy/vllm_test.py | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/llm_deploy/README.md b/llm_deploy/README.md index 2ad30532..a6cff81f 100644 --- a/llm_deploy/README.md +++ b/llm_deploy/README.md @@ -1,3 +1,5 @@ We need to use an unmerged branch to support deploying lora-finetuned model. (the forked repo is https://github.com/troph-team/vllm.git) -Go to the vllm dir and pip install -e . \ No newline at end of file +Go to the vllm dir and pip install -e . + +To notice https://github.com/vllm-project/vllm/issues/1283, need to modify the config file to == 2.0.1 and the pytorch version if facing with CUDA version error. \ No newline at end of file diff --git a/llm_deploy/vllm_test.py b/llm_deploy/vllm_test.py index 3421dc1d..0ebb79ac 100644 --- a/llm_deploy/vllm_test.py +++ b/llm_deploy/vllm_test.py @@ -1,11 +1,11 @@ from vllm import LLM, SamplingParams from vllm.model_executor.adapters import lora -# Create an LLM. -llm = LLM(model="../llm_ft/vicuna-7b-1.5", gpu_memory_utilization=0.05) +# Create an LLM, need to change gpu memory utilization based on our need +llm = LLM(model="../llm_ft/vicuna-7b-1.5", gpu_memory_utilization=0.5) # Add LoRA adapter -lora.LoRAModel.from_pretrained(llm.llm_engine.workers[0].model, "../llm_ft/checkpoints/checkpoint-1200") +lora.LoRAModel.from_pretrained(llm.llm_engine.workers[0].model, "../llm_ft/vicuna_checkpoints/checkpoint-1200") prompts = [ "Hello, my name is",