From 1af090b57d0e23d268e79941f8084bf0a8ad8621 Mon Sep 17 00:00:00 2001
From: Zhuohan Li <zhuohan123@gmail.com>
Date: Wed, 31 Jan 2024 00:07:07 -0800
Subject: [PATCH] Bump up version to v0.3.0 (#2656)

---
 README.md             | 4 +++-
 docs/source/index.rst | 4 +++-
 vllm/__init__.py      | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index c0d267b2cbbf3..3853760613833 100644
--- a/README.md
+++ b/README.md
@@ -46,7 +46,7 @@ vLLM is fast with:
 - Efficient management of attention key and value memory with **PagedAttention**
 - Continuous batching of incoming requests
 - Fast model execution with CUDA/HIP graph
-- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629)
+- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache
 - Optimized CUDA kernels
 
 vLLM is flexible and easy to use with:
@@ -57,6 +57,8 @@ vLLM is flexible and easy to use with:
 - Streaming outputs
 - OpenAI-compatible API server
 - Support NVIDIA GPUs and AMD GPUs
+- (Experimental) Prefix caching support
+- (Experimental) Multi-lora support
 
 vLLM seamlessly supports many Hugging Face models, including the following architectures:
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 321f855645bb8..3e2331907f0f2 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -31,7 +31,7 @@ vLLM is fast with:
 * Efficient management of attention key and value memory with **PagedAttention**
 * Continuous batching of incoming requests
 * Fast model execution with CUDA/HIP graph
-* Quantization: `GPTQ <https://arxiv.org/abs/2210.17323>`_, `AWQ <https://arxiv.org/abs/2306.00978>`_, `SqueezeLLM <https://arxiv.org/abs/2306.07629>`_
+* Quantization: `GPTQ <https://arxiv.org/abs/2210.17323>`_, `AWQ <https://arxiv.org/abs/2306.00978>`_, `SqueezeLLM <https://arxiv.org/abs/2306.07629>`_, FP8 KV Cache
 * Optimized CUDA kernels
 
 vLLM is flexible and easy to use with:
@@ -42,6 +42,8 @@ vLLM is flexible and easy to use with:
 * Streaming outputs
 * OpenAI-compatible API server
 * Support NVIDIA GPUs and AMD GPUs
+* (Experimental) Prefix caching support
+* (Experimental) Multi-lora support
 
 For more information, check out the following:
 
diff --git a/vllm/__init__.py b/vllm/__init__.py
index 327dfad06352c..36d177f5942e7 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -8,7 +8,7 @@
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import SamplingParams
 
-__version__ = "0.2.7"
+__version__ = "0.3.0"
 
 __all__ = [
     "LLM",