diff --git a/backend/backend.proto b/backend/backend.proto index 48b0101b4b29..0a341ca2a9ed 100644 --- a/backend/backend.proto +++ b/backend/backend.proto @@ -242,6 +242,9 @@ message ModelOptions { repeated float LoraScales = 61; repeated string Options = 62; + + string CacheTypeKey = 63; + string CacheTypeValue = 64; } message Result { diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp index 0fde74cbd3a6..ea5c4e34a465 100644 --- a/backend/cpp/llama/grpc-server.cpp +++ b/backend/cpp/llama/grpc-server.cpp @@ -2241,6 +2241,12 @@ static void params_parse(const backend::ModelOptions* request, } // params.model_alias ?? params.model_alias = request->modelfile(); + if (!request->cachetypekey().empty()) { + params.cache_type_k = request->cachetypekey(); + } + if (!request->cachetypevalue().empty()) { + params.cache_type_v = request->cachetypevalue(); + } params.n_ctx = request->contextsize(); //params.memory_f16 = request->f16memory(); params.cpuparams.n_threads = request->threads(); diff --git a/core/backend/options.go b/core/backend/options.go index 1f88122fc2b2..f6247c605668 100644 --- a/core/backend/options.go +++ b/core/backend/options.go @@ -151,6 +151,8 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions { TensorParallelSize: int32(c.TensorParallelSize), MMProj: c.MMProj, FlashAttention: c.FlashAttention, + CacheTypeKey: c.CacheTypeK, + CacheTypeValue: c.CacheTypeV, NoKVOffload: c.NoKVOffloading, YarnExtFactor: c.YarnExtFactor, YarnAttnFactor: c.YarnAttnFactor, diff --git a/core/config/backend_config.go b/core/config/backend_config.go index 1de540f94382..0ff347699932 100644 --- a/core/config/backend_config.go +++ b/core/config/backend_config.go @@ -155,8 +155,10 @@ type LLMConfig struct { TensorParallelSize int `yaml:"tensor_parallel_size"` // vLLM MMProj string `yaml:"mmproj"` - FlashAttention bool `yaml:"flash_attention"` - NoKVOffloading bool `yaml:"no_kv_offloading"` + FlashAttention bool `yaml:"flash_attention"` + NoKVOffloading bool `yaml:"no_kv_offloading"` + CacheTypeK string `yaml:"cache_type_k"` + CacheTypeV string `yaml:"cache_type_v"` RopeScaling string `yaml:"rope_scaling"` ModelType string `yaml:"type"`