diff --git a/models/continuous_batching/cpu/meta-llama--Llama-2-7b-chat-hf.yaml b/models/continuous_batching/cpu/meta-llama--Llama-2-7b-chat-hf.yaml
new file mode 100644
index 00000000..1b85a7e0
--- /dev/null
+++ b/models/continuous_batching/cpu/meta-llama--Llama-2-7b-chat-hf.yaml
@@ -0,0 +1,41 @@
+deployment_config:
+ autoscaling_config:
+ min_replicas: 1
+ initial_replicas: 1
+ max_replicas: 8
+ target_num_ongoing_requests_per_replica: 24
+ metrics_interval_s: 10.0
+ look_back_period_s: 30.0
+ smoothing_factor: 0.6
+ downscale_delay_s: 300.0
+ upscale_delay_s: 15.0
+ max_concurrent_queries: 64
+ ray_actor_options: {}
+engine_config:
+ model_id: meta-llama/Llama-2-7b-chat-hf
+ hf_model_id: meta-llama/Llama-2-7b-chat-hf
+ type: VLLMEngine
+ engine_kwargs:
+ device: "cpu"
+ dtype: "float32"
+ trust_remote_code: true
+ max_num_batched_tokens: 4096
+ max_num_seqs: 64
+ gpu_memory_utilization: 0.95
+ max_total_tokens: 4096
+ generation:
+ prompt_format:
+ system: "<>\n{instruction}\n<>\n\n"
+ assistant: " {instruction} "
+ trailing_assistant: ""
+ user: "[INST] {system}{instruction} [/INST]"
+ system_in_user: true
+ default_system_message: ""
+ stopping_sequences: []
+scaling_config:
+ num_workers: 1
+ num_gpus_per_worker: 0
+ num_cpus_per_worker: 32
+ placement_strategy: "STRICT_PACK"
+ resources_per_worker: {}
+
diff --git a/models/continuous_batching/meta-llama--Llama-2-7b-chat-hf.yaml b/models/continuous_batching/meta-llama--Llama-2-7b-chat-hf.yaml
index 9bfdf5a1..8d0949ce 100644
--- a/models/continuous_batching/meta-llama--Llama-2-7b-chat-hf.yaml
+++ b/models/continuous_batching/meta-llama--Llama-2-7b-chat-hf.yaml
@@ -10,9 +10,7 @@ deployment_config:
downscale_delay_s: 300.0
upscale_delay_s: 15.0
max_concurrent_queries: 64
- ray_actor_options:
- resources:
- accelerator_type_a10: 0.01
+ ray_actor_options: {}
engine_config:
model_id: meta-llama/Llama-2-7b-chat-hf
hf_model_id: meta-llama/Llama-2-7b-chat-hf
@@ -34,8 +32,8 @@ engine_config:
stopping_sequences: []
scaling_config:
num_workers: 1
- num_gpus_per_worker: 1
+ num_gpus_per_worker: 0
num_cpus_per_worker: 8
placement_strategy: "STRICT_PACK"
- resources_per_worker:
- accelerator_type_a10: 0.01
+ resources_per_worker: {}
+
diff --git a/rayllm/backend/llm/embedding/embedding_models.py b/rayllm/backend/llm/embedding/embedding_models.py
index ce8c1b02..5816b962 100644
--- a/rayllm/backend/llm/embedding/embedding_models.py
+++ b/rayllm/backend/llm/embedding/embedding_models.py
@@ -11,6 +11,7 @@
ModelType,
S3MirrorConfig,
)
+from pydantic import ConfigDict
logger = logging.getLogger(__name__)
@@ -20,8 +21,7 @@ class EmbeddingOptimize(str, Enum):
class EmbeddingEngineConfig(EngineConfig):
- class Config:
- use_enum_values = True
+ model_config = ConfigDict(use_enum_values=True)
type: EngineType = EngineType.EmbeddingEngine
model_type: ModelType = ModelType.embedding
diff --git a/rayllm/backend/llm/trtllm/trtllm_models.py b/rayllm/backend/llm/trtllm/trtllm_models.py
index 6243aed3..79da1d61 100644
--- a/rayllm/backend/llm/trtllm/trtllm_models.py
+++ b/rayllm/backend/llm/trtllm/trtllm_models.py
@@ -10,6 +10,7 @@
S3MirrorConfig,
SamplingParams,
)
+from pydantic import ConfigDict
try:
from tensorrt_llm.libs import trt_llm_engine_py as trt_py
@@ -23,9 +24,7 @@ class TRTLLMGPTServeConfig(BaseModelExtended):
max_tokens_in_paged_kv_cache: int = None
kv_cache_free_gpu_mem_fraction: float = None
enable_trt_overlap: bool = None
-
- class Config:
- arbitrary_types_allowed = True
+ model_config = ConfigDict(arbitrary_types_allowed=True)
@classmethod
def from_engine_config(
diff --git a/rayllm/backend/llm/vllm/vllm_compatibility.py b/rayllm/backend/llm/vllm/vllm_compatibility.py
index b69b1a64..f4dd8fa0 100644
--- a/rayllm/backend/llm/vllm/vllm_compatibility.py
+++ b/rayllm/backend/llm/vllm/vllm_compatibility.py
@@ -1,14 +1,10 @@
import logging
import time
-from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Type, Union
+from typing import TYPE_CHECKING, Any, List, Optional, Type, Union
import ray
from ray.util.placement_group import PlacementGroup
from transformers.dynamic_module_utils import init_hf_modules
-from vllm.config import CacheConfig as VllmCacheConfig
-from vllm.config import ModelConfig as VllmModelConfig
-from vllm.config import ParallelConfig as VllmParallelConfig
-from vllm.config import SchedulerConfig as VllmSchedulerConfig
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine, AsyncStream, _AsyncLLMEngine
@@ -32,13 +28,8 @@
logger = logging.getLogger(__name__)
-VllmConfigs = Tuple[
- VllmCacheConfig, VllmModelConfig, VllmParallelConfig, VllmSchedulerConfig
-]
-
-
class AviaryLLMEngine(_AsyncLLMEngine):
- def __init__(self, *args, runtime_env: dict, **kwargs):
+ def __init__(self, *args, runtime_env: dict = {}, **kwargs):
self.runtime_env = runtime_env
super().__init__(*args, **kwargs)
@@ -106,15 +97,17 @@ def _record_system_stats(self, *args, **kwargs):
return last_stats
-def _get_vllm_engine_config(vllm_app) -> Tuple[AsyncEngineArgs, VllmConfigs]:
- # Generate engine arguements and engine configs
+def _get_vllm_engine_config(vllm_app) -> AsyncEngineArgs:
+ device = vllm_app.engine_config.engine_kwargs.get("device", "gpu")
+ # Generate engine arguements and engine configs
async_engine_args = AsyncEngineArgs(
# This is the local path on disk, or the hf model id
# If it is the hf_model_id, vllm automatically downloads the correct model.
**dict(
model=vllm_app.engine_config.actual_hf_model_id,
- worker_use_ray=True,
+ # vLLM for CPU doesn't support Ray workers for model parallelism yet
+ worker_use_ray=True if device == "gpu" else False,
engine_use_ray=False,
tensor_parallel_size=vllm_app.placement_config.world_size,
max_model_len=vllm_app.engine_config.max_total_tokens,
@@ -123,8 +116,8 @@ def _get_vllm_engine_config(vllm_app) -> Tuple[AsyncEngineArgs, VllmConfigs]:
**vllm_app.engine_config.get_initialization_kwargs(),
)
)
- configs = async_engine_args.create_engine_configs()
- return async_engine_args, configs
+
+ return async_engine_args
class AviaryAsyncLLMEngine(AsyncLLMEngine):
@@ -167,23 +160,12 @@ def from_llm_app(
# torch to have access to CUDA devices. We use a remote task
# with `num_gpus` set here, so the type check happens in an environment
# with `CUDA_VISIBLE_DEVICES` set.
- engine_args, engine_configs = ray.get(
+ engine_args = ray.get(
ray.remote(_get_vllm_engine_config)
.options(**scaling_config)
.remote(vllm_app)
)
- # Create the async LLM engine.
- engine = cls(
- engine_args.worker_use_ray,
- engine_args.engine_use_ray,
- *engine_configs,
- None,
- placement_group,
- runtime_env=runtime_env,
- log_requests=not engine_args.disable_log_requests,
- log_stats=not engine_args.disable_log_stats,
- max_log_len=engine_args.max_log_len,
- start_engine_loop=True,
- )
+ engine = cls.from_engine_args(engine_args, start_engine_loop=True)
+
return engine
diff --git a/rayllm/backend/llm/vllm/vllm_engine.py b/rayllm/backend/llm/vllm/vllm_engine.py
index 79131e6d..617784b6 100644
--- a/rayllm/backend/llm/vllm/vllm_engine.py
+++ b/rayllm/backend/llm/vllm/vllm_engine.py
@@ -59,7 +59,9 @@ def __init__(
self.llm_app = llm_app.copy(deep=True)
self.engine_config = llm_app.engine_config
self.placement_config = llm_app.placement_config
- if not (self.placement_config.scaling_config.num_gpus_per_worker > 0):
+
+ device = self.engine_config.engine_kwargs.get("device", "gpu")
+ if device == "gpu" and not (self.placement_config.scaling_config.num_gpus_per_worker > 0):
raise ValueError("The VLLM Engine Requires > 0 GPUs to run.")
self.node_initializer = node_initializer or LLMNodeInitializer(
diff --git a/rayllm/backend/server/config_models.py b/rayllm/backend/server/config_models.py
new file mode 100644
index 00000000..645b1d90
--- /dev/null
+++ b/rayllm/backend/server/config_models.py
@@ -0,0 +1,86 @@
+from typing import Optional
+from pydantic import (
+ BaseModel,
+ Field,
+ NonNegativeFloat,
+ NonNegativeInt,
+ PositiveFloat,
+ PositiveInt,
+ field_validator,
+)
+
+# Adapted from ray.serve.config.AutoscalingConfig
+# Port it here as the original AutoscalingConfig model is pydantic V1
+class AutoscalingConfig(BaseModel):
+ """Config for the Serve Autoscaler."""
+
+ # Please keep these options in sync with those in
+ # `src/ray/protobuf/serve.proto`.
+
+ # Publicly exposed options
+ min_replicas: NonNegativeInt = 1
+ initial_replicas: Optional[NonNegativeInt] = None
+ max_replicas: PositiveInt = 1
+
+ # DEPRECATED: replaced by target_ongoing_requests
+ target_num_ongoing_requests_per_replica: PositiveFloat = Field(
+ default=1.0,
+ description="[DEPRECATED] Please use `target_ongoing_requests` instead.",
+ )
+ # Will default to 1.0 in the future.
+ target_ongoing_requests: Optional[PositiveFloat] = None
+
+ # How often to scrape for metrics
+ metrics_interval_s: PositiveFloat = 10.0
+ # Time window to average over for metrics.
+ look_back_period_s: PositiveFloat = 30.0
+
+ # DEPRECATED
+ smoothing_factor: PositiveFloat = 1.0
+ # DEPRECATED: replaced by `downscaling_factor`
+ upscale_smoothing_factor: Optional[PositiveFloat] = Field(
+ default=None, description="[DEPRECATED] Please use `upscaling_factor` instead."
+ )
+ # DEPRECATED: replaced by `upscaling_factor`
+ downscale_smoothing_factor: Optional[PositiveFloat] = Field(
+ default=None,
+ description="[DEPRECATED] Please use `downscaling_factor` instead.",
+ )
+
+ # Multiplicative "gain" factor to limit scaling decisions
+ upscaling_factor: Optional[PositiveFloat] = None
+ downscaling_factor: Optional[PositiveFloat] = None
+
+ # How frequently to make autoscaling decisions
+ # loop_period_s: float = CONTROL_LOOP_PERIOD_S
+ # How long to wait before scaling down replicas
+ downscale_delay_s: NonNegativeFloat = 600.0
+ # How long to wait before scaling up replicas
+ upscale_delay_s: NonNegativeFloat = 30.0
+
+ @field_validator("max_replicas")
+ def replicas_settings_valid(cls, max_replicas, values):
+ min_replicas = values.data.get("min_replicas")
+ initial_replicas = values.data.get("initial_replicas")
+ if min_replicas is not None and max_replicas < min_replicas:
+ raise ValueError(
+ f"max_replicas ({max_replicas}) must be greater than "
+ f"or equal to min_replicas ({min_replicas})!"
+ )
+
+ if initial_replicas is not None:
+ if initial_replicas < min_replicas:
+ raise ValueError(
+ f"min_replicas ({min_replicas}) must be less than "
+ f"or equal to initial_replicas ({initial_replicas})!"
+ )
+ elif initial_replicas > max_replicas:
+ raise ValueError(
+ f"max_replicas ({max_replicas}) must be greater than "
+ f"or equal to initial_replicas ({initial_replicas})!"
+ )
+
+ return max_replicas
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
diff --git a/rayllm/backend/server/models.py b/rayllm/backend/server/models.py
index dcdf10ff..a56e8086 100644
--- a/rayllm/backend/server/models.py
+++ b/rayllm/backend/server/models.py
@@ -21,16 +21,14 @@
import yaml
from markdown_it import MarkdownIt
from pydantic import (
- BaseModel,
- Extra,
+ ConfigDict, BaseModel,
Field,
PrivateAttr,
- conlist,
- root_validator,
- validator,
+ field_validator,
+ model_validator,
)
from ray.air import ScalingConfig as AIRScalingConfig
-from ray.serve.config import AutoscalingConfig
+from rayllm.backend.server.config_models import AutoscalingConfig
from ray.util.placement_group import (
PlacementGroup,
get_current_placement_group,
@@ -58,6 +56,7 @@
ALLOW_NEW_PLACEMENT_GROUPS_IN_DEPLOYMENT,
MAX_NUM_STOPPING_SEQUENCES,
)
+from typing_extensions import Annotated
T = TypeVar("T")
ModelT = TypeVar("ModelT", bound=BaseModel)
@@ -103,7 +102,7 @@ class BaseModelExtended(BaseModel):
def parse_yaml(cls: Type[ModelT], file, **kwargs) -> ModelT:
kwargs.setdefault("Loader", yaml.SafeLoader)
dict_args = yaml.load(file, **kwargs)
- return cls.parse_obj(dict_args)
+ return cls.model_validate(dict_args)
def yaml(
self,
@@ -123,7 +122,7 @@ def yaml(
"""
return yaml.dump(
json.loads(
- self.json(
+ self.model_dump_json(
include=include,
exclude=exclude,
by_alias=by_alias,
@@ -200,18 +199,18 @@ class AviaryModelResponse(ComputedPropertyMixin, BaseModelExtended):
finish_reason: Optional[str] = None
error: Optional[ErrorResponse] = None
- @root_validator
- def text_or_error_or_finish_reason(cls, values):
+ @model_validator(mode='after')
+ def text_or_error_or_finish_reason(self):
if (
- values.get("generated_text") is None
- and values.get("embedding_outputs") is None
- and values.get("error") is None
- and values.get("finish_reason") is None
+ self.generated_text is None
+ and self.embedding_outputs is None
+ and self.error is None
+ and self.finish_reason is None
):
raise ValueError(
"Either 'generated_text' or 'embedding_outputs' or 'error' or 'finish_reason' must be set"
)
- return values
+ return self
@classmethod
def merge_stream(cls, *responses: "AviaryModelResponse") -> "AviaryModelResponse":
@@ -351,7 +350,7 @@ def unpack(self) -> Tuple["AviaryModelResponse", ...]:
class S3AWSCredentials(BaseModelExtended):
create_aws_credentials_url: str
- auth_token_env_variable: Optional[str]
+ auth_token_env_variable: Optional[str] = None
class ExtraFiles(BaseModelExtended):
@@ -370,7 +369,7 @@ class GCSMirrorConfig(BaseModelExtended):
bucket_uri: str
extra_files: List[ExtraFiles] = Field(default_factory=list)
- @validator("bucket_uri")
+ @field_validator("bucket_uri")
def check_uri_format(cls, value: str):
if not value.startswith("gs://"):
raise ValueError(
@@ -385,11 +384,11 @@ class GenerationConfig(BaseModelExtended):
generate_kwargs: Dict[str, Any] = {}
stopping_sequences: Optional[List[Union[str, int, List[Union[str, int]]]]] = None
- @validator("prompt_format")
+ @field_validator("prompt_format")
def default_prompt_format(cls, prompt_format):
return prompt_format if prompt_format is not None else DisabledPromptFormat()
- @validator("stopping_sequences")
+ @field_validator("stopping_sequences")
def check_stopping_sequences(cls, value):
def try_int(x):
if isinstance(x, list):
@@ -423,15 +422,13 @@ class ModelType(str, Enum):
class EngineConfig(BaseModelExtended):
- class Config:
- use_enum_values = True
- extra = Extra.forbid
+ model_config = ConfigDict(use_enum_values=True, extra='forbid')
model_id: str
hf_model_id: Optional[str] = None
type: EngineType
model_type: ModelType
- tokenizer_id: Optional[str]
+ tokenizer_id: Optional[str] = None
s3_mirror_config: Optional[S3MirrorConfig] = None
gcs_mirror_config: Optional[GCSMirrorConfig] = None
@@ -445,7 +442,7 @@ class Config:
# These will be copied to the runtime env
env_vars_to_propogate: List[str] = list(ENV_VARS_TO_PROPAGATE)
- @validator("gcs_mirror_config")
+ @field_validator("gcs_mirror_config")
def check_only_one_mirror_config_specified(cls, value, values):
gcs_config = value
s3_config = values["s3_mirror_config"]
@@ -489,9 +486,7 @@ def get_vllm_load_s3_path(self) -> Optional[str]:
class SchedulingMetadata(BaseModelExtended):
request_id: Union[str, List[str]]
priority: Union[QueuePriority, List[QueuePriority]]
-
- class Config:
- arbitrary_types_allowed = True
+ model_config = ConfigDict(arbitrary_types_allowed=True)
class SamplingParams(BaseModelExtended):
@@ -540,9 +535,9 @@ class SamplingParams(BaseModelExtended):
def dict(self, **kwargs):
if kwargs.get("exclude", None) is None:
kwargs["exclude"] = self._ignored_fields
- return super().dict(**kwargs)
+ return super().model_dump(**kwargs)
- @validator("stop", always=True)
+ @field_validator("stop")
def validate_stopping_sequences(cls, values):
if not values:
return values
@@ -577,13 +572,13 @@ def merge_generation_params(
generation.stopping_sequences or []
)
- return cls.parse_obj(generate_kwargs)
+ return cls.model_validate(generate_kwargs)
class GenerationRequest(BaseModelExtended):
prompt: Union[str, List[int], List[str]]
request_id: Union[str, List[str]]
- sampling_params: Optional[Union[SamplingParams, List[SamplingParams]]]
+ sampling_params: Optional[Union[SamplingParams, List[SamplingParams]]] = None
scheduling_metadata: Union[SchedulingMetadata, List[SchedulingMetadata]]
@@ -603,7 +598,7 @@ class ChatCompletions(BaseModelExtended):
"""
model: str
- messages: conlist(Message, min_items=1)
+ messages: Annotated[List[Message], Field(min_length=1)]
stream: bool = False
echo: Optional[bool] = False
user: Optional[str] = None
@@ -640,7 +635,7 @@ class Embeddings(BaseModelExtended):
"""
model: str
- input: Union[str, conlist(str, min_items=1)]
+ input: Union[str, Annotated[List[str], Field(min_length=1)]]
user: Optional[str] = None
@@ -652,7 +647,7 @@ class ScalingConfig(BaseModelExtended):
resources_per_worker: Optional[Dict[str, float]] = None
pg_timeout_s: float = 600
- @validator("num_gpus_per_worker")
+ @field_validator("num_gpus_per_worker")
def validate_num_gpus_per_worker(cls, value):
if value > 1:
raise ValueError(
@@ -689,7 +684,7 @@ class ServeMultiplexConfig(BaseModelExtended):
class DeploymentConfig(BaseModelExtended):
- autoscaling_config: Optional[AutoscalingConfig]
+ autoscaling_config: Optional[AutoscalingConfig] = None
max_concurrent_queries: Optional[int] = None
ray_actor_options: Optional[Dict[str, Any]] = None
@@ -706,7 +701,7 @@ def model_id(self):
return self.engine_config.model_id
def short_metadata(self):
- return self.dict(
+ return self.model_dump(
include={
"model_id": True,
"engine_config": {
@@ -787,9 +782,9 @@ class HasModelId(Protocol):
model_id: str
-class ChatCompletionsParams(ChatCompletions, SamplingParams, extra=Extra.allow):
+class ChatCompletionsParams(ChatCompletions, SamplingParams, extra='allow'):
pass
-class CompletionsParams(Completions, SamplingParams, extra=Extra.allow):
+class CompletionsParams(Completions, SamplingParams, extra='allow'):
max_tokens: Optional[int] = 16
diff --git a/rayllm/backend/server/routers/router_app.py b/rayllm/backend/server/routers/router_app.py
index bdcd9fda..6ef6ae30 100644
--- a/rayllm/backend/server/routers/router_app.py
+++ b/rayllm/backend/server/routers/router_app.py
@@ -136,7 +136,7 @@ async def _completions_wrapper(
)
all_results.pop()
had_error = True
- yield AviaryModelResponse.parse_obj(subresult_dict)
+ yield AviaryModelResponse.model_validate(subresult_dict)
# Return early in case of an error
break
choices = [
@@ -213,7 +213,7 @@ async def _chat_completions_wrapper(
subresult_dict["finish_reason"] = None
all_results.pop()
had_error = True
- yield AviaryModelResponse.parse_obj(subresult_dict)
+ yield AviaryModelResponse.model_validate(subresult_dict)
# Return early in case of an error
break
else:
@@ -240,7 +240,7 @@ async def _chat_completions_wrapper(
if subresult_dict["logprobs"]:
logprobs = ChoiceLogProbs(
content=[
- LogProbs.parse_obj(logprob)
+ LogProbs.model_validate(logprob)
for logprob in subresult_dict["logprobs"]
]
)
@@ -469,7 +469,7 @@ async def chat(
logprobs = results.logprobs
if logprobs:
logprobs = ChoiceLogProbs(
- content=[LogProbs.parse_obj(logprob) for logprob in logprobs]
+ content=[LogProbs.model_validate(logprob) for logprob in logprobs]
)
else:
logprobs = None
diff --git a/rayllm/backend/server/run.py b/rayllm/backend/server/run.py
index a80a09aa..072bd119 100644
--- a/rayllm/backend/server/run.py
+++ b/rayllm/backend/server/run.py
@@ -177,7 +177,7 @@ def router_deployment(
def router_application(args):
ray._private.usage.usage_lib.record_library_usage("ray-llm")
- router_args = RouterArgs.parse_obj(args)
+ router_args = RouterArgs.model_validate(args)
vllm_apps = []
embedding_apps = []
trtllm_apps = []
diff --git a/rayllm/backend/server/trtllm/trtllm_deployment.py b/rayllm/backend/server/trtllm/trtllm_deployment.py
index 8b816012..e31af5d9 100644
--- a/rayllm/backend/server/trtllm/trtllm_deployment.py
+++ b/rayllm/backend/server/trtllm/trtllm_deployment.py
@@ -30,7 +30,7 @@ async def stream(
prompt: Prompt,
priority=None,
):
- sample_params = TRTLLMSamplingParams.parse_obj(prompt.parameters)
+ sample_params = TRTLLMSamplingParams.model_validate(prompt.parameters)
logger.info(f"Received request {request_id}")
prompt_text = (
diff --git a/rayllm/backend/server/utils.py b/rayllm/backend/server/utils.py
index 1601c994..5840397a 100644
--- a/rayllm/backend/server/utils.py
+++ b/rayllm/backend/server/utils.py
@@ -71,7 +71,7 @@ def parse_args(
else:
raise
else:
- parsed_models = [llm_app_cls.parse_obj(raw_model)]
+ parsed_models = [llm_app_cls.model_validate(raw_model)]
models += parsed_models
return [model for model in models if model.enabled]
diff --git a/rayllm/common/llm_event.py b/rayllm/common/llm_event.py
index b7fff4f7..4e42d50f 100644
--- a/rayllm/common/llm_event.py
+++ b/rayllm/common/llm_event.py
@@ -20,8 +20,8 @@ class Vote(BaseModel):
class LlmResponse(BaseModel):
model_id: str
text: str
- engine_config: Optional[Dict]
- gen_stats: Optional[Dict]
+ engine_config: Optional[Dict] = None
+ gen_stats: Optional[Dict] = None
class LlmEvent(BaseModel):
@@ -30,7 +30,7 @@ class LlmEvent(BaseModel):
project_name: str
# Identifier for a session
- session_id: Optional[str]
+ session_id: Optional[str] = None
# unique string representing this event
instance_id: str
@@ -41,13 +41,13 @@ class LlmEvent(BaseModel):
# Vote is a dictionary by llm and the votes
# that model got. Typically, this is 1.
- votes: Optional[List[Vote]]
- vote_comments: Optional[Dict[str, str]]
+ votes: Optional[List[Vote]] = None
+ vote_comments: Optional[Dict[str, str]] = None
# Key: llm
# Value: list of flags
- flag: Optional[Dict[str, List[Flag]]]
+ flag: Optional[Dict[str, List[Flag]]] = None
# Key: llm
# Value: Comment for each llm
- flag_comments: Optional[Dict[str, str]]
+ flag_comments: Optional[Dict[str, str]] = None
diff --git a/rayllm/common/models.py b/rayllm/common/models.py
index 939d3990..775eaf14 100644
--- a/rayllm/common/models.py
+++ b/rayllm/common/models.py
@@ -1,7 +1,7 @@
from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, TypeVar, Union
from fastapi import HTTPException, status
-from pydantic import BaseModel, root_validator, validator
+from pydantic import ConfigDict, BaseModel, field_validator, model_validator
if TYPE_CHECKING:
from rayllm.backend.server.models import AviaryModelResponse
@@ -44,7 +44,7 @@ class TextChoice(BaseModel):
text: str
index: int
logprobs: dict
- finish_reason: Optional[str]
+ finish_reason: Optional[str] = None
class Usage(BaseModel):
@@ -79,7 +79,7 @@ class Completion(BaseModel):
created: int
model: str
choices: List[TextChoice]
- usage: Optional[Usage]
+ usage: Optional[Usage] = None
@classmethod
def create(
@@ -113,7 +113,7 @@ class EmbeddingsOutput(BaseModel):
object: str
created: int
model: str
- usage: Optional[EmbeddingsUsage]
+ usage: Optional[EmbeddingsUsage] = None
class FunctionCall(BaseModel):
@@ -127,7 +127,7 @@ class ToolCall(BaseModel):
id: str
def __str__(self):
- return str(self.dict())
+ return str(self.model_dump())
class Function(BaseModel):
@@ -181,37 +181,37 @@ def __str__(self):
# the object
if getattr(self, "tool_calls", None):
return str(self.content)
- return str(self.dict())
+ return str(self.model_dump())
- @root_validator
- def check_fields(cls, values):
- if values["role"] in ["system", "user"]:
- if not isinstance(values.get("content"), str):
+ @model_validator(mode='after')
+ def check_fields(self):
+ if self.role in ["system", "user"]:
+ if not isinstance(self.content, str):
raise ValueError("content must be a string")
- if values["role"] == "tool":
- if not isinstance(values.get("tool_call_id"), str):
+ if self.role == "tool":
+ if not isinstance(self.tool_call_id, str):
raise ValueError("tool_call_id must be a str")
# content should either be a dict with errors or with results
- if not isinstance(values.get("content"), str):
+ if not isinstance(self.content, str):
raise ValueError(
"content must be a str with results or errors for " "the tool call"
)
- if values["role"] == "assistant":
- if values.get("tool_calls"):
+ if self.role == "assistant":
+ if self.tool_calls:
# passing a regular assistant message
- if not isinstance(values.get("tool_calls"), list):
+ if not isinstance(self.tool_calls, list):
raise ValueError("tool_calls must be a list")
- for tool_call in values["tool_calls"]:
+ for tool_call in self.tool_calls:
if not isinstance(tool_call, ToolCall):
raise TypeError("Tool calls must be of type ToolCall")
else:
# passing a regular assistant message
if (
- not isinstance(values.get("content"), str)
- or values.get("content") == ""
+ not isinstance(self.content, str)
+ or self.content == ""
):
raise ValueError("content must be a string or None")
- return values
+ return self
class DeltaRole(BaseModel):
@@ -229,12 +229,11 @@ def __str__(self):
if self.tool_calls:
return str(self.tool_calls)
else:
- return str(self.dict())
+ return str(self.model_dump())
class DeltaEOS(BaseModel):
- class Config:
- extra = "forbid"
+ model_config = ConfigDict(extra="forbid")
class ChoiceLogProbs(BaseModel):
@@ -251,7 +250,7 @@ class MessageChoices(BaseModel):
class DeltaChoices(BaseModel):
delta: Union[DeltaRole, DeltaContent, DeltaEOS]
index: int
- finish_reason: Optional[str]
+ finish_reason: Optional[str] = None
logprobs: Optional[ChoiceLogProbs] = None
@@ -261,7 +260,7 @@ class ChatCompletion(BaseModel):
created: int
model: str
choices: List[Union[MessageChoices, DeltaChoices]]
- usage: Optional[Usage]
+ usage: Optional[Usage] = None
@classmethod
def create(
@@ -289,7 +288,7 @@ class Prompt(BaseModel):
tools: Optional[List[Tool]] = None
tool_choice: Union[Literal["auto", "none"], ToolChoice] = "auto"
- @validator("prompt")
+ @field_validator("prompt")
def check_prompt(cls, value):
if isinstance(value, list) and not value:
raise ValueError("Messages cannot be an empty list.")
@@ -320,8 +319,7 @@ class ErrorResponse(BaseModel):
class AbstractPromptFormat(BaseModel):
- class Config:
- extra = "forbid"
+ model_config = ConfigDict(extra="forbid")
def generate_prompt(self, messages: Union[Prompt, List[Message]]) -> str:
raise NotImplementedError()
@@ -338,34 +336,34 @@ class PromptFormat(AbstractPromptFormat):
add_system_tags_even_if_message_is_empty: bool = False
strip_whitespace: bool = True
- @validator("system")
+ @field_validator("system")
def check_system(cls, value):
assert value and (
"{instruction}" in value
), "system must be a string containing '{instruction}'"
return value
- @validator("assistant")
+ @field_validator("assistant")
def check_assistant(cls, value):
assert (
value and "{instruction}" in value
), "assistant must be a string containing '{instruction}'"
return value
- @validator("user")
+ @field_validator("user")
def check_user(cls, value):
assert value and (
"{instruction}" in value
), "user must be a string containing '{instruction}'"
return value
- @root_validator
- def check_user_system_in_user(cls, values):
- if values["system_in_user"]:
+ @model_validator(mode='after')
+ def check_user_system_in_user(self):
+ if self.system_in_user:
assert (
- "{system}" in values["user"]
+ "{system}" in self.user
), "If system_in_user=True, user must contain '{system}'"
- return values
+ return self
def generate_prompt(self, messages: Union[Prompt, List[Message]]) -> str:
if isinstance(messages, Prompt):
diff --git a/requirements-backend.txt b/requirements-backend.txt
index c2e792c4..890e74f1 100644
--- a/requirements-backend.txt
+++ b/requirements-backend.txt
@@ -11,16 +11,17 @@ Jinja2
numexpr>=2.7.3
hf_transfer
evaluate
-vllm>=0.2.0,<0.2.6
+vllm>=0.4.1
numpy<1.24
ninja
protobuf<3.21.0
safetensors
-pydantic~=1.10.0
+pydantic>=2.0
einops
markdown-it-py[plugins]
fastapi-versioning
scipy
+redis
opentelemetry-api>=1.15.0
opentelemetry-sdk
opentelemetry-exporter-otlp
diff --git a/requirements.txt b/requirements.txt
index 7a08198f..72519891 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,7 @@ boto3
pydantic
typer>=0.9
rich
-typing_extensions~=4.5.0
+# typing_extensions~=4.5.0
+typing_extensions
requests
openai
diff --git a/serve_configs/cpu/meta-llama--Llama-2-7b-chat-hf.yaml b/serve_configs/cpu/meta-llama--Llama-2-7b-chat-hf.yaml
new file mode 100644
index 00000000..5a597802
--- /dev/null
+++ b/serve_configs/cpu/meta-llama--Llama-2-7b-chat-hf.yaml
@@ -0,0 +1,7 @@
+applications:
+- name: ray-llm
+ route_prefix: /
+ import_path: rayllm.backend:router_application
+ args:
+ models:
+ - "./models/continuous_batching/cpu/meta-llama--Llama-2-7b-chat-hf.yaml"