diff --git a/models/continuous_batching/cpu/meta-llama--Llama-2-7b-chat-hf.yaml b/models/continuous_batching/cpu/meta-llama--Llama-2-7b-chat-hf.yaml new file mode 100644 index 00000000..1b85a7e0 --- /dev/null +++ b/models/continuous_batching/cpu/meta-llama--Llama-2-7b-chat-hf.yaml @@ -0,0 +1,41 @@ +deployment_config: + autoscaling_config: + min_replicas: 1 + initial_replicas: 1 + max_replicas: 8 + target_num_ongoing_requests_per_replica: 24 + metrics_interval_s: 10.0 + look_back_period_s: 30.0 + smoothing_factor: 0.6 + downscale_delay_s: 300.0 + upscale_delay_s: 15.0 + max_concurrent_queries: 64 + ray_actor_options: {} +engine_config: + model_id: meta-llama/Llama-2-7b-chat-hf + hf_model_id: meta-llama/Llama-2-7b-chat-hf + type: VLLMEngine + engine_kwargs: + device: "cpu" + dtype: "float32" + trust_remote_code: true + max_num_batched_tokens: 4096 + max_num_seqs: 64 + gpu_memory_utilization: 0.95 + max_total_tokens: 4096 + generation: + prompt_format: + system: "<>\n{instruction}\n<>\n\n" + assistant: " {instruction} " + trailing_assistant: "" + user: "[INST] {system}{instruction} [/INST]" + system_in_user: true + default_system_message: "" + stopping_sequences: [] +scaling_config: + num_workers: 1 + num_gpus_per_worker: 0 + num_cpus_per_worker: 32 + placement_strategy: "STRICT_PACK" + resources_per_worker: {} + diff --git a/models/continuous_batching/meta-llama--Llama-2-7b-chat-hf.yaml b/models/continuous_batching/meta-llama--Llama-2-7b-chat-hf.yaml index 9bfdf5a1..8d0949ce 100644 --- a/models/continuous_batching/meta-llama--Llama-2-7b-chat-hf.yaml +++ b/models/continuous_batching/meta-llama--Llama-2-7b-chat-hf.yaml @@ -10,9 +10,7 @@ deployment_config: downscale_delay_s: 300.0 upscale_delay_s: 15.0 max_concurrent_queries: 64 - ray_actor_options: - resources: - accelerator_type_a10: 0.01 + ray_actor_options: {} engine_config: model_id: meta-llama/Llama-2-7b-chat-hf hf_model_id: meta-llama/Llama-2-7b-chat-hf @@ -34,8 +32,8 @@ engine_config: stopping_sequences: [] scaling_config: num_workers: 1 - num_gpus_per_worker: 1 + num_gpus_per_worker: 0 num_cpus_per_worker: 8 placement_strategy: "STRICT_PACK" - resources_per_worker: - accelerator_type_a10: 0.01 + resources_per_worker: {} + diff --git a/rayllm/backend/llm/embedding/embedding_models.py b/rayllm/backend/llm/embedding/embedding_models.py index ce8c1b02..5816b962 100644 --- a/rayllm/backend/llm/embedding/embedding_models.py +++ b/rayllm/backend/llm/embedding/embedding_models.py @@ -11,6 +11,7 @@ ModelType, S3MirrorConfig, ) +from pydantic import ConfigDict logger = logging.getLogger(__name__) @@ -20,8 +21,7 @@ class EmbeddingOptimize(str, Enum): class EmbeddingEngineConfig(EngineConfig): - class Config: - use_enum_values = True + model_config = ConfigDict(use_enum_values=True) type: EngineType = EngineType.EmbeddingEngine model_type: ModelType = ModelType.embedding diff --git a/rayllm/backend/llm/trtllm/trtllm_models.py b/rayllm/backend/llm/trtllm/trtllm_models.py index 6243aed3..79da1d61 100644 --- a/rayllm/backend/llm/trtllm/trtllm_models.py +++ b/rayllm/backend/llm/trtllm/trtllm_models.py @@ -10,6 +10,7 @@ S3MirrorConfig, SamplingParams, ) +from pydantic import ConfigDict try: from tensorrt_llm.libs import trt_llm_engine_py as trt_py @@ -23,9 +24,7 @@ class TRTLLMGPTServeConfig(BaseModelExtended): max_tokens_in_paged_kv_cache: int = None kv_cache_free_gpu_mem_fraction: float = None enable_trt_overlap: bool = None - - class Config: - arbitrary_types_allowed = True + model_config = ConfigDict(arbitrary_types_allowed=True) @classmethod def from_engine_config( diff --git a/rayllm/backend/llm/vllm/vllm_compatibility.py b/rayllm/backend/llm/vllm/vllm_compatibility.py index b69b1a64..f4dd8fa0 100644 --- a/rayllm/backend/llm/vllm/vllm_compatibility.py +++ b/rayllm/backend/llm/vllm/vllm_compatibility.py @@ -1,14 +1,10 @@ import logging import time -from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Type, Union +from typing import TYPE_CHECKING, Any, List, Optional, Type, Union import ray from ray.util.placement_group import PlacementGroup from transformers.dynamic_module_utils import init_hf_modules -from vllm.config import CacheConfig as VllmCacheConfig -from vllm.config import ModelConfig as VllmModelConfig -from vllm.config import ParallelConfig as VllmParallelConfig -from vllm.config import SchedulerConfig as VllmSchedulerConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine, AsyncStream, _AsyncLLMEngine @@ -32,13 +28,8 @@ logger = logging.getLogger(__name__) -VllmConfigs = Tuple[ - VllmCacheConfig, VllmModelConfig, VllmParallelConfig, VllmSchedulerConfig -] - - class AviaryLLMEngine(_AsyncLLMEngine): - def __init__(self, *args, runtime_env: dict, **kwargs): + def __init__(self, *args, runtime_env: dict = {}, **kwargs): self.runtime_env = runtime_env super().__init__(*args, **kwargs) @@ -106,15 +97,17 @@ def _record_system_stats(self, *args, **kwargs): return last_stats -def _get_vllm_engine_config(vllm_app) -> Tuple[AsyncEngineArgs, VllmConfigs]: - # Generate engine arguements and engine configs +def _get_vllm_engine_config(vllm_app) -> AsyncEngineArgs: + device = vllm_app.engine_config.engine_kwargs.get("device", "gpu") + # Generate engine arguements and engine configs async_engine_args = AsyncEngineArgs( # This is the local path on disk, or the hf model id # If it is the hf_model_id, vllm automatically downloads the correct model. **dict( model=vllm_app.engine_config.actual_hf_model_id, - worker_use_ray=True, + # vLLM for CPU doesn't support Ray workers for model parallelism yet + worker_use_ray=True if device == "gpu" else False, engine_use_ray=False, tensor_parallel_size=vllm_app.placement_config.world_size, max_model_len=vllm_app.engine_config.max_total_tokens, @@ -123,8 +116,8 @@ def _get_vllm_engine_config(vllm_app) -> Tuple[AsyncEngineArgs, VllmConfigs]: **vllm_app.engine_config.get_initialization_kwargs(), ) ) - configs = async_engine_args.create_engine_configs() - return async_engine_args, configs + + return async_engine_args class AviaryAsyncLLMEngine(AsyncLLMEngine): @@ -167,23 +160,12 @@ def from_llm_app( # torch to have access to CUDA devices. We use a remote task # with `num_gpus` set here, so the type check happens in an environment # with `CUDA_VISIBLE_DEVICES` set. - engine_args, engine_configs = ray.get( + engine_args = ray.get( ray.remote(_get_vllm_engine_config) .options(**scaling_config) .remote(vllm_app) ) - # Create the async LLM engine. - engine = cls( - engine_args.worker_use_ray, - engine_args.engine_use_ray, - *engine_configs, - None, - placement_group, - runtime_env=runtime_env, - log_requests=not engine_args.disable_log_requests, - log_stats=not engine_args.disable_log_stats, - max_log_len=engine_args.max_log_len, - start_engine_loop=True, - ) + engine = cls.from_engine_args(engine_args, start_engine_loop=True) + return engine diff --git a/rayllm/backend/llm/vllm/vllm_engine.py b/rayllm/backend/llm/vllm/vllm_engine.py index 79131e6d..617784b6 100644 --- a/rayllm/backend/llm/vllm/vllm_engine.py +++ b/rayllm/backend/llm/vllm/vllm_engine.py @@ -59,7 +59,9 @@ def __init__( self.llm_app = llm_app.copy(deep=True) self.engine_config = llm_app.engine_config self.placement_config = llm_app.placement_config - if not (self.placement_config.scaling_config.num_gpus_per_worker > 0): + + device = self.engine_config.engine_kwargs.get("device", "gpu") + if device == "gpu" and not (self.placement_config.scaling_config.num_gpus_per_worker > 0): raise ValueError("The VLLM Engine Requires > 0 GPUs to run.") self.node_initializer = node_initializer or LLMNodeInitializer( diff --git a/rayllm/backend/server/config_models.py b/rayllm/backend/server/config_models.py new file mode 100644 index 00000000..645b1d90 --- /dev/null +++ b/rayllm/backend/server/config_models.py @@ -0,0 +1,86 @@ +from typing import Optional +from pydantic import ( + BaseModel, + Field, + NonNegativeFloat, + NonNegativeInt, + PositiveFloat, + PositiveInt, + field_validator, +) + +# Adapted from ray.serve.config.AutoscalingConfig +# Port it here as the original AutoscalingConfig model is pydantic V1 +class AutoscalingConfig(BaseModel): + """Config for the Serve Autoscaler.""" + + # Please keep these options in sync with those in + # `src/ray/protobuf/serve.proto`. + + # Publicly exposed options + min_replicas: NonNegativeInt = 1 + initial_replicas: Optional[NonNegativeInt] = None + max_replicas: PositiveInt = 1 + + # DEPRECATED: replaced by target_ongoing_requests + target_num_ongoing_requests_per_replica: PositiveFloat = Field( + default=1.0, + description="[DEPRECATED] Please use `target_ongoing_requests` instead.", + ) + # Will default to 1.0 in the future. + target_ongoing_requests: Optional[PositiveFloat] = None + + # How often to scrape for metrics + metrics_interval_s: PositiveFloat = 10.0 + # Time window to average over for metrics. + look_back_period_s: PositiveFloat = 30.0 + + # DEPRECATED + smoothing_factor: PositiveFloat = 1.0 + # DEPRECATED: replaced by `downscaling_factor` + upscale_smoothing_factor: Optional[PositiveFloat] = Field( + default=None, description="[DEPRECATED] Please use `upscaling_factor` instead." + ) + # DEPRECATED: replaced by `upscaling_factor` + downscale_smoothing_factor: Optional[PositiveFloat] = Field( + default=None, + description="[DEPRECATED] Please use `downscaling_factor` instead.", + ) + + # Multiplicative "gain" factor to limit scaling decisions + upscaling_factor: Optional[PositiveFloat] = None + downscaling_factor: Optional[PositiveFloat] = None + + # How frequently to make autoscaling decisions + # loop_period_s: float = CONTROL_LOOP_PERIOD_S + # How long to wait before scaling down replicas + downscale_delay_s: NonNegativeFloat = 600.0 + # How long to wait before scaling up replicas + upscale_delay_s: NonNegativeFloat = 30.0 + + @field_validator("max_replicas") + def replicas_settings_valid(cls, max_replicas, values): + min_replicas = values.data.get("min_replicas") + initial_replicas = values.data.get("initial_replicas") + if min_replicas is not None and max_replicas < min_replicas: + raise ValueError( + f"max_replicas ({max_replicas}) must be greater than " + f"or equal to min_replicas ({min_replicas})!" + ) + + if initial_replicas is not None: + if initial_replicas < min_replicas: + raise ValueError( + f"min_replicas ({min_replicas}) must be less than " + f"or equal to initial_replicas ({initial_replicas})!" + ) + elif initial_replicas > max_replicas: + raise ValueError( + f"max_replicas ({max_replicas}) must be greater than " + f"or equal to initial_replicas ({initial_replicas})!" + ) + + return max_replicas + + def __init__(self, **kwargs): + super().__init__(**kwargs) diff --git a/rayllm/backend/server/models.py b/rayllm/backend/server/models.py index dcdf10ff..a56e8086 100644 --- a/rayllm/backend/server/models.py +++ b/rayllm/backend/server/models.py @@ -21,16 +21,14 @@ import yaml from markdown_it import MarkdownIt from pydantic import ( - BaseModel, - Extra, + ConfigDict, BaseModel, Field, PrivateAttr, - conlist, - root_validator, - validator, + field_validator, + model_validator, ) from ray.air import ScalingConfig as AIRScalingConfig -from ray.serve.config import AutoscalingConfig +from rayllm.backend.server.config_models import AutoscalingConfig from ray.util.placement_group import ( PlacementGroup, get_current_placement_group, @@ -58,6 +56,7 @@ ALLOW_NEW_PLACEMENT_GROUPS_IN_DEPLOYMENT, MAX_NUM_STOPPING_SEQUENCES, ) +from typing_extensions import Annotated T = TypeVar("T") ModelT = TypeVar("ModelT", bound=BaseModel) @@ -103,7 +102,7 @@ class BaseModelExtended(BaseModel): def parse_yaml(cls: Type[ModelT], file, **kwargs) -> ModelT: kwargs.setdefault("Loader", yaml.SafeLoader) dict_args = yaml.load(file, **kwargs) - return cls.parse_obj(dict_args) + return cls.model_validate(dict_args) def yaml( self, @@ -123,7 +122,7 @@ def yaml( """ return yaml.dump( json.loads( - self.json( + self.model_dump_json( include=include, exclude=exclude, by_alias=by_alias, @@ -200,18 +199,18 @@ class AviaryModelResponse(ComputedPropertyMixin, BaseModelExtended): finish_reason: Optional[str] = None error: Optional[ErrorResponse] = None - @root_validator - def text_or_error_or_finish_reason(cls, values): + @model_validator(mode='after') + def text_or_error_or_finish_reason(self): if ( - values.get("generated_text") is None - and values.get("embedding_outputs") is None - and values.get("error") is None - and values.get("finish_reason") is None + self.generated_text is None + and self.embedding_outputs is None + and self.error is None + and self.finish_reason is None ): raise ValueError( "Either 'generated_text' or 'embedding_outputs' or 'error' or 'finish_reason' must be set" ) - return values + return self @classmethod def merge_stream(cls, *responses: "AviaryModelResponse") -> "AviaryModelResponse": @@ -351,7 +350,7 @@ def unpack(self) -> Tuple["AviaryModelResponse", ...]: class S3AWSCredentials(BaseModelExtended): create_aws_credentials_url: str - auth_token_env_variable: Optional[str] + auth_token_env_variable: Optional[str] = None class ExtraFiles(BaseModelExtended): @@ -370,7 +369,7 @@ class GCSMirrorConfig(BaseModelExtended): bucket_uri: str extra_files: List[ExtraFiles] = Field(default_factory=list) - @validator("bucket_uri") + @field_validator("bucket_uri") def check_uri_format(cls, value: str): if not value.startswith("gs://"): raise ValueError( @@ -385,11 +384,11 @@ class GenerationConfig(BaseModelExtended): generate_kwargs: Dict[str, Any] = {} stopping_sequences: Optional[List[Union[str, int, List[Union[str, int]]]]] = None - @validator("prompt_format") + @field_validator("prompt_format") def default_prompt_format(cls, prompt_format): return prompt_format if prompt_format is not None else DisabledPromptFormat() - @validator("stopping_sequences") + @field_validator("stopping_sequences") def check_stopping_sequences(cls, value): def try_int(x): if isinstance(x, list): @@ -423,15 +422,13 @@ class ModelType(str, Enum): class EngineConfig(BaseModelExtended): - class Config: - use_enum_values = True - extra = Extra.forbid + model_config = ConfigDict(use_enum_values=True, extra='forbid') model_id: str hf_model_id: Optional[str] = None type: EngineType model_type: ModelType - tokenizer_id: Optional[str] + tokenizer_id: Optional[str] = None s3_mirror_config: Optional[S3MirrorConfig] = None gcs_mirror_config: Optional[GCSMirrorConfig] = None @@ -445,7 +442,7 @@ class Config: # These will be copied to the runtime env env_vars_to_propogate: List[str] = list(ENV_VARS_TO_PROPAGATE) - @validator("gcs_mirror_config") + @field_validator("gcs_mirror_config") def check_only_one_mirror_config_specified(cls, value, values): gcs_config = value s3_config = values["s3_mirror_config"] @@ -489,9 +486,7 @@ def get_vllm_load_s3_path(self) -> Optional[str]: class SchedulingMetadata(BaseModelExtended): request_id: Union[str, List[str]] priority: Union[QueuePriority, List[QueuePriority]] - - class Config: - arbitrary_types_allowed = True + model_config = ConfigDict(arbitrary_types_allowed=True) class SamplingParams(BaseModelExtended): @@ -540,9 +535,9 @@ class SamplingParams(BaseModelExtended): def dict(self, **kwargs): if kwargs.get("exclude", None) is None: kwargs["exclude"] = self._ignored_fields - return super().dict(**kwargs) + return super().model_dump(**kwargs) - @validator("stop", always=True) + @field_validator("stop") def validate_stopping_sequences(cls, values): if not values: return values @@ -577,13 +572,13 @@ def merge_generation_params( generation.stopping_sequences or [] ) - return cls.parse_obj(generate_kwargs) + return cls.model_validate(generate_kwargs) class GenerationRequest(BaseModelExtended): prompt: Union[str, List[int], List[str]] request_id: Union[str, List[str]] - sampling_params: Optional[Union[SamplingParams, List[SamplingParams]]] + sampling_params: Optional[Union[SamplingParams, List[SamplingParams]]] = None scheduling_metadata: Union[SchedulingMetadata, List[SchedulingMetadata]] @@ -603,7 +598,7 @@ class ChatCompletions(BaseModelExtended): """ model: str - messages: conlist(Message, min_items=1) + messages: Annotated[List[Message], Field(min_length=1)] stream: bool = False echo: Optional[bool] = False user: Optional[str] = None @@ -640,7 +635,7 @@ class Embeddings(BaseModelExtended): """ model: str - input: Union[str, conlist(str, min_items=1)] + input: Union[str, Annotated[List[str], Field(min_length=1)]] user: Optional[str] = None @@ -652,7 +647,7 @@ class ScalingConfig(BaseModelExtended): resources_per_worker: Optional[Dict[str, float]] = None pg_timeout_s: float = 600 - @validator("num_gpus_per_worker") + @field_validator("num_gpus_per_worker") def validate_num_gpus_per_worker(cls, value): if value > 1: raise ValueError( @@ -689,7 +684,7 @@ class ServeMultiplexConfig(BaseModelExtended): class DeploymentConfig(BaseModelExtended): - autoscaling_config: Optional[AutoscalingConfig] + autoscaling_config: Optional[AutoscalingConfig] = None max_concurrent_queries: Optional[int] = None ray_actor_options: Optional[Dict[str, Any]] = None @@ -706,7 +701,7 @@ def model_id(self): return self.engine_config.model_id def short_metadata(self): - return self.dict( + return self.model_dump( include={ "model_id": True, "engine_config": { @@ -787,9 +782,9 @@ class HasModelId(Protocol): model_id: str -class ChatCompletionsParams(ChatCompletions, SamplingParams, extra=Extra.allow): +class ChatCompletionsParams(ChatCompletions, SamplingParams, extra='allow'): pass -class CompletionsParams(Completions, SamplingParams, extra=Extra.allow): +class CompletionsParams(Completions, SamplingParams, extra='allow'): max_tokens: Optional[int] = 16 diff --git a/rayllm/backend/server/routers/router_app.py b/rayllm/backend/server/routers/router_app.py index bdcd9fda..6ef6ae30 100644 --- a/rayllm/backend/server/routers/router_app.py +++ b/rayllm/backend/server/routers/router_app.py @@ -136,7 +136,7 @@ async def _completions_wrapper( ) all_results.pop() had_error = True - yield AviaryModelResponse.parse_obj(subresult_dict) + yield AviaryModelResponse.model_validate(subresult_dict) # Return early in case of an error break choices = [ @@ -213,7 +213,7 @@ async def _chat_completions_wrapper( subresult_dict["finish_reason"] = None all_results.pop() had_error = True - yield AviaryModelResponse.parse_obj(subresult_dict) + yield AviaryModelResponse.model_validate(subresult_dict) # Return early in case of an error break else: @@ -240,7 +240,7 @@ async def _chat_completions_wrapper( if subresult_dict["logprobs"]: logprobs = ChoiceLogProbs( content=[ - LogProbs.parse_obj(logprob) + LogProbs.model_validate(logprob) for logprob in subresult_dict["logprobs"] ] ) @@ -469,7 +469,7 @@ async def chat( logprobs = results.logprobs if logprobs: logprobs = ChoiceLogProbs( - content=[LogProbs.parse_obj(logprob) for logprob in logprobs] + content=[LogProbs.model_validate(logprob) for logprob in logprobs] ) else: logprobs = None diff --git a/rayllm/backend/server/run.py b/rayllm/backend/server/run.py index a80a09aa..072bd119 100644 --- a/rayllm/backend/server/run.py +++ b/rayllm/backend/server/run.py @@ -177,7 +177,7 @@ def router_deployment( def router_application(args): ray._private.usage.usage_lib.record_library_usage("ray-llm") - router_args = RouterArgs.parse_obj(args) + router_args = RouterArgs.model_validate(args) vllm_apps = [] embedding_apps = [] trtllm_apps = [] diff --git a/rayllm/backend/server/trtllm/trtllm_deployment.py b/rayllm/backend/server/trtllm/trtllm_deployment.py index 8b816012..e31af5d9 100644 --- a/rayllm/backend/server/trtllm/trtllm_deployment.py +++ b/rayllm/backend/server/trtllm/trtllm_deployment.py @@ -30,7 +30,7 @@ async def stream( prompt: Prompt, priority=None, ): - sample_params = TRTLLMSamplingParams.parse_obj(prompt.parameters) + sample_params = TRTLLMSamplingParams.model_validate(prompt.parameters) logger.info(f"Received request {request_id}") prompt_text = ( diff --git a/rayllm/backend/server/utils.py b/rayllm/backend/server/utils.py index 1601c994..5840397a 100644 --- a/rayllm/backend/server/utils.py +++ b/rayllm/backend/server/utils.py @@ -71,7 +71,7 @@ def parse_args( else: raise else: - parsed_models = [llm_app_cls.parse_obj(raw_model)] + parsed_models = [llm_app_cls.model_validate(raw_model)] models += parsed_models return [model for model in models if model.enabled] diff --git a/rayllm/common/llm_event.py b/rayllm/common/llm_event.py index b7fff4f7..4e42d50f 100644 --- a/rayllm/common/llm_event.py +++ b/rayllm/common/llm_event.py @@ -20,8 +20,8 @@ class Vote(BaseModel): class LlmResponse(BaseModel): model_id: str text: str - engine_config: Optional[Dict] - gen_stats: Optional[Dict] + engine_config: Optional[Dict] = None + gen_stats: Optional[Dict] = None class LlmEvent(BaseModel): @@ -30,7 +30,7 @@ class LlmEvent(BaseModel): project_name: str # Identifier for a session - session_id: Optional[str] + session_id: Optional[str] = None # unique string representing this event instance_id: str @@ -41,13 +41,13 @@ class LlmEvent(BaseModel): # Vote is a dictionary by llm and the votes # that model got. Typically, this is 1. - votes: Optional[List[Vote]] - vote_comments: Optional[Dict[str, str]] + votes: Optional[List[Vote]] = None + vote_comments: Optional[Dict[str, str]] = None # Key: llm # Value: list of flags - flag: Optional[Dict[str, List[Flag]]] + flag: Optional[Dict[str, List[Flag]]] = None # Key: llm # Value: Comment for each llm - flag_comments: Optional[Dict[str, str]] + flag_comments: Optional[Dict[str, str]] = None diff --git a/rayllm/common/models.py b/rayllm/common/models.py index 939d3990..775eaf14 100644 --- a/rayllm/common/models.py +++ b/rayllm/common/models.py @@ -1,7 +1,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, TypeVar, Union from fastapi import HTTPException, status -from pydantic import BaseModel, root_validator, validator +from pydantic import ConfigDict, BaseModel, field_validator, model_validator if TYPE_CHECKING: from rayllm.backend.server.models import AviaryModelResponse @@ -44,7 +44,7 @@ class TextChoice(BaseModel): text: str index: int logprobs: dict - finish_reason: Optional[str] + finish_reason: Optional[str] = None class Usage(BaseModel): @@ -79,7 +79,7 @@ class Completion(BaseModel): created: int model: str choices: List[TextChoice] - usage: Optional[Usage] + usage: Optional[Usage] = None @classmethod def create( @@ -113,7 +113,7 @@ class EmbeddingsOutput(BaseModel): object: str created: int model: str - usage: Optional[EmbeddingsUsage] + usage: Optional[EmbeddingsUsage] = None class FunctionCall(BaseModel): @@ -127,7 +127,7 @@ class ToolCall(BaseModel): id: str def __str__(self): - return str(self.dict()) + return str(self.model_dump()) class Function(BaseModel): @@ -181,37 +181,37 @@ def __str__(self): # the object if getattr(self, "tool_calls", None): return str(self.content) - return str(self.dict()) + return str(self.model_dump()) - @root_validator - def check_fields(cls, values): - if values["role"] in ["system", "user"]: - if not isinstance(values.get("content"), str): + @model_validator(mode='after') + def check_fields(self): + if self.role in ["system", "user"]: + if not isinstance(self.content, str): raise ValueError("content must be a string") - if values["role"] == "tool": - if not isinstance(values.get("tool_call_id"), str): + if self.role == "tool": + if not isinstance(self.tool_call_id, str): raise ValueError("tool_call_id must be a str") # content should either be a dict with errors or with results - if not isinstance(values.get("content"), str): + if not isinstance(self.content, str): raise ValueError( "content must be a str with results or errors for " "the tool call" ) - if values["role"] == "assistant": - if values.get("tool_calls"): + if self.role == "assistant": + if self.tool_calls: # passing a regular assistant message - if not isinstance(values.get("tool_calls"), list): + if not isinstance(self.tool_calls, list): raise ValueError("tool_calls must be a list") - for tool_call in values["tool_calls"]: + for tool_call in self.tool_calls: if not isinstance(tool_call, ToolCall): raise TypeError("Tool calls must be of type ToolCall") else: # passing a regular assistant message if ( - not isinstance(values.get("content"), str) - or values.get("content") == "" + not isinstance(self.content, str) + or self.content == "" ): raise ValueError("content must be a string or None") - return values + return self class DeltaRole(BaseModel): @@ -229,12 +229,11 @@ def __str__(self): if self.tool_calls: return str(self.tool_calls) else: - return str(self.dict()) + return str(self.model_dump()) class DeltaEOS(BaseModel): - class Config: - extra = "forbid" + model_config = ConfigDict(extra="forbid") class ChoiceLogProbs(BaseModel): @@ -251,7 +250,7 @@ class MessageChoices(BaseModel): class DeltaChoices(BaseModel): delta: Union[DeltaRole, DeltaContent, DeltaEOS] index: int - finish_reason: Optional[str] + finish_reason: Optional[str] = None logprobs: Optional[ChoiceLogProbs] = None @@ -261,7 +260,7 @@ class ChatCompletion(BaseModel): created: int model: str choices: List[Union[MessageChoices, DeltaChoices]] - usage: Optional[Usage] + usage: Optional[Usage] = None @classmethod def create( @@ -289,7 +288,7 @@ class Prompt(BaseModel): tools: Optional[List[Tool]] = None tool_choice: Union[Literal["auto", "none"], ToolChoice] = "auto" - @validator("prompt") + @field_validator("prompt") def check_prompt(cls, value): if isinstance(value, list) and not value: raise ValueError("Messages cannot be an empty list.") @@ -320,8 +319,7 @@ class ErrorResponse(BaseModel): class AbstractPromptFormat(BaseModel): - class Config: - extra = "forbid" + model_config = ConfigDict(extra="forbid") def generate_prompt(self, messages: Union[Prompt, List[Message]]) -> str: raise NotImplementedError() @@ -338,34 +336,34 @@ class PromptFormat(AbstractPromptFormat): add_system_tags_even_if_message_is_empty: bool = False strip_whitespace: bool = True - @validator("system") + @field_validator("system") def check_system(cls, value): assert value and ( "{instruction}" in value ), "system must be a string containing '{instruction}'" return value - @validator("assistant") + @field_validator("assistant") def check_assistant(cls, value): assert ( value and "{instruction}" in value ), "assistant must be a string containing '{instruction}'" return value - @validator("user") + @field_validator("user") def check_user(cls, value): assert value and ( "{instruction}" in value ), "user must be a string containing '{instruction}'" return value - @root_validator - def check_user_system_in_user(cls, values): - if values["system_in_user"]: + @model_validator(mode='after') + def check_user_system_in_user(self): + if self.system_in_user: assert ( - "{system}" in values["user"] + "{system}" in self.user ), "If system_in_user=True, user must contain '{system}'" - return values + return self def generate_prompt(self, messages: Union[Prompt, List[Message]]) -> str: if isinstance(messages, Prompt): diff --git a/requirements-backend.txt b/requirements-backend.txt index c2e792c4..890e74f1 100644 --- a/requirements-backend.txt +++ b/requirements-backend.txt @@ -11,16 +11,17 @@ Jinja2 numexpr>=2.7.3 hf_transfer evaluate -vllm>=0.2.0,<0.2.6 +vllm>=0.4.1 numpy<1.24 ninja protobuf<3.21.0 safetensors -pydantic~=1.10.0 +pydantic>=2.0 einops markdown-it-py[plugins] fastapi-versioning scipy +redis opentelemetry-api>=1.15.0 opentelemetry-sdk opentelemetry-exporter-otlp diff --git a/requirements.txt b/requirements.txt index 7a08198f..72519891 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ boto3 pydantic typer>=0.9 rich -typing_extensions~=4.5.0 +# typing_extensions~=4.5.0 +typing_extensions requests openai diff --git a/serve_configs/cpu/meta-llama--Llama-2-7b-chat-hf.yaml b/serve_configs/cpu/meta-llama--Llama-2-7b-chat-hf.yaml new file mode 100644 index 00000000..5a597802 --- /dev/null +++ b/serve_configs/cpu/meta-llama--Llama-2-7b-chat-hf.yaml @@ -0,0 +1,7 @@ +applications: +- name: ray-llm + route_prefix: / + import_path: rayllm.backend:router_application + args: + models: + - "./models/continuous_batching/cpu/meta-llama--Llama-2-7b-chat-hf.yaml"