From 4daba71f8fe918b61b1e14be6bcc5f46825cf261 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Mon, 15 Apr 2024 14:58:08 +0000 Subject: [PATCH 01/11] initial update --- .../meta-llama--Llama-2-7b-chat-hf.yaml | 10 ++++------ rayllm/backend/llm/vllm/vllm_engine.py | 4 ++-- rayllm/backend/server/models.py | 2 +- rayllm/common/models.py | 6 +++--- requirements-backend.txt | 3 ++- requirements.txt | 3 ++- setup.py | 3 ++- 7 files changed, 16 insertions(+), 15 deletions(-) diff --git a/models/continuous_batching/meta-llama--Llama-2-7b-chat-hf.yaml b/models/continuous_batching/meta-llama--Llama-2-7b-chat-hf.yaml index 9bfdf5a1..8d0949ce 100644 --- a/models/continuous_batching/meta-llama--Llama-2-7b-chat-hf.yaml +++ b/models/continuous_batching/meta-llama--Llama-2-7b-chat-hf.yaml @@ -10,9 +10,7 @@ deployment_config: downscale_delay_s: 300.0 upscale_delay_s: 15.0 max_concurrent_queries: 64 - ray_actor_options: - resources: - accelerator_type_a10: 0.01 + ray_actor_options: {} engine_config: model_id: meta-llama/Llama-2-7b-chat-hf hf_model_id: meta-llama/Llama-2-7b-chat-hf @@ -34,8 +32,8 @@ engine_config: stopping_sequences: [] scaling_config: num_workers: 1 - num_gpus_per_worker: 1 + num_gpus_per_worker: 0 num_cpus_per_worker: 8 placement_strategy: "STRICT_PACK" - resources_per_worker: - accelerator_type_a10: 0.01 + resources_per_worker: {} + diff --git a/rayllm/backend/llm/vllm/vllm_engine.py b/rayllm/backend/llm/vllm/vllm_engine.py index 79131e6d..04ef8734 100644 --- a/rayllm/backend/llm/vllm/vllm_engine.py +++ b/rayllm/backend/llm/vllm/vllm_engine.py @@ -59,8 +59,8 @@ def __init__( self.llm_app = llm_app.copy(deep=True) self.engine_config = llm_app.engine_config self.placement_config = llm_app.placement_config - if not (self.placement_config.scaling_config.num_gpus_per_worker > 0): - raise ValueError("The VLLM Engine Requires > 0 GPUs to run.") + # if not (self.placement_config.scaling_config.num_gpus_per_worker > 0): + # raise ValueError("The VLLM Engine Requires > 0 GPUs to run.") self.node_initializer = node_initializer or LLMNodeInitializer( local_node_tokenizer_only=True diff --git a/rayllm/backend/server/models.py b/rayllm/backend/server/models.py index dcdf10ff..9cd696f0 100644 --- a/rayllm/backend/server/models.py +++ b/rayllm/backend/server/models.py @@ -20,7 +20,7 @@ import yaml from markdown_it import MarkdownIt -from pydantic import ( +from pydantic.v1 import ( BaseModel, Extra, Field, diff --git a/rayllm/common/models.py b/rayllm/common/models.py index 939d3990..ad637359 100644 --- a/rayllm/common/models.py +++ b/rayllm/common/models.py @@ -1,7 +1,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, TypeVar, Union from fastapi import HTTPException, status -from pydantic import BaseModel, root_validator, validator +from pydantic.v1 import BaseModel, root_validator, validator if TYPE_CHECKING: from rayllm.backend.server.models import AviaryModelResponse @@ -183,7 +183,7 @@ def __str__(self): return str(self.content) return str(self.dict()) - @root_validator + @root_validator(skip_on_failure=True) def check_fields(cls, values): if values["role"] in ["system", "user"]: if not isinstance(values.get("content"), str): @@ -359,7 +359,7 @@ def check_user(cls, value): ), "user must be a string containing '{instruction}'" return value - @root_validator + @root_validator(skip_on_failure=True) def check_user_system_in_user(cls, values): if values["system_in_user"]: assert ( diff --git a/requirements-backend.txt b/requirements-backend.txt index c2e792c4..fbd459f5 100644 --- a/requirements-backend.txt +++ b/requirements-backend.txt @@ -11,7 +11,8 @@ Jinja2 numexpr>=2.7.3 hf_transfer evaluate -vllm>=0.2.0,<0.2.6 +# vllm>=0.2.0,<0.2.6 +vllm==0.4.0.post1+cpu numpy<1.24 ninja protobuf<3.21.0 diff --git a/requirements.txt b/requirements.txt index 7a08198f..72519891 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ boto3 pydantic typer>=0.9 rich -typing_extensions~=4.5.0 +# typing_extensions~=4.5.0 +typing_extensions requests openai diff --git a/setup.py b/setup.py index 93e0200d..f4d55bdb 100644 --- a/setup.py +++ b/setup.py @@ -35,6 +35,7 @@ "test": required_dev, "docs": required_docs, }, - dependency_links=["https://download.pytorch.org/whl/cu118"], + # dependency_links=["https://download.pytorch.org/whl/cu118"], + dependency_links=["https://download.pytorch.org/whl/cpu"], python_requires=">=3.8", ) From 0092d2203ec7c329849e638dc478fa7619ac9b1f Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Tue, 16 Apr 2024 02:04:39 +0000 Subject: [PATCH 02/11] fix runtime_env arg --- rayllm/backend/llm/vllm/vllm_compatibility.py | 44 +++++++++++-------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/rayllm/backend/llm/vllm/vllm_compatibility.py b/rayllm/backend/llm/vllm/vllm_compatibility.py index b69b1a64..76a6b69f 100644 --- a/rayllm/backend/llm/vllm/vllm_compatibility.py +++ b/rayllm/backend/llm/vllm/vllm_compatibility.py @@ -9,6 +9,9 @@ from vllm.config import ModelConfig as VllmModelConfig from vllm.config import ParallelConfig as VllmParallelConfig from vllm.config import SchedulerConfig as VllmSchedulerConfig +from vllm.config import EngineConfig as VllmEngineConfig +from vllm.config import VisionLanguageConfig as VllmVisionLanguageConfig +from vllm.config import SpeculativeConfig as VllmSpeculativeConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine, AsyncStream, _AsyncLLMEngine @@ -33,12 +36,11 @@ logger = logging.getLogger(__name__) VllmConfigs = Tuple[ - VllmCacheConfig, VllmModelConfig, VllmParallelConfig, VllmSchedulerConfig + VllmCacheConfig, VllmModelConfig, VllmParallelConfig, VllmSchedulerConfig, VllmVisionLanguageConfig, VllmSpeculativeConfig, ] - class AviaryLLMEngine(_AsyncLLMEngine): - def __init__(self, *args, runtime_env: dict, **kwargs): + def __init__(self, *args, runtime_env: dict = {}, **kwargs): self.runtime_env = runtime_env super().__init__(*args, **kwargs) @@ -114,7 +116,8 @@ def _get_vllm_engine_config(vllm_app) -> Tuple[AsyncEngineArgs, VllmConfigs]: # If it is the hf_model_id, vllm automatically downloads the correct model. **dict( model=vllm_app.engine_config.actual_hf_model_id, - worker_use_ray=True, + # worker_use_ray=True, + worker_use_ray=False, engine_use_ray=False, tensor_parallel_size=vllm_app.placement_config.world_size, max_model_len=vllm_app.engine_config.max_total_tokens, @@ -123,8 +126,10 @@ def _get_vllm_engine_config(vllm_app) -> Tuple[AsyncEngineArgs, VllmConfigs]: **vllm_app.engine_config.get_initialization_kwargs(), ) ) - configs = async_engine_args.create_engine_configs() - return async_engine_args, configs + config = async_engine_args.create_engine_config() + vllm_configs = (config.cache_config, config.model_config, config.parallel_config, + config.scheduler_config, config.vision_language_config, config.speculative_config) + return async_engine_args, vllm_configs class AviaryAsyncLLMEngine(AsyncLLMEngine): @@ -174,16 +179,19 @@ def from_llm_app( ) # Create the async LLM engine. - engine = cls( - engine_args.worker_use_ray, - engine_args.engine_use_ray, - *engine_configs, - None, - placement_group, - runtime_env=runtime_env, - log_requests=not engine_args.disable_log_requests, - log_stats=not engine_args.disable_log_stats, - max_log_len=engine_args.max_log_len, - start_engine_loop=True, - ) + # engine = cls( + # engine_args.worker_use_ray, + # engine_args.engine_use_ray, + # *engine_configs, + # None, + # placement_group, + # runtime_env=runtime_env, + # log_requests=not engine_args.disable_log_requests, + # log_stats=not engine_args.disable_log_stats, + # max_log_len=engine_args.max_log_len, + # start_engine_loop=True, + # ) + + engine = cls.from_engine_args(engine_args, start_engine_loop=True) + return engine From b09ff4e8f3bef162168354b41f2f310d4bce84f0 Mon Sep 17 00:00:00 2001 From: Xiaochang Wu Date: Thu, 25 Apr 2024 05:49:26 +0000 Subject: [PATCH 03/11] add models --- .../cpu/meta-llama--Llama-2-7b-chat-hf.yaml | 41 +++++++++++++++++++ requirements-backend.txt | 2 +- .../cpu/meta-llama--Llama-2-7b-chat-hf.yaml | 7 ++++ 3 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 models/continuous_batching/cpu/meta-llama--Llama-2-7b-chat-hf.yaml create mode 100644 serve_configs/cpu/meta-llama--Llama-2-7b-chat-hf.yaml diff --git a/models/continuous_batching/cpu/meta-llama--Llama-2-7b-chat-hf.yaml b/models/continuous_batching/cpu/meta-llama--Llama-2-7b-chat-hf.yaml new file mode 100644 index 00000000..1b85a7e0 --- /dev/null +++ b/models/continuous_batching/cpu/meta-llama--Llama-2-7b-chat-hf.yaml @@ -0,0 +1,41 @@ +deployment_config: + autoscaling_config: + min_replicas: 1 + initial_replicas: 1 + max_replicas: 8 + target_num_ongoing_requests_per_replica: 24 + metrics_interval_s: 10.0 + look_back_period_s: 30.0 + smoothing_factor: 0.6 + downscale_delay_s: 300.0 + upscale_delay_s: 15.0 + max_concurrent_queries: 64 + ray_actor_options: {} +engine_config: + model_id: meta-llama/Llama-2-7b-chat-hf + hf_model_id: meta-llama/Llama-2-7b-chat-hf + type: VLLMEngine + engine_kwargs: + device: "cpu" + dtype: "float32" + trust_remote_code: true + max_num_batched_tokens: 4096 + max_num_seqs: 64 + gpu_memory_utilization: 0.95 + max_total_tokens: 4096 + generation: + prompt_format: + system: "<>\n{instruction}\n<>\n\n" + assistant: " {instruction} " + trailing_assistant: "" + user: "[INST] {system}{instruction} [/INST]" + system_in_user: true + default_system_message: "" + stopping_sequences: [] +scaling_config: + num_workers: 1 + num_gpus_per_worker: 0 + num_cpus_per_worker: 32 + placement_strategy: "STRICT_PACK" + resources_per_worker: {} + diff --git a/requirements-backend.txt b/requirements-backend.txt index fbd459f5..5bbd8058 100644 --- a/requirements-backend.txt +++ b/requirements-backend.txt @@ -11,8 +11,8 @@ Jinja2 numexpr>=2.7.3 hf_transfer evaluate +# vllm is installed seperately from source for now # vllm>=0.2.0,<0.2.6 -vllm==0.4.0.post1+cpu numpy<1.24 ninja protobuf<3.21.0 diff --git a/serve_configs/cpu/meta-llama--Llama-2-7b-chat-hf.yaml b/serve_configs/cpu/meta-llama--Llama-2-7b-chat-hf.yaml new file mode 100644 index 00000000..5a597802 --- /dev/null +++ b/serve_configs/cpu/meta-llama--Llama-2-7b-chat-hf.yaml @@ -0,0 +1,7 @@ +applications: +- name: ray-llm + route_prefix: / + import_path: rayllm.backend:router_application + args: + models: + - "./models/continuous_batching/cpu/meta-llama--Llama-2-7b-chat-hf.yaml" From f79a2bc315307622cba331c6b5dfbadc523d8bc2 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Sat, 27 Apr 2024 12:34:33 +0000 Subject: [PATCH 04/11] update models to pydantic v2 and fix AutoscalingConfig model Signed-off-by: Wu, Xiaochang --- rayllm/backend/server/config_models.py | 86 ++++++++++++++++++++++++++ rayllm/backend/server/models.py | 56 ++++++++--------- rayllm/common/models.py | 56 ++++++++--------- 3 files changed, 142 insertions(+), 56 deletions(-) create mode 100644 rayllm/backend/server/config_models.py diff --git a/rayllm/backend/server/config_models.py b/rayllm/backend/server/config_models.py new file mode 100644 index 00000000..645b1d90 --- /dev/null +++ b/rayllm/backend/server/config_models.py @@ -0,0 +1,86 @@ +from typing import Optional +from pydantic import ( + BaseModel, + Field, + NonNegativeFloat, + NonNegativeInt, + PositiveFloat, + PositiveInt, + field_validator, +) + +# Adapted from ray.serve.config.AutoscalingConfig +# Port it here as the original AutoscalingConfig model is pydantic V1 +class AutoscalingConfig(BaseModel): + """Config for the Serve Autoscaler.""" + + # Please keep these options in sync with those in + # `src/ray/protobuf/serve.proto`. + + # Publicly exposed options + min_replicas: NonNegativeInt = 1 + initial_replicas: Optional[NonNegativeInt] = None + max_replicas: PositiveInt = 1 + + # DEPRECATED: replaced by target_ongoing_requests + target_num_ongoing_requests_per_replica: PositiveFloat = Field( + default=1.0, + description="[DEPRECATED] Please use `target_ongoing_requests` instead.", + ) + # Will default to 1.0 in the future. + target_ongoing_requests: Optional[PositiveFloat] = None + + # How often to scrape for metrics + metrics_interval_s: PositiveFloat = 10.0 + # Time window to average over for metrics. + look_back_period_s: PositiveFloat = 30.0 + + # DEPRECATED + smoothing_factor: PositiveFloat = 1.0 + # DEPRECATED: replaced by `downscaling_factor` + upscale_smoothing_factor: Optional[PositiveFloat] = Field( + default=None, description="[DEPRECATED] Please use `upscaling_factor` instead." + ) + # DEPRECATED: replaced by `upscaling_factor` + downscale_smoothing_factor: Optional[PositiveFloat] = Field( + default=None, + description="[DEPRECATED] Please use `downscaling_factor` instead.", + ) + + # Multiplicative "gain" factor to limit scaling decisions + upscaling_factor: Optional[PositiveFloat] = None + downscaling_factor: Optional[PositiveFloat] = None + + # How frequently to make autoscaling decisions + # loop_period_s: float = CONTROL_LOOP_PERIOD_S + # How long to wait before scaling down replicas + downscale_delay_s: NonNegativeFloat = 600.0 + # How long to wait before scaling up replicas + upscale_delay_s: NonNegativeFloat = 30.0 + + @field_validator("max_replicas") + def replicas_settings_valid(cls, max_replicas, values): + min_replicas = values.data.get("min_replicas") + initial_replicas = values.data.get("initial_replicas") + if min_replicas is not None and max_replicas < min_replicas: + raise ValueError( + f"max_replicas ({max_replicas}) must be greater than " + f"or equal to min_replicas ({min_replicas})!" + ) + + if initial_replicas is not None: + if initial_replicas < min_replicas: + raise ValueError( + f"min_replicas ({min_replicas}) must be less than " + f"or equal to initial_replicas ({initial_replicas})!" + ) + elif initial_replicas > max_replicas: + raise ValueError( + f"max_replicas ({max_replicas}) must be greater than " + f"or equal to initial_replicas ({initial_replicas})!" + ) + + return max_replicas + + def __init__(self, **kwargs): + super().__init__(**kwargs) diff --git a/rayllm/backend/server/models.py b/rayllm/backend/server/models.py index 9cd696f0..44dcb6c6 100644 --- a/rayllm/backend/server/models.py +++ b/rayllm/backend/server/models.py @@ -11,6 +11,7 @@ Literal, Optional, Protocol, + Self, Set, Tuple, Type, @@ -20,17 +21,16 @@ import yaml from markdown_it import MarkdownIt -from pydantic.v1 import ( +from pydantic import ( BaseModel, - Extra, Field, PrivateAttr, conlist, - root_validator, - validator, + field_validator, + model_validator, ) from ray.air import ScalingConfig as AIRScalingConfig -from ray.serve.config import AutoscalingConfig +from rayllm.backend.server.config_models import AutoscalingConfig from ray.util.placement_group import ( PlacementGroup, get_current_placement_group, @@ -103,7 +103,7 @@ class BaseModelExtended(BaseModel): def parse_yaml(cls: Type[ModelT], file, **kwargs) -> ModelT: kwargs.setdefault("Loader", yaml.SafeLoader) dict_args = yaml.load(file, **kwargs) - return cls.parse_obj(dict_args) + return cls.model_validate(dict_args) def yaml( self, @@ -123,7 +123,7 @@ def yaml( """ return yaml.dump( json.loads( - self.json( + self.model_dump_json( include=include, exclude=exclude, by_alias=by_alias, @@ -200,18 +200,18 @@ class AviaryModelResponse(ComputedPropertyMixin, BaseModelExtended): finish_reason: Optional[str] = None error: Optional[ErrorResponse] = None - @root_validator - def text_or_error_or_finish_reason(cls, values): + @model_validator(mode='after') + def text_or_error_or_finish_reason(self) -> Self: if ( - values.get("generated_text") is None - and values.get("embedding_outputs") is None - and values.get("error") is None - and values.get("finish_reason") is None + self.generated_text is None + and self.embedding_outputs is None + and self.error is None + and self.finish_reason is None ): raise ValueError( "Either 'generated_text' or 'embedding_outputs' or 'error' or 'finish_reason' must be set" ) - return values + return self @classmethod def merge_stream(cls, *responses: "AviaryModelResponse") -> "AviaryModelResponse": @@ -370,7 +370,7 @@ class GCSMirrorConfig(BaseModelExtended): bucket_uri: str extra_files: List[ExtraFiles] = Field(default_factory=list) - @validator("bucket_uri") + @field_validator("bucket_uri") def check_uri_format(cls, value: str): if not value.startswith("gs://"): raise ValueError( @@ -385,11 +385,11 @@ class GenerationConfig(BaseModelExtended): generate_kwargs: Dict[str, Any] = {} stopping_sequences: Optional[List[Union[str, int, List[Union[str, int]]]]] = None - @validator("prompt_format") + @field_validator("prompt_format") def default_prompt_format(cls, prompt_format): return prompt_format if prompt_format is not None else DisabledPromptFormat() - @validator("stopping_sequences") + @field_validator("stopping_sequences") def check_stopping_sequences(cls, value): def try_int(x): if isinstance(x, list): @@ -425,13 +425,13 @@ class ModelType(str, Enum): class EngineConfig(BaseModelExtended): class Config: use_enum_values = True - extra = Extra.forbid + extra = 'forbid' model_id: str hf_model_id: Optional[str] = None type: EngineType model_type: ModelType - tokenizer_id: Optional[str] + tokenizer_id: Optional[str] = None s3_mirror_config: Optional[S3MirrorConfig] = None gcs_mirror_config: Optional[GCSMirrorConfig] = None @@ -445,7 +445,7 @@ class Config: # These will be copied to the runtime env env_vars_to_propogate: List[str] = list(ENV_VARS_TO_PROPAGATE) - @validator("gcs_mirror_config") + @field_validator("gcs_mirror_config") def check_only_one_mirror_config_specified(cls, value, values): gcs_config = value s3_config = values["s3_mirror_config"] @@ -540,9 +540,9 @@ class SamplingParams(BaseModelExtended): def dict(self, **kwargs): if kwargs.get("exclude", None) is None: kwargs["exclude"] = self._ignored_fields - return super().dict(**kwargs) + return super().model_dump(**kwargs) - @validator("stop", always=True) + @field_validator("stop") def validate_stopping_sequences(cls, values): if not values: return values @@ -603,7 +603,7 @@ class ChatCompletions(BaseModelExtended): """ model: str - messages: conlist(Message, min_items=1) + messages: conlist(Message, min_length=1) stream: bool = False echo: Optional[bool] = False user: Optional[str] = None @@ -640,7 +640,7 @@ class Embeddings(BaseModelExtended): """ model: str - input: Union[str, conlist(str, min_items=1)] + input: Union[str, conlist(str, min_length=1)] user: Optional[str] = None @@ -652,7 +652,7 @@ class ScalingConfig(BaseModelExtended): resources_per_worker: Optional[Dict[str, float]] = None pg_timeout_s: float = 600 - @validator("num_gpus_per_worker") + @field_validator("num_gpus_per_worker") def validate_num_gpus_per_worker(cls, value): if value > 1: raise ValueError( @@ -706,7 +706,7 @@ def model_id(self): return self.engine_config.model_id def short_metadata(self): - return self.dict( + return self.model_dump( include={ "model_id": True, "engine_config": { @@ -787,9 +787,9 @@ class HasModelId(Protocol): model_id: str -class ChatCompletionsParams(ChatCompletions, SamplingParams, extra=Extra.allow): +class ChatCompletionsParams(ChatCompletions, SamplingParams, extra='allow'): pass -class CompletionsParams(Completions, SamplingParams, extra=Extra.allow): +class CompletionsParams(Completions, SamplingParams, extra='allow'): max_tokens: Optional[int] = 16 diff --git a/rayllm/common/models.py b/rayllm/common/models.py index ad637359..7dc57ae9 100644 --- a/rayllm/common/models.py +++ b/rayllm/common/models.py @@ -1,7 +1,7 @@ -from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, TypeVar, Union +from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Self, TypeVar, Union from fastapi import HTTPException, status -from pydantic.v1 import BaseModel, root_validator, validator +from pydantic import BaseModel, field_validator, model_validator if TYPE_CHECKING: from rayllm.backend.server.models import AviaryModelResponse @@ -127,7 +127,7 @@ class ToolCall(BaseModel): id: str def __str__(self): - return str(self.dict()) + return str(self.model_dump()) class Function(BaseModel): @@ -181,37 +181,37 @@ def __str__(self): # the object if getattr(self, "tool_calls", None): return str(self.content) - return str(self.dict()) + return str(self.model_dump()) - @root_validator(skip_on_failure=True) - def check_fields(cls, values): - if values["role"] in ["system", "user"]: - if not isinstance(values.get("content"), str): + @model_validator(mode='after') + def check_fields(self) -> Self: + if self.role in ["system", "user"]: + if not isinstance(self.content, str): raise ValueError("content must be a string") - if values["role"] == "tool": - if not isinstance(values.get("tool_call_id"), str): + if self.role == "tool": + if not isinstance(self.tool_call_id, str): raise ValueError("tool_call_id must be a str") # content should either be a dict with errors or with results - if not isinstance(values.get("content"), str): + if not isinstance(self.content, str): raise ValueError( "content must be a str with results or errors for " "the tool call" ) - if values["role"] == "assistant": - if values.get("tool_calls"): + if self.role == "assistant": + if self.tool_calls: # passing a regular assistant message - if not isinstance(values.get("tool_calls"), list): + if not isinstance(self.tool_calls, list): raise ValueError("tool_calls must be a list") - for tool_call in values["tool_calls"]: + for tool_call in self.tool_calls: if not isinstance(tool_call, ToolCall): raise TypeError("Tool calls must be of type ToolCall") else: # passing a regular assistant message if ( - not isinstance(values.get("content"), str) - or values.get("content") == "" + not isinstance(self.content, str) + or self.content == "" ): raise ValueError("content must be a string or None") - return values + return self class DeltaRole(BaseModel): @@ -229,7 +229,7 @@ def __str__(self): if self.tool_calls: return str(self.tool_calls) else: - return str(self.dict()) + return str(self.model_dump()) class DeltaEOS(BaseModel): @@ -289,7 +289,7 @@ class Prompt(BaseModel): tools: Optional[List[Tool]] = None tool_choice: Union[Literal["auto", "none"], ToolChoice] = "auto" - @validator("prompt") + @field_validator("prompt") def check_prompt(cls, value): if isinstance(value, list) and not value: raise ValueError("Messages cannot be an empty list.") @@ -338,34 +338,34 @@ class PromptFormat(AbstractPromptFormat): add_system_tags_even_if_message_is_empty: bool = False strip_whitespace: bool = True - @validator("system") + @field_validator("system") def check_system(cls, value): assert value and ( "{instruction}" in value ), "system must be a string containing '{instruction}'" return value - @validator("assistant") + @field_validator("assistant") def check_assistant(cls, value): assert ( value and "{instruction}" in value ), "assistant must be a string containing '{instruction}'" return value - @validator("user") + @field_validator("user") def check_user(cls, value): assert value and ( "{instruction}" in value ), "user must be a string containing '{instruction}'" return value - @root_validator(skip_on_failure=True) - def check_user_system_in_user(cls, values): - if values["system_in_user"]: + @model_validator(mode='after') + def check_user_system_in_user(self) -> Self: + if self.system_in_user: assert ( - "{system}" in values["user"] + "{system}" in self.user ), "If system_in_user=True, user must contain '{system}'" - return values + return self def generate_prompt(self, messages: Union[Prompt, List[Message]]) -> str: if isinstance(messages, Prompt): From e5fefc199eaeba5bc59bf35eef48b66cfc6e18aa Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Mon, 29 Apr 2024 08:33:51 +0000 Subject: [PATCH 05/11] apply bump-pydantic tool Signed-off-by: Wu, Xiaochang --- .../backend/llm/embedding/embedding_models.py | 4 ++-- rayllm/backend/llm/trtllm/trtllm_models.py | 5 ++--- rayllm/backend/server/models.py | 22 ++++++++----------- rayllm/common/llm_event.py | 14 ++++++------ rayllm/common/models.py | 18 +++++++-------- 5 files changed, 28 insertions(+), 35 deletions(-) diff --git a/rayllm/backend/llm/embedding/embedding_models.py b/rayllm/backend/llm/embedding/embedding_models.py index ce8c1b02..5816b962 100644 --- a/rayllm/backend/llm/embedding/embedding_models.py +++ b/rayllm/backend/llm/embedding/embedding_models.py @@ -11,6 +11,7 @@ ModelType, S3MirrorConfig, ) +from pydantic import ConfigDict logger = logging.getLogger(__name__) @@ -20,8 +21,7 @@ class EmbeddingOptimize(str, Enum): class EmbeddingEngineConfig(EngineConfig): - class Config: - use_enum_values = True + model_config = ConfigDict(use_enum_values=True) type: EngineType = EngineType.EmbeddingEngine model_type: ModelType = ModelType.embedding diff --git a/rayllm/backend/llm/trtllm/trtllm_models.py b/rayllm/backend/llm/trtllm/trtllm_models.py index 6243aed3..79da1d61 100644 --- a/rayllm/backend/llm/trtllm/trtllm_models.py +++ b/rayllm/backend/llm/trtllm/trtllm_models.py @@ -10,6 +10,7 @@ S3MirrorConfig, SamplingParams, ) +from pydantic import ConfigDict try: from tensorrt_llm.libs import trt_llm_engine_py as trt_py @@ -23,9 +24,7 @@ class TRTLLMGPTServeConfig(BaseModelExtended): max_tokens_in_paged_kv_cache: int = None kv_cache_free_gpu_mem_fraction: float = None enable_trt_overlap: bool = None - - class Config: - arbitrary_types_allowed = True + model_config = ConfigDict(arbitrary_types_allowed=True) @classmethod def from_engine_config( diff --git a/rayllm/backend/server/models.py b/rayllm/backend/server/models.py index 44dcb6c6..0e630e37 100644 --- a/rayllm/backend/server/models.py +++ b/rayllm/backend/server/models.py @@ -22,10 +22,9 @@ import yaml from markdown_it import MarkdownIt from pydantic import ( - BaseModel, + ConfigDict, BaseModel, Field, PrivateAttr, - conlist, field_validator, model_validator, ) @@ -58,6 +57,7 @@ ALLOW_NEW_PLACEMENT_GROUPS_IN_DEPLOYMENT, MAX_NUM_STOPPING_SEQUENCES, ) +from typing_extensions import Annotated T = TypeVar("T") ModelT = TypeVar("ModelT", bound=BaseModel) @@ -351,7 +351,7 @@ def unpack(self) -> Tuple["AviaryModelResponse", ...]: class S3AWSCredentials(BaseModelExtended): create_aws_credentials_url: str - auth_token_env_variable: Optional[str] + auth_token_env_variable: Optional[str] = None class ExtraFiles(BaseModelExtended): @@ -423,9 +423,7 @@ class ModelType(str, Enum): class EngineConfig(BaseModelExtended): - class Config: - use_enum_values = True - extra = 'forbid' + model_config = ConfigDict(use_enum_values=True, extra='forbid') model_id: str hf_model_id: Optional[str] = None @@ -489,9 +487,7 @@ def get_vllm_load_s3_path(self) -> Optional[str]: class SchedulingMetadata(BaseModelExtended): request_id: Union[str, List[str]] priority: Union[QueuePriority, List[QueuePriority]] - - class Config: - arbitrary_types_allowed = True + model_config = ConfigDict(arbitrary_types_allowed=True) class SamplingParams(BaseModelExtended): @@ -583,7 +579,7 @@ def merge_generation_params( class GenerationRequest(BaseModelExtended): prompt: Union[str, List[int], List[str]] request_id: Union[str, List[str]] - sampling_params: Optional[Union[SamplingParams, List[SamplingParams]]] + sampling_params: Optional[Union[SamplingParams, List[SamplingParams]]] = None scheduling_metadata: Union[SchedulingMetadata, List[SchedulingMetadata]] @@ -603,7 +599,7 @@ class ChatCompletions(BaseModelExtended): """ model: str - messages: conlist(Message, min_length=1) + messages: Annotated[List[Message], Field(min_length=1)] stream: bool = False echo: Optional[bool] = False user: Optional[str] = None @@ -640,7 +636,7 @@ class Embeddings(BaseModelExtended): """ model: str - input: Union[str, conlist(str, min_length=1)] + input: Union[str, Annotated[List[str], Field(min_length=1)]] user: Optional[str] = None @@ -689,7 +685,7 @@ class ServeMultiplexConfig(BaseModelExtended): class DeploymentConfig(BaseModelExtended): - autoscaling_config: Optional[AutoscalingConfig] + autoscaling_config: Optional[AutoscalingConfig] = None max_concurrent_queries: Optional[int] = None ray_actor_options: Optional[Dict[str, Any]] = None diff --git a/rayllm/common/llm_event.py b/rayllm/common/llm_event.py index b7fff4f7..4e42d50f 100644 --- a/rayllm/common/llm_event.py +++ b/rayllm/common/llm_event.py @@ -20,8 +20,8 @@ class Vote(BaseModel): class LlmResponse(BaseModel): model_id: str text: str - engine_config: Optional[Dict] - gen_stats: Optional[Dict] + engine_config: Optional[Dict] = None + gen_stats: Optional[Dict] = None class LlmEvent(BaseModel): @@ -30,7 +30,7 @@ class LlmEvent(BaseModel): project_name: str # Identifier for a session - session_id: Optional[str] + session_id: Optional[str] = None # unique string representing this event instance_id: str @@ -41,13 +41,13 @@ class LlmEvent(BaseModel): # Vote is a dictionary by llm and the votes # that model got. Typically, this is 1. - votes: Optional[List[Vote]] - vote_comments: Optional[Dict[str, str]] + votes: Optional[List[Vote]] = None + vote_comments: Optional[Dict[str, str]] = None # Key: llm # Value: list of flags - flag: Optional[Dict[str, List[Flag]]] + flag: Optional[Dict[str, List[Flag]]] = None # Key: llm # Value: Comment for each llm - flag_comments: Optional[Dict[str, str]] + flag_comments: Optional[Dict[str, str]] = None diff --git a/rayllm/common/models.py b/rayllm/common/models.py index 7dc57ae9..a6c00a88 100644 --- a/rayllm/common/models.py +++ b/rayllm/common/models.py @@ -1,7 +1,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Self, TypeVar, Union from fastapi import HTTPException, status -from pydantic import BaseModel, field_validator, model_validator +from pydantic import ConfigDict, BaseModel, field_validator, model_validator if TYPE_CHECKING: from rayllm.backend.server.models import AviaryModelResponse @@ -44,7 +44,7 @@ class TextChoice(BaseModel): text: str index: int logprobs: dict - finish_reason: Optional[str] + finish_reason: Optional[str] = None class Usage(BaseModel): @@ -79,7 +79,7 @@ class Completion(BaseModel): created: int model: str choices: List[TextChoice] - usage: Optional[Usage] + usage: Optional[Usage] = None @classmethod def create( @@ -113,7 +113,7 @@ class EmbeddingsOutput(BaseModel): object: str created: int model: str - usage: Optional[EmbeddingsUsage] + usage: Optional[EmbeddingsUsage] = None class FunctionCall(BaseModel): @@ -233,8 +233,7 @@ def __str__(self): class DeltaEOS(BaseModel): - class Config: - extra = "forbid" + model_config = ConfigDict(extra="forbid") class ChoiceLogProbs(BaseModel): @@ -251,7 +250,7 @@ class MessageChoices(BaseModel): class DeltaChoices(BaseModel): delta: Union[DeltaRole, DeltaContent, DeltaEOS] index: int - finish_reason: Optional[str] + finish_reason: Optional[str] = None logprobs: Optional[ChoiceLogProbs] = None @@ -261,7 +260,7 @@ class ChatCompletion(BaseModel): created: int model: str choices: List[Union[MessageChoices, DeltaChoices]] - usage: Optional[Usage] + usage: Optional[Usage] = None @classmethod def create( @@ -320,8 +319,7 @@ class ErrorResponse(BaseModel): class AbstractPromptFormat(BaseModel): - class Config: - extra = "forbid" + model_config = ConfigDict(extra="forbid") def generate_prompt(self, messages: Union[Prompt, List[Message]]) -> str: raise NotImplementedError() From cca3f681857576212600317694c805493ce805ec Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Mon, 29 Apr 2024 08:35:32 +0000 Subject: [PATCH 06/11] parse_obj => model_validate Signed-off-by: Wu, Xiaochang --- rayllm/backend/server/models.py | 2 +- rayllm/backend/server/routers/router_app.py | 8 ++++---- rayllm/backend/server/run.py | 2 +- rayllm/backend/server/trtllm/trtllm_deployment.py | 2 +- rayllm/backend/server/utils.py | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/rayllm/backend/server/models.py b/rayllm/backend/server/models.py index 0e630e37..9d84c218 100644 --- a/rayllm/backend/server/models.py +++ b/rayllm/backend/server/models.py @@ -573,7 +573,7 @@ def merge_generation_params( generation.stopping_sequences or [] ) - return cls.parse_obj(generate_kwargs) + return cls.model_validate(generate_kwargs) class GenerationRequest(BaseModelExtended): diff --git a/rayllm/backend/server/routers/router_app.py b/rayllm/backend/server/routers/router_app.py index bdcd9fda..6ef6ae30 100644 --- a/rayllm/backend/server/routers/router_app.py +++ b/rayllm/backend/server/routers/router_app.py @@ -136,7 +136,7 @@ async def _completions_wrapper( ) all_results.pop() had_error = True - yield AviaryModelResponse.parse_obj(subresult_dict) + yield AviaryModelResponse.model_validate(subresult_dict) # Return early in case of an error break choices = [ @@ -213,7 +213,7 @@ async def _chat_completions_wrapper( subresult_dict["finish_reason"] = None all_results.pop() had_error = True - yield AviaryModelResponse.parse_obj(subresult_dict) + yield AviaryModelResponse.model_validate(subresult_dict) # Return early in case of an error break else: @@ -240,7 +240,7 @@ async def _chat_completions_wrapper( if subresult_dict["logprobs"]: logprobs = ChoiceLogProbs( content=[ - LogProbs.parse_obj(logprob) + LogProbs.model_validate(logprob) for logprob in subresult_dict["logprobs"] ] ) @@ -469,7 +469,7 @@ async def chat( logprobs = results.logprobs if logprobs: logprobs = ChoiceLogProbs( - content=[LogProbs.parse_obj(logprob) for logprob in logprobs] + content=[LogProbs.model_validate(logprob) for logprob in logprobs] ) else: logprobs = None diff --git a/rayllm/backend/server/run.py b/rayllm/backend/server/run.py index a80a09aa..072bd119 100644 --- a/rayllm/backend/server/run.py +++ b/rayllm/backend/server/run.py @@ -177,7 +177,7 @@ def router_deployment( def router_application(args): ray._private.usage.usage_lib.record_library_usage("ray-llm") - router_args = RouterArgs.parse_obj(args) + router_args = RouterArgs.model_validate(args) vllm_apps = [] embedding_apps = [] trtllm_apps = [] diff --git a/rayllm/backend/server/trtllm/trtllm_deployment.py b/rayllm/backend/server/trtllm/trtllm_deployment.py index 8b816012..e31af5d9 100644 --- a/rayllm/backend/server/trtllm/trtllm_deployment.py +++ b/rayllm/backend/server/trtllm/trtllm_deployment.py @@ -30,7 +30,7 @@ async def stream( prompt: Prompt, priority=None, ): - sample_params = TRTLLMSamplingParams.parse_obj(prompt.parameters) + sample_params = TRTLLMSamplingParams.model_validate(prompt.parameters) logger.info(f"Received request {request_id}") prompt_text = ( diff --git a/rayllm/backend/server/utils.py b/rayllm/backend/server/utils.py index 1601c994..5840397a 100644 --- a/rayllm/backend/server/utils.py +++ b/rayllm/backend/server/utils.py @@ -71,7 +71,7 @@ def parse_args( else: raise else: - parsed_models = [llm_app_cls.parse_obj(raw_model)] + parsed_models = [llm_app_cls.model_validate(raw_model)] models += parsed_models return [model for model in models if model.enabled] From c5788a64dd5e9da4f46033a1b262738c6d81d08f Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Mon, 6 May 2024 02:51:43 +0000 Subject: [PATCH 07/11] Refactor Signed-off-by: Wu, Xiaochang --- rayllm/backend/llm/vllm/vllm_compatibility.py | 44 ++++--------------- rayllm/backend/llm/vllm/vllm_engine.py | 6 ++- 2 files changed, 13 insertions(+), 37 deletions(-) diff --git a/rayllm/backend/llm/vllm/vllm_compatibility.py b/rayllm/backend/llm/vllm/vllm_compatibility.py index 76a6b69f..f4dd8fa0 100644 --- a/rayllm/backend/llm/vllm/vllm_compatibility.py +++ b/rayllm/backend/llm/vllm/vllm_compatibility.py @@ -1,17 +1,10 @@ import logging import time -from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Type, Union +from typing import TYPE_CHECKING, Any, List, Optional, Type, Union import ray from ray.util.placement_group import PlacementGroup from transformers.dynamic_module_utils import init_hf_modules -from vllm.config import CacheConfig as VllmCacheConfig -from vllm.config import ModelConfig as VllmModelConfig -from vllm.config import ParallelConfig as VllmParallelConfig -from vllm.config import SchedulerConfig as VllmSchedulerConfig -from vllm.config import EngineConfig as VllmEngineConfig -from vllm.config import VisionLanguageConfig as VllmVisionLanguageConfig -from vllm.config import SpeculativeConfig as VllmSpeculativeConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine, AsyncStream, _AsyncLLMEngine @@ -35,10 +28,6 @@ logger = logging.getLogger(__name__) -VllmConfigs = Tuple[ - VllmCacheConfig, VllmModelConfig, VllmParallelConfig, VllmSchedulerConfig, VllmVisionLanguageConfig, VllmSpeculativeConfig, -] - class AviaryLLMEngine(_AsyncLLMEngine): def __init__(self, *args, runtime_env: dict = {}, **kwargs): self.runtime_env = runtime_env @@ -108,16 +97,17 @@ def _record_system_stats(self, *args, **kwargs): return last_stats -def _get_vllm_engine_config(vllm_app) -> Tuple[AsyncEngineArgs, VllmConfigs]: - # Generate engine arguements and engine configs +def _get_vllm_engine_config(vllm_app) -> AsyncEngineArgs: + device = vllm_app.engine_config.engine_kwargs.get("device", "gpu") + # Generate engine arguements and engine configs async_engine_args = AsyncEngineArgs( # This is the local path on disk, or the hf model id # If it is the hf_model_id, vllm automatically downloads the correct model. **dict( model=vllm_app.engine_config.actual_hf_model_id, - # worker_use_ray=True, - worker_use_ray=False, + # vLLM for CPU doesn't support Ray workers for model parallelism yet + worker_use_ray=True if device == "gpu" else False, engine_use_ray=False, tensor_parallel_size=vllm_app.placement_config.world_size, max_model_len=vllm_app.engine_config.max_total_tokens, @@ -126,10 +116,8 @@ def _get_vllm_engine_config(vllm_app) -> Tuple[AsyncEngineArgs, VllmConfigs]: **vllm_app.engine_config.get_initialization_kwargs(), ) ) - config = async_engine_args.create_engine_config() - vllm_configs = (config.cache_config, config.model_config, config.parallel_config, - config.scheduler_config, config.vision_language_config, config.speculative_config) - return async_engine_args, vllm_configs + + return async_engine_args class AviaryAsyncLLMEngine(AsyncLLMEngine): @@ -172,26 +160,12 @@ def from_llm_app( # torch to have access to CUDA devices. We use a remote task # with `num_gpus` set here, so the type check happens in an environment # with `CUDA_VISIBLE_DEVICES` set. - engine_args, engine_configs = ray.get( + engine_args = ray.get( ray.remote(_get_vllm_engine_config) .options(**scaling_config) .remote(vllm_app) ) - # Create the async LLM engine. - # engine = cls( - # engine_args.worker_use_ray, - # engine_args.engine_use_ray, - # *engine_configs, - # None, - # placement_group, - # runtime_env=runtime_env, - # log_requests=not engine_args.disable_log_requests, - # log_stats=not engine_args.disable_log_stats, - # max_log_len=engine_args.max_log_len, - # start_engine_loop=True, - # ) - engine = cls.from_engine_args(engine_args, start_engine_loop=True) return engine diff --git a/rayllm/backend/llm/vllm/vllm_engine.py b/rayllm/backend/llm/vllm/vllm_engine.py index 04ef8734..617784b6 100644 --- a/rayllm/backend/llm/vllm/vllm_engine.py +++ b/rayllm/backend/llm/vllm/vllm_engine.py @@ -59,8 +59,10 @@ def __init__( self.llm_app = llm_app.copy(deep=True) self.engine_config = llm_app.engine_config self.placement_config = llm_app.placement_config - # if not (self.placement_config.scaling_config.num_gpus_per_worker > 0): - # raise ValueError("The VLLM Engine Requires > 0 GPUs to run.") + + device = self.engine_config.engine_kwargs.get("device", "gpu") + if device == "gpu" and not (self.placement_config.scaling_config.num_gpus_per_worker > 0): + raise ValueError("The VLLM Engine Requires > 0 GPUs to run.") self.node_initializer = node_initializer or LLMNodeInitializer( local_node_tokenizer_only=True From 26580170973ff409525b596d2b9814998582ae9d Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Mon, 6 May 2024 03:08:05 +0000 Subject: [PATCH 08/11] Update backend requirements Signed-off-by: Wu, Xiaochang --- requirements-backend.txt | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/requirements-backend.txt b/requirements-backend.txt index 5bbd8058..ea4367b6 100644 --- a/requirements-backend.txt +++ b/requirements-backend.txt @@ -11,13 +11,12 @@ Jinja2 numexpr>=2.7.3 hf_transfer evaluate -# vllm is installed seperately from source for now -# vllm>=0.2.0,<0.2.6 +vllm>=0.4.1 numpy<1.24 ninja protobuf<3.21.0 safetensors -pydantic~=1.10.0 +pydantic>=2.0 einops markdown-it-py[plugins] fastapi-versioning From a786c12654ebf2e56f0698a9811cf163d6be81a6 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Mon, 6 May 2024 03:29:39 +0000 Subject: [PATCH 09/11] fix Signed-off-by: Wu, Xiaochang --- rayllm/backend/server/models.py | 3 +-- rayllm/common/models.py | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/rayllm/backend/server/models.py b/rayllm/backend/server/models.py index 9d84c218..a56e8086 100644 --- a/rayllm/backend/server/models.py +++ b/rayllm/backend/server/models.py @@ -11,7 +11,6 @@ Literal, Optional, Protocol, - Self, Set, Tuple, Type, @@ -201,7 +200,7 @@ class AviaryModelResponse(ComputedPropertyMixin, BaseModelExtended): error: Optional[ErrorResponse] = None @model_validator(mode='after') - def text_or_error_or_finish_reason(self) -> Self: + def text_or_error_or_finish_reason(self): if ( self.generated_text is None and self.embedding_outputs is None diff --git a/rayllm/common/models.py b/rayllm/common/models.py index a6c00a88..775eaf14 100644 --- a/rayllm/common/models.py +++ b/rayllm/common/models.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Self, TypeVar, Union +from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, TypeVar, Union from fastapi import HTTPException, status from pydantic import ConfigDict, BaseModel, field_validator, model_validator @@ -184,7 +184,7 @@ def __str__(self): return str(self.model_dump()) @model_validator(mode='after') - def check_fields(self) -> Self: + def check_fields(self): if self.role in ["system", "user"]: if not isinstance(self.content, str): raise ValueError("content must be a string") @@ -358,7 +358,7 @@ def check_user(cls, value): return value @model_validator(mode='after') - def check_user_system_in_user(self) -> Self: + def check_user_system_in_user(self): if self.system_in_user: assert ( "{system}" in self.user From 19e9542be3abecd1e8a9bce001180c406efdbc4e Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Mon, 6 May 2024 03:38:38 +0000 Subject: [PATCH 10/11] fix Signed-off-by: Wu, Xiaochang --- requirements-backend.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-backend.txt b/requirements-backend.txt index ea4367b6..890e74f1 100644 --- a/requirements-backend.txt +++ b/requirements-backend.txt @@ -21,6 +21,7 @@ einops markdown-it-py[plugins] fastapi-versioning scipy +redis opentelemetry-api>=1.15.0 opentelemetry-sdk opentelemetry-exporter-otlp From 605ae4a0c64ad0932af0ff6d01e38cf5f61259d0 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Mon, 6 May 2024 04:07:26 +0000 Subject: [PATCH 11/11] nit Signed-off-by: Wu, Xiaochang --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index f4d55bdb..93e0200d 100644 --- a/setup.py +++ b/setup.py @@ -35,7 +35,6 @@ "test": required_dev, "docs": required_docs, }, - # dependency_links=["https://download.pytorch.org/whl/cu118"], - dependency_links=["https://download.pytorch.org/whl/cpu"], + dependency_links=["https://download.pytorch.org/whl/cu118"], python_requires=">=3.8", )