diff --git a/models/Google--GcpGPT b/models/Google--GcpGPT new file mode 100644 index 00000000..c130340c --- /dev/null +++ b/models/Google--GcpGPT @@ -0,0 +1,41 @@ +deployment_config: + autoscaling_config: + min_replicas: 1 + initial_replicas: 1 + max_replicas: 8 + target_num_ongoing_requests_per_replica: 24 + metrics_interval_s: 10.0 + look_back_period_s: 30.0 + smoothing_factor: 0.5 + downscale_delay_s: 300.0 + upscale_delay_s: 15.0 + max_concurrent_queries: 64 + ray_actor_options: + resources: + accelerator_type_a10: 0.01 +engine_config: + model_id: google/flan-t5-base + hf_model_id: google/flan-t5-base + type: VLLMEngine + engine_kwargs: + trust_remote_code: true + max_num_batched_tokens: 4096 + max_num_seqs: 64 + gpu_memory_utilization: 0.95 + max_total_tokens: 2048 + generation: + prompt_format: + system: "{instruction}\n" + assistant: "### Response:\n{instruction}\n" + trailing_assistant: "### Response:\n" + user: "### Instruction:\n{instruction}\n" + default_system_message: "Below is an instruction that describes a task. Write a response that appropriately completes the request." + default_system_message: "" + stopping_sequences: ["### Response:", "### End"] +scaling_config: + num_workers: 1 + num_gpus_per_worker: 1 + num_cpus_per_worker: 8 + placement_strategy: "STRICT_PACK" + resources_per_worker: + accelerator_type_a10: 0.01