Skip to content

Commit

Permalink
Remove max_running_time_per_request
Browse files Browse the repository at this point in the history
  • Loading branch information
cthiriet committed Jul 12, 2024
1 parent 12fb124 commit 0443c65
Show file tree
Hide file tree
Showing 4 changed files with 4 additions and 42 deletions.
3 changes: 1 addition & 2 deletions sagemaker/configs/llama3-70b.json
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
{
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
"image": "vllm:0.5.0.post1-1",
"image": "vllm:0.5.1-1",
"sagemaker_instance_type": "ml.p4d.24xlarge",
"env_vars": {
"PIPELINE_PARALLEL_SIZE": "1",
"TENSOR_PARALLEL_SIZE": "8",
"MAX_RUNNING_TIME_PER_REQUEST": "180",
"DISABLE_CUSTOM_ALL_REDUCE": "true"
}
}
3 changes: 1 addition & 2 deletions sagemaker/configs/llama3-8b.json
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
{
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
"image": "vllm:0.5.0.post1-1",
"image": "vllm:0.5.1-1",
"sagemaker_instance_type": "ml.g5.2xlarge",
"env_vars": {
"PIPELINE_PARALLEL_SIZE": "1",
"TENSOR_PARALLEL_SIZE": "1",
"MAX_RUNNING_TIME_PER_REQUEST": "180",
"DISABLE_CUSTOM_ALL_REDUCE": "true"
}
}
13 changes: 1 addition & 12 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -794,7 +794,6 @@ class AsyncEngineArgs(EngineArgs):
engine_use_ray: bool = False
disable_log_requests: bool = False
max_log_len: Optional[int] = None
max_running_time_per_request: Optional[int] = None

@staticmethod
def add_cli_args(parser: FlexibleArgumentParser,
Expand All @@ -814,16 +813,6 @@ def add_cli_args(parser: FlexibleArgumentParser,
help='Max number of prompt characters or prompt '
'ID numbers being printed in log.'
'\n\nDefault: Unlimited')
parser.add_argument(
"--max-running-time-per-request",
type=int,
default=int(os.getenv('MAX_RUNNING_TIME_PER_REQUEST'))
if os.getenv('MAX_RUNNING_TIME_PER_REQUEST') else
AsyncEngineArgs.max_running_time_per_request,
help=
('Maximum running time (in seconds) allowed for each request. '
'If the request takes longer than this time, it will be terminated. '
'If not specified, the request will not be terminated.'))
return parser


Expand All @@ -834,4 +823,4 @@ def _engine_args_parser():

def _async_engine_args_parser():
return AsyncEngineArgs.add_cli_args(FlexibleArgumentParser(),
async_args_only=True)
async_args_only=True)
27 changes: 1 addition & 26 deletions vllm/engine/async_llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,6 @@ def __init__(self,
log_requests: bool = True,
max_log_len: Optional[int] = None,
start_engine_loop: bool = True,
max_running_time_per_request: Optional[int] = None,
**kwargs) -> None:
self.worker_use_ray = worker_use_ray
self.engine_use_ray = engine_use_ray
Expand All @@ -366,7 +365,6 @@ def __init__(self,
# collected
self._background_loop_unshielded: Optional[asyncio.Task] = None
self.start_engine_loop = start_engine_loop
self.max_running_time_per_request = max_running_time_per_request
self._errored_with: Optional[BaseException] = None

# Lazy initialized fields
Expand Down Expand Up @@ -440,8 +438,6 @@ def from_engine_args(
max_log_len=engine_args.max_log_len,
start_engine_loop=start_engine_loop,
usage_context=usage_context,
max_running_time_per_request=engine_args.
max_running_time_per_request,
)
return engine

Expand Down Expand Up @@ -515,27 +511,6 @@ async def engine_step(self, virtual_engine: int) -> bool:
Returns True if there are in-progress requests."""

if self.max_running_time_per_request:
# Fix the current time.
now = time.time()

requests_to_be_aborted_due_to_timeout = []
for seq_group in self.engine.scheduler.running:
if seq_group.metrics.first_scheduled_time is None:
continue
time_in_the_queue = now - seq_group.metrics.first_scheduled_time
# if the sequence group has been in the runnning queue for more than `max_running_time_per_request` seconds
# abort the sequence group.
if time_in_the_queue > self.max_running_time_per_request:
logger.warning(
f"Request {seq_group.request_id} has been in the "
f"queue for {time_in_the_queue:.2f} seconds. Abort.")
requests_to_be_aborted_due_to_timeout.append(
seq_group.request_id)

for req_id in requests_to_be_aborted_due_to_timeout:
await self.abort(req_id)

new_requests, finished_requests = (
self._request_tracker.get_new_and_finished_requests())

Expand Down Expand Up @@ -959,4 +934,4 @@ async def is_tracing_enabled(self) -> bool:
return await self.engine.is_tracing_enabled.remote( # type: ignore
)
else:
return self.engine.is_tracing_enabled()
return self.engine.is_tracing_enabled()

0 comments on commit 0443c65

Please sign in to comment.