diff --git a/requirements-common.txt b/requirements-common.txt index 3b8d473c1fe7a..746ba8d31552f 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -18,7 +18,7 @@ prometheus_client >= 0.18.0 prometheus-fastapi-instrumentator >= 7.0.0 tiktoken >= 0.6.0 # Required for DBRX tokenizer lm-format-enforcer == 0.10.3 -outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0 +outlines >= 0.0.46, < 0.1 # Requires torch >= 2.1.0 typing_extensions filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4 pyzmq diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py index 1c8f6cccb3e9a..c527f23ced1f4 100644 --- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py +++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py @@ -21,12 +21,12 @@ from typing import Callable, DefaultDict, Dict, List, Union import torch -from outlines.caching import cache from outlines.fsm.guide import CFGGuide, Generate, Guide, RegexGuide, Write from outlines.fsm.json_schema import build_regex_from_schema from pydantic import BaseModel from transformers import PreTrainedTokenizerBase +from vllm.hpu.utils import with_mark_steps class BaseLogitsProcessor: @@ -34,6 +34,7 @@ def __init__(self, guide: Guide): self._guide: Guide = guide self._fsm_state: DefaultDict[int, int] = defaultdict(int) + @with_mark_steps def __call__(self, input_ids: List[int], scores: torch.Tensor) -> torch.Tensor: """Use the FSM to bias the logits before sampling the next token.""" @@ -61,14 +62,14 @@ def __call__(self, input_ids: List[int], -math.inf, device=scores.device) mask[allowed_tokens] = 0 - scores.add_(mask) + scores = scores.add(mask) return scores class RegexLogitsProcessor(BaseLogitsProcessor): @classmethod - @cache() + @lru_cache(maxsize=32) def _get_guide(cls, regex_string: str, tokenizer: PreTrainedTokenizerBase) -> Guide: tokenizer = _adapt_tokenizer(tokenizer) @@ -127,7 +128,7 @@ def __init__(self, schema: Union[str, Dict, BaseModel], class CFGLogitsProcessor(BaseLogitsProcessor): @classmethod - @cache() + @lru_cache(maxsize=32) def _get_guide(cls, cfg: str, tokenizer: PreTrainedTokenizerBase) -> Guide: tokenizer = _adapt_tokenizer(tokenizer) return CFGGuide(cfg, tokenizer)