HabanaAI · tae-su-kim · Aug 5, 2024 · Aug 23, 2024 · Sep 9, 2024 · Sep 23, 2024
diff --git a/requirements-common.txt b/requirements-common.txt
@@ -18,7 +18,7 @@ prometheus_client >= 0.18.0
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0 # Required for DBRX tokenizer
 lm-format-enforcer == 0.10.3
-outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
+outlines >= 0.0.46, < 0.1 # Requires torch >= 2.1.0
 typing_extensions
 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
 pyzmq
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -27,13 +27,15 @@
 from pydantic import BaseModel
 from transformers import PreTrainedTokenizerBase
 
+from vllm.hpu.utils import with_mark_steps
 
 class BaseLogitsProcessor:
 
  def __init__(self, guide: Guide):
  self._guide: Guide = guide
  self._fsm_state: DefaultDict[int, int] = defaultdict(int)
 
+ @with_mark_steps
  def __call__(self, input_ids: List[int],
  scores: torch.Tensor) -> torch.Tensor:
  """Use the FSM to bias the logits before sampling the next token."""
@@ -57,18 +59,16 @@ def __call__(self, input_ids: List[int],
  raise TypeError(
  f"Unsupported instruction type {type(instruction)}")
 
- mask = torch.full((scores.shape[-1], ),
- -math.inf,
- device=scores.device)
+ mask = torch.ones((scores.shape[-1], ), device=scores.device, dtype=torch.bool)
  mask[allowed_tokens] = 0
- scores.add_(mask)
+ scores.masked_fill_(mask, -math.inf)
  return scores
 
 
 class RegexLogitsProcessor(BaseLogitsProcessor):
 
  @classmethod
- @cache()
+ @lru_cache(maxsize=32)
  def _get_guide(cls, regex_string: str,
  tokenizer: PreTrainedTokenizerBase) -> Guide:
  tokenizer = _adapt_tokenizer(tokenizer)
@@ -127,7 +127,7 @@ def __init__(self, schema: Union[str, Dict, BaseModel],
 class CFGLogitsProcessor(BaseLogitsProcessor):
 
  @classmethod
- @cache()
+ @lru_cache(maxsize=32)
  def _get_guide(cls, cfg: str, tokenizer: PreTrainedTokenizerBase) -> Guide:
  tokenizer = _adapt_tokenizer(tokenizer)
  return CFGGuide(cfg, tokenizer)