vllm-project
diff --git a/‎vllm/utils.py‎
Lines changed: 0 additions & 1 deletion b/‎vllm/utils.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎vllm/v1/core/guided_decoding/__init__.py‎
Lines changed: 14 additions & 100 deletions b/‎vllm/v1/core/guided_decoding/__init__.py‎
Lines changed: 14 additions & 100 deletions
diff --git a/‎vllm/v1/core/guided_decoding/grammar.py‎
Lines changed: 5 additions & 100 deletions b/‎vllm/v1/core/guided_decoding/grammar.py‎
Lines changed: 5 additions & 100 deletions
@@ -22,7 +22,6 @@
 import threading
 import time
 import traceback
-import types
 import uuid
 import warnings
 import weakref
 
@@ -1,13 +1,11 @@
 from __future__ import annotations
 
-import copy
+import copy, enum
 import threading
-from abc import ABC, abstractmethod
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Generic, Literal, TypeVar, get_args
+from typing import TYPE_CHECKING, TypeVar
 
-from transformers import PreTrainedTokenizer
 import xgrammar as xgr
 
 from vllm.config import ModelConfig
@@ -17,8 +15,7 @@
 from .grammar import Grammar
 
 if TYPE_CHECKING:
-    from transformers import PreTrainedTokenizer
-    from typing_extensions import LiteralString
+    from typing_extensions import Self
 
     from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 
@@ -31,18 +28,11 @@
 
 @dataclass
 class GrammarCache:
-    value: Grammar | None
+    value: Optional[Grammar]
     event: threading.Event
 
 
-T = TypeVar("T", bound=str)
-
-
-class GuidedDecodingManager(ABC, Generic[T]):
-
-    @abstractmethod
-    def initialize_cache(self, key: GuidedDecodingKey) -> Grammar:
-        ...
+class GuidedDecodingManager:
 
     def flush(self):
         with self._lock:
@@ -84,68 +74,21 @@ def collect(self, request: Request):
             return True
         return False
 
-    @classmethod
-    def from_backend(cls,
-                     backend: LiteralString = "xgrammar",
-                     /,
-                     *,
-                     tokenizer_group: BaseTokenizerGroup,
-                     model_config: ModelConfig) -> GuidedDecodingManager[T]:
-        manager_cls = cls._registry.get(backend)
-        if manager_cls is None:
-            raise ValueError(
-                f"Backend '{backend}' not found in registry. Available backends: {list(cls._registry)}"
-            )
-        return manager_cls(tokenizer_group=tokenizer_group,
-                           model_config=model_config)
-
-    _registry: dict[str, type[GuidedDecodingManager[T]]] = {}
-    _backend: T
-
-    def __init__(self, *, tokenizer_group: BaseTokenizerGroup,
+    def __init__(self, *, backend: str, tokenizer_group: BaseTokenizerGroup,
                  model_config: ModelConfig):
+        self._backend = backend
         self.model_config = model_config
         self.tokenizer = tokenizer_group.get_lora_tokenizer(None)
         self.grammar_cache: dict[GuidedDecodingKey, GrammarCache] = {}
         self.executor = ThreadPoolExecutor()
         self._lock = threading.Lock()
-
-    def __init_subclass__(cls, **kwargs: Any):
-        if not hasattr(cls, '__orig_bases__'):
-            raise TypeError(
-                f"{cls.__qualname__} must be subclass of GuidedDecodingManager"
-            )
-
-        backend = None
-        for base in cls.__orig_bases__:
-            if (origin := get_args(base)) and issubclass(
-                    base.__origin__, GuidedDecodingManager):
-                backend = get_args(origin[0])[0]
-                break
-
-        if backend is None:
-            raise TypeError(
-                f"Class {cls.__qualname__} must specify backend as a Literal type"
-            )
-
-        if backend in cls._registry:
-            name = cls._registry[backend].__qualname__
-            raise ValueError(
-                f"Backend '{backend}' is already registered to {name}")
-
-        # Set the backend value from the Literal type
-        cls._backend = backend
         cls._registry[backend] = cls
 
-
-class XGrammarManager(GuidedDecodingManager[Literal["xgrammar"]]):
-    # cache GrammarCompiler instances based on given tokenizer
-    _compiler_cache: dict[str, xgr.GrammarCompiler] = {}
-    _compiler: xgr.GrammarCompiler | None = None
-
-    def initialize_cache(self, key: GuidedDecodingKey) -> XGrammar:
+    def initialize_cache(self, key: GuidedDecodingKey) -> Self:
         request_type, grammar_spec = key
-        compiler = XGrammarManager.get_compiler(self.tokenizer)
+        tokenizer_info = xgr.TokenizerInfo.from_huggingface(
+            tokenizer, stop_token_ids=stop_token_ids, vocab_size=vocab_size)
+        compiler = xgr.GrammarCompiler(tokenizer_info, max_threads=max_threads)
         if request_type == "json":
             if type(grammar_spec) is not str:
                 ctx = compiler.compile_builtin_json_grammar()
@@ -155,35 +98,6 @@ def initialize_cache(self, key: GuidedDecodingKey) -> XGrammar:
             ctx = compiler.compile_grammar(grammar_spec)
         else:
             raise ValueError("grammar is not of valid supported types.")
-        return Grammar.from_backend(
-            self._backend,
-            matcher=xgr.GrammarMatcher(ctx),
-            vocab_size=self.model_config.hf_text_config.vocab_size,
-            ctx=ctx)
-
-    def flush(self):
-        super().flush()
-        if self._compiler: self._compiler.clear_cache()
-        for compiler in self._compiler_cache.values():
-            compiler.clear_cache()
-        self._compiler_cache.clear()
-
-    @classmethod
-    def get_compiler(
-            cls,
-            tokenizer: PreTrainedTokenizer,
-            *,
-            max_threads: int = 8,
-            # passthrough to TokenizerInfo
-            vocab_size: int | None = None,
-            stop_token_ids: list[int] | int | None = None
-    ) -> xgr.GrammarCompiler:
-        cache_key = str(hash(tokenizer))
-        if cache_key not in cls._compiler_cache:
-            tokenizer_info = xgr.TokenizerInfo.from_huggingface(
-                tokenizer,
-                stop_token_ids=stop_token_ids,
-                vocab_size=vocab_size)
-            cls._compiler_cache[cache_key] = xgr.GrammarCompiler(
-                tokenizer_info, max_threads=max_threads)
-        return cls._compiler_cache[cache_key]
+        return Grammar(matcher=xgr.GrammarMatcher(ctx),
+                       vocab_size=self.model_config.hf_text_config.vocab_size,
+                       ctx=ctx)
@@ -14,103 +14,8 @@
 T = TypeVar("T", bound=Annotated[LiteralString, str])
 
 
-class Grammar(ABC, Generic[T]):
+class Grammar:
     finished: bool = False
-
-    @abstractmethod
-    def accept_token(self, token: int) -> bool:
-        """Whether to accept the token and advance the machine state."""
-
-    @abstractmethod
-    def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> None:
-        """Fill the bitmask for the token at the given index."""
-
-    @abstractmethod
-    def allocate_bitmask(self, batch_size: int,
-                         vocab_size: int) -> torch.Tensor:
-        """Allocate a bitmask for the given batch size and vocabulary size."""
-
-    @staticmethod
-    @abstractmethod
-    def apply_bitmask(logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
-        """Apply the bitmask to the logits."""
-
-    @abstractmethod
-    def reset(self):
-        """Reset the machine state."""
-
-    @abstractmethod
-    def copy(self) -> Self:
-        """Copy the grammar object."""
-
-    def __copy__(self):
-        return self.copy()
-
-    _registry: dict[str, type[Grammar[T]]] = {}
-    _backend: T
-
-    def __init_subclass__(cls):
-        if not hasattr(cls, '__orig_bases__'):
-            raise TypeError(
-                f"Class {cls.__qualname__} must be a subclass of GrammarObject"
-            )
-
-        backend = None
-        for base in cls.__orig_bases__:
-            if (origin := get_args(base)) and issubclass(
-                    base.__origin__, Grammar):
-                backend = get_args(origin[0])[0]
-                break
-
-        if backend is None:
-            raise TypeError(
-                f"Class {cls.__qualname__} must specify backend as Literal type"
-            )
-
-        if backend in cls._registry:
-            name = cls._registry[backend].__qualname__
-            raise ValueError(
-                f"Backend '{backend}' is already registered to {name}")
-
-        # Set the backend value from the Literal type
-        cls._backend = backend
-        cls._registry[backend] = cls
-
-    @overload
-    @classmethod
-    def from_backend(
-        cls,
-        backend: Literal["xgrammar"] = ...,
-        *,
-        matcher: xgr.GrammarMatcher = ...,
-        vocab_size: int = ...,
-        ctx: xgr.CompiledGrammar = ...,
-    ) -> XGrammar:
-        ...
-
-    @overload
-    @classmethod
-    def from_backend(
-        cls,
-        backend: Literal["outlines"] = ...,
-        *,
-        guide: str = ...,
-        whitespace_pattern: str | None = ...,
-    ) -> XGrammar:
-        ...
-
-    @classmethod
-    def from_backend(cls,
-                     backend: LiteralString = "xgrammar",
-                     **kwargs: Any) -> Grammar[T]:
-        grammar_cls = cls._registry.get(backend)
-        if grammar_cls is None:
-            raise ValueError(
-                f"No grammar implementation registered for '{backend}'")
-        return grammar_cls(**kwargs)
-
-
-class XGrammar(Grammar[Literal["xgrammar"]]):
     # https://xgrammar.mlc.ai/docs/api/python/index.html#xgrammar.GrammarMatcher.find_jump_forward_string for jump-forward decoding
 
     def __init__(self, matcher: xgr.GrammarMatcher, vocab_size: int,
@@ -135,15 +40,15 @@ def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> None:
 
     @staticmethod
     def apply_bitmask(logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
-        # Note: In this method, if the tensors have different dimensions
-        # on CPU device fails, but on GPU it runs without error. Hence the
-        # unsqueeze above for scores, to match the token bitmask shape
         xgr.apply_token_bitmask_inplace(logits, vocab_mask)
 
     def reset(self):
         self.matcher.reset()
 
     def copy(self):
-        return XGrammar(matcher=xgr.GrammarMatcher(self.ctx),
+        return Grammar(matcher=xgr.GrammarMatcher(self.ctx),
                         vocab_size=self.vocab_size,
                         ctx=self.ctx)
+
+    def __copy__(self):
+        return self.copy()