diff --git a/vllm/config.py b/vllm/config.py index 41a30efea039..16e9ef409d9d 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import ast import copy import enum @@ -22,17 +24,12 @@ import torch from pydantic import BaseModel, Field, PrivateAttr from torch.distributed import ProcessGroup, ReduceOp -from transformers import PretrainedConfig import vllm.envs as envs from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass from vllm.logger import init_logger -from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS, - get_quantization_config) -from vllm.model_executor.models import ModelRegistry -from vllm.platforms import CpuArchEnum, current_platform +from vllm.platforms import CpuArchEnum from vllm.sampling_params import GuidedDecodingParams -from vllm.tracing import is_otel_available, otel_import_error_traceback from vllm.transformers_utils.config import ( ConfigFormat, get_config, get_hf_image_processor_config, get_hf_text_config, get_pooling_config, @@ -40,13 +37,15 @@ try_get_generation_config, uses_mrope) from vllm.transformers_utils.s3_utils import S3Model from vllm.transformers_utils.utils import is_s3, maybe_model_redirect -from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless, - get_cpu_memory, get_open_port, is_torch_equal_or_newer, - random_uuid, resolve_obj_by_qualname) +from vllm.utils import (GiB_bytes, LayerBlockType, LazyLoader, + cuda_device_count_stateless, get_cpu_memory, + get_open_port, is_torch_equal_or_newer, random_uuid, + resolve_obj_by_qualname) if TYPE_CHECKING: from _typeshed import DataclassInstance from ray.util.placement_group import PlacementGroup + from transformers import PretrainedConfig from vllm.executor.executor_base import ExecutorBase from vllm.model_executor.layers.quantization.base_config import ( @@ -54,10 +53,16 @@ from vllm.model_executor.model_loader.loader import BaseModelLoader ConfigType = type[DataclassInstance] + HfOverrides = Union[dict[str, Any], Callable[[PretrainedConfig], + PretrainedConfig]] else: - QuantizationConfig = None + HfOverrides = None ConfigType = type +me_quant = LazyLoader("model_executor", globals(), + "vllm.model_executor.layers.quantization") +me_models = LazyLoader("model_executor", globals(), + "vllm.model_executor.models") logger = init_logger(__name__) ConfigT = TypeVar("ConfigT", bound=ConfigType) @@ -89,9 +94,6 @@ for task in tasks } -HfOverrides = Union[dict[str, Any], Callable[[PretrainedConfig], - PretrainedConfig]] - class SupportsHash(Protocol): @@ -365,7 +367,7 @@ def __init__( mm_processor_kwargs: Optional[dict[str, Any]] = None, disable_mm_preprocessor_cache: bool = False, override_neuron_config: Optional[dict[str, Any]] = None, - override_pooler_config: Optional["PoolerConfig"] = None, + override_pooler_config: Optional[PoolerConfig] = None, logits_processor_pattern: Optional[str] = None, generation_config: str = "auto", enable_sleep_mode: bool = False, @@ -548,7 +550,7 @@ def __init__( @property def registry(self): - return ModelRegistry + return me_models.ModelRegistry @property def architectures(self) -> list[str]: @@ -581,7 +583,7 @@ def maybe_pull_model_tokenizer_for_s3(self, model: str, def _init_multimodal_config( self, limit_mm_per_prompt: Optional[dict[str, int]] - ) -> Optional["MultiModalConfig"]: + ) -> Optional[MultiModalConfig]: if self.registry.is_multimodal_model(self.architectures): return MultiModalConfig(limit_per_prompt=limit_mm_per_prompt or {}) @@ -597,8 +599,8 @@ def _get_encoder_config(self): def _init_pooler_config( self, - override_pooler_config: Optional["PoolerConfig"], - ) -> Optional["PoolerConfig"]: + override_pooler_config: Optional[PoolerConfig], + ) -> Optional[PoolerConfig]: if self.runner_type == "pooling": user_config = override_pooler_config or PoolerConfig() @@ -749,7 +751,8 @@ def _parse_quant_hf_config(self): return quant_cfg def _verify_quantization(self) -> None: - supported_quantization = QUANTIZATION_METHODS + supported_quantization = me_quant.QUANTIZATION_METHODS + optimized_quantization_methods = [ "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin", "awq_marlin", "fbgemm_fp8", "compressed_tensors", @@ -766,8 +769,8 @@ def _verify_quantization(self) -> None: quant_method = quant_cfg.get("quant_method", "").lower() # Detect which checkpoint is it - for name in QUANTIZATION_METHODS: - method = get_quantization_config(name) + for name in me_quant.QUANTIZATION_METHODS: + method = me_quant.get_quantization_config(name) quantization_override = method.override_quantization_method( quant_cfg, self.quantization) if quantization_override: @@ -799,6 +802,8 @@ def _verify_quantization(self) -> None: "non-quantized models.", self.quantization) def _verify_cuda_graph(self) -> None: + from vllm.platforms import current_platform + if self.max_seq_len_to_capture is None: self.max_seq_len_to_capture = self.max_model_len self.max_seq_len_to_capture = min(self.max_seq_len_to_capture, @@ -885,7 +890,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config, def verify_with_parallel_config( self, - parallel_config: "ParallelConfig", + parallel_config: ParallelConfig, ) -> None: if parallel_config.distributed_executor_backend == "external_launcher": @@ -1038,7 +1043,7 @@ def get_total_num_kv_heads(self) -> int: # equal to the number of attention heads. return self.hf_text_config.num_attention_heads - def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int: + def get_num_kv_heads(self, parallel_config: ParallelConfig) -> int: """Returns the number of KV heads per GPU.""" if self.use_mla: # When using MLA during decode it becomes MQA @@ -1052,13 +1057,12 @@ def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int: return max(1, total_num_kv_heads // parallel_config.tensor_parallel_size) - def get_num_attention_heads(self, - parallel_config: "ParallelConfig") -> int: + def get_num_attention_heads(self, parallel_config: ParallelConfig) -> int: num_heads = getattr(self.hf_text_config, "num_attention_heads", 0) return num_heads // parallel_config.tensor_parallel_size def get_layers_start_end_indices( - self, parallel_config: "ParallelConfig") -> tuple[int, int]: + self, parallel_config: ParallelConfig) -> tuple[int, int]: from vllm.distributed.utils import get_pp_indices if self.hf_text_config.model_type == "deepseek_mtp": total_num_hidden_layers = getattr(self.hf_text_config, @@ -1073,13 +1077,13 @@ def get_layers_start_end_indices( start, end = get_pp_indices(total_num_hidden_layers, pp_rank, pp_size) return start, end - def get_num_layers(self, parallel_config: "ParallelConfig") -> int: + def get_num_layers(self, parallel_config: ParallelConfig) -> int: start, end = self.get_layers_start_end_indices(parallel_config) return end - start def get_num_layers_by_block_type( self, - parallel_config: "ParallelConfig", + parallel_config: ParallelConfig, block_type: LayerBlockType = LayerBlockType.attention, ) -> int: # This function relies on 'layers_block_type' in hf_config, @@ -1132,7 +1136,7 @@ def get_num_layers_by_block_type( return sum(t == 1 for t in attn_type_list[start:end]) - def get_multimodal_config(self) -> "MultiModalConfig": + def get_multimodal_config(self) -> MultiModalConfig: """ Get the multimodal configuration of the model. @@ -1241,7 +1245,7 @@ def runner_type(self) -> RunnerType: @property def is_v1_compatible(self) -> bool: architectures = getattr(self.hf_config, "architectures", []) - return ModelRegistry.is_v1_compatible(architectures) + return me_models.ModelRegistry.is_v1_compatible(architectures) @property def is_matryoshka(self) -> bool: @@ -1392,7 +1396,7 @@ def _verify_prefix_caching(self) -> None: def verify_with_parallel_config( self, - parallel_config: "ParallelConfig", + parallel_config: ParallelConfig, ) -> None: total_cpu_memory = get_cpu_memory() # FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel @@ -1460,7 +1464,7 @@ class LoadConfig: """Configuration for loading the model weights.""" load_format: Union[str, LoadFormat, - "BaseModelLoader"] = LoadFormat.AUTO.value + BaseModelLoader] = LoadFormat.AUTO.value """The format of the model weights to load:\n - "auto" will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is not available.\n @@ -1582,11 +1586,11 @@ def data_parallel_rank_local(self, value: int) -> None: ray_workers_use_nsight: bool = False """Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler.""" - placement_group: Optional["PlacementGroup"] = None + placement_group: Optional[PlacementGroup] = None """ray distributed model workers placement group.""" distributed_executor_backend: Optional[Union[DistributedExecutorBackend, - type["ExecutorBase"]]] = None + type[ExecutorBase]]] = None """Backend to use for distributed model workers, either "ray" or "mp" (multiprocessing). If the product of pipeline_parallel_size and tensor_parallel_size is less than @@ -1629,7 +1633,7 @@ def get_next_dp_init_port(self) -> int: self.data_parallel_master_port += 1 return answer - def stateless_init_dp_group(self) -> "ProcessGroup": + def stateless_init_dp_group(self) -> ProcessGroup: from vllm.distributed.utils import ( stateless_init_torch_distributed_process_group) @@ -1644,7 +1648,7 @@ def stateless_init_dp_group(self) -> "ProcessGroup": return dp_group @staticmethod - def has_unfinished_dp(dp_group: "ProcessGroup", + def has_unfinished_dp(dp_group: ProcessGroup, has_unfinished: bool) -> bool: tensor = torch.tensor([has_unfinished], dtype=torch.int32, @@ -2227,7 +2231,7 @@ def compute_hash(self) -> str: return hash_str @classmethod - def from_dict(cls, dict_value: dict) -> "SpeculativeConfig": + def from_dict(cls, dict_value: dict) -> SpeculativeConfig: """Parse the CLI value for the speculative config.""" return cls(**dict_value) @@ -2819,7 +2823,7 @@ def compute_hash(self) -> str: return hash_str @staticmethod - def from_json(json_str: str) -> "PoolerConfig": + def from_json(json_str: str) -> PoolerConfig: return PoolerConfig(**json.loads(json_str)) @@ -3176,6 +3180,7 @@ def compute_hash(self) -> str: return hash_str def __post_init__(self): + from vllm.tracing import is_otel_available, otel_import_error_traceback if not is_otel_available() and self.otlp_traces_endpoint is not None: raise ValueError( "OpenTelemetry is not available. Unable to configure " @@ -3239,7 +3244,7 @@ def compute_hash(self) -> str: return hash_str @classmethod - def from_cli(cls, cli_value: str) -> "KVTransferConfig": + def from_cli(cls, cli_value: str) -> KVTransferConfig: """Parse the CLI value for the kv cache transfer config.""" return KVTransferConfig.model_validate_json(cli_value) @@ -3476,7 +3481,7 @@ def __repr__(self) -> str: __str__ = __repr__ @classmethod - def from_cli(cls, cli_value: str) -> "CompilationConfig": + def from_cli(cls, cli_value: str) -> CompilationConfig: """Parse the CLI value for the compilation config.""" if cli_value in ["0", "1", "2", "3"]: return cls(level=int(cli_value)) @@ -3528,7 +3533,7 @@ def model_post_init(self, __context: Any) -> None: self.static_forward_context = {} self.compilation_time = 0.0 - def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]: + def init_backend(self, vllm_config: VllmConfig) -> Union[str, Callable]: if self.level == CompilationLevel.NO_COMPILATION: raise ValueError("No compilation level is set.") @@ -3744,9 +3749,7 @@ def _get_quantization_config( """Get the quantization config.""" from vllm.platforms import current_platform if model_config.quantization is not None: - from vllm.model_executor.model_loader.weight_utils import ( - get_quant_config) - quant_config = get_quant_config(model_config, load_config) + quant_config = me_quant.get_quant_config(model_config, load_config) capability_tuple = current_platform.get_device_capability() if capability_tuple is not None: @@ -3770,7 +3773,7 @@ def with_hf_config( self, hf_config: PretrainedConfig, architectures: Optional[list[str]] = None, - ) -> "VllmConfig": + ) -> VllmConfig: if architectures is not None: hf_config = copy.deepcopy(hf_config) hf_config.architectures = architectures diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 9cb2aa797be5..73c6a6854c99 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + # yapf: disable import argparse import dataclasses @@ -7,8 +9,8 @@ import re import threading from dataclasses import MISSING, dataclass, fields -from typing import (Any, Callable, Dict, List, Literal, Optional, Type, - TypeVar, Union, cast, get_args, get_origin) +from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Literal, + Optional, Type, TypeVar, Union, cast, get_args, get_origin) import torch from typing_extensions import TypeIs @@ -26,9 +28,7 @@ SchedulerConfig, SchedulerPolicy, SpeculativeConfig, TaskOption, TokenizerPoolConfig, VllmConfig, get_attr_docs, get_field) -from vllm.executor.executor_base import ExecutorBase from vllm.logger import init_logger -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.plugins import load_general_plugins from vllm.reasoning import ReasoningParserManager from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3 @@ -38,6 +38,9 @@ # yapf: enable +if TYPE_CHECKING: + from vllm.executor.executor_base import ExecutorBase + logger = init_logger(__name__) ALLOWED_DETAILED_TRACE_MODULES = ["model", "worker", "all"] @@ -73,7 +76,7 @@ def optional_float(val: str) -> Optional[float]: def nullable_kvs(val: str) -> Optional[dict[str, int]]: """NOTE: This function is deprecated, args should be passed as JSON strings instead. - + Parses a string containing comma separate key [str] to value [int] pairs into a dictionary. @@ -303,7 +306,9 @@ def can_be_type(cls: TypeHint, type: TypeHintT) -> TypeIs[TypeHintT]: def is_custom_type(cls: TypeHint) -> bool: """Check if the class is a custom type.""" - return cls.__module__ != "builtins" + if isinstance(cls, type): + return cls.__module__ != "builtins" + return True def get_kwargs(cls: ConfigType) -> dict[str, Any]: cls_docs = get_attr_docs(cls) @@ -610,6 +615,9 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]: action='store_true', help='Disable logging statistics.') # Quantization settings. + from vllm.model_executor.layers.quantization import ( + QUANTIZATION_METHODS) + parser.add_argument('--quantization', '-q', type=optional_str, @@ -1071,7 +1079,7 @@ def create_speculative_config( target_parallel_config: ParallelConfig, enable_chunked_prefill: bool, disable_log_stats: bool, - ) -> Optional["SpeculativeConfig"]: + ) -> Optional[SpeculativeConfig]: """Initializes and returns a SpeculativeConfig object based on `speculative_config`. @@ -1698,7 +1706,7 @@ def _warn_or_fallback(feature_name: str) -> bool: def human_readable_int(value): """Parse human-readable integers like '1k', '2M', etc. Including decimal values with decimal multipliers. - + Examples: - '1k' -> 1,000 - '1K' -> 1,024 diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index bd2c3357cdc0..dd34a3e6c95e 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import asyncio import json from abc import ABC, abstractmethod @@ -7,11 +9,10 @@ from collections.abc import Awaitable, Iterable from functools import cache, lru_cache, partial from pathlib import Path -from typing import (Any, Callable, Generic, Literal, Optional, TypeVar, Union, - cast) +from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, Optional, + TypeVar, Union, cast) import jinja2.nodes -import transformers.utils.chat_template_utils as hf_chat_utils # yapf conflicts with isort for this block # yapf: disable from openai.types.chat import (ChatCompletionAssistantMessageParam, @@ -29,8 +30,6 @@ InputAudio) from pydantic import TypeAdapter # yapf: enable -from transformers import (PreTrainedTokenizer, PreTrainedTokenizerFast, - ProcessorMixin) # pydantic needs the TypedDict from typing_extensions from typing_extensions import Required, TypeAlias, TypedDict @@ -39,7 +38,12 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict from vllm.multimodal.utils import MediaConnector from vllm.transformers_utils.processor import cached_get_processor -from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer +from vllm.transformers_utils.tokenizer import AnyTokenizer + +if TYPE_CHECKING: + from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast + + from vllm.transformers_utils.tokenizers import MistralTokenizer logger = init_logger(__name__) @@ -280,6 +284,7 @@ def _iter_nodes_assign_content_item(root: jinja2.nodes.Node): def _try_extract_ast(chat_template: str) -> Optional[jinja2.nodes.Template]: try: + import transformers.utils.chat_template_utils as hf_chat_utils jinja_compiled = hf_chat_utils._compile_jinja_template(chat_template) return jinja_compiled.environment.parse(chat_template) except Exception: @@ -338,6 +343,7 @@ def resolve_hf_chat_template( # 2nd priority: AutoProcessor chat template, unless tool calling is enabled if tools is None: try: + from transformers import ProcessorMixin processor = cached_get_processor( tokenizer.name_or_path, processor_cls=(PreTrainedTokenizer, PreTrainedTokenizerFast, @@ -369,6 +375,7 @@ def _resolve_chat_template_content_format( *, trust_remote_code: bool, ) -> _ChatTemplateContentFormat: + from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)): hf_chat_template = resolve_hf_chat_template( tokenizer, @@ -575,7 +582,7 @@ def add(self, modality: ModalityStr, item: _T) -> Optional[str]: return self._placeholder_str(modality, current_count) @abstractmethod - def create_parser(self) -> "BaseMultiModalContentParser": + def create_parser(self) -> BaseMultiModalContentParser: raise NotImplementedError @@ -604,7 +611,7 @@ def all_mm_data(self) -> Optional[MultiModalDataDict]: mm_inputs["video"] = items_by_modality["video"] # A list of videos return mm_inputs - def create_parser(self) -> "BaseMultiModalContentParser": + def create_parser(self) -> BaseMultiModalContentParser: return MultiModalContentParser(self) @@ -637,7 +644,7 @@ async def all_mm_data(self) -> Optional[MultiModalDataDict]: mm_inputs["video"] = items_by_modality["video"] # A list of videos return mm_inputs - def create_parser(self) -> "BaseMultiModalContentParser": + def create_parser(self) -> BaseMultiModalContentParser: return AsyncMultiModalContentParser(self) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 38a541a408fa..dca39f2bc08f 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1,10 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import itertools import warnings from collections.abc import Sequence from contextlib import contextmanager -from typing import Any, Callable, ClassVar, Optional, Union, cast, overload +from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union, + cast, overload) import cloudpickle import torch.nn as nn @@ -13,16 +16,7 @@ from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput, BeamSearchSequence, get_beam_search_score) -from vllm.config import CompilationConfig -from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig, - TaskOption) from vllm.engine.llm_engine import LLMEngine -from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, - ChatTemplateContentFormatOption, - apply_hf_chat_template, - apply_mistral_chat_template, - parse_chat_messages, - resolve_chat_template_content_format) from vllm.entrypoints.score_utils import (_cosine_similarity, _validate_score_input_lens) from vllm.inputs import PromptType, SingletonPrompt, TextPrompt, TokensPrompt @@ -44,6 +38,11 @@ from vllm.utils import (Counter, Device, deprecate_args, deprecate_kwargs, is_list_of) +if TYPE_CHECKING: + from vllm.engine.arg_utils import HfOverrides, PoolerConfig, TaskOption + from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, + ChatTemplateContentFormatOption) + logger = init_logger(__name__) _R = TypeVar("_R", default=Any) @@ -117,7 +116,7 @@ class LLM: disable_async_output_proc: Disable async output processing. This may result in lower performance. hf_token: The token to use as HTTP bearer authorization for remote files - . If `True`, will use the token generated when running + . If `True`, will use the token generated when running `huggingface-cli login` (stored in `~/.huggingface`). hf_overrides: If a dictionary, contains arguments to be forwarded to the HuggingFace config. If a callable, it is called to update the @@ -194,6 +193,7 @@ def __init__( Note: if enforce_eager is unset (enforce_eager is None) it defaults to False. ''' + from vllm.engine.arg_utils import EngineArgs if "disable_log_stats" not in kwargs: kwargs["disable_log_stats"] = True @@ -207,6 +207,7 @@ def __init__( if compilation_config is not None: if isinstance(compilation_config, (int, dict)): + from vllm.config import CompilationConfig compilation_config_instance = CompilationConfig.from_cli( str(compilation_config)) else: @@ -701,6 +702,10 @@ def chat( A list of ``RequestOutput`` objects containing the generated responses in the same order as the input messages. """ + from vllm.entrypoints.chat_utils import ( + ChatCompletionMessageParam, apply_hf_chat_template, + apply_mistral_chat_template, parse_chat_messages, + resolve_chat_template_content_format) list_of_messages: list[list[ChatCompletionMessageParam]] # Handle multi and single conversations diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index 73b4288cbb0d..b4667b2650f7 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -2,7 +2,7 @@ import asyncio import time from collections.abc import AsyncGenerator, Mapping -from typing import Any, Optional, Union +from typing import TYPE_CHECKING, Any, Optional, Union from fastapi import Request @@ -23,11 +23,13 @@ from vllm.lora.request import LoRARequest from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput from vllm.prompt_adapter.request import PromptAdapterRequest -from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer, - PreTrainedTokenizer, - PreTrainedTokenizerFast) +from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer from vllm.utils import make_async, merge_async_iterators +if TYPE_CHECKING: + from vllm.transformers_utils.tokenizer import (PreTrainedTokenizer, + PreTrainedTokenizerFast) + logger = init_logger(__name__) @@ -48,7 +50,7 @@ def __init__( async def _embedding_score( self, - tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], + tokenizer: Union["PreTrainedTokenizer", "PreTrainedTokenizerFast"], texts_1: list[str], texts_2: list[str], request: Union[RerankRequest, ScoreRequest], diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py index 53411a27b41e..d4f067989d56 100644 --- a/vllm/entrypoints/score_utils.py +++ b/vllm/entrypoints/score_utils.py @@ -1,15 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Union +from typing import TYPE_CHECKING, Union from torch.nn import CosineSimilarity from vllm.outputs import PoolingRequestOutput -from vllm.transformers_utils.tokenizer import (PreTrainedTokenizer, - PreTrainedTokenizerFast) + +if TYPE_CHECKING: + from vllm.transformers_utils.tokenizer import (PreTrainedTokenizer, + PreTrainedTokenizerFast) def _cosine_similarity( - tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], + tokenizer: Union["PreTrainedTokenizer", "PreTrainedTokenizerFast"], embed_1: list[PoolingRequestOutput], embed_2: list[PoolingRequestOutput], ) -> list[PoolingRequestOutput]: diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 58796e5d7326..31fc6f75e275 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -109,6 +109,7 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: """Initialize the KV cache by invoking the underlying worker. """ # NOTE: This is logged in the executor because there can be >1 workers. + logger.info("# %s blocks: %d, # CPU blocks: %d", vllm.platforms.current_platform.device_name, num_gpu_blocks, num_cpu_blocks) diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 37cc07bfbb36..12842d3cd397 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -11,7 +11,6 @@ from vllm.config import ParallelConfig from vllm.executor.msgspec_utils import decode_hook, encode_hook from vllm.logger import init_logger -from vllm.platforms import current_platform from vllm.sequence import ExecuteModelRequest, IntermediateTensors from vllm.utils import get_ip from vllm.worker.worker_base import WorkerWrapperBase @@ -109,7 +108,7 @@ def setup_device_if_necessary(self): # We can remove this API after it is fixed in compiled graph. assert self.worker is not None, "Worker is not initialized" if not self.compiled_dag_cuda_device_set: - if current_platform.is_tpu(): + if vllm.platforms.current_platform.is_tpu(): # Not needed pass else: diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 0579893e5d76..e97ab325e6e9 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -8,12 +8,9 @@ Protocol, Union) from torch import nn -from transformers import BatchFeature, PretrainedConfig, ProcessorMixin from typing_extensions import TypeVar, assert_never from vllm.logger import init_logger -from vllm.transformers_utils.processor import cached_processor_from_config -from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides, resolve_mm_processor_kwargs) @@ -21,16 +18,19 @@ from .parse import split_enc_dec_inputs if TYPE_CHECKING: + from transformers import BatchFeature, PretrainedConfig, ProcessorMixin + from vllm.config import ModelConfig from vllm.multimodal import (MultiModalDataDict, MultiModalPlaceholderDict, MultiModalRegistry) from vllm.sequence import SequenceData + from vllm.transformers_utils.tokenizer import AnyTokenizer logger = init_logger(__name__) _T = TypeVar("_T") -_C = TypeVar("_C", bound=PretrainedConfig, default=PretrainedConfig) -_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin) +_C = TypeVar("_C", bound="PretrainedConfig", default="PretrainedConfig") +_P = TypeVar("_P", bound="ProcessorMixin", default="ProcessorMixin") @dataclass(frozen=True) @@ -45,7 +45,7 @@ class InputContext: def get_hf_config( self, - typ: Union[type[_C], tuple[type[_C], ...]] = PretrainedConfig, + typ: Optional[Union[type[_C], tuple[type[_C], ...]]] = None, /, ) -> _C: """ @@ -56,6 +56,11 @@ def get_hf_config( Raises: TypeError: If the configuration is not of the specified type. """ + + if typ is None: + from transformers import PretrainedConfig + typ = PretrainedConfig + hf_config = self.model_config.hf_config if not isinstance(hf_config, typ): raise TypeError("Invalid type of HuggingFace config. " @@ -85,7 +90,7 @@ def get_mm_config(self): def get_hf_processor( self, - typ: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin, + typ: Optional[Union[type[_P], tuple[type[_P], ...]]] = None, /, **kwargs: object, ) -> _P: @@ -97,6 +102,12 @@ def get_hf_processor( Raises: TypeError: If the processor is not of the specified type. """ + + if typ is None: + from transformers import ProcessorMixin + typ = ProcessorMixin + from vllm.transformers_utils.processor import ( + cached_processor_from_config) return cached_processor_from_config( self.model_config, processor_cls=typ, @@ -124,15 +135,19 @@ def init_processor( @dataclass(frozen=True) class InputProcessingContext(InputContext): - tokenizer: AnyTokenizer + tokenizer: "AnyTokenizer" """The tokenizer used to tokenize the inputs.""" def get_hf_processor( self, - typ: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin, + typ: Union[type[_P], tuple[type[_P], ...], None] = None, /, **kwargs: object, ) -> _P: + + if typ is None: + from transformers import ProcessorMixin + typ = ProcessorMixin return super().get_hf_processor( typ, tokenizer=self.tokenizer, @@ -141,10 +156,10 @@ def get_hf_processor( def call_hf_processor( self, - hf_processor: ProcessorMixin, + hf_processor: "ProcessorMixin", data: Mapping[str, object], kwargs: Mapping[str, object] = {}, - ) -> BatchFeature: + ) -> "BatchFeature": """ Call :code:`hf_processor` on the prompt :code:`data` (text, image, audio...) with configurable options :code:`kwargs`. diff --git a/vllm/model_executor/guided_decoding/reasoner/__init__.py b/vllm/model_executor/guided_decoding/reasoner/__init__.py index ab6e47c007d2..297e5b29518a 100644 --- a/vllm/model_executor/guided_decoding/reasoner/__init__.py +++ b/vllm/model_executor/guided_decoding/reasoner/__init__.py @@ -2,13 +2,16 @@ from __future__ import annotations -from transformers import PreTrainedTokenizer +from typing import TYPE_CHECKING from vllm.logger import init_logger from vllm.model_executor.guided_decoding.reasoner.deepseek_reasoner import ( # noqa: E501 DeepSeekReasoner) from vllm.model_executor.guided_decoding.reasoner.reasoner import Reasoner +if TYPE_CHECKING: + from transformers import PreTrainedTokenizer + logger = init_logger(__name__) diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py index 54fd43fc6592..50b9a909ab7b 100644 --- a/vllm/model_executor/layers/spec_decode_base_sampler.py +++ b/vllm/model_executor/layers/spec_decode_base_sampler.py @@ -7,8 +7,6 @@ import torch.jit import torch.nn as nn -from vllm.platforms import current_platform - class SpecDecodeBaseSampler(nn.Module): """Base class for samplers used for Speculative Decoding verification @@ -37,6 +35,7 @@ def __init__(self, strict_mode: bool = False): def init_gpu_tensors(self, device: Union[int, str]) -> None: assert self.num_accepted_tokens is None if isinstance(device, int): + from vllm.platforms import current_platform device = f"{current_platform.device_type}:{device}" elif not isinstance(device, str): raise ValueError(f"Device must be int or str, get {type(device)}") diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 6855808e8e44..8296811e52c1 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -13,13 +13,14 @@ import torch import torch.types from PIL.Image import Image -from transformers import BatchFeature from typing_extensions import NotRequired, TypeAlias from vllm.jsontree import JSONTree, json_map_leaves from vllm.utils import full_groupby, is_list_of if TYPE_CHECKING: + from transformers import BatchFeature + from .hasher import MultiModalHashDict _T = TypeVar("_T") @@ -599,7 +600,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): @staticmethod def from_hf_inputs( - hf_inputs: BatchFeature, + hf_inputs: "BatchFeature", config_by_key: Mapping[str, MultiModalFieldConfig], ): # NOTE: This skips fields in `hf_inputs` that are not in `config_by_key` diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 9707b9cfcf8b..d1c36a92dae9 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -9,7 +9,6 @@ import numpy as np import torch from PIL.Image import Image -from transformers import BatchFeature from typing_extensions import TypeAlias, TypeGuard, assert_never from vllm.utils import is_list_of @@ -149,7 +148,7 @@ def __init__( self.fields_config = fields_config self.required_fields = required_fields - + from transformers import BatchFeature self._kwargs = MultiModalKwargs.from_hf_inputs( BatchFeature(dict(data)), fields_config, diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 16358d1a5ee4..dda66807b94c 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -13,7 +13,6 @@ TypeVar, Union, cast) import torch -from transformers import BatchFeature, PretrainedConfig, ProcessorMixin from typing_extensions import assert_never from vllm.inputs import InputProcessingContext @@ -31,6 +30,8 @@ MultiModalDataParser) if TYPE_CHECKING: + from transformers import BatchFeature, PretrainedConfig, ProcessorMixin + from .profiling import BaseDummyInputsBuilder logger = init_logger(__name__) @@ -1013,10 +1014,10 @@ def model_id(self) -> str: def get_tokenizer(self) -> AnyTokenizer: return self.ctx.tokenizer - def get_hf_config(self) -> PretrainedConfig: + def get_hf_config(self) -> "PretrainedConfig": return self.ctx.get_hf_config() - def get_hf_processor(self, **kwargs: object) -> ProcessorMixin: + def get_hf_processor(self, **kwargs: object) -> "ProcessorMixin": """ Subclasses can override this method to handle specific kwargs from model config or user inputs. @@ -1128,7 +1129,7 @@ def _to_mm_items( @abstractmethod def _get_mm_fields_config( self, - hf_inputs: BatchFeature, + hf_inputs: "BatchFeature", hf_processor_mm_kwargs: Mapping[str, object], ) -> Mapping[str, MultiModalFieldConfig]: """Given the HF-processed data, output the metadata of each field.""" @@ -1185,7 +1186,7 @@ def _call_hf_processor( # This refers to the data to be passed to HF processor. mm_data: Mapping[str, object], mm_kwargs: Mapping[str, object], - ) -> BatchFeature: + ) -> "BatchFeature": """ Call the HF processor on the prompt text and associated multi-modal data. @@ -1676,7 +1677,7 @@ def create_encoder_prompt( mm_data: MultiModalDataDict, ) -> Union[str, list[int]]: """ - Create input prompt for the encoder. HF processor will be applied on + Create input prompt for the encoder. HF processor will be applied on this prompt during profiling and generation. """ raise NotImplementedError diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py index 454167a0dc95..f30c161481e4 100644 --- a/vllm/reasoning/abs_reasoning_parsers.py +++ b/vllm/reasoning/abs_reasoning_parsers.py @@ -1,17 +1,21 @@ # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import os from abc import abstractmethod from collections.abc import Sequence from functools import cached_property -from typing import Callable, Optional, Union +from typing import TYPE_CHECKING, Callable, Optional, Union -from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, - DeltaMessage) from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import import_from_path, is_list_of +if TYPE_CHECKING: + from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaMessage) + from vllm.transformers_utils.tokenizer import AnyTokenizer + logger = init_logger(__name__) diff --git a/vllm/reasoning/deepseek_r1_reasoning_parser.py b/vllm/reasoning/deepseek_r1_reasoning_parser.py index 1c283c092a28..0f125c6b2052 100644 --- a/vllm/reasoning/deepseek_r1_reasoning_parser.py +++ b/vllm/reasoning/deepseek_r1_reasoning_parser.py @@ -1,15 +1,19 @@ # SPDX-License-Identifier: Apache-2.0 -from collections.abc import Sequence -from typing import Optional, Union +from __future__ import annotations -from transformers import PreTrainedTokenizerBase +from collections.abc import Sequence +from typing import TYPE_CHECKING, Optional, Union -from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, - DeltaMessage) from vllm.logger import init_logger from vllm.reasoning import ReasoningParser, ReasoningParserManager +if TYPE_CHECKING: + from transformers import PreTrainedTokenizerBase + + from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaMessage) + logger = init_logger(__name__) @@ -72,6 +76,8 @@ def extract_reasoning_content_streaming( - 'abc' goes to reasoning_content - 'xyz' goes to content """ + from vllm.entrypoints.openai.protocol import DeltaMessage + # Skip single special tokens if len(delta_token_ids) == 1 and (delta_token_ids[0] in [ self.start_token_id, self.end_token_id diff --git a/vllm/reasoning/granite_reasoning_parser.py b/vllm/reasoning/granite_reasoning_parser.py index 0dae02d33fec..f72592f0a6b5 100644 --- a/vllm/reasoning/granite_reasoning_parser.py +++ b/vllm/reasoning/granite_reasoning_parser.py @@ -1,16 +1,20 @@ # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import re from collections.abc import Sequence -from typing import Optional, Union - -from transformers import PreTrainedTokenizerBase +from typing import TYPE_CHECKING, Optional, Union -from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, - DeltaMessage) from vllm.logger import init_logger from vllm.reasoning import ReasoningParser, ReasoningParserManager +if TYPE_CHECKING: + from transformers import PreTrainedTokenizerBase + + from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaMessage) + logger = init_logger(__name__) @@ -138,7 +142,7 @@ def _is_reasoning_start_substr(self, text: str) -> bool: Args: text (str): Text to check for leading substr. - + Returns: bool: True if any of the possible reasoning start seqs match. """ @@ -151,7 +155,7 @@ def _is_response_start_substr(self, text: str) -> bool: Args: text (str): Text to check for leading substr. - + Returns: bool: True if any of the possible response start seqs match. """ @@ -174,6 +178,8 @@ def _get_delta_message_with_no_reasoning_bounds( Returns: DeltaMessage: Message containing the parsed content. """ + from vllm.entrypoints.openai.protocol import DeltaMessage + prev_longest_length = len(current_text) - len(delta_text) is_substr = self._is_reasoning_start_substr(current_text) was_substr = self._is_reasoning_start_substr( @@ -213,6 +219,8 @@ def _get_delta_message_with_no_response_bounds( Returns: DeltaMessage: Message containing the parsed content. """ + from vllm.entrypoints.openai.protocol import DeltaMessage + # If we have no reasoning content or explicitly end with the start of # response sequence, we are in transition to the response; need to be # careful here, since the final token (:) will match the reasoning @@ -286,6 +294,8 @@ def _get_delta_message_with_both_bounds( Returns: DeltaMessage: Message containing the parsed content. """ + from vllm.entrypoints.openai.protocol import DeltaMessage + # Always have content; take length to the end delta_content = delta_text[-len(response_content):] reasoning_end_idx = len(delta_text) - (len(response_content) + diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index c9f9af45044e..f2699d047b15 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -1,88 +1,82 @@ # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import enum +import importlib import json import os import time from functools import cache from pathlib import Path -from typing import Any, Callable, Dict, Literal, Optional, Type, Union - -import huggingface_hub -from huggingface_hub import hf_hub_download -from huggingface_hub import list_repo_files as hf_list_repo_files -from huggingface_hub import try_to_load_from_cache -from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError, - HFValidationError, LocalEntryNotFoundError, - RepositoryNotFoundError, - RevisionNotFoundError) +from typing import (TYPE_CHECKING, Any, Callable, Dict, Literal, Optional, + Type, Union) + from torch import nn -from transformers import GenerationConfig, PretrainedConfig -from transformers.models.auto.image_processing_auto import ( - get_image_processor_config) -from transformers.models.auto.modeling_auto import ( - MODEL_FOR_CAUSAL_LM_MAPPING_NAMES) -from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME from vllm.envs import VLLM_USE_MODELSCOPE from vllm.logger import init_logger -# yapf conflicts with isort for this block -# yapf: disable -from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config, - DbrxConfig, DeepseekVLV2Config, - EAGLEConfig, ExaoneConfig, - H2OVLChatConfig, - InternVLChatConfig, JAISConfig, - KimiVLConfig, MedusaConfig, - MllamaConfig, MLPSpeculatorConfig, - MPTConfig, NemotronConfig, - NVLM_D_Config, RWConfig, - SkyworkR1VChatConfig, SolarConfig, - Telechat2Config, UltravoxConfig) -# yapf: enable from vllm.transformers_utils.utils import check_gguf_file -from vllm.utils import resolve_obj_by_qualname - -if VLLM_USE_MODELSCOPE: - from modelscope import AutoConfig -else: - from transformers import AutoConfig +from vllm.utils import LazyLoader, resolve_obj_by_qualname MISTRAL_CONFIG_NAME = "params.json" HF_TOKEN = os.getenv('HF_TOKEN', None) logger = init_logger(__name__) -_CONFIG_REGISTRY_OVERRIDE_HF: Dict[str, Type[PretrainedConfig]] = { - "mllama": MllamaConfig -} - -_CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = { - "chatglm": ChatGLMConfig, - "cohere2": Cohere2Config, - "dbrx": DbrxConfig, - "deepseek_vl_v2": DeepseekVLV2Config, - "kimi_vl": KimiVLConfig, - "mpt": MPTConfig, - "RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct) - "RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct) - "jais": JAISConfig, - "mlp_speculator": MLPSpeculatorConfig, - "medusa": MedusaConfig, - "eagle": EAGLEConfig, - "exaone": ExaoneConfig, - "h2ovl_chat": H2OVLChatConfig, - "internvl_chat": InternVLChatConfig, - "nemotron": NemotronConfig, - "NVLM_D": NVLM_D_Config, - "solar": SolarConfig, - "skywork_chat": SkyworkR1VChatConfig, - "telechat": Telechat2Config, - "ultravox": UltravoxConfig, +if TYPE_CHECKING: + import huggingface_hub as hfhub + import huggingface_hub.utils as hfhub_utils + from transformers.configuration_utils import PretrainedConfig + from transformers.generation.configuration_utils import GenerationConfig +else: + hfhub = LazyLoader("hfhub", globals(), "huggingface_hub") + hfhub_utils = LazyLoader("hfhub_utils", globals(), "huggingface_hub.utils") + +_CONFIG_REGISTRY_OVERRIDE_HF: Dict[str, str] = {"mllama": "MllamaConfig"} + +_CONFIG_REGISTRY: Dict[str, str] = { + "chatglm": "ChatGLMConfig", + "cohere2": "Cohere2Config", + "dbrx": "DbrxConfig", + "deepseek_vl_v2": "DeepseekVLV2Config", + "kimi_vl": "KimiVLConfig", + "mpt": "MPTConfig", + "RefinedWeb": "RWConfig", # For tiiuae/falcon-40b(-instruct) + "RefinedWebModel": "RWConfig", # For tiiuae/falcon-7b(-instruct) + "jais": "JAISConfig", + "mlp_speculator": "MLPSpeculatorConfig", + "medusa": "MedusaConfig", + "eagle": "EAGLEConfig", + "exaone": "ExaoneConfig", + "h2ovl_chat": "H2OVLChatConfig", + "internvl_chat": "InternVLChatConfig", + "nemotron": "NemotronConfig", + "NVLM_D": "NVLM_D_Config", + "olmo2": "Olmo2Config", + "solar": "SolarConfig", + "skywork_chat": "SkyworkR1VChatConfig", + "telechat": "Telechat2Config", + "ultravox": "UltravoxConfig", **_CONFIG_REGISTRY_OVERRIDE_HF } +def get_config_class(key: str) -> Type: + config_class_name = _CONFIG_REGISTRY[key] + module_path = "vllm.transformers_utils.configs" + + try: + module = importlib.import_module(module_path) + config_class = getattr(module, config_class_name) + except (ModuleNotFoundError, AttributeError) as e: + raise ValueError( + f"Failed to import config class '{config_class_name}' " + f"from module '{module_path}'.") from e + + return config_class + + class ConfigFormat(str, enum.Enum): AUTO = "auto" HF = "hf" @@ -131,11 +125,11 @@ def lookup_files() -> list[str]: return modelscope_list_repo_files(repo_id, revision=revision, token=token) - return hf_list_repo_files(repo_id, - revision=revision, - repo_type=repo_type, - token=token) - except huggingface_hub.errors.OfflineModeIsEnabled: + return hfhub.list_repo_files(repo_id, + revision=revision, + repo_type=repo_type, + token=token) + except hfhub.errors.OfflineModeIsEnabled: # Don't raise in offline mode, # all we know is that we don't have this # file cached. @@ -166,9 +160,9 @@ def file_or_path_exists(model: Union[str, Path], config_name: str, return (local_path / config_name).is_file() # Offline mode support: Check if config file is cached already - cached_filepath = try_to_load_from_cache(repo_id=model, - filename=config_name, - revision=revision) + cached_filepath = hfhub.try_to_load_from_cache(repo_id=model, + filename=config_name, + revision=revision) if isinstance(cached_filepath, str): # The config file exists in cache- we can continue trying to load return True @@ -272,6 +266,7 @@ def get_config( if config_format == ConfigFormat.AUTO: try: + from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME if is_gguf or file_or_path_exists( model, HF_CONFIG_NAME, revision=revision): config_format = ConfigFormat.HF @@ -300,6 +295,7 @@ def get_config( raise ValueError(error_message) from e if config_format == ConfigFormat.HF: + from transformers import PretrainedConfig config_dict, _ = PretrainedConfig.get_config_dict( model, revision=revision, @@ -311,7 +307,7 @@ def get_config( # Use custom model class if it's in our registry model_type = config_dict.get("model_type") if model_type in _CONFIG_REGISTRY: - config_class = _CONFIG_REGISTRY[model_type] + config_class = get_config_class(model_type) config = config_class.from_pretrained( model, revision=revision, @@ -321,6 +317,10 @@ def get_config( ) else: try: + if VLLM_USE_MODELSCOPE: + from modelscope import AutoConfig + else: + from transformers import AutoConfig config = AutoConfig.from_pretrained( model, trust_remote_code=trust_remote_code, @@ -357,6 +357,8 @@ def get_config( # Special architecture mapping check for GGUF models if is_gguf: + from transformers.models.auto.modeling_auto import ( + MODEL_FOR_CAUSAL_LM_MAPPING_NAMES) if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES: raise RuntimeError( f"Can't get gguf config for {config.model_type}.") @@ -379,12 +381,12 @@ def try_get_local_file(model: Union[str, Path], return file_path else: try: - cached_filepath = try_to_load_from_cache(repo_id=model, - filename=file_name, - revision=revision) + cached_filepath = hfhub.try_to_load_from_cache(repo_id=model, + filename=file_name, + revision=revision) if isinstance(cached_filepath, str): return Path(cached_filepath) - except HFValidationError: + except hfhub_utils.HFValidationError: ... return None @@ -412,14 +414,18 @@ def get_hf_file_to_dict(file_name: str, if file_path is None: try: - hf_hub_file = hf_hub_download(model, file_name, revision=revision) - except huggingface_hub.errors.OfflineModeIsEnabled: + hf_hub_file = hfhub.hf_hub_download(model, + file_name, + revision=revision) + except hfhub.errors.OfflineModeIsEnabled: return None - except (RepositoryNotFoundError, RevisionNotFoundError, - EntryNotFoundError, LocalEntryNotFoundError) as e: + except (hfhub_utils.RepositoryNotFoundError, + hfhub_utils.RevisionNotFoundError, + hfhub_utils.EntryNotFoundError, + hfhub_utils.LocalEntryNotFoundError) as e: logger.debug("File or repository not found in hf_hub_download", e) return None - except HfHubHTTPError as e: + except hfhub_utils.HfHubHTTPError as e: logger.warning( "Cannot connect to Hugging Face Hub. Skipping file " "download for '%s':", @@ -724,6 +730,7 @@ def recurse_elems(elem: Any): config_dict = recurse_elems(config_dict) # transform to HF config format + from transformers import PretrainedConfig if config_type == "multimodal": config_dict["text_config"] = PretrainedConfig( **config_dict["text_config"]) @@ -745,6 +752,8 @@ def get_hf_image_processor_config( # Separate model folder from file path for GGUF models if check_gguf_file(model): model = Path(model).parent + from transformers.models.auto.image_processing_auto import ( + get_image_processor_config) return get_image_processor_config(model, token=hf_token, revision=revision, @@ -775,6 +784,7 @@ def try_get_generation_config( trust_remote_code: bool, revision: Optional[str] = None, ) -> Optional[GenerationConfig]: + from transformers import GenerationConfig try: return GenerationConfig.from_pretrained( model, diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py index ed2f4b076ded..71855ad36041 100644 --- a/vllm/transformers_utils/processor.py +++ b/vllm/transformers_utils/processor.py @@ -1,15 +1,16 @@ # SPDX-License-Identifier: Apache-2.0 from functools import lru_cache -from typing import TYPE_CHECKING, Any, Union, cast +from typing import TYPE_CHECKING, Any, Callable, Optional, Union, cast -from transformers.processing_utils import ProcessorMixin from typing_extensions import TypeVar if TYPE_CHECKING: + from transformers.processing_utils import ProcessorMixin + from vllm.config import ModelConfig -_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin) +_P = TypeVar("_P", bound="ProcessorMixin", default="ProcessorMixin") class HashableDict(dict): @@ -54,13 +55,16 @@ def get_processor( processor_name: str, *args: Any, trust_remote_code: bool = False, - processor_cls: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin, + processor_cls: Optional[Union[type[_P], tuple[type[_P], ...]]] = None, **kwargs: Any, ) -> _P: """Load a processor for the given model name via HuggingFace.""" # don't put this import at the top level # it will call torch.cuda.device_count() from transformers import AutoProcessor + from transformers.processing_utils import ProcessorMixin + if processor_cls is None: + processor_cls = ProcessorMixin processor_factory = (AutoProcessor if processor_cls == ProcessorMixin or isinstance(processor_cls, tuple) else processor_cls) @@ -95,14 +99,19 @@ def get_processor( return processor -cached_get_processor = lru_cache(get_processor) +cached_get_processor: Callable = lru_cache(get_processor) def cached_processor_from_config( model_config: "ModelConfig", - processor_cls: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin, + processor_cls: Optional[Union[type[_P], tuple[type[_P], ...]]] = None, **kwargs: Any, ) -> _P: + + from transformers.processing_utils import ProcessorMixin + if processor_cls is None: + processor_cls = ProcessorMixin + return cached_get_processor( model_config.model, trust_remote_code=model_config.trust_remote_code, diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 1bfb50328338..97bfc17236c4 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -9,25 +9,25 @@ from typing import TYPE_CHECKING, Any, Optional, Union import huggingface_hub -from transformers import (AutoTokenizer, PreTrainedTokenizer, - PreTrainedTokenizerFast) from vllm.envs import VLLM_USE_MODELSCOPE from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.transformers_utils.tokenizer_base import (TokenizerBase, - TokenizerRegistry) +from vllm.transformers_utils.tokenizer_base import TokenizerRegistry from vllm.transformers_utils.tokenizers import MistralTokenizer from vllm.transformers_utils.utils import check_gguf_file from vllm.utils import make_async if TYPE_CHECKING: + from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast + from vllm.config import ModelConfig + from vllm.transformers_utils.tokenizer_base import TokenizerBase logger = init_logger(__name__) -AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast, - TokenizerBase] +AnyTokenizer = Union["PreTrainedTokenizer", "PreTrainedTokenizerFast", + "TokenizerBase"] def decode_tokens( @@ -124,12 +124,12 @@ def __len__(self): return tokenizer -def patch_padding_side(tokenizer: PreTrainedTokenizer) -> None: +def patch_padding_side(tokenizer: "PreTrainedTokenizer") -> None: """Patch _pad method to accept `padding_side` for older tokenizers.""" orig_pad = tokenizer._pad def _pad( - self: PreTrainedTokenizer, + self: "PreTrainedTokenizer", *args, padding_side: Optional[str] = None, **kwargs, @@ -215,6 +215,7 @@ def get_tokenizer( **kwargs) else: try: + from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained( tokenizer_name, *args, @@ -241,9 +242,10 @@ def get_tokenizer( # NOTE: We can remove this after https://github.com/THUDM/ChatGLM3/issues/1324 if type(tokenizer).__name__ in ("ChatGLMTokenizer", "ChatGLM4Tokenizer"): + from transformers import PreTrainedTokenizer assert isinstance(tokenizer, PreTrainedTokenizer) patch_padding_side(tokenizer) - + from transformers import PreTrainedTokenizerFast if not isinstance(tokenizer, PreTrainedTokenizerFast): logger.warning( "Using a slow tokenizer. This might cause a significant " diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 58a114fa3a32..9cbfcf0d7c8b 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -7,7 +7,6 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union, cast import huggingface_hub -from huggingface_hub import HfApi, hf_hub_download from vllm.logger import init_logger from vllm.transformers_utils.tokenizer_base import TokenizerBase @@ -236,6 +235,7 @@ def from_pretrained(cls, @staticmethod def _download_mistral_tokenizer_from_hf(tokenizer_name: str, revision: Optional[str]) -> str: + from huggingface_hub import HfApi, hf_hub_download try: hf_api = HfApi() files = hf_api.list_repo_files(repo_id=tokenizer_name,