diff --git a/vllm/config.py b/vllm/config.py
index 41a30efea039..16e9ef409d9d 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from __future__ import annotations
+
 import ast
 import copy
 import enum
@@ -22,17 +24,12 @@
 import torch
 from pydantic import BaseModel, Field, PrivateAttr
 from torch.distributed import ProcessGroup, ReduceOp
-from transformers import PretrainedConfig
 
 import vllm.envs as envs
 from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
 from vllm.logger import init_logger
-from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
-                                                     get_quantization_config)
-from vllm.model_executor.models import ModelRegistry
-from vllm.platforms import CpuArchEnum, current_platform
+from vllm.platforms import CpuArchEnum
 from vllm.sampling_params import GuidedDecodingParams
-from vllm.tracing import is_otel_available, otel_import_error_traceback
 from vllm.transformers_utils.config import (
     ConfigFormat, get_config, get_hf_image_processor_config,
     get_hf_text_config, get_pooling_config,
@@ -40,13 +37,15 @@
     try_get_generation_config, uses_mrope)
 from vllm.transformers_utils.s3_utils import S3Model
 from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
-from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
-                        get_cpu_memory, get_open_port, is_torch_equal_or_newer,
-                        random_uuid, resolve_obj_by_qualname)
+from vllm.utils import (GiB_bytes, LayerBlockType, LazyLoader,
+                        cuda_device_count_stateless, get_cpu_memory,
+                        get_open_port, is_torch_equal_or_newer, random_uuid,
+                        resolve_obj_by_qualname)
 
 if TYPE_CHECKING:
     from _typeshed import DataclassInstance
     from ray.util.placement_group import PlacementGroup
+    from transformers import PretrainedConfig
 
     from vllm.executor.executor_base import ExecutorBase
     from vllm.model_executor.layers.quantization.base_config import (
@@ -54,10 +53,16 @@
     from vllm.model_executor.model_loader.loader import BaseModelLoader
 
     ConfigType = type[DataclassInstance]
+    HfOverrides = Union[dict[str, Any], Callable[[PretrainedConfig],
+                                                 PretrainedConfig]]
 else:
-    QuantizationConfig = None
+    HfOverrides = None
     ConfigType = type
 
+me_quant = LazyLoader("model_executor", globals(),
+                      "vllm.model_executor.layers.quantization")
+me_models = LazyLoader("model_executor", globals(),
+                       "vllm.model_executor.models")
 logger = init_logger(__name__)
 
 ConfigT = TypeVar("ConfigT", bound=ConfigType)
@@ -89,9 +94,6 @@
     for task in tasks
 }
 
-HfOverrides = Union[dict[str, Any], Callable[[PretrainedConfig],
-                                             PretrainedConfig]]
-
 
 class SupportsHash(Protocol):
 
@@ -365,7 +367,7 @@ def __init__(
         mm_processor_kwargs: Optional[dict[str, Any]] = None,
         disable_mm_preprocessor_cache: bool = False,
         override_neuron_config: Optional[dict[str, Any]] = None,
-        override_pooler_config: Optional["PoolerConfig"] = None,
+        override_pooler_config: Optional[PoolerConfig] = None,
         logits_processor_pattern: Optional[str] = None,
         generation_config: str = "auto",
         enable_sleep_mode: bool = False,
@@ -548,7 +550,7 @@ def __init__(
 
     @property
     def registry(self):
-        return ModelRegistry
+        return me_models.ModelRegistry
 
     @property
     def architectures(self) -> list[str]:
@@ -581,7 +583,7 @@ def maybe_pull_model_tokenizer_for_s3(self, model: str,
 
     def _init_multimodal_config(
         self, limit_mm_per_prompt: Optional[dict[str, int]]
-    ) -> Optional["MultiModalConfig"]:
+    ) -> Optional[MultiModalConfig]:
         if self.registry.is_multimodal_model(self.architectures):
             return MultiModalConfig(limit_per_prompt=limit_mm_per_prompt or {})
 
@@ -597,8 +599,8 @@ def _get_encoder_config(self):
 
     def _init_pooler_config(
         self,
-        override_pooler_config: Optional["PoolerConfig"],
-    ) -> Optional["PoolerConfig"]:
+        override_pooler_config: Optional[PoolerConfig],
+    ) -> Optional[PoolerConfig]:
 
         if self.runner_type == "pooling":
             user_config = override_pooler_config or PoolerConfig()
@@ -749,7 +751,8 @@ def _parse_quant_hf_config(self):
         return quant_cfg
 
     def _verify_quantization(self) -> None:
-        supported_quantization = QUANTIZATION_METHODS
+        supported_quantization = me_quant.QUANTIZATION_METHODS
+
         optimized_quantization_methods = [
             "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
             "awq_marlin", "fbgemm_fp8", "compressed_tensors",
@@ -766,8 +769,8 @@ def _verify_quantization(self) -> None:
             quant_method = quant_cfg.get("quant_method", "").lower()
 
             # Detect which checkpoint is it
-            for name in QUANTIZATION_METHODS:
-                method = get_quantization_config(name)
+            for name in me_quant.QUANTIZATION_METHODS:
+                method = me_quant.get_quantization_config(name)
                 quantization_override = method.override_quantization_method(
                     quant_cfg, self.quantization)
                 if quantization_override:
@@ -799,6 +802,8 @@ def _verify_quantization(self) -> None:
                     "non-quantized models.", self.quantization)
 
     def _verify_cuda_graph(self) -> None:
+        from vllm.platforms import current_platform
+
         if self.max_seq_len_to_capture is None:
             self.max_seq_len_to_capture = self.max_model_len
         self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
@@ -885,7 +890,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
 
     def verify_with_parallel_config(
         self,
-        parallel_config: "ParallelConfig",
+        parallel_config: ParallelConfig,
     ) -> None:
 
         if parallel_config.distributed_executor_backend == "external_launcher":
@@ -1038,7 +1043,7 @@ def get_total_num_kv_heads(self) -> int:
         # equal to the number of attention heads.
         return self.hf_text_config.num_attention_heads
 
-    def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int:
+    def get_num_kv_heads(self, parallel_config: ParallelConfig) -> int:
         """Returns the number of KV heads per GPU."""
         if self.use_mla:
             # When using MLA during decode it becomes MQA
@@ -1052,13 +1057,12 @@ def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int:
         return max(1,
                    total_num_kv_heads // parallel_config.tensor_parallel_size)
 
-    def get_num_attention_heads(self,
-                                parallel_config: "ParallelConfig") -> int:
+    def get_num_attention_heads(self, parallel_config: ParallelConfig) -> int:
         num_heads = getattr(self.hf_text_config, "num_attention_heads", 0)
         return num_heads // parallel_config.tensor_parallel_size
 
     def get_layers_start_end_indices(
-            self, parallel_config: "ParallelConfig") -> tuple[int, int]:
+            self, parallel_config: ParallelConfig) -> tuple[int, int]:
         from vllm.distributed.utils import get_pp_indices
         if self.hf_text_config.model_type == "deepseek_mtp":
             total_num_hidden_layers = getattr(self.hf_text_config,
@@ -1073,13 +1077,13 @@ def get_layers_start_end_indices(
         start, end = get_pp_indices(total_num_hidden_layers, pp_rank, pp_size)
         return start, end
 
-    def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
+    def get_num_layers(self, parallel_config: ParallelConfig) -> int:
         start, end = self.get_layers_start_end_indices(parallel_config)
         return end - start
 
     def get_num_layers_by_block_type(
         self,
-        parallel_config: "ParallelConfig",
+        parallel_config: ParallelConfig,
         block_type: LayerBlockType = LayerBlockType.attention,
     ) -> int:
         # This function relies on 'layers_block_type' in hf_config,
@@ -1132,7 +1136,7 @@ def get_num_layers_by_block_type(
 
             return sum(t == 1 for t in attn_type_list[start:end])
 
-    def get_multimodal_config(self) -> "MultiModalConfig":
+    def get_multimodal_config(self) -> MultiModalConfig:
         """
         Get the multimodal configuration of the model.
 
@@ -1241,7 +1245,7 @@ def runner_type(self) -> RunnerType:
     @property
     def is_v1_compatible(self) -> bool:
         architectures = getattr(self.hf_config, "architectures", [])
-        return ModelRegistry.is_v1_compatible(architectures)
+        return me_models.ModelRegistry.is_v1_compatible(architectures)
 
     @property
     def is_matryoshka(self) -> bool:
@@ -1392,7 +1396,7 @@ def _verify_prefix_caching(self) -> None:
 
     def verify_with_parallel_config(
         self,
-        parallel_config: "ParallelConfig",
+        parallel_config: ParallelConfig,
     ) -> None:
         total_cpu_memory = get_cpu_memory()
         # FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel
@@ -1460,7 +1464,7 @@ class LoadConfig:
     """Configuration for loading the model weights."""
 
     load_format: Union[str, LoadFormat,
-                       "BaseModelLoader"] = LoadFormat.AUTO.value
+                       BaseModelLoader] = LoadFormat.AUTO.value
     """The format of the model weights to load:\n
     - "auto" will try to load the weights in the safetensors format and fall
     back to the pytorch bin format if safetensors format is not available.\n
@@ -1582,11 +1586,11 @@ def data_parallel_rank_local(self, value: int) -> None:
     ray_workers_use_nsight: bool = False
     """Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
 
-    placement_group: Optional["PlacementGroup"] = None
+    placement_group: Optional[PlacementGroup] = None
     """ray distributed model workers placement group."""
 
     distributed_executor_backend: Optional[Union[DistributedExecutorBackend,
-                                                 type["ExecutorBase"]]] = None
+                                                 type[ExecutorBase]]] = None
     """Backend to use for distributed model
     workers, either "ray" or "mp" (multiprocessing). If the product
     of pipeline_parallel_size and tensor_parallel_size is less than
@@ -1629,7 +1633,7 @@ def get_next_dp_init_port(self) -> int:
         self.data_parallel_master_port += 1
         return answer
 
-    def stateless_init_dp_group(self) -> "ProcessGroup":
+    def stateless_init_dp_group(self) -> ProcessGroup:
         from vllm.distributed.utils import (
             stateless_init_torch_distributed_process_group)
 
@@ -1644,7 +1648,7 @@ def stateless_init_dp_group(self) -> "ProcessGroup":
         return dp_group
 
     @staticmethod
-    def has_unfinished_dp(dp_group: "ProcessGroup",
+    def has_unfinished_dp(dp_group: ProcessGroup,
                           has_unfinished: bool) -> bool:
         tensor = torch.tensor([has_unfinished],
                               dtype=torch.int32,
@@ -2227,7 +2231,7 @@ def compute_hash(self) -> str:
         return hash_str
 
     @classmethod
-    def from_dict(cls, dict_value: dict) -> "SpeculativeConfig":
+    def from_dict(cls, dict_value: dict) -> SpeculativeConfig:
         """Parse the CLI value for the speculative config."""
         return cls(**dict_value)
 
@@ -2819,7 +2823,7 @@ def compute_hash(self) -> str:
         return hash_str
 
     @staticmethod
-    def from_json(json_str: str) -> "PoolerConfig":
+    def from_json(json_str: str) -> PoolerConfig:
         return PoolerConfig(**json.loads(json_str))
 
 
@@ -3176,6 +3180,7 @@ def compute_hash(self) -> str:
         return hash_str
 
     def __post_init__(self):
+        from vllm.tracing import is_otel_available, otel_import_error_traceback
         if not is_otel_available() and self.otlp_traces_endpoint is not None:
             raise ValueError(
                 "OpenTelemetry is not available. Unable to configure "
@@ -3239,7 +3244,7 @@ def compute_hash(self) -> str:
         return hash_str
 
     @classmethod
-    def from_cli(cls, cli_value: str) -> "KVTransferConfig":
+    def from_cli(cls, cli_value: str) -> KVTransferConfig:
         """Parse the CLI value for the kv cache transfer config."""
         return KVTransferConfig.model_validate_json(cli_value)
 
@@ -3476,7 +3481,7 @@ def __repr__(self) -> str:
     __str__ = __repr__
 
     @classmethod
-    def from_cli(cls, cli_value: str) -> "CompilationConfig":
+    def from_cli(cls, cli_value: str) -> CompilationConfig:
         """Parse the CLI value for the compilation config."""
         if cli_value in ["0", "1", "2", "3"]:
             return cls(level=int(cli_value))
@@ -3528,7 +3533,7 @@ def model_post_init(self, __context: Any) -> None:
         self.static_forward_context = {}
         self.compilation_time = 0.0
 
-    def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
+    def init_backend(self, vllm_config: VllmConfig) -> Union[str, Callable]:
         if self.level == CompilationLevel.NO_COMPILATION:
             raise ValueError("No compilation level is set.")
 
@@ -3744,9 +3749,7 @@ def _get_quantization_config(
         """Get the quantization config."""
         from vllm.platforms import current_platform
         if model_config.quantization is not None:
-            from vllm.model_executor.model_loader.weight_utils import (
-                get_quant_config)
-            quant_config = get_quant_config(model_config, load_config)
+            quant_config = me_quant.get_quant_config(model_config, load_config)
             capability_tuple = current_platform.get_device_capability()
 
             if capability_tuple is not None:
@@ -3770,7 +3773,7 @@ def with_hf_config(
         self,
         hf_config: PretrainedConfig,
         architectures: Optional[list[str]] = None,
-    ) -> "VllmConfig":
+    ) -> VllmConfig:
         if architectures is not None:
             hf_config = copy.deepcopy(hf_config)
             hf_config.architectures = architectures
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 9cb2aa797be5..73c6a6854c99 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from __future__ import annotations
+
 # yapf: disable
 import argparse
 import dataclasses
@@ -7,8 +9,8 @@
 import re
 import threading
 from dataclasses import MISSING, dataclass, fields
-from typing import (Any, Callable, Dict, List, Literal, Optional, Type,
-                    TypeVar, Union, cast, get_args, get_origin)
+from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Literal,
+                    Optional, Type, TypeVar, Union, cast, get_args, get_origin)
 
 import torch
 from typing_extensions import TypeIs
@@ -26,9 +28,7 @@
                          SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
                          TaskOption, TokenizerPoolConfig, VllmConfig,
                          get_attr_docs, get_field)
-from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.plugins import load_general_plugins
 from vllm.reasoning import ReasoningParserManager
 from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
@@ -38,6 +38,9 @@
 
 # yapf: enable
 
+if TYPE_CHECKING:
+    from vllm.executor.executor_base import ExecutorBase
+
 logger = init_logger(__name__)
 
 ALLOWED_DETAILED_TRACE_MODULES = ["model", "worker", "all"]
@@ -73,7 +76,7 @@ def optional_float(val: str) -> Optional[float]:
 def nullable_kvs(val: str) -> Optional[dict[str, int]]:
     """NOTE: This function is deprecated, args should be passed as JSON
     strings instead.
-    
+
     Parses a string containing comma separate key [str] to value [int]
     pairs into a dictionary.
 
@@ -303,7 +306,9 @@ def can_be_type(cls: TypeHint, type: TypeHintT) -> TypeIs[TypeHintT]:
 
         def is_custom_type(cls: TypeHint) -> bool:
             """Check if the class is a custom type."""
-            return cls.__module__ != "builtins"
+            if isinstance(cls, type):
+                return cls.__module__ != "builtins"
+            return True
 
         def get_kwargs(cls: ConfigType) -> dict[str, Any]:
             cls_docs = get_attr_docs(cls)
@@ -610,6 +615,9 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]:
                             action='store_true',
                             help='Disable logging statistics.')
         # Quantization settings.
+        from vllm.model_executor.layers.quantization import (
+            QUANTIZATION_METHODS)
+
         parser.add_argument('--quantization',
                             '-q',
                             type=optional_str,
@@ -1071,7 +1079,7 @@ def create_speculative_config(
         target_parallel_config: ParallelConfig,
         enable_chunked_prefill: bool,
         disable_log_stats: bool,
-    ) -> Optional["SpeculativeConfig"]:
+    ) -> Optional[SpeculativeConfig]:
         """Initializes and returns a SpeculativeConfig object based on
         `speculative_config`.
 
@@ -1698,7 +1706,7 @@ def _warn_or_fallback(feature_name: str) -> bool:
 def human_readable_int(value):
     """Parse human-readable integers like '1k', '2M', etc.
     Including decimal values with decimal multipliers.
-    
+
     Examples:
     - '1k' -> 1,000
     - '1K' -> 1,024
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index bd2c3357cdc0..dd34a3e6c95e 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from __future__ import annotations
+
 import asyncio
 import json
 from abc import ABC, abstractmethod
@@ -7,11 +9,10 @@
 from collections.abc import Awaitable, Iterable
 from functools import cache, lru_cache, partial
 from pathlib import Path
-from typing import (Any, Callable, Generic, Literal, Optional, TypeVar, Union,
-                    cast)
+from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, Optional,
+                    TypeVar, Union, cast)
 
 import jinja2.nodes
-import transformers.utils.chat_template_utils as hf_chat_utils
 # yapf conflicts with isort for this block
 # yapf: disable
 from openai.types.chat import (ChatCompletionAssistantMessageParam,
@@ -29,8 +30,6 @@
     InputAudio)
 from pydantic import TypeAdapter
 # yapf: enable
-from transformers import (PreTrainedTokenizer, PreTrainedTokenizerFast,
-                          ProcessorMixin)
 # pydantic needs the TypedDict from typing_extensions
 from typing_extensions import Required, TypeAlias, TypedDict
 
@@ -39,7 +38,12 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
 from vllm.multimodal.utils import MediaConnector
 from vllm.transformers_utils.processor import cached_get_processor
-from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+
+    from vllm.transformers_utils.tokenizers import MistralTokenizer
 
 logger = init_logger(__name__)
 
@@ -280,6 +284,7 @@ def _iter_nodes_assign_content_item(root: jinja2.nodes.Node):
 
 def _try_extract_ast(chat_template: str) -> Optional[jinja2.nodes.Template]:
     try:
+        import transformers.utils.chat_template_utils as hf_chat_utils
         jinja_compiled = hf_chat_utils._compile_jinja_template(chat_template)
         return jinja_compiled.environment.parse(chat_template)
     except Exception:
@@ -338,6 +343,7 @@ def resolve_hf_chat_template(
     # 2nd priority: AutoProcessor chat template, unless tool calling is enabled
     if tools is None:
         try:
+            from transformers import ProcessorMixin
             processor = cached_get_processor(
                 tokenizer.name_or_path,
                 processor_cls=(PreTrainedTokenizer, PreTrainedTokenizerFast,
@@ -369,6 +375,7 @@ def _resolve_chat_template_content_format(
     *,
     trust_remote_code: bool,
 ) -> _ChatTemplateContentFormat:
+    from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
     if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
         hf_chat_template = resolve_hf_chat_template(
             tokenizer,
@@ -575,7 +582,7 @@ def add(self, modality: ModalityStr, item: _T) -> Optional[str]:
         return self._placeholder_str(modality, current_count)
 
     @abstractmethod
-    def create_parser(self) -> "BaseMultiModalContentParser":
+    def create_parser(self) -> BaseMultiModalContentParser:
         raise NotImplementedError
 
 
@@ -604,7 +611,7 @@ def all_mm_data(self) -> Optional[MultiModalDataDict]:
             mm_inputs["video"] = items_by_modality["video"] # A list of videos
         return mm_inputs
 
-    def create_parser(self) -> "BaseMultiModalContentParser":
+    def create_parser(self) -> BaseMultiModalContentParser:
         return MultiModalContentParser(self)
 
 
@@ -637,7 +644,7 @@ async def all_mm_data(self) -> Optional[MultiModalDataDict]:
             mm_inputs["video"] = items_by_modality["video"] # A list of videos
         return mm_inputs
 
-    def create_parser(self) -> "BaseMultiModalContentParser":
+    def create_parser(self) -> BaseMultiModalContentParser:
         return AsyncMultiModalContentParser(self)
 
 
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 38a541a408fa..dca39f2bc08f 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1,10 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from __future__ import annotations
+
 import itertools
 import warnings
 from collections.abc import Sequence
 from contextlib import contextmanager
-from typing import Any, Callable, ClassVar, Optional, Union, cast, overload
+from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union,
+                    cast, overload)
 
 import cloudpickle
 import torch.nn as nn
@@ -13,16 +16,7 @@
 
 from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
                               BeamSearchSequence, get_beam_search_score)
-from vllm.config import CompilationConfig
-from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
-                                   TaskOption)
 from vllm.engine.llm_engine import LLMEngine
-from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
-                                         ChatTemplateContentFormatOption,
-                                         apply_hf_chat_template,
-                                         apply_mistral_chat_template,
-                                         parse_chat_messages,
-                                         resolve_chat_template_content_format)
 from vllm.entrypoints.score_utils import (_cosine_similarity,
                                           _validate_score_input_lens)
 from vllm.inputs import PromptType, SingletonPrompt, TextPrompt, TokensPrompt
@@ -44,6 +38,11 @@
 from vllm.utils import (Counter, Device, deprecate_args, deprecate_kwargs,
                         is_list_of)
 
+if TYPE_CHECKING:
+    from vllm.engine.arg_utils import HfOverrides, PoolerConfig, TaskOption
+    from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
+                                             ChatTemplateContentFormatOption)
+
 logger = init_logger(__name__)
 
 _R = TypeVar("_R", default=Any)
@@ -117,7 +116,7 @@ class LLM:
         disable_async_output_proc: Disable async output processing.
             This may result in lower performance.
         hf_token: The token to use as HTTP bearer authorization for remote files
-            . If `True`, will use the token generated when running 
+            . If `True`, will use the token generated when running
             `huggingface-cli login` (stored in `~/.huggingface`).
         hf_overrides: If a dictionary, contains arguments to be forwarded to the
             HuggingFace config. If a callable, it is called to update the
@@ -194,6 +193,7 @@ def __init__(
         Note: if enforce_eager is unset (enforce_eager is None)
         it defaults to False.
         '''
+        from vllm.engine.arg_utils import EngineArgs
 
         if "disable_log_stats" not in kwargs:
             kwargs["disable_log_stats"] = True
@@ -207,6 +207,7 @@ def __init__(
 
         if compilation_config is not None:
             if isinstance(compilation_config, (int, dict)):
+                from vllm.config import CompilationConfig
                 compilation_config_instance = CompilationConfig.from_cli(
                     str(compilation_config))
             else:
@@ -701,6 +702,10 @@ def chat(
             A list of ``RequestOutput`` objects containing the generated
             responses in the same order as the input messages.
         """
+        from vllm.entrypoints.chat_utils import (
+            ChatCompletionMessageParam, apply_hf_chat_template,
+            apply_mistral_chat_template, parse_chat_messages,
+            resolve_chat_template_content_format)
         list_of_messages: list[list[ChatCompletionMessageParam]]
 
         # Handle multi and single conversations
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 73b4288cbb0d..b4667b2650f7 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -2,7 +2,7 @@
 import asyncio
 import time
 from collections.abc import AsyncGenerator, Mapping
-from typing import Any, Optional, Union
+from typing import TYPE_CHECKING, Any, Optional, Union
 
 from fastapi import Request
 
@@ -23,11 +23,13 @@
 from vllm.lora.request import LoRARequest
 from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
-                                               PreTrainedTokenizer,
-                                               PreTrainedTokenizerFast)
+from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.utils import make_async, merge_async_iterators
 
+if TYPE_CHECKING:
+    from vllm.transformers_utils.tokenizer import (PreTrainedTokenizer,
+                                                   PreTrainedTokenizerFast)
+
 logger = init_logger(__name__)
 
 
@@ -48,7 +50,7 @@ def __init__(
 
     async def _embedding_score(
         self,
-        tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+        tokenizer: Union["PreTrainedTokenizer", "PreTrainedTokenizerFast"],
         texts_1: list[str],
         texts_2: list[str],
         request: Union[RerankRequest, ScoreRequest],
diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
index 53411a27b41e..d4f067989d56 100644
--- a/vllm/entrypoints/score_utils.py
+++ b/vllm/entrypoints/score_utils.py
@@ -1,15 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
-from typing import Union
+from typing import TYPE_CHECKING, Union
 
 from torch.nn import CosineSimilarity
 
 from vllm.outputs import PoolingRequestOutput
-from vllm.transformers_utils.tokenizer import (PreTrainedTokenizer,
-                                               PreTrainedTokenizerFast)
+
+if TYPE_CHECKING:
+    from vllm.transformers_utils.tokenizer import (PreTrainedTokenizer,
+                                                   PreTrainedTokenizerFast)
 
 
 def _cosine_similarity(
-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    tokenizer: Union["PreTrainedTokenizer", "PreTrainedTokenizerFast"],
     embed_1: list[PoolingRequestOutput],
     embed_2: list[PoolingRequestOutput],
 ) -> list[PoolingRequestOutput]:
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 58796e5d7326..31fc6f75e275 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -109,6 +109,7 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
         """Initialize the KV cache by invoking the underlying worker.
         """
         # NOTE: This is logged in the executor because there can be >1 workers.
+
         logger.info("# %s blocks: %d, # CPU blocks: %d",
                     vllm.platforms.current_platform.device_name,
                     num_gpu_blocks, num_cpu_blocks)
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 37cc07bfbb36..12842d3cd397 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -11,7 +11,6 @@
 from vllm.config import ParallelConfig
 from vllm.executor.msgspec_utils import decode_hook, encode_hook
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
 from vllm.utils import get_ip
 from vllm.worker.worker_base import WorkerWrapperBase
@@ -109,7 +108,7 @@ def setup_device_if_necessary(self):
             # We can remove this API after it is fixed in compiled graph.
             assert self.worker is not None, "Worker is not initialized"
             if not self.compiled_dag_cuda_device_set:
-                if current_platform.is_tpu():
+                if vllm.platforms.current_platform.is_tpu():
                     # Not needed
                     pass
                 else:
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 0579893e5d76..e97ab325e6e9 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -8,12 +8,9 @@
                     Protocol, Union)
 
 from torch import nn
-from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
 from typing_extensions import TypeVar, assert_never
 
 from vllm.logger import init_logger
-from vllm.transformers_utils.processor import cached_processor_from_config
-from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides,
                         resolve_mm_processor_kwargs)
 
@@ -21,16 +18,19 @@
 from .parse import split_enc_dec_inputs
 
 if TYPE_CHECKING:
+    from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
+
     from vllm.config import ModelConfig
     from vllm.multimodal import (MultiModalDataDict, MultiModalPlaceholderDict,
                                  MultiModalRegistry)
     from vllm.sequence import SequenceData
+    from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 logger = init_logger(__name__)
 
 _T = TypeVar("_T")
-_C = TypeVar("_C", bound=PretrainedConfig, default=PretrainedConfig)
-_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
+_C = TypeVar("_C", bound="PretrainedConfig", default="PretrainedConfig")
+_P = TypeVar("_P", bound="ProcessorMixin", default="ProcessorMixin")
 
 
 @dataclass(frozen=True)
@@ -45,7 +45,7 @@ class InputContext:
 
     def get_hf_config(
         self,
-        typ: Union[type[_C], tuple[type[_C], ...]] = PretrainedConfig,
+        typ: Optional[Union[type[_C], tuple[type[_C], ...]]] = None,
         /,
     ) -> _C:
         """
@@ -56,6 +56,11 @@ def get_hf_config(
         Raises:
             TypeError: If the configuration is not of the specified type.
         """
+
+        if typ is None:
+            from transformers import PretrainedConfig
+            typ = PretrainedConfig
+
         hf_config = self.model_config.hf_config
         if not isinstance(hf_config, typ):
             raise TypeError("Invalid type of HuggingFace config. "
@@ -85,7 +90,7 @@ def get_mm_config(self):
 
     def get_hf_processor(
         self,
-        typ: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
+        typ: Optional[Union[type[_P], tuple[type[_P], ...]]] = None,
         /,
         **kwargs: object,
     ) -> _P:
@@ -97,6 +102,12 @@ def get_hf_processor(
         Raises:
             TypeError: If the processor is not of the specified type.
         """
+
+        if typ is None:
+            from transformers import ProcessorMixin
+            typ = ProcessorMixin
+        from vllm.transformers_utils.processor import (
+            cached_processor_from_config)
         return cached_processor_from_config(
             self.model_config,
             processor_cls=typ,
@@ -124,15 +135,19 @@ def init_processor(
 
 @dataclass(frozen=True)
 class InputProcessingContext(InputContext):
-    tokenizer: AnyTokenizer
+    tokenizer: "AnyTokenizer"
     """The tokenizer used to tokenize the inputs."""
 
     def get_hf_processor(
         self,
-        typ: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
+        typ: Union[type[_P], tuple[type[_P], ...], None] = None,
         /,
         **kwargs: object,
     ) -> _P:
+
+        if typ is None:
+            from transformers import ProcessorMixin
+            typ = ProcessorMixin
         return super().get_hf_processor(
             typ,
             tokenizer=self.tokenizer,
@@ -141,10 +156,10 @@ def get_hf_processor(
 
     def call_hf_processor(
         self,
-        hf_processor: ProcessorMixin,
+        hf_processor: "ProcessorMixin",
         data: Mapping[str, object],
         kwargs: Mapping[str, object] = {},
-    ) -> BatchFeature:
+    ) -> "BatchFeature":
         """
         Call :code:`hf_processor` on the prompt :code:`data`
         (text, image, audio...) with configurable options :code:`kwargs`.
diff --git a/vllm/model_executor/guided_decoding/reasoner/__init__.py b/vllm/model_executor/guided_decoding/reasoner/__init__.py
index ab6e47c007d2..297e5b29518a 100644
--- a/vllm/model_executor/guided_decoding/reasoner/__init__.py
+++ b/vllm/model_executor/guided_decoding/reasoner/__init__.py
@@ -2,13 +2,16 @@
 
 from __future__ import annotations
 
-from transformers import PreTrainedTokenizer
+from typing import TYPE_CHECKING
 
 from vllm.logger import init_logger
 from vllm.model_executor.guided_decoding.reasoner.deepseek_reasoner import (  # noqa: E501
     DeepSeekReasoner)
 from vllm.model_executor.guided_decoding.reasoner.reasoner import Reasoner
 
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer
+
 logger = init_logger(__name__)
 
 
diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py
index 54fd43fc6592..50b9a909ab7b 100644
--- a/vllm/model_executor/layers/spec_decode_base_sampler.py
+++ b/vllm/model_executor/layers/spec_decode_base_sampler.py
@@ -7,8 +7,6 @@
 import torch.jit
 import torch.nn as nn
 
-from vllm.platforms import current_platform
-
 
 class SpecDecodeBaseSampler(nn.Module):
     """Base class for samplers used for Speculative Decoding verification
@@ -37,6 +35,7 @@ def __init__(self, strict_mode: bool = False):
     def init_gpu_tensors(self, device: Union[int, str]) -> None:
         assert self.num_accepted_tokens is None
         if isinstance(device, int):
+            from vllm.platforms import current_platform
             device = f"{current_platform.device_type}:{device}"
         elif not isinstance(device, str):
             raise ValueError(f"Device must be int or str, get {type(device)}")
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 6855808e8e44..8296811e52c1 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -13,13 +13,14 @@
 import torch
 import torch.types
 from PIL.Image import Image
-from transformers import BatchFeature
 from typing_extensions import NotRequired, TypeAlias
 
 from vllm.jsontree import JSONTree, json_map_leaves
 from vllm.utils import full_groupby, is_list_of
 
 if TYPE_CHECKING:
+    from transformers import BatchFeature
+
     from .hasher import MultiModalHashDict
 
 _T = TypeVar("_T")
@@ -599,7 +600,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
 
     @staticmethod
     def from_hf_inputs(
-        hf_inputs: BatchFeature,
+        hf_inputs: "BatchFeature",
         config_by_key: Mapping[str, MultiModalFieldConfig],
     ):
         # NOTE: This skips fields in `hf_inputs` that are not in `config_by_key`
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 9707b9cfcf8b..d1c36a92dae9 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -9,7 +9,6 @@
 import numpy as np
 import torch
 from PIL.Image import Image
-from transformers import BatchFeature
 from typing_extensions import TypeAlias, TypeGuard, assert_never
 
 from vllm.utils import is_list_of
@@ -149,7 +148,7 @@ def __init__(
 
         self.fields_config = fields_config
         self.required_fields = required_fields
-
+        from transformers import BatchFeature
         self._kwargs = MultiModalKwargs.from_hf_inputs(
             BatchFeature(dict(data)),
             fields_config,
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 16358d1a5ee4..dda66807b94c 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -13,7 +13,6 @@
                     TypeVar, Union, cast)
 
 import torch
-from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
 from typing_extensions import assert_never
 
 from vllm.inputs import InputProcessingContext
@@ -31,6 +30,8 @@
                     MultiModalDataParser)
 
 if TYPE_CHECKING:
+    from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
+
     from .profiling import BaseDummyInputsBuilder
 
 logger = init_logger(__name__)
@@ -1013,10 +1014,10 @@ def model_id(self) -> str:
     def get_tokenizer(self) -> AnyTokenizer:
         return self.ctx.tokenizer
 
-    def get_hf_config(self) -> PretrainedConfig:
+    def get_hf_config(self) -> "PretrainedConfig":
         return self.ctx.get_hf_config()
 
-    def get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
+    def get_hf_processor(self, **kwargs: object) -> "ProcessorMixin":
         """
         Subclasses can override this method to handle
         specific kwargs from model config or user inputs.
@@ -1128,7 +1129,7 @@ def _to_mm_items(
     @abstractmethod
     def _get_mm_fields_config(
         self,
-        hf_inputs: BatchFeature,
+        hf_inputs: "BatchFeature",
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
         """Given the HF-processed data, output the metadata of each field."""
@@ -1185,7 +1186,7 @@ def _call_hf_processor(
         # This refers to the data to be passed to HF processor.
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
-    ) -> BatchFeature:
+    ) -> "BatchFeature":
         """
         Call the HF processor on the prompt text and
         associated multi-modal data.
@@ -1676,7 +1677,7 @@ def create_encoder_prompt(
         mm_data: MultiModalDataDict,
     ) -> Union[str, list[int]]:
         """
-        Create input prompt for the encoder. HF processor will be applied on 
+        Create input prompt for the encoder. HF processor will be applied on
         this prompt during profiling and generation.
         """
         raise NotImplementedError
diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py
index 454167a0dc95..f30c161481e4 100644
--- a/vllm/reasoning/abs_reasoning_parsers.py
+++ b/vllm/reasoning/abs_reasoning_parsers.py
@@ -1,17 +1,21 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from __future__ import annotations
+
 import os
 from abc import abstractmethod
 from collections.abc import Sequence
 from functools import cached_property
-from typing import Callable, Optional, Union
+from typing import TYPE_CHECKING, Callable, Optional, Union
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaMessage)
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import import_from_path, is_list_of
 
+if TYPE_CHECKING:
+    from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                                  DeltaMessage)
+    from vllm.transformers_utils.tokenizer import AnyTokenizer
+
 logger = init_logger(__name__)
 
 
diff --git a/vllm/reasoning/deepseek_r1_reasoning_parser.py b/vllm/reasoning/deepseek_r1_reasoning_parser.py
index 1c283c092a28..0f125c6b2052 100644
--- a/vllm/reasoning/deepseek_r1_reasoning_parser.py
+++ b/vllm/reasoning/deepseek_r1_reasoning_parser.py
@@ -1,15 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from collections.abc import Sequence
-from typing import Optional, Union
+from __future__ import annotations
 
-from transformers import PreTrainedTokenizerBase
+from collections.abc import Sequence
+from typing import TYPE_CHECKING, Optional, Union
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaMessage)
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
 
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizerBase
+
+    from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                                  DeltaMessage)
+
 logger = init_logger(__name__)
 
 
@@ -72,6 +76,8 @@ def extract_reasoning_content_streaming(
         - 'abc' goes to reasoning_content
         - 'xyz' goes to content
         """
+        from vllm.entrypoints.openai.protocol import DeltaMessage
+
         # Skip single special tokens
         if len(delta_token_ids) == 1 and (delta_token_ids[0] in [
                 self.start_token_id, self.end_token_id
diff --git a/vllm/reasoning/granite_reasoning_parser.py b/vllm/reasoning/granite_reasoning_parser.py
index 0dae02d33fec..f72592f0a6b5 100644
--- a/vllm/reasoning/granite_reasoning_parser.py
+++ b/vllm/reasoning/granite_reasoning_parser.py
@@ -1,16 +1,20 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from __future__ import annotations
+
 import re
 from collections.abc import Sequence
-from typing import Optional, Union
-
-from transformers import PreTrainedTokenizerBase
+from typing import TYPE_CHECKING, Optional, Union
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaMessage)
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
 
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizerBase
+
+    from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                                  DeltaMessage)
+
 logger = init_logger(__name__)
 
 
@@ -138,7 +142,7 @@ def _is_reasoning_start_substr(self, text: str) -> bool:
 
         Args:
             text (str): Text to check for leading substr.
-        
+
         Returns:
             bool: True if any of the possible reasoning start seqs match.
         """
@@ -151,7 +155,7 @@ def _is_response_start_substr(self, text: str) -> bool:
 
         Args:
             text (str): Text to check for leading substr.
-        
+
         Returns:
             bool: True if any of the possible response start seqs match.
         """
@@ -174,6 +178,8 @@ def _get_delta_message_with_no_reasoning_bounds(
         Returns:
             DeltaMessage: Message containing the parsed content.
         """
+        from vllm.entrypoints.openai.protocol import DeltaMessage
+
         prev_longest_length = len(current_text) - len(delta_text)
         is_substr = self._is_reasoning_start_substr(current_text)
         was_substr = self._is_reasoning_start_substr(
@@ -213,6 +219,8 @@ def _get_delta_message_with_no_response_bounds(
         Returns:
             DeltaMessage: Message containing the parsed content.
         """
+        from vllm.entrypoints.openai.protocol import DeltaMessage
+
         # If we have no reasoning content or explicitly end with the start of
         # response sequence, we are in transition to the response; need to be
         # careful here, since the final token (:) will match the reasoning
@@ -286,6 +294,8 @@ def _get_delta_message_with_both_bounds(
         Returns:
             DeltaMessage: Message containing the parsed content.
         """
+        from vllm.entrypoints.openai.protocol import DeltaMessage
+
         # Always have content; take length to the end
         delta_content = delta_text[-len(response_content):]
         reasoning_end_idx = len(delta_text) - (len(response_content) +
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index c9f9af45044e..f2699d047b15 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -1,88 +1,82 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from __future__ import annotations
+
 import enum
+import importlib
 import json
 import os
 import time
 from functools import cache
 from pathlib import Path
-from typing import Any, Callable, Dict, Literal, Optional, Type, Union
-
-import huggingface_hub
-from huggingface_hub import hf_hub_download
-from huggingface_hub import list_repo_files as hf_list_repo_files
-from huggingface_hub import try_to_load_from_cache
-from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError,
-                                   HFValidationError, LocalEntryNotFoundError,
-                                   RepositoryNotFoundError,
-                                   RevisionNotFoundError)
+from typing import (TYPE_CHECKING, Any, Callable, Dict, Literal, Optional,
+                    Type, Union)
+
 from torch import nn
-from transformers import GenerationConfig, PretrainedConfig
-from transformers.models.auto.image_processing_auto import (
-    get_image_processor_config)
-from transformers.models.auto.modeling_auto import (
-    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
-from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
 
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config,
-                                             DbrxConfig, DeepseekVLV2Config,
-                                             EAGLEConfig, ExaoneConfig,
-                                             H2OVLChatConfig,
-                                             InternVLChatConfig, JAISConfig,
-                                             KimiVLConfig, MedusaConfig,
-                                             MllamaConfig, MLPSpeculatorConfig,
-                                             MPTConfig, NemotronConfig,
-                                             NVLM_D_Config, RWConfig,
-                                             SkyworkR1VChatConfig, SolarConfig,
-                                             Telechat2Config, UltravoxConfig)
-# yapf: enable
 from vllm.transformers_utils.utils import check_gguf_file
-from vllm.utils import resolve_obj_by_qualname
-
-if VLLM_USE_MODELSCOPE:
-    from modelscope import AutoConfig
-else:
-    from transformers import AutoConfig
+from vllm.utils import LazyLoader, resolve_obj_by_qualname
 
 MISTRAL_CONFIG_NAME = "params.json"
 HF_TOKEN = os.getenv('HF_TOKEN', None)
 
 logger = init_logger(__name__)
 
-_CONFIG_REGISTRY_OVERRIDE_HF: Dict[str, Type[PretrainedConfig]] = {
-    "mllama": MllamaConfig
-}
-
-_CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
-    "chatglm": ChatGLMConfig,
-    "cohere2": Cohere2Config,
-    "dbrx": DbrxConfig,
-    "deepseek_vl_v2": DeepseekVLV2Config,
-    "kimi_vl": KimiVLConfig,
-    "mpt": MPTConfig,
-    "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
-    "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
-    "jais": JAISConfig,
-    "mlp_speculator": MLPSpeculatorConfig,
-    "medusa": MedusaConfig,
-    "eagle": EAGLEConfig,
-    "exaone": ExaoneConfig,
-    "h2ovl_chat": H2OVLChatConfig,
-    "internvl_chat": InternVLChatConfig,
-    "nemotron": NemotronConfig,
-    "NVLM_D": NVLM_D_Config,
-    "solar": SolarConfig,
-    "skywork_chat": SkyworkR1VChatConfig,
-    "telechat": Telechat2Config,
-    "ultravox": UltravoxConfig,
+if TYPE_CHECKING:
+    import huggingface_hub as hfhub
+    import huggingface_hub.utils as hfhub_utils
+    from transformers.configuration_utils import PretrainedConfig
+    from transformers.generation.configuration_utils import GenerationConfig
+else:
+    hfhub = LazyLoader("hfhub", globals(), "huggingface_hub")
+    hfhub_utils = LazyLoader("hfhub_utils", globals(), "huggingface_hub.utils")
+
+_CONFIG_REGISTRY_OVERRIDE_HF: Dict[str, str] = {"mllama": "MllamaConfig"}
+
+_CONFIG_REGISTRY: Dict[str, str] = {
+    "chatglm": "ChatGLMConfig",
+    "cohere2": "Cohere2Config",
+    "dbrx": "DbrxConfig",
+    "deepseek_vl_v2": "DeepseekVLV2Config",
+    "kimi_vl": "KimiVLConfig",
+    "mpt": "MPTConfig",
+    "RefinedWeb": "RWConfig",  # For tiiuae/falcon-40b(-instruct)
+    "RefinedWebModel": "RWConfig",  # For tiiuae/falcon-7b(-instruct)
+    "jais": "JAISConfig",
+    "mlp_speculator": "MLPSpeculatorConfig",
+    "medusa": "MedusaConfig",
+    "eagle": "EAGLEConfig",
+    "exaone": "ExaoneConfig",
+    "h2ovl_chat": "H2OVLChatConfig",
+    "internvl_chat": "InternVLChatConfig",
+    "nemotron": "NemotronConfig",
+    "NVLM_D": "NVLM_D_Config",
+    "olmo2": "Olmo2Config",
+    "solar": "SolarConfig",
+    "skywork_chat": "SkyworkR1VChatConfig",
+    "telechat": "Telechat2Config",
+    "ultravox": "UltravoxConfig",
     **_CONFIG_REGISTRY_OVERRIDE_HF
 }
 
 
+def get_config_class(key: str) -> Type:
+    config_class_name = _CONFIG_REGISTRY[key]
+    module_path = "vllm.transformers_utils.configs"
+
+    try:
+        module = importlib.import_module(module_path)
+        config_class = getattr(module, config_class_name)
+    except (ModuleNotFoundError, AttributeError) as e:
+        raise ValueError(
+            f"Failed to import config class '{config_class_name}' "
+            f"from module '{module_path}'.") from e
+
+    return config_class
+
+
 class ConfigFormat(str, enum.Enum):
     AUTO = "auto"
     HF = "hf"
@@ -131,11 +125,11 @@ def lookup_files() -> list[str]:
                 return modelscope_list_repo_files(repo_id,
                                                   revision=revision,
                                                   token=token)
-            return hf_list_repo_files(repo_id,
-                                      revision=revision,
-                                      repo_type=repo_type,
-                                      token=token)
-        except huggingface_hub.errors.OfflineModeIsEnabled:
+            return hfhub.list_repo_files(repo_id,
+                                         revision=revision,
+                                         repo_type=repo_type,
+                                         token=token)
+        except hfhub.errors.OfflineModeIsEnabled:
             # Don't raise in offline mode,
             # all we know is that we don't have this
             # file cached.
@@ -166,9 +160,9 @@ def file_or_path_exists(model: Union[str, Path], config_name: str,
         return (local_path / config_name).is_file()
 
     # Offline mode support: Check if config file is cached already
-    cached_filepath = try_to_load_from_cache(repo_id=model,
-                                             filename=config_name,
-                                             revision=revision)
+    cached_filepath = hfhub.try_to_load_from_cache(repo_id=model,
+                                                   filename=config_name,
+                                                   revision=revision)
     if isinstance(cached_filepath, str):
         # The config file exists in cache- we can continue trying to load
         return True
@@ -272,6 +266,7 @@ def get_config(
 
     if config_format == ConfigFormat.AUTO:
         try:
+            from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
             if is_gguf or file_or_path_exists(
                     model, HF_CONFIG_NAME, revision=revision):
                 config_format = ConfigFormat.HF
@@ -300,6 +295,7 @@ def get_config(
             raise ValueError(error_message) from e
 
     if config_format == ConfigFormat.HF:
+        from transformers import PretrainedConfig
         config_dict, _ = PretrainedConfig.get_config_dict(
             model,
             revision=revision,
@@ -311,7 +307,7 @@ def get_config(
         # Use custom model class if it's in our registry
         model_type = config_dict.get("model_type")
         if model_type in _CONFIG_REGISTRY:
-            config_class = _CONFIG_REGISTRY[model_type]
+            config_class = get_config_class(model_type)
             config = config_class.from_pretrained(
                 model,
                 revision=revision,
@@ -321,6 +317,10 @@ def get_config(
             )
         else:
             try:
+                if VLLM_USE_MODELSCOPE:
+                    from modelscope import AutoConfig
+                else:
+                    from transformers import AutoConfig
                 config = AutoConfig.from_pretrained(
                     model,
                     trust_remote_code=trust_remote_code,
@@ -357,6 +357,8 @@ def get_config(
 
     # Special architecture mapping check for GGUF models
     if is_gguf:
+        from transformers.models.auto.modeling_auto import (
+            MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
         if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
             raise RuntimeError(
                 f"Can't get gguf config for {config.model_type}.")
@@ -379,12 +381,12 @@ def try_get_local_file(model: Union[str, Path],
         return file_path
     else:
         try:
-            cached_filepath = try_to_load_from_cache(repo_id=model,
-                                                     filename=file_name,
-                                                     revision=revision)
+            cached_filepath = hfhub.try_to_load_from_cache(repo_id=model,
+                                                           filename=file_name,
+                                                           revision=revision)
             if isinstance(cached_filepath, str):
                 return Path(cached_filepath)
-        except HFValidationError:
+        except hfhub_utils.HFValidationError:
             ...
     return None
 
@@ -412,14 +414,18 @@ def get_hf_file_to_dict(file_name: str,
 
     if file_path is None:
         try:
-            hf_hub_file = hf_hub_download(model, file_name, revision=revision)
-        except huggingface_hub.errors.OfflineModeIsEnabled:
+            hf_hub_file = hfhub.hf_hub_download(model,
+                                                file_name,
+                                                revision=revision)
+        except hfhub.errors.OfflineModeIsEnabled:
             return None
-        except (RepositoryNotFoundError, RevisionNotFoundError,
-                EntryNotFoundError, LocalEntryNotFoundError) as e:
+        except (hfhub_utils.RepositoryNotFoundError,
+                hfhub_utils.RevisionNotFoundError,
+                hfhub_utils.EntryNotFoundError,
+                hfhub_utils.LocalEntryNotFoundError) as e:
             logger.debug("File or repository not found in hf_hub_download", e)
             return None
-        except HfHubHTTPError as e:
+        except hfhub_utils.HfHubHTTPError as e:
             logger.warning(
                 "Cannot connect to Hugging Face Hub. Skipping file "
                 "download for '%s':",
@@ -724,6 +730,7 @@ def recurse_elems(elem: Any):
     config_dict = recurse_elems(config_dict)
 
     # transform to HF config format
+    from transformers import PretrainedConfig
     if config_type == "multimodal":
         config_dict["text_config"] = PretrainedConfig(
             **config_dict["text_config"])
@@ -745,6 +752,8 @@ def get_hf_image_processor_config(
     # Separate model folder from file path for GGUF models
     if check_gguf_file(model):
         model = Path(model).parent
+    from transformers.models.auto.image_processing_auto import (
+        get_image_processor_config)
     return get_image_processor_config(model,
                                       token=hf_token,
                                       revision=revision,
@@ -775,6 +784,7 @@ def try_get_generation_config(
     trust_remote_code: bool,
     revision: Optional[str] = None,
 ) -> Optional[GenerationConfig]:
+    from transformers import GenerationConfig
     try:
         return GenerationConfig.from_pretrained(
             model,
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index ed2f4b076ded..71855ad36041 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -1,15 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from functools import lru_cache
-from typing import TYPE_CHECKING, Any, Union, cast
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union, cast
 
-from transformers.processing_utils import ProcessorMixin
 from typing_extensions import TypeVar
 
 if TYPE_CHECKING:
+    from transformers.processing_utils import ProcessorMixin
+
     from vllm.config import ModelConfig
 
-_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
+_P = TypeVar("_P", bound="ProcessorMixin", default="ProcessorMixin")
 
 
 class HashableDict(dict):
@@ -54,13 +55,16 @@ def get_processor(
     processor_name: str,
     *args: Any,
     trust_remote_code: bool = False,
-    processor_cls: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
+    processor_cls: Optional[Union[type[_P], tuple[type[_P], ...]]] = None,
     **kwargs: Any,
 ) -> _P:
     """Load a processor for the given model name via HuggingFace."""
     # don't put this import at the top level
     # it will call torch.cuda.device_count()
     from transformers import AutoProcessor
+    from transformers.processing_utils import ProcessorMixin
+    if processor_cls is None:
+        processor_cls = ProcessorMixin
 
     processor_factory = (AutoProcessor if processor_cls == ProcessorMixin or
                          isinstance(processor_cls, tuple) else processor_cls)
@@ -95,14 +99,19 @@ def get_processor(
     return processor
 
 
-cached_get_processor = lru_cache(get_processor)
+cached_get_processor: Callable = lru_cache(get_processor)
 
 
 def cached_processor_from_config(
     model_config: "ModelConfig",
-    processor_cls: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
+    processor_cls: Optional[Union[type[_P], tuple[type[_P], ...]]] = None,
     **kwargs: Any,
 ) -> _P:
+
+    from transformers.processing_utils import ProcessorMixin
+    if processor_cls is None:
+        processor_cls = ProcessorMixin
+
     return cached_get_processor(
         model_config.model,
         trust_remote_code=model_config.trust_remote_code,
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 1bfb50328338..97bfc17236c4 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -9,25 +9,25 @@
 from typing import TYPE_CHECKING, Any, Optional, Union
 
 import huggingface_hub
-from transformers import (AutoTokenizer, PreTrainedTokenizer,
-                          PreTrainedTokenizerFast)
 
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.transformers_utils.tokenizer_base import (TokenizerBase,
-                                                    TokenizerRegistry)
+from vllm.transformers_utils.tokenizer_base import TokenizerRegistry
 from vllm.transformers_utils.tokenizers import MistralTokenizer
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.utils import make_async
 
 if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+
     from vllm.config import ModelConfig
+    from vllm.transformers_utils.tokenizer_base import TokenizerBase
 
 logger = init_logger(__name__)
 
-AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast,
-                     TokenizerBase]
+AnyTokenizer = Union["PreTrainedTokenizer", "PreTrainedTokenizerFast",
+                     "TokenizerBase"]
 
 
 def decode_tokens(
@@ -124,12 +124,12 @@ def __len__(self):
     return tokenizer
 
 
-def patch_padding_side(tokenizer: PreTrainedTokenizer) -> None:
+def patch_padding_side(tokenizer: "PreTrainedTokenizer") -> None:
     """Patch _pad method to accept `padding_side` for older tokenizers."""
     orig_pad = tokenizer._pad
 
     def _pad(
-        self: PreTrainedTokenizer,
+        self: "PreTrainedTokenizer",
         *args,
         padding_side: Optional[str] = None,
         **kwargs,
@@ -215,6 +215,7 @@ def get_tokenizer(
                                                     **kwargs)
     else:
         try:
+            from transformers import AutoTokenizer
             tokenizer = AutoTokenizer.from_pretrained(
                 tokenizer_name,
                 *args,
@@ -241,9 +242,10 @@ def get_tokenizer(
         # NOTE: We can remove this after https://github.com/THUDM/ChatGLM3/issues/1324
         if type(tokenizer).__name__ in ("ChatGLMTokenizer",
                                         "ChatGLM4Tokenizer"):
+            from transformers import PreTrainedTokenizer
             assert isinstance(tokenizer, PreTrainedTokenizer)
             patch_padding_side(tokenizer)
-
+        from transformers import PreTrainedTokenizerFast
         if not isinstance(tokenizer, PreTrainedTokenizerFast):
             logger.warning(
                 "Using a slow tokenizer. This might cause a significant "
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 58a114fa3a32..9cbfcf0d7c8b 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -7,7 +7,6 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union, cast
 
 import huggingface_hub
-from huggingface_hub import HfApi, hf_hub_download
 
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer_base import TokenizerBase
@@ -236,6 +235,7 @@ def from_pretrained(cls,
     @staticmethod
     def _download_mistral_tokenizer_from_hf(tokenizer_name: str,
                                             revision: Optional[str]) -> str:
+        from huggingface_hub import HfApi, hf_hub_download
         try:
             hf_api = HfApi()
             files = hf_api.list_repo_files(repo_id=tokenizer_name,