From e0bb71615008849424bc8a649363c74f88c1b74c Mon Sep 17 00:00:00 2001
From: Mu Huai <tianbowen.tbw@antgroup.com>
Date: Wed, 2 Jul 2025 20:19:27 +0800
Subject: [PATCH 01/27] feat:trace v1

Signed-off-by: Mu Huai <tianbowen.tbw@antgroup.com>
---
 vllm/tracing.py                    |  5 ++
 vllm/v1/core/sched/scheduler.py    |  2 +-
 vllm/v1/engine/__init__.py         |  7 ++-
 vllm/v1/engine/output_processor.py | 83 ++++++++++++++++++++++++++++--
 vllm/v1/engine/processor.py        |  2 -
 vllm/v1/request.py                 |  6 ++-
 6 files changed, 94 insertions(+), 11 deletions(-)

diff --git a/vllm/tracing.py b/vllm/tracing.py
index 6a287d82be5f..7537e9901a04 100644
--- a/vllm/tracing.py
+++ b/vllm/tracing.py
@@ -119,6 +119,11 @@ class SpanAttributes:
     # forward, block/sync across workers, cpu-gpu sync time and sampling time.
     GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE = (
         "gen_ai.latency.time_in_model_execute")
+    GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL = \
+        "gen_ai.latency.time_in_model_prefill"
+    GEN_AI_LATENCY_TIME_IN_MODEL_DECODE = "gen_ai.latency.time_in_model_decode"
+    GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE = \
+        "gen_ai.latency.time_in_model_inference"
 
 
 def contains_trace_headers(headers: Mapping[str, str]) -> bool:
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index fe552db74e2f..23b3ace73c7b 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -860,9 +860,9 @@ def update_from_output(
                         stop_reason=request.stop_reason,
                         events=request.take_events(),
                         kv_transfer_params=kv_transfer_params,
+                        trace_headers=request.trace_headers,
                         num_cached_tokens=request.num_cached_tokens,
                     ))
-
             else:
                 # Invariant: EngineCore returns no partial prefill outputs.
                 assert not prompt_logprobs_tensors
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 921ccd708cdd..58aca430e7ee 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -3,7 +3,7 @@
 
 import enum
 import time
-from collections.abc import Sequence
+from collections.abc import Mapping, Sequence
 from typing import Any, Optional, Union
 
 import msgspec
@@ -70,6 +70,8 @@ class EngineCoreRequest(
     current_wave: int = 0
     priority: int = 0
 
+    trace_headers: Optional[Mapping[str, str]] = None
+
 
 class EngineCoreEventType(enum.IntEnum):
     """The type of engine core request event."""
@@ -115,6 +117,7 @@ class EngineCoreOutput(
     events: Optional[list[EngineCoreEvent]] = None
     kv_transfer_params: Optional[dict[str, Any]] = None
 
+    trace_headers: Optional[Mapping[str, str]] = None
     # The number of tokens with prefix cache hits.
     num_cached_tokens: int = 0
 
@@ -141,7 +144,7 @@ class EngineCoreOutputs(
         omit_defaults=True,  # type: ignore[call-arg]
         gc=False):  # type: ignore[call-arg]
 
-    #NOTE(Nick): We could consider ways to make this more compact,
+    # NOTE(Nick): We could consider ways to make this more compact,
     # e.g. columnwise layout
 
     engine_index: int = 0
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 2bcd61d1f0aa..b8cfc2c133d8 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -2,15 +2,19 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
+import time
 from collections.abc import Iterable
 from dataclasses import dataclass
 from typing import Any, Optional, Union, cast
 
 import torch
 
+from vllm.config import ObservabilityConfig
 from vllm.outputs import (CompletionOutput, PoolingOutput,
                           PoolingRequestOutput, RequestOutput)
 from vllm.sampling_params import RequestOutputKind
+from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context,
+                          init_tracer)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
@@ -274,16 +278,26 @@ def _new_pooling_output(
 class OutputProcessor:
     """Process EngineCoreOutputs into RequestOutputs."""
 
-    def __init__(
-        self,
-        tokenizer: TokenizerGroup,
-        log_stats: bool,
-    ):
+    def __init__(self,
+                 tokenizer: TokenizerGroup,
+                 log_stats: bool,
+                 observability_config: Optional[ObservabilityConfig] = None):
         self.log_stats = log_stats
         self.tokenizer = tokenizer
         self.request_states: dict[str, RequestState] = {}
         self.parent_requests: dict[str, ParentRequest] = {}
         self.lora_states = LoRARequestStates()
+        self.observability_config = observability_config
+
+        self.tracer = None
+        if (self.observability_config is not None
+                and self.observability_config.otlp_traces_endpoint):
+            self.tracer = init_tracer(
+                "vllm.llm_engine",
+                self.observability_config.otlp_traces_endpoint)
+
+    def is_tracing_enabled(self) -> bool:
+        return self.tracer is not None
 
     def get_num_unfinished_requests(self):
         return len(self.request_states)
@@ -440,6 +454,65 @@ def process_outputs(
             reqs_to_abort=reqs_to_abort,
         )
 
+    def do_tracing(self, engine_core_output: EngineCoreOutput,
+                   req_state: RequestState,
+                   iteration_stats: Optional[IterationStats]):
+        if (engine_core_output.finish_reason is None or iteration_stats is None
+                or req_state is None or req_state.stats is None
+                or self.tracer is None):
+            return
+        arrival_time_nano_seconds = int(req_state.stats.arrival_time * 1e9)
+
+        trace_context = extract_trace_context(engine_core_output.trace_headers)
+        with self.tracer.start_as_current_span(
+                "llm_request",
+                kind=SpanKind.SERVER,
+                context=trace_context,
+                start_time=arrival_time_nano_seconds) as span:
+            metrics = req_state.stats
+            ttft = metrics.first_token_ts - metrics.arrival_time
+            e2e_time = time.time() - metrics.arrival_time
+            # Queued interval is from first QUEUED event to first SCHEDULED
+            queued_time = metrics.scheduled_ts - metrics.queued_ts
+
+            # Prefill interval is from first SCHEDULED to first NEW_TOKEN
+            # Any preemptions during prefill is included in the interval
+            prefill_time = metrics.first_token_ts - metrics.scheduled_ts
+
+            # Decode interval is from first NEW_TOKEN to last NEW_TOKEN
+            # Any preemptions during decode are included
+            decode_time = metrics.last_token_ts - metrics.first_token_ts
+
+            # Inference interval is from first SCHEDULED to last NEW_TOKEN
+            # Any preemptions during prefill or decode are included
+            inference_time = metrics.last_token_ts - metrics.scheduled_ts
+            span.set_attribute(SpanAttributes.GEN_AI_RESPONSE_MODEL,
+                               self.tokenizer.tokenizer_id)
+            span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID,
+                               req_state.request_id)
+            span.set_attribute(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS,
+                               req_state.max_tokens_param)
+            span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS,
+                               len(req_state.prompt_token_ids))
+            span.set_attribute(SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS,
+                               metrics.num_generation_tokens)
+            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE,
+                               metrics.queued_ts - metrics.arrival_time)
+            span.set_attribute(
+                SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN, ttft)
+            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E, e2e_time)
+            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE,
+                               queued_time)
+            span.set_attribute(
+                SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL,
+                prefill_time)
+            span.set_attribute(
+                SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_DECODE,
+                decode_time)
+            span.set_attribute(
+                SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE,
+                inference_time)
+
     def _update_stats_from_output(self, req_state: RequestState,
                                   engine_core_output: EngineCoreOutput,
                                   engine_core_timestamp: Optional[float],
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 7e7703df2cf1..dafb4bc4a953 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -225,8 +225,6 @@ def process_inputs(
         # TODO(woosuk): Support encoder-decoder models.
         self._validate_lora(lora_request)
         self._validate_params(params, lora_request)
-        if trace_headers is not None:
-            raise ValueError("V1 does not support tracing yet.")
         if prompt_adapter_request is not None:
             raise ValueError("V1 does not support prompt_adapter_request.")
 
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 9b96f4599f92..a78099e3bf66 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -3,6 +3,7 @@
 
 import enum
 import time
+from collections.abc import Mapping
 from typing import TYPE_CHECKING, Any, Optional, Union
 
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
@@ -36,6 +37,7 @@ def __init__(
         structured_output_request: Optional["StructuredOutputRequest"] = None,
         cache_salt: Optional[str] = None,
         priority: int = 0,
+        trace_headers: Optional[Mapping[str, str]] = None,
     ) -> None:
         self.request_id = request_id
         self.client_index = client_index
@@ -98,7 +100,8 @@ def __init__(
         # they should also be updated simultaneously.
         self.output_token_ids = ConstantList(self._output_token_ids)
         self.all_token_ids = ConstantList(self._all_token_ids)
-
+        # trace_headers
+        self.trace_headers = trace_headers
         # State
         # The number of tokens with prefix cache hits.
         self.num_cached_tokens = -1
@@ -131,6 +134,7 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
                     if request.sampling_params else None,
             cache_salt=request.cache_salt,
             priority=request.priority,
+            trace_headers=request.trace_headers,
         )
 
     def append_output_token_ids(

From 440ca598cfe00d2257036ec945f51921debeacec Mon Sep 17 00:00:00 2001
From: Ye Zhang <zhysishu@gmail.com>
Date: Wed, 2 Jul 2025 20:55:34 +0800
Subject: [PATCH 02/27] fix: ttft calculation

Signed-off-by: Ye Zhang <zhysishu@gmail.com>
---
 vllm/engine/arg_utils.py           |  6 ---
 vllm/v1/engine/async_llm.py        |  9 +++-
 vllm/v1/engine/output_processor.py | 84 +++++++++++-------------------
 vllm/v1/engine/processor.py        |  1 +
 vllm/v1/metrics/stats.py           |  4 ++
 5 files changed, 42 insertions(+), 62 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 38f82e64de53..30d92bb5dca2 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1412,12 +1412,6 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
                                recommend_to_remove=False)
             return False
 
-        # No OTLP observability so far.
-        if (self.otlp_traces_endpoint or self.collect_detailed_traces):
-            _raise_or_fallback(feature_name="--otlp-traces-endpoint",
-                               recommend_to_remove=False)
-            return False
-
         # V1 supports N-gram, Medusa, and Eagle speculative decoding.
         is_ngram_enabled = False
         is_eagle_enabled = False
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 3754570dfaaa..715dac0e14e5 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -21,6 +21,7 @@
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
+from vllm.tracing import init_tracer
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -91,6 +92,7 @@ def __init__(
 
         self.model_config = vllm_config.model_config
         self.vllm_config = vllm_config
+        self.observability_config = vllm_config.observability_config
         self.log_requests = log_requests
         self.log_stats = log_stats
 
@@ -118,6 +120,11 @@ def __init__(
         # OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
         self.output_processor = OutputProcessor(self.tokenizer,
                                                 log_stats=self.log_stats)
+        if self.observability_config.otlp_traces_endpoint is not None:
+            tracer = init_tracer(
+                "vllm.llm_engine",
+                self.observability_config.otlp_traces_endpoint)
+            self.output_processor.tracer = tracer
 
         # EngineCore (starts the engine in background process).
 
@@ -539,7 +546,7 @@ async def get_tokenizer(
         return self.tokenizer.get_lora_tokenizer(lora_request)
 
     async def is_tracing_enabled(self) -> bool:
-        return False
+        return self.observability_config.otlp_traces_endpoint is not None
 
     async def do_log_stats(
         self,
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index b8cfc2c133d8..79a77c51ca3b 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -13,8 +13,7 @@
 from vllm.outputs import (CompletionOutput, PoolingOutput,
                           PoolingRequestOutput, RequestOutput)
 from vllm.sampling_params import RequestOutputKind
-from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context,
-                          init_tracer)
+from vllm.tracing import (Tracer, SpanAttributes, SpanKind, extract_trace_context)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
@@ -288,13 +287,7 @@ def __init__(self,
         self.parent_requests: dict[str, ParentRequest] = {}
         self.lora_states = LoRARequestStates()
         self.observability_config = observability_config
-
-        self.tracer = None
-        if (self.observability_config is not None
-                and self.observability_config.otlp_traces_endpoint):
-            self.tracer = init_tracer(
-                "vllm.llm_engine",
-                self.observability_config.otlp_traces_endpoint)
+        self.tracer: Optional[Tracer] = None
 
     def is_tracing_enabled(self) -> bool:
         return self.tracer is not None
@@ -446,7 +439,8 @@ def process_outputs(
                 # Track per-request stats
                 self._update_stats_from_finished(req_state, finish_reason,
                                                  iteration_stats)
-
+                if self.tracer:
+                    self.do_tracing(engine_core_output, req_state, iteration_stats)
         self.lora_states.update_iteration_stats(iteration_stats)
 
         return OutputProcessorOutput(
@@ -454,15 +448,14 @@ def process_outputs(
             reqs_to_abort=reqs_to_abort,
         )
 
-    def do_tracing(self, engine_core_output: EngineCoreOutput,
-                   req_state: RequestState,
-                   iteration_stats: Optional[IterationStats]):
-        if (engine_core_output.finish_reason is None or iteration_stats is None
-                or req_state is None or req_state.stats is None
-                or self.tracer is None):
-            return
-        arrival_time_nano_seconds = int(req_state.stats.arrival_time * 1e9)
+    def do_tracing(self,
+               engine_core_output: EngineCoreOutput,
+               req_state: RequestState,
+               iteration_stats: Optional[IterationStats]) -> None:
+        assert req_state.stats is not None
+        assert iteration_stats is not None
 
+        arrival_time_nano_seconds = int(req_state.stats.arrival_time * 1e9)
         trace_context = extract_trace_context(engine_core_output.trace_headers)
         with self.tracer.start_as_current_span(
                 "llm_request",
@@ -470,48 +463,29 @@ def do_tracing(self, engine_core_output: EngineCoreOutput,
                 context=trace_context,
                 start_time=arrival_time_nano_seconds) as span:
             metrics = req_state.stats
-            ttft = metrics.first_token_ts - metrics.arrival_time
-            e2e_time = time.time() - metrics.arrival_time
-            # Queued interval is from first QUEUED event to first SCHEDULED
+            e2e_time = iteration_stats.iteration_timestamp - metrics.arrival_time
             queued_time = metrics.scheduled_ts - metrics.queued_ts
-
-            # Prefill interval is from first SCHEDULED to first NEW_TOKEN
-            # Any preemptions during prefill is included in the interval
             prefill_time = metrics.first_token_ts - metrics.scheduled_ts
-
-            # Decode interval is from first NEW_TOKEN to last NEW_TOKEN
-            # Any preemptions during decode are included
             decode_time = metrics.last_token_ts - metrics.first_token_ts
-
-            # Inference interval is from first SCHEDULED to last NEW_TOKEN
-            # Any preemptions during prefill or decode are included
             inference_time = metrics.last_token_ts - metrics.scheduled_ts
-            span.set_attribute(SpanAttributes.GEN_AI_RESPONSE_MODEL,
-                               self.tokenizer.tokenizer_id)
-            span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID,
-                               req_state.request_id)
-            span.set_attribute(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS,
-                               req_state.max_tokens_param)
-            span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS,
-                               len(req_state.prompt_token_ids))
-            span.set_attribute(SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS,
-                               metrics.num_generation_tokens)
-            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE,
-                               metrics.queued_ts - metrics.arrival_time)
-            span.set_attribute(
-                SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN, ttft)
+            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN, metrics.first_token_latency)
             span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E, e2e_time)
-            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE,
-                               queued_time)
-            span.set_attribute(
-                SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL,
-                prefill_time)
-            span.set_attribute(
-                SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_DECODE,
-                decode_time)
-            span.set_attribute(
-                SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE,
-                inference_time)
+            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE, queued_time)
+            span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS, len(req_state.prompt_token_ids))
+            span.set_attribute(SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS, metrics.num_generation_tokens)
+            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL, prefill_time)
+            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_DECODE, decode_time)
+            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE, inference_time)
+
+            # meta
+            span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID, req_state.request_id)
+            if req_state.parent_req and req_state.parent_req.sampling_params:
+                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P, req_state.parent_req.sampling_params.top_p)
+                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS,
+                                   req_state.parent_req.sampling_params.max_tokens)
+                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE,
+                                   req_state.parent_req.sampling_params.temperature)
+                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N, req_state.parent_req.sampling_params.n)
 
     def _update_stats_from_output(self, req_state: RequestState,
                                   engine_core_output: EngineCoreOutput,
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index dafb4bc4a953..5ca4aa75f31b 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -344,6 +344,7 @@ def process_inputs(
             cache_salt=decoder_inputs.get("cache_salt"),
             priority=priority,
             data_parallel_rank=data_parallel_rank,
+            trace_headers=trace_headers,
         )
 
     def _validate_model_inputs(self,
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 1eb10ccb6c49..8d8cd1663d6f 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -64,6 +64,9 @@ class RequestStateStats:
     first_token_ts: float = 0.0
     last_token_ts: float = 0.0
 
+    # first token latency
+    first_token_latency: float = 0.0
+
 
 @dataclass
 class FinishedRequestStats:
@@ -112,6 +115,7 @@ def update_from_output(self, output: "EngineCoreOutput",
 
             first_token_latency = self._time_since(req_stats.arrival_time)
             self.time_to_first_tokens_iter.append(first_token_latency)
+            req_stats.first_token_latency = first_token_latency
 
         req_stats.num_generation_tokens += num_new_generation_tokens
 

From 8afb03e170d50b2f5a72324e57b111a8ffa09f3e Mon Sep 17 00:00:00 2001
From: Ye Zhang <zhysishu@gmail.com>
Date: Mon, 4 Aug 2025 20:07:38 +0800
Subject: [PATCH 03/27] fix: merge error by accident

Signed-off-by: Ye Zhang <zhysishu@gmail.com>
---
 vllm/v1/engine/output_processor.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 79a77c51ca3b..cabb0d99b529 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -279,19 +279,14 @@ class OutputProcessor:
 
     def __init__(self,
                  tokenizer: TokenizerGroup,
-                 log_stats: bool,
-                 observability_config: Optional[ObservabilityConfig] = None):
+                 log_stats: bool):
         self.log_stats = log_stats
         self.tokenizer = tokenizer
         self.request_states: dict[str, RequestState] = {}
         self.parent_requests: dict[str, ParentRequest] = {}
         self.lora_states = LoRARequestStates()
-        self.observability_config = observability_config
         self.tracer: Optional[Tracer] = None
 
-    def is_tracing_enabled(self) -> bool:
-        return self.tracer is not None
-
     def get_num_unfinished_requests(self):
         return len(self.request_states)
 

From b5c27ed9dbae31beecae71245b93c8849a32327f Mon Sep 17 00:00:00 2001
From: RichardoMu <44485717+RichardoMrMu@users.noreply.github.com>
Date: Thu, 7 Aug 2025 09:50:20 +0800
Subject: [PATCH 04/27] Update vllm/v1/engine/async_llm.py

Co-authored-by: Benjamin Bartels <benjamin@bartels.dev>
Signed-off-by: RichardoMu <44485717+RichardoMrMu@users.noreply.github.com>
---
 vllm/v1/engine/async_llm.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 4a99e7588c24..2fbc9306c0cc 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -22,6 +22,7 @@
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.tracing import init_tracer
+from vllm.tasks import SupportedTask
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
 from vllm.transformers_utils.tokenizer import AnyTokenizer

From 7b1de1c7d5ddd18bd79c966ab8c50cbaac938ed5 Mon Sep 17 00:00:00 2001
From: RichardoMu <44485717+RichardoMrMu@users.noreply.github.com>
Date: Thu, 7 Aug 2025 18:18:55 +0800
Subject: [PATCH 05/27] Update vllm/v1/engine/output_processor.py

Co-authored-by: Benjamin Bartels <benjamin@bartels.dev>
Signed-off-by: RichardoMu <44485717+RichardoMrMu@users.noreply.github.com>
---
 vllm/v1/engine/output_processor.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 9274a3d1605a..74501623f15c 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -451,6 +451,7 @@ def do_tracing(self,
                iteration_stats: Optional[IterationStats]) -> None:
         assert req_state.stats is not None
         assert iteration_stats is not None
+        assert self.tracer is not None
 
         arrival_time_nano_seconds = int(req_state.stats.arrival_time * 1e9)
         trace_context = extract_trace_context(engine_core_output.trace_headers)

From cdf0d9ffd5ccac71e883baa842bd6bf991181313 Mon Sep 17 00:00:00 2001
From: Ye Zhang <zhysishu@gmail.com>
Date: Thu, 7 Aug 2025 20:16:02 +0800
Subject: [PATCH 06/27] fix: gen meta directly from
 enginecorequest.sampling_params

Signed-off-by: Ye Zhang <zhysishu@gmail.com>
---
 vllm/v1/engine/output_processor.py | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index cabb0d99b529..6fc0fbed8e43 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -96,6 +96,9 @@ def __init__(
         arrival_time: float,
         queue: Optional[RequestOutputCollector],
         log_stats: bool,
+        top_p: Optional[float] = None,
+        n: Optional[int] = None,
+        temperature: Optional[float] = None,
     ):
         self.request_id = request_id
         self.parent_req = parent_req
@@ -108,6 +111,9 @@ def __init__(
         self.logprobs_processor = logprobs_processor
         self.detokenizer = detokenizer
         self.max_tokens_param = max_tokens_param
+        self.top_p = top_p
+        self.n = n
+        self.temperature = temperature
         self.is_prefilling = True
         self.queue = queue
 
@@ -139,10 +145,16 @@ def from_new_request(
                 request=request,
             )
             max_tokens_param = sampling_params.max_tokens
+            top_p = sampling_params.top_p
+            n = sampling_params.n
+            temperature = sampling_params.temperature
         else:
             logprobs_processor = None
             detokenizer = None
             max_tokens_param = None
+            top_p = None
+            n = None
+            temperature = None
             assert request.pooling_params is not None
             output_kind = request.pooling_params.output_kind
 
@@ -158,6 +170,9 @@ def from_new_request(
             logprobs_processor=logprobs_processor,
             detokenizer=detokenizer,
             max_tokens_param=max_tokens_param,
+            top_p=top_p,
+            n=n,
+            temperature=temperature,
             arrival_time=request.arrival_time,
             queue=queue,
             log_stats=log_stats,
@@ -474,13 +489,16 @@ def do_tracing(self,
 
             # meta
             span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID, req_state.request_id)
-            if req_state.parent_req and req_state.parent_req.sampling_params:
-                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P, req_state.parent_req.sampling_params.top_p)
+            if req_state.top_p:
+                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P, req_state.top_p)
+            if req_state.max_tokens_param:
                 span.set_attribute(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS,
-                                   req_state.parent_req.sampling_params.max_tokens)
+                                   req_state.max_tokens_param)
+            if req_state.temperature:
                 span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE,
-                                   req_state.parent_req.sampling_params.temperature)
-                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N, req_state.parent_req.sampling_params.n)
+                                   req_state.temperature)
+            if req_state.n:
+                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N, req_state.n)
 
     def _update_stats_from_output(self, req_state: RequestState,
                                   engine_core_output: EngineCoreOutput,

From 8e3887cac5d6374eb0c44b1d5e27433a604399a0 Mon Sep 17 00:00:00 2001
From: RichardoMu <44485717+RichardoMrMu@users.noreply.github.com>
Date: Fri, 15 Aug 2025 17:10:48 +0800
Subject: [PATCH 07/27] Update vllm/v1/engine/processor.py

Co-authored-by: Benjamin Bartels <benjamin@bartels.dev>
Signed-off-by: RichardoMu <44485717+RichardoMrMu@users.noreply.github.com>
---
 vllm/v1/engine/processor.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 120fb29a27c6..55f1408e4be4 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -240,8 +240,6 @@ def process_inputs(
         # TODO(woosuk): Support encoder-decoder models.
         self._validate_lora(lora_request)
         self._validate_params(params, lora_request)
-        if prompt_adapter_request is not None:
-            raise ValueError("V1 does not support prompt_adapter_request.")
 
         data_parallel_size = self.vllm_config.parallel_config.data_parallel_size
         if data_parallel_rank is not None and not (0 <= data_parallel_rank <

From 6bea3fa384c02fdbeb84ec8edc7314e5c674f8a1 Mon Sep 17 00:00:00 2001
From: Mu Huai <tianbowen.tbw@antgroup.com>
Date: Fri, 15 Aug 2025 18:05:39 +0800
Subject: [PATCH 08/27] fix:pre-commit

Signed-off-by: Mu Huai <tianbowen.tbw@antgroup.com>
---
 vllm/v1/engine/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index cbe649e02e4f..55fd04064010 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -3,7 +3,7 @@
 
 import enum
 import time
-from collections.abc import Mapping, Sequence
+from collections.abc import Mapping
 from typing import Any, Optional, Union
 
 import msgspec

From dd8c2a09e143d24430ee815ebb50672b91947efb Mon Sep 17 00:00:00 2001
From: Mu Huai <tianbowen.tbw@antgroup.com>
Date: Wed, 20 Aug 2025 14:18:59 +0800
Subject: [PATCH 09/27] fix:pre-commit

Signed-off-by: Mu Huai <tianbowen.tbw@antgroup.com>
---
 vllm/v1/engine/__init__.py         |  2 +-
 vllm/v1/engine/async_llm.py        |  2 +-
 vllm/v1/engine/output_processor.py | 71 ++++++++++++++++++------------
 3 files changed, 44 insertions(+), 31 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 9cfc4efffe7f..9183c6af0810 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -3,7 +3,7 @@
 
 import enum
 import time
-from collections.abc import Sequence,Mapping
+from collections.abc import Mapping, Sequence
 from typing import Any, Optional, Union
 
 import msgspec
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index e0ee2a14cc4b..d041dbfd9027 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -24,8 +24,8 @@
 from vllm.outputs import PoolingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
-from vllm.tracing import init_tracer
 from vllm.tasks import SupportedTask
+from vllm.tracing import init_tracer
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 871b274cf798..5132237427b7 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -2,18 +2,17 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
-import time
 from collections.abc import Iterable
 from dataclasses import dataclass
 from typing import Any, Optional, Union, cast
 
 import torch
 
-from vllm.config import ObservabilityConfig
 from vllm.outputs import (CompletionOutput, PoolingOutput,
                           PoolingRequestOutput, RequestOutput)
 from vllm.sampling_params import RequestOutputKind
-from vllm.tracing import (Tracer, SpanAttributes, SpanKind, extract_trace_context)
+from vllm.tracing import (SpanAttributes, SpanKind, Tracer,
+                          extract_trace_context)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
@@ -74,7 +73,6 @@ def get_nowait(
 
 @dataclass
 class OutputProcessorOutput:
-
     request_outputs: list[Union[RequestOutput, PoolingRequestOutput]]
     reqs_to_abort: list[str]
 
@@ -277,9 +275,7 @@ def _new_pooling_output(
 class OutputProcessor:
     """Process EngineCoreOutputs into RequestOutputs."""
 
-    def __init__(self,
-                 tokenizer: TokenizerGroup,
-                 log_stats: bool):
+    def __init__(self, tokenizer: TokenizerGroup, log_stats: bool):
         self.log_stats = log_stats
         self.tokenizer = tokenizer
         self.request_states: dict[str, RequestState] = {}
@@ -444,7 +440,8 @@ def process_outputs(
                 self._update_stats_from_finished(req_state, finish_reason,
                                                  iteration_stats)
                 if self.tracer:
-                    self.do_tracing(engine_core_output, req_state, iteration_stats)
+                    self.do_tracing(engine_core_output, req_state,
+                                    iteration_stats)
         self.lora_states.update_iteration_stats(iteration_stats)
 
         return OutputProcessorOutput(
@@ -452,45 +449,61 @@ def process_outputs(
             reqs_to_abort=reqs_to_abort,
         )
 
-    def do_tracing(self,
-               engine_core_output: EngineCoreOutput,
-               req_state: RequestState,
-               iteration_stats: Optional[IterationStats]) -> None:
+    def do_tracing(self, engine_core_output: EngineCoreOutput,
+                   req_state: RequestState,
+                   iteration_stats: Optional[IterationStats]) -> None:
         assert req_state.stats is not None
         assert iteration_stats is not None
         assert self.tracer is not None
 
         arrival_time_nano_seconds = int(req_state.stats.arrival_time * 1e9)
         trace_context = extract_trace_context(engine_core_output.trace_headers)
-        with self.tracer.start_as_current_span(
+        with (self.tracer.start_as_current_span(
                 "llm_request",
                 kind=SpanKind.SERVER,
                 context=trace_context,
-                start_time=arrival_time_nano_seconds) as span:
+                start_time=arrival_time_nano_seconds) as span):
             metrics = req_state.stats
-            e2e_time = iteration_stats.iteration_timestamp - metrics.arrival_time
+            e2e_time = iteration_stats.iteration_timestamp - \
+                       metrics.arrival_time
             queued_time = metrics.scheduled_ts - metrics.queued_ts
             prefill_time = metrics.first_token_ts - metrics.scheduled_ts
             decode_time = metrics.last_token_ts - metrics.first_token_ts
             inference_time = metrics.last_token_ts - metrics.scheduled_ts
-            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN, metrics.first_token_latency)
+            span.set_attribute(
+                SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN,
+                metrics.first_token_latency)
             span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E, e2e_time)
-            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE, queued_time)
-            span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS, len(req_state.prompt_token_ids))
-            span.set_attribute(SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS, metrics.num_generation_tokens)
-            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL, prefill_time)
-            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_DECODE, decode_time)
-            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE, inference_time)
+            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE,
+                               queued_time)
+            span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS,
+                               len(req_state.prompt_token_ids))
+            span.set_attribute(SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS,
+                               metrics.num_generation_tokens)
+            span.set_attribute(
+                SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL,
+                prefill_time)
+            span.set_attribute(
+                SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_DECODE,
+                decode_time)
+            span.set_attribute(
+                SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE,
+                inference_time)
 
             # meta
-            span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID, req_state.request_id)
+            span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID,
+                               req_state.request_id)
             if req_state.parent_req and req_state.parent_req.sampling_params:
-                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P, req_state.parent_req.sampling_params.top_p)
-                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS,
-                                   req_state.parent_req.sampling_params.max_tokens)
-                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE,
-                                   req_state.parent_req.sampling_params.temperature)
-                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N, req_state.parent_req.sampling_params.n)
+                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P,
+                                   req_state.parent_req.sampling_params.top_p)
+                span.set_attribute(
+                    SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS,
+                    req_state.parent_req.sampling_params.max_tokens)
+                span.set_attribute(
+                    SpanAttributes.GEN_AI_REQUEST_TEMPERATURE,
+                    req_state.parent_req.sampling_params.temperature)
+                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N,
+                                   req_state.parent_req.sampling_params.n)
 
     def _update_stats_from_output(self, req_state: RequestState,
                                   engine_core_output: EngineCoreOutput,

From 66992964ce535e9710ac605db8c2ff98bb9deaa3 Mon Sep 17 00:00:00 2001
From: simon-mo <simon.mo@hey.com>
Date: Sun, 24 Aug 2025 15:43:28 -0700
Subject: [PATCH 10/27] remove v0 guard for tests

Signed-off-by: simon-mo <simon.mo@hey.com>
---
 tests/tracing/test_tracing.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py
index 4dbae7c15de3..3d31df728f46 100644
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
@@ -22,18 +22,6 @@
 from vllm import LLM, SamplingParams
 from vllm.tracing import SpanAttributes
 
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch: pytest.MonkeyPatch):
-    """
-    Since this module is V0 only, set VLLM_USE_V1=0 for
-    all tests in the module.
-    """
-    with monkeypatch.context() as m:
-        m.setenv('VLLM_USE_V1', '0')
-        yield
-
-
 FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
 
 FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value',

From 516b954db17d047bdd7190d314dd7ba51df8f727 Mon Sep 17 00:00:00 2001
From: Mu Huai <tianbowen.tbw@antgroup.com>
Date: Mon, 25 Aug 2025 19:34:12 +0800
Subject: [PATCH 11/27] change: test_tracing.py gpu_memory_utilization=0.3 to
 avoid oom

Signed-off-by: Mu Huai <tianbowen.tbw@antgroup.com>
---
 tests/tracing/test_tracing.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py
index 3d31df728f46..407649aacd48 100644
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
@@ -86,10 +86,9 @@ def test_traces(
             max_tokens=256,
         )
         model = "facebook/opt-125m"
-        llm = LLM(
-            model=model,
-            otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
-        )
+        llm = LLM(model=model,
+                  otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
+                  gpu_memory_utilization=0.3)
         prompts = ["This is a short prompt"]
         outputs = llm.generate(prompts, sampling_params=sampling_params)
 
@@ -159,11 +158,10 @@ def test_traces_with_detailed_steps(
             max_tokens=256,
         )
         model = "facebook/opt-125m"
-        llm = LLM(
-            model=model,
-            otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
-            collect_detailed_traces=["all"],
-        )
+        llm = LLM(model=model,
+                  otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
+                  collect_detailed_traces=["all"],
+                  gpu_memory_utilization=0.3)
         prompts = ["This is a short prompt"]
         outputs = llm.generate(prompts, sampling_params=sampling_params)
 

From 71012c0cdf3d4dccbb770bbc4f2eec14e6be7ec2 Mon Sep 17 00:00:00 2001
From: Mu Huai <tianbowen.tbw@antgroup.com>
Date: Mon, 25 Aug 2025 21:06:27 +0800
Subject: [PATCH 12/27] test: timeout to 10

Signed-off-by: Mu Huai <tianbowen.tbw@antgroup.com>
---
 tests/tracing/test_tracing.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py
index 407649aacd48..846d43c1dbdd 100644
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
@@ -92,7 +92,7 @@ def test_traces(
         prompts = ["This is a short prompt"]
         outputs = llm.generate(prompts, sampling_params=sampling_params)
 
-        timeout = 5
+        timeout = 10
         if not trace_service.evt.wait(timeout):
             raise TimeoutError(
                 f"The fake trace service didn't receive a trace within "
@@ -165,7 +165,7 @@ def test_traces_with_detailed_steps(
         prompts = ["This is a short prompt"]
         outputs = llm.generate(prompts, sampling_params=sampling_params)
 
-        timeout = 5
+        timeout = 10
         if not trace_service.evt.wait(timeout):
             raise TimeoutError(
                 f"The fake trace service didn't receive a trace within "

From baa6b8536a8da0cdc1b4fcb857ea657ae2032dc5 Mon Sep 17 00:00:00 2001
From: Mu Huai <tianbowen.tbw@antgroup.com>
Date: Tue, 26 Aug 2025 20:28:06 +0800
Subject: [PATCH 13/27] change: set env VLLM_USE_V1 1

Signed-off-by: Mu Huai <tianbowen.tbw@antgroup.com>
---
 tests/tracing/test_tracing.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py
index 846d43c1dbdd..75ddf600ba07 100644
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
@@ -79,7 +79,7 @@ def test_traces(
 ):
     with monkeypatch.context() as m:
         m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
-
+        m.setenv("VLLM_USE_V1", "1")
         sampling_params = SamplingParams(
             temperature=0.01,
             top_p=0.1,
@@ -151,7 +151,7 @@ def test_traces_with_detailed_steps(
 ):
     with monkeypatch.context() as m:
         m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
-
+        m.setenv("VLLM_USE_V1", "1")
         sampling_params = SamplingParams(
             temperature=0.01,
             top_p=0.1,

From 05b2e694db289458575ca728895d25990ee2084c Mon Sep 17 00:00:00 2001
From: Mu Huai <tianbowen.tbw@antgroup.com>
Date: Mon, 1 Sep 2025 15:19:12 +0800
Subject: [PATCH 14/27] test: set env VLLM_USE_V1 0

Signed-off-by: Mu Huai <tianbowen.tbw@antgroup.com>
---
 tests/tracing/test_tracing.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py
index 75ddf600ba07..3a1f1b55d2df 100644
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
@@ -79,7 +79,7 @@ def test_traces(
 ):
     with monkeypatch.context() as m:
         m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
-        m.setenv("VLLM_USE_V1", "1")
+        m.setenv("VLLM_USE_V1", "0")
         sampling_params = SamplingParams(
             temperature=0.01,
             top_p=0.1,
@@ -151,7 +151,7 @@ def test_traces_with_detailed_steps(
 ):
     with monkeypatch.context() as m:
         m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
-        m.setenv("VLLM_USE_V1", "1")
+        m.setenv("VLLM_USE_V1", "0")
         sampling_params = SamplingParams(
             temperature=0.01,
             top_p=0.1,

From e1113e93dc41a32add0ea396c20d16c1269cf4e7 Mon Sep 17 00:00:00 2001
From: Mu Huai <tianbowen.tbw@antgroup.com>
Date: Mon, 1 Sep 2025 17:51:45 +0800
Subject: [PATCH 15/27] test: set env VLLM_USE_V1 1

Signed-off-by: Mu Huai <tianbowen.tbw@antgroup.com>
---
 tests/tracing/test_tracing.py      | 5 +++--
 vllm/v1/engine/async_llm.py        | 1 +
 vllm/v1/engine/output_processor.py | 2 ++
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py
index 3a1f1b55d2df..be92c5359147 100644
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
@@ -79,7 +79,7 @@ def test_traces(
 ):
     with monkeypatch.context() as m:
         m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
-        m.setenv("VLLM_USE_V1", "0")
+        m.setenv("VLLM_USE_V1", "1")
         sampling_params = SamplingParams(
             temperature=0.01,
             top_p=0.1,
@@ -91,6 +91,7 @@ def test_traces(
                   gpu_memory_utilization=0.3)
         prompts = ["This is a short prompt"]
         outputs = llm.generate(prompts, sampling_params=sampling_params)
+        print(f"test_traces outputs is : {outputs}")
 
         timeout = 10
         if not trace_service.evt.wait(timeout):
@@ -151,7 +152,7 @@ def test_traces_with_detailed_steps(
 ):
     with monkeypatch.context() as m:
         m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
-        m.setenv("VLLM_USE_V1", "0")
+        m.setenv("VLLM_USE_V1", "1")
         sampling_params = SamplingParams(
             temperature=0.01,
             top_p=0.1,
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 079faa530586..794da8b00887 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -125,6 +125,7 @@ def __init__(
             tracer = init_tracer(
                 "vllm.llm_engine",
                 self.observability_config.otlp_traces_endpoint)
+            logger.debug("Tracer initialized. tracer: %s", tracer)
             self.output_processor.tracer = tracer
 
         # EngineCore (starts the engine in background process).
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 5132237427b7..9b3e33cab00a 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -439,7 +439,9 @@ def process_outputs(
                 # Track per-request stats
                 self._update_stats_from_finished(req_state, finish_reason,
                                                  iteration_stats)
+                print(f"self.tracer: {self.tracer}")
                 if self.tracer:
+                    print(f"engine_core_output: {engine_core_output}")
                     self.do_tracing(engine_core_output, req_state,
                                     iteration_stats)
         self.lora_states.update_iteration_stats(iteration_stats)

From 81decbd04a3731c8d13c6855a03c886da4a3bf45 Mon Sep 17 00:00:00 2001
From: Ye Zhang <zhysishu@gmail.com>
Date: Tue, 2 Sep 2025 11:48:26 +0800
Subject: [PATCH 16/27] fix: tracing ut - tracer not initialized

Signed-off-by: Ye Zhang <zhysishu@gmail.com>
---
 vllm/v1/engine/llm_engine.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index a2328c37ba0c..1e28aee42497 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -19,6 +19,7 @@
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
+from vllm.tracing import init_tracer
 from vllm.transformers_utils.tokenizer_group import (
     TokenizerGroup, init_tokenizer_from_configs)
 from vllm.usage.usage_lib import UsageContext
@@ -65,6 +66,7 @@ def __init__(
                 "Set VLLM_USE_V1=0 and file and issue on Github.")
 
         self.vllm_config = vllm_config
+        self.observability_config = vllm_config.observability_config
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
 
@@ -96,6 +98,11 @@ def __init__(
         # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
         self.output_processor = OutputProcessor(self.tokenizer,
                                                 log_stats=self.log_stats)
+        if self.observability_config.otlp_traces_endpoint is not None:
+            tracer = init_tracer(
+                "vllm.llm_engine",
+                self.observability_config.otlp_traces_endpoint)
+            self.output_processor.tracer = tracer
 
         # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs)
         self.engine_core = EngineCoreClient.make_client(

From eedf2078fa34ac1f69f20615cba3cea3b9f45c74 Mon Sep 17 00:00:00 2001
From: Mu Huai <tianbowen.tbw@antgroup.com>
Date: Tue, 2 Sep 2025 12:03:38 +0800
Subject: [PATCH 17/27] test:

Signed-off-by: Mu Huai <tianbowen.tbw@antgroup.com>
---
 vllm/v1/engine/async_llm.py        | 1 -
 vllm/v1/engine/output_processor.py | 2 --
 2 files changed, 3 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 794da8b00887..079faa530586 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -125,7 +125,6 @@ def __init__(
             tracer = init_tracer(
                 "vllm.llm_engine",
                 self.observability_config.otlp_traces_endpoint)
-            logger.debug("Tracer initialized. tracer: %s", tracer)
             self.output_processor.tracer = tracer
 
         # EngineCore (starts the engine in background process).
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 9b3e33cab00a..5132237427b7 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -439,9 +439,7 @@ def process_outputs(
                 # Track per-request stats
                 self._update_stats_from_finished(req_state, finish_reason,
                                                  iteration_stats)
-                print(f"self.tracer: {self.tracer}")
                 if self.tracer:
-                    print(f"engine_core_output: {engine_core_output}")
                     self.do_tracing(engine_core_output, req_state,
                                     iteration_stats)
         self.lora_states.update_iteration_stats(iteration_stats)

From 73daf4d57b9bb2f91290bc0532e08bef2a59ad2d Mon Sep 17 00:00:00 2001
From: Mu Huai <tianbowen.tbw@antgroup.com>
Date: Tue, 2 Sep 2025 13:40:08 +0800
Subject: [PATCH 18/27] test:disable_log_stats=False

Signed-off-by: Mu Huai <tianbowen.tbw@antgroup.com>
---
 tests/tracing/test_tracing.py | 6 ++++--
 vllm/v1/engine/__init__.py    | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py
index be92c5359147..7737a0ec37f2 100644
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
@@ -88,7 +88,8 @@ def test_traces(
         model = "facebook/opt-125m"
         llm = LLM(model=model,
                   otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
-                  gpu_memory_utilization=0.3)
+                  gpu_memory_utilization=0.3,
+                  disable_log_stats=False)
         prompts = ["This is a short prompt"]
         outputs = llm.generate(prompts, sampling_params=sampling_params)
         print(f"test_traces outputs is : {outputs}")
@@ -162,7 +163,8 @@ def test_traces_with_detailed_steps(
         llm = LLM(model=model,
                   otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
                   collect_detailed_traces=["all"],
-                  gpu_memory_utilization=0.3)
+                  gpu_memory_utilization=0.3,
+                  disable_log_stats=False)
         prompts = ["This is a short prompt"]
         outputs = llm.generate(prompts, sampling_params=sampling_params)
 
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 01d7e98f93fe..dec4abec519b 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -3,7 +3,7 @@
 
 import enum
 import time
-from collections.abc import Mapping, Sequence
+from collections.abc import Mapping
 from typing import Any, Optional, Union
 
 import msgspec

From 0623cd7127faab0968b015f0fa8be3a3ef4a4ea5 Mon Sep 17 00:00:00 2001
From: Mu Huai <tianbowen.tbw@antgroup.com>
Date: Tue, 2 Sep 2025 13:55:29 +0800
Subject: [PATCH 19/27] test:format

Signed-off-by: Mu Huai <tianbowen.tbw@antgroup.com>
---
 vllm/v1/engine/llm_engine.py       | 2 +-
 vllm/v1/engine/output_processor.py | 9 ++++++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index cf2ddea52498..fca5a783bc3b 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -18,8 +18,8 @@
 from vllm.outputs import PoolingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
-from vllm.tracing import init_tracer
 from vllm.tasks import SupportedTask
+from vllm.tracing import init_tracer
 from vllm.transformers_utils.tokenizer_group import (
     TokenizerGroup, init_tokenizer_from_configs)
 from vllm.usage.usage_lib import UsageContext
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index ec7b86afa2df..02c8c61cb909 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -506,9 +506,11 @@ def do_tracing(self, engine_core_output: EngineCoreOutput,
                 inference_time)
 
             # meta
-            span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID, req_state.request_id)
+            span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID,
+                               req_state.request_id)
             if req_state.top_p:
-                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P, req_state.top_p)
+                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P,
+                                   req_state.top_p)
             if req_state.max_tokens_param:
                 span.set_attribute(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS,
                                    req_state.max_tokens_param)
@@ -516,7 +518,8 @@ def do_tracing(self, engine_core_output: EngineCoreOutput,
                 span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE,
                                    req_state.temperature)
             if req_state.n:
-                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N, req_state.n)
+                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N,
+                                   req_state.n)
 
     def _update_stats_from_output(self, req_state: RequestState,
                                   engine_core_output: EngineCoreOutput,

From 57dbf9f449dbfa5ef76f9ed6ab683455cbaaa094 Mon Sep 17 00:00:00 2001
From: Mu Huai <tianbowen.tbw@antgroup.com>
Date: Tue, 2 Sep 2025 14:42:17 +0800
Subject: [PATCH 20/27] test:no model name

Signed-off-by: Mu Huai <tianbowen.tbw@antgroup.com>
---
 tests/tracing/test_tracing.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py
index 7737a0ec37f2..d76f4ea7f57d 100644
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
@@ -113,7 +113,7 @@ def test_traces(
 
         attributes = decode_attributes(
             request.resource_spans[0].scope_spans[0].spans[0].attributes)
-        assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
+        # assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
         assert attributes.get(
             SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
         assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
@@ -187,7 +187,7 @@ def test_traces_with_detailed_steps(
 
         attributes = decode_attributes(
             request.resource_spans[0].scope_spans[0].spans[0].attributes)
-        assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
+        # assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
         assert attributes.get(
             SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
         assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE

From cdb9c4861fd71f22a5bb7b26735e62a476e24638 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=91=9C=E7=90=AE?= <ly186375@antfin.com>
Date: Tue, 9 Sep 2025 17:18:34 +0800
Subject: [PATCH 21/27] add tracing ut for v1

---
 tests/tracing/test_tracing.py    |  43 ++++++----
 tests/v1/tracing/test_tracing.py | 138 +++++++++++++++++++++++++++++++
 2 files changed, 165 insertions(+), 16 deletions(-)
 create mode 100644 tests/v1/tracing/test_tracing.py

diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py
index d76f4ea7f57d..4dbae7c15de3 100644
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
@@ -22,6 +22,18 @@
 from vllm import LLM, SamplingParams
 from vllm.tracing import SpanAttributes
 
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch: pytest.MonkeyPatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    with monkeypatch.context() as m:
+        m.setenv('VLLM_USE_V1', '0')
+        yield
+
+
 FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
 
 FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value',
@@ -79,22 +91,21 @@ def test_traces(
 ):
     with monkeypatch.context() as m:
         m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
-        m.setenv("VLLM_USE_V1", "1")
+
         sampling_params = SamplingParams(
             temperature=0.01,
             top_p=0.1,
             max_tokens=256,
         )
         model = "facebook/opt-125m"
-        llm = LLM(model=model,
-                  otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
-                  gpu_memory_utilization=0.3,
-                  disable_log_stats=False)
+        llm = LLM(
+            model=model,
+            otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
+        )
         prompts = ["This is a short prompt"]
         outputs = llm.generate(prompts, sampling_params=sampling_params)
-        print(f"test_traces outputs is : {outputs}")
 
-        timeout = 10
+        timeout = 5
         if not trace_service.evt.wait(timeout):
             raise TimeoutError(
                 f"The fake trace service didn't receive a trace within "
@@ -113,7 +124,7 @@ def test_traces(
 
         attributes = decode_attributes(
             request.resource_spans[0].scope_spans[0].spans[0].attributes)
-        # assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
+        assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
         assert attributes.get(
             SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
         assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
@@ -153,22 +164,22 @@ def test_traces_with_detailed_steps(
 ):
     with monkeypatch.context() as m:
         m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
-        m.setenv("VLLM_USE_V1", "1")
+
         sampling_params = SamplingParams(
             temperature=0.01,
             top_p=0.1,
             max_tokens=256,
         )
         model = "facebook/opt-125m"
-        llm = LLM(model=model,
-                  otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
-                  collect_detailed_traces=["all"],
-                  gpu_memory_utilization=0.3,
-                  disable_log_stats=False)
+        llm = LLM(
+            model=model,
+            otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
+            collect_detailed_traces=["all"],
+        )
         prompts = ["This is a short prompt"]
         outputs = llm.generate(prompts, sampling_params=sampling_params)
 
-        timeout = 10
+        timeout = 5
         if not trace_service.evt.wait(timeout):
             raise TimeoutError(
                 f"The fake trace service didn't receive a trace within "
@@ -187,7 +198,7 @@ def test_traces_with_detailed_steps(
 
         attributes = decode_attributes(
             request.resource_spans[0].scope_spans[0].spans[0].attributes)
-        # assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
+        assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
         assert attributes.get(
             SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
         assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
diff --git a/tests/v1/tracing/test_tracing.py b/tests/v1/tracing/test_tracing.py
new file mode 100644
index 000000000000..0ccfcb9f7d89
--- /dev/null
+++ b/tests/v1/tracing/test_tracing.py
@@ -0,0 +1,138 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa
+# type: ignore
+from __future__ import annotations
+
+import threading
+from collections.abc import Iterable
+from concurrent import futures
+from typing import Callable, Generator, Literal
+
+import grpc
+import pytest
+from opentelemetry.proto.collector.trace.v1.trace_service_pb2 import (
+    ExportTraceServiceResponse)
+from opentelemetry.proto.collector.trace.v1.trace_service_pb2_grpc import (
+    TraceServiceServicer, add_TraceServiceServicer_to_server)
+from opentelemetry.proto.common.v1.common_pb2 import AnyValue, KeyValue
+from opentelemetry.sdk.environment_variables import (
+    OTEL_EXPORTER_OTLP_TRACES_INSECURE)
+
+from vllm import LLM, SamplingParams
+from vllm.tracing import SpanAttributes
+
+FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
+
+FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value',
+                    'array_value']
+
+
+def decode_value(value: AnyValue):
+    field_decoders: dict[FieldName, Callable] = {
+        "bool_value": (lambda v: v.bool_value),
+        "string_value": (lambda v: v.string_value),
+        "int_value": (lambda v: v.int_value),
+        "double_value": (lambda v: v.double_value),
+        "array_value":
+        (lambda v: [decode_value(item) for item in v.array_value.values]),
+    }
+    for field, decoder in field_decoders.items():
+        if value.HasField(field):
+            return decoder(value)
+    raise ValueError(f"Couldn't decode value: {value}")
+
+
+def decode_attributes(attributes: Iterable[KeyValue]):
+    return {kv.key: decode_value(kv.value) for kv in attributes}
+
+
+class FakeTraceService(TraceServiceServicer):
+
+    def __init__(self):
+        self.request = None
+        self.evt = threading.Event()
+
+    def Export(self, request, context):
+        self.request = request
+        self.evt.set()
+        return ExportTraceServiceResponse()
+
+
+@pytest.fixture
+def trace_service() -> Generator[FakeTraceService, None, None]:
+    """Fixture to set up a fake gRPC trace service"""
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
+    service = FakeTraceService()
+    add_TraceServiceServicer_to_server(service, server)
+    server.add_insecure_port(FAKE_TRACE_SERVER_ADDRESS)
+    server.start()
+
+    yield service
+
+    server.stop(None)
+
+
+def test_traces(
+    monkeypatch: pytest.MonkeyPatch,
+    trace_service: FakeTraceService,
+):
+    with monkeypatch.context() as m:
+        m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
+        m.setenv("VLLM_USE_V1", "1")
+        sampling_params = SamplingParams(
+            temperature=0.01,
+            top_p=0.1,
+            max_tokens=256,
+        )
+        model = "facebook/opt-125m"
+        llm = LLM(model=model,
+                  otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
+                  gpu_memory_utilization=0.3,
+                  disable_log_stats=False)
+        prompts = ["This is a short prompt"]
+        outputs = llm.generate(prompts, sampling_params=sampling_params)
+        print(f"test_traces outputs is : {outputs}")
+
+        timeout = 10
+        if not trace_service.evt.wait(timeout):
+            raise TimeoutError(
+                f"The fake trace service didn't receive a trace within "
+                f"the {timeout} seconds timeout")
+
+        request = trace_service.request
+        assert len(request.resource_spans) == 1, (
+            f"Expected 1 resource span, "
+            f"but got {len(request.resource_spans)}")
+        assert len(request.resource_spans[0].scope_spans) == 1, (
+            f"Expected 1 scope span, "
+            f"but got {len(request.resource_spans[0].scope_spans)}")
+        assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
+            f"Expected 1 span, "
+            f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
+
+        attributes = decode_attributes(
+            request.resource_spans[0].scope_spans[0].spans[0].attributes)
+        # assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
+                              ) == sampling_params.temperature
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
+                              ) == sampling_params.max_tokens
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
+        assert attributes.get(
+            SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
+                outputs[0].prompt_token_ids)
+        completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
+        assert attributes.get(
+            SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
+
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
+                              ) > 0
+        assert attributes.get(
+            SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) > 0
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) > 0
\ No newline at end of file

From daa13c8ea0f236754d94d5d7de5f6c8ef484f631 Mon Sep 17 00:00:00 2001
From: chrisyang <ly186375@antfin.com>
Date: Tue, 9 Sep 2025 17:42:11 +0800
Subject: [PATCH 22/27] reformat

---
 tests/v1/tracing/test_tracing.py | 33 +++++++++-----------------------
 1 file changed, 9 insertions(+), 24 deletions(-)

diff --git a/tests/v1/tracing/test_tracing.py b/tests/v1/tracing/test_tracing.py
index 0ccfcb9f7d89..76107b963a09 100644
--- a/tests/v1/tracing/test_tracing.py
+++ b/tests/v1/tracing/test_tracing.py
@@ -42,11 +42,9 @@ def decode_value(value: AnyValue):
             return decoder(value)
     raise ValueError(f"Couldn't decode value: {value}")
 
-
 def decode_attributes(attributes: Iterable[KeyValue]):
     return {kv.key: decode_value(kv.value) for kv in attributes}
 
-
 class FakeTraceService(TraceServiceServicer):
 
     def __init__(self):
@@ -58,7 +56,6 @@ def Export(self, request, context):
         self.evt.set()
         return ExportTraceServiceResponse()
 
-
 @pytest.fixture
 def trace_service() -> Generator[FakeTraceService, None, None]:
     """Fixture to set up a fake gRPC trace service"""
@@ -72,7 +69,6 @@ def trace_service() -> Generator[FakeTraceService, None, None]:
 
     server.stop(None)
 
-
 def test_traces(
     monkeypatch: pytest.MonkeyPatch,
     trace_service: FakeTraceService,
@@ -114,25 +110,14 @@ def test_traces(
         attributes = decode_attributes(
             request.resource_spans[0].scope_spans[0].spans[0].attributes)
         # assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
-        assert attributes.get(
-            SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
-        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
-                              ) == sampling_params.temperature
-        assert attributes.get(
-            SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
-        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
-                              ) == sampling_params.max_tokens
-        assert attributes.get(
-            SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
-        assert attributes.get(
-            SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
-                outputs[0].prompt_token_ids)
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE) == sampling_params.temperature
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
+        assert attributes.get( SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
+        assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(outputs[0].prompt_token_ids)
         completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
-        assert attributes.get(
-            SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
-
-        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
-                              ) > 0
-        assert attributes.get(
-            SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) > 0
+        assert attributes.get(SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) > 0
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) > 0
         assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) > 0
\ No newline at end of file

From bdb8847982856b6779f3bea958f9ab3ee581fea5 Mon Sep 17 00:00:00 2001
From: chrisyang <ly186375@antfin.com>
Date: Tue, 9 Sep 2025 17:42:11 +0800
Subject: [PATCH 23/27] reformat

---
 tests/v1/tracing/test_tracing.py | 33 +++++++++-----------------------
 1 file changed, 9 insertions(+), 24 deletions(-)

diff --git a/tests/v1/tracing/test_tracing.py b/tests/v1/tracing/test_tracing.py
index 0ccfcb9f7d89..76107b963a09 100644
--- a/tests/v1/tracing/test_tracing.py
+++ b/tests/v1/tracing/test_tracing.py
@@ -42,11 +42,9 @@ def decode_value(value: AnyValue):
             return decoder(value)
     raise ValueError(f"Couldn't decode value: {value}")
 
-
 def decode_attributes(attributes: Iterable[KeyValue]):
     return {kv.key: decode_value(kv.value) for kv in attributes}
 
-
 class FakeTraceService(TraceServiceServicer):
 
     def __init__(self):
@@ -58,7 +56,6 @@ def Export(self, request, context):
         self.evt.set()
         return ExportTraceServiceResponse()
 
-
 @pytest.fixture
 def trace_service() -> Generator[FakeTraceService, None, None]:
     """Fixture to set up a fake gRPC trace service"""
@@ -72,7 +69,6 @@ def trace_service() -> Generator[FakeTraceService, None, None]:
 
     server.stop(None)
 
-
 def test_traces(
     monkeypatch: pytest.MonkeyPatch,
     trace_service: FakeTraceService,
@@ -114,25 +110,14 @@ def test_traces(
         attributes = decode_attributes(
             request.resource_spans[0].scope_spans[0].spans[0].attributes)
         # assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
-        assert attributes.get(
-            SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
-        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
-                              ) == sampling_params.temperature
-        assert attributes.get(
-            SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
-        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
-                              ) == sampling_params.max_tokens
-        assert attributes.get(
-            SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
-        assert attributes.get(
-            SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
-                outputs[0].prompt_token_ids)
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE) == sampling_params.temperature
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
+        assert attributes.get( SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
+        assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(outputs[0].prompt_token_ids)
         completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
-        assert attributes.get(
-            SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
-
-        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
-                              ) > 0
-        assert attributes.get(
-            SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) > 0
+        assert attributes.get(SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) > 0
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) > 0
         assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) > 0
\ No newline at end of file

From 8cf7c88e615ad4c64f58e5b5a2b4e3992537c38f Mon Sep 17 00:00:00 2001
From: chrisyang <ly186375@antfin.com>
Date: Tue, 9 Sep 2025 18:46:38 +0800
Subject: [PATCH 24/27] fix precommit error

---
 tests/v1/tracing/test_tracing.py | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/tests/v1/tracing/test_tracing.py b/tests/v1/tracing/test_tracing.py
index 76107b963a09..ceb46fd06e88 100644
--- a/tests/v1/tracing/test_tracing.py
+++ b/tests/v1/tracing/test_tracing.py
@@ -42,9 +42,11 @@ def decode_value(value: AnyValue):
             return decoder(value)
     raise ValueError(f"Couldn't decode value: {value}")
 
+
 def decode_attributes(attributes: Iterable[KeyValue]):
     return {kv.key: decode_value(kv.value) for kv in attributes}
 
+
 class FakeTraceService(TraceServiceServicer):
 
     def __init__(self):
@@ -56,6 +58,7 @@ def Export(self, request, context):
         self.evt.set()
         return ExportTraceServiceResponse()
 
+
 @pytest.fixture
 def trace_service() -> Generator[FakeTraceService, None, None]:
     """Fixture to set up a fake gRPC trace service"""
@@ -69,6 +72,7 @@ def trace_service() -> Generator[FakeTraceService, None, None]:
 
     server.stop(None)
 
+
 def test_traces(
     monkeypatch: pytest.MonkeyPatch,
     trace_service: FakeTraceService,
@@ -110,14 +114,23 @@ def test_traces(
         attributes = decode_attributes(
             request.resource_spans[0].scope_spans[0].spans[0].attributes)
         # assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
-        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
-        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE) == sampling_params.temperature
-        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
-        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
-        assert attributes.get( SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
-        assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(outputs[0].prompt_token_ids)
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
+                              ) == sampling_params.temperature
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
+                              ) == sampling_params.max_tokens
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
+        assert attributes.get(
+            SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
+                outputs[0].prompt_token_ids)
         completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
-        assert attributes.get(SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
+        assert attributes.get(
+            SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
         assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) > 0
-        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) > 0
-        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) > 0
\ No newline at end of file
+        assert attributes.get(
+            SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) > 0
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) > 0

From 27d6c6978c35c94a0e2734e17dce283b6316c3dd Mon Sep 17 00:00:00 2001
From: Aaron Pham <contact@aarnphm.xyz>
Date: Sat, 6 Sep 2025 23:10:40 -0400
Subject: [PATCH 25/27] [CI][Fix] deterministic seed for flaky CI runs on
 structured outputs (#24380)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
---
 tests/v1/entrypoints/llm/test_struct_output_generate.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index cd82eb2ac419..c10b1abb2b3b 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -122,6 +122,7 @@ def test_structured_output(
         guided_decoding_backend=guided_decoding_backend,
         guided_decoding_disable_any_whitespace=(guided_decoding_backend
                                                 in {"xgrammar", "guidance"}),
+        seed=120,
         tokenizer_mode=tokenizer_mode,
         speculative_config=speculative_config)
 

From 1d100d03ec3b4204c02ca52fbf5810607b1fc9cf Mon Sep 17 00:00:00 2001
From: 22quinn <33176974+22quinn@users.noreply.github.com>
Date: Sun, 7 Sep 2025 19:51:59 -0700
Subject: [PATCH 26/27] [CI/Build] Disable flaky test_structured_output tests
 (#24404)

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
---
 tests/v1/entrypoints/llm/test_struct_output_generate.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index c10b1abb2b3b..126d8ce8c8e0 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -46,12 +46,12 @@
     ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "mistral", None),
     ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", None),
     ("Qwen/Qwen2.5-1.5B-Instruct", "lm-format-enforcer", "auto", None),
-    ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto", None),
-    ("mistralai/Ministral-8B-Instruct-2410", "outlines", "mistral", None),
+    #FIXME: This tests are flaky on CI thus disabled. Tracking in Issue #24402
+    # ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto", None),
+    # ("mistralai/Ministral-8B-Instruct-2410", "outlines", "mistral", None),
+    #("Qwen/Qwen2.5-1.5B-Instruct", "guidance", "auto"),
     ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto",
      NGRAM_SPEC_CONFIG),
-    #FIXME: This test is flaky on CI thus disabled
-    #("Qwen/Qwen2.5-1.5B-Instruct", "guidance", "auto"),
     ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto",
      NGRAM_SPEC_CONFIG),
     ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", NGRAM_SPEC_CONFIG),

From bce28ccfe28d715570990506f634683fa8535a78 Mon Sep 17 00:00:00 2001
From: chrisyang <ly186375@antfin.com>
Date: Wed, 10 Sep 2025 14:21:00 +0800
Subject: [PATCH 27/27] fix trace test pipeline config

---
 .buildkite/test-pipeline.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 8f2f6083b030..a33de4cd73cf 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -205,7 +205,7 @@ steps:
   source_file_dependencies:
   - vllm/
   - tests/metrics
-  - tests/tracing
+  - tests/v1/tracing
   commands:
   - pytest -v -s metrics
   - "pip install \