Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
e0bb716
feat:trace v1
Jul 2, 2025
a7414f7
Merge pull request #1 from RichardoMrMu/feat-trace-v1-aftermerge
RichardoMrMu Jul 2, 2025
440ca59
fix: ttft calculation
hcyezhang Jul 2, 2025
a30adc7
Merge pull request #2 from RichardoMrMu/main-ttft-fix
hcyezhang Jul 2, 2025
8afb03e
fix: merge error by accident
hcyezhang Aug 4, 2025
e0af39b
Merge pull request #3 from hcyezhang/main
RichardoMrMu Aug 4, 2025
a5462a1
Merge branch 'main' into fix_conflict
RichardoMrMu Aug 5, 2025
b5c27ed
Update vllm/v1/engine/async_llm.py
RichardoMrMu Aug 7, 2025
7b1de1c
Update vllm/v1/engine/output_processor.py
RichardoMrMu Aug 7, 2025
cdf0d9f
fix: gen meta directly from enginecorequest.sampling_params
hcyezhang Aug 7, 2025
4661667
Merge pull request #4 from hcyezhang/main
hcyezhang Aug 7, 2025
8e3887c
Update vllm/v1/engine/processor.py
RichardoMrMu Aug 15, 2025
3d65643
Merge branch 'main' into fix_conflict
RichardoMrMu Aug 15, 2025
6bea3fa
fix:pre-commit
Aug 15, 2025
1a5af39
Merge pull request #5 from RichardoMrMu/fix_conflict_2
RichardoMrMu Aug 15, 2025
4e623e3
Merge branch 'main' into fix_conflict
RichardoMrMu Aug 20, 2025
dd8c2a0
fix:pre-commit
Aug 20, 2025
47bea22
Merge branch 'main' into fix_conflict
RichardoMrMu Aug 20, 2025
33d736e
Merge branch 'main' into fix_conflict
RichardoMrMu Aug 20, 2025
6699296
remove v0 guard for tests
simon-mo Aug 24, 2025
c182529
Merge branch 'main' into fix_conflict
simon-mo Aug 24, 2025
86e4321
Merge branch 'main' into fix_conflict
RichardoMrMu Aug 25, 2025
516b954
change: test_tracing.py gpu_memory_utilization=0.3 to avoid oom
Aug 25, 2025
71012c0
test: timeout to 10
Aug 25, 2025
baa6b85
change: set env VLLM_USE_V1 1
Aug 26, 2025
05b2e69
test: set env VLLM_USE_V1 0
Sep 1, 2025
5f51aa1
Merge branch 'main' into fix_conflict
RichardoMrMu Sep 1, 2025
e1113e9
test: set env VLLM_USE_V1 1
Sep 1, 2025
81decbd
fix: tracing ut - tracer not initialized
hcyezhang Sep 2, 2025
b0f85e6
Merge branch 'fix_conflict' into main
hcyezhang Sep 2, 2025
38434cd
Merge pull request #6 from hcyezhang/main
RichardoMrMu Sep 2, 2025
eedf207
test:
Sep 2, 2025
28c0de7
Merge remote-tracking branch 'origin/fix_conflict' into fix_conflict
Sep 2, 2025
c255374
Merge branch 'main' into fix_conflict
RichardoMrMu Sep 2, 2025
73daf4d
test:disable_log_stats=False
Sep 2, 2025
0623cd7
test:format
Sep 2, 2025
57dbf9f
test:no model name
Sep 2, 2025
cdb9c48
add tracing ut for v1
ChrisYangAI Sep 9, 2025
2afc5bd
Merge pull request #7 from RichardoMrMu/chris_traceut_fix
RichardoMrMu Sep 9, 2025
daa13c8
reformat
ChrisYangAI Sep 9, 2025
bdb8847
reformat
ChrisYangAI Sep 9, 2025
8cf7c88
fix precommit error
ChrisYangAI Sep 9, 2025
a2b5346
fix precommit error
ChrisYangAI Sep 9, 2025
57c0df6
Merge pull request #8 from RichardoMrMu/chris_traceut_fix
RichardoMrMu Sep 9, 2025
27d6c69
[CI][Fix] deterministic seed for flaky CI runs on structured outputs …
aarnphm Sep 7, 2025
1d100d0
[CI/Build] Disable flaky test_structured_output tests (#24404)
22quinn Sep 8, 2025
a5c7f83
Merge pull request #9 from RichardoMrMu/fix_guidedcodeing_ut_failure
RichardoMrMu Sep 10, 2025
6370955
Merge branch 'main' into fix_conflict
ChrisYangAI Sep 10, 2025
bce28cc
fix trace test pipeline config
ChrisYangAI Sep 10, 2025
204a6b7
Merge pull request #10 from RichardoMrMu/fix_trace_test_config
RichardoMrMu Sep 10, 2025
e03076c
Merge branch 'main' into fix_conflict
ChrisYangAI Sep 10, 2025
23e74d3
Merge branch 'main' into fix_conflict
ChrisYangAI Sep 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ steps:
source_file_dependencies:
- vllm/
- tests/metrics
- tests/tracing
- tests/v1/tracing
commands:
- pytest -v -s metrics
- "pip install \
Expand Down
137 changes: 137 additions & 0 deletions tests/v1/tracing/test_tracing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa
# type: ignore
from __future__ import annotations

import threading
from collections.abc import Iterable
from concurrent import futures
from typing import Callable, Generator, Literal

import grpc
import pytest
from opentelemetry.proto.collector.trace.v1.trace_service_pb2 import (
ExportTraceServiceResponse)
from opentelemetry.proto.collector.trace.v1.trace_service_pb2_grpc import (
TraceServiceServicer, add_TraceServiceServicer_to_server)
from opentelemetry.proto.common.v1.common_pb2 import AnyValue, KeyValue
from opentelemetry.sdk.environment_variables import (
OTEL_EXPORTER_OTLP_TRACES_INSECURE)

from vllm import LLM, SamplingParams
from vllm.tracing import SpanAttributes

FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"

FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value',
'array_value']


def decode_value(value: AnyValue):
field_decoders: dict[FieldName, Callable] = {
"bool_value": (lambda v: v.bool_value),
"string_value": (lambda v: v.string_value),
"int_value": (lambda v: v.int_value),
"double_value": (lambda v: v.double_value),
"array_value":
(lambda v: [decode_value(item) for item in v.array_value.values]),
}
for field, decoder in field_decoders.items():
if value.HasField(field):
return decoder(value)
raise ValueError(f"Couldn't decode value: {value}")


def decode_attributes(attributes: Iterable[KeyValue]):
return {kv.key: decode_value(kv.value) for kv in attributes}


class FakeTraceService(TraceServiceServicer):

def __init__(self):
self.request = None
self.evt = threading.Event()

def Export(self, request, context):
self.request = request
self.evt.set()
return ExportTraceServiceResponse()


@pytest.fixture
def trace_service() -> Generator[FakeTraceService, None, None]:
"""Fixture to set up a fake gRPC trace service"""
server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
service = FakeTraceService()
add_TraceServiceServicer_to_server(service, server)
server.add_insecure_port(FAKE_TRACE_SERVER_ADDRESS)
server.start()

yield service

server.stop(None)


def test_traces(
monkeypatch: pytest.MonkeyPatch,
trace_service: FakeTraceService,
):
with monkeypatch.context() as m:
m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
m.setenv("VLLM_USE_V1", "1")
sampling_params = SamplingParams(
temperature=0.01,
top_p=0.1,
max_tokens=256,
)
model = "facebook/opt-125m"
llm = LLM(model=model,
otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
gpu_memory_utilization=0.3,
disable_log_stats=False)
prompts = ["This is a short prompt"]
outputs = llm.generate(prompts, sampling_params=sampling_params)
print(f"test_traces outputs is : {outputs}")

timeout = 10
if not trace_service.evt.wait(timeout):
raise TimeoutError(
f"The fake trace service didn't receive a trace within "
f"the {timeout} seconds timeout")

request = trace_service.request
assert len(request.resource_spans) == 1, (
f"Expected 1 resource span, "
f"but got {len(request.resource_spans)}")
assert len(request.resource_spans[0].scope_spans) == 1, (
f"Expected 1 scope span, "
f"but got {len(request.resource_spans[0].scope_spans)}")
assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
f"Expected 1 span, "
f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")

attributes = decode_attributes(
request.resource_spans[0].scope_spans[0].spans[0].attributes)
# assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
assert attributes.get(
SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
) == sampling_params.temperature
assert attributes.get(
SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
) == sampling_params.max_tokens
assert attributes.get(
SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
assert attributes.get(
SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
outputs[0].prompt_token_ids)
completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
assert attributes.get(
SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens

assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) > 0
assert attributes.get(
SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) > 0
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) > 0
6 changes: 0 additions & 6 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1478,12 +1478,6 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
recommend_to_remove=False)
return False

# No OTLP observability so far.
if (self.otlp_traces_endpoint or self.collect_detailed_traces):
_raise_or_fallback(feature_name="--otlp-traces-endpoint",
recommend_to_remove=False)
return False

# V1 supports N-gram, Medusa, and Eagle speculative decoding.
if (self.speculative_config is not None
and self.speculative_config.get("method") == "draft_model"):
Expand Down
5 changes: 5 additions & 0 deletions vllm/tracing.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,11 @@ class SpanAttributes:
# forward, block/sync across workers, cpu-gpu sync time and sampling time.
GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE = (
"gen_ai.latency.time_in_model_execute")
GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL = \
"gen_ai.latency.time_in_model_prefill"
GEN_AI_LATENCY_TIME_IN_MODEL_DECODE = "gen_ai.latency.time_in_model_decode"
GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE = \
"gen_ai.latency.time_in_model_inference"


def contains_trace_headers(headers: Mapping[str, str]) -> bool:
Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/core/sched/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -965,9 +965,9 @@ def update_from_output(
stop_reason=request.stop_reason,
events=request.take_events(),
kv_transfer_params=kv_transfer_params,
trace_headers=request.trace_headers,
num_cached_tokens=request.num_cached_tokens,
))

else:
# Invariant: EngineCore returns no partial prefill outputs.
assert not prompt_logprobs_tensors
Expand Down
6 changes: 5 additions & 1 deletion vllm/v1/engine/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import enum
import time
from collections.abc import Mapping
from typing import Any, Optional, Union

import msgspec
Expand Down Expand Up @@ -66,6 +67,8 @@ class EngineCoreRequest(
current_wave: int = 0
priority: int = 0

trace_headers: Optional[Mapping[str, str]] = None


class EngineCoreEventType(enum.IntEnum):
"""The type of engine core request event."""
Expand Down Expand Up @@ -111,6 +114,7 @@ class EngineCoreOutput(
events: Optional[list[EngineCoreEvent]] = None
kv_transfer_params: Optional[dict[str, Any]] = None

trace_headers: Optional[Mapping[str, str]] = None
# The number of tokens with prefix cache hits.
num_cached_tokens: int = 0

Expand Down Expand Up @@ -144,7 +148,7 @@ class EngineCoreOutputs(
omit_defaults=True, # type: ignore[call-arg]
gc=False): # type: ignore[call-arg]

#NOTE(Nick): We could consider ways to make this more compact,
# NOTE(Nick): We could consider ways to make this more compact,
# e.g. columnwise layout

engine_index: int = 0
Expand Down
9 changes: 8 additions & 1 deletion vllm/v1/engine/async_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams
from vllm.tasks import SupportedTask
from vllm.tracing import init_tracer
from vllm.transformers_utils.config import (
maybe_register_config_serialize_by_value)
from vllm.transformers_utils.tokenizer import AnyTokenizer
Expand Down Expand Up @@ -97,6 +98,7 @@ def __init__(

self.model_config = vllm_config.model_config
self.vllm_config = vllm_config
self.observability_config = vllm_config.observability_config
self.log_requests = log_requests

self.log_stats = log_stats or (stat_loggers is not None)
Expand Down Expand Up @@ -124,6 +126,11 @@ def __init__(
# OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
self.output_processor = OutputProcessor(self.tokenizer,
log_stats=self.log_stats)
if self.observability_config.otlp_traces_endpoint is not None:
tracer = init_tracer(
"vllm.llm_engine",
self.observability_config.otlp_traces_endpoint)
self.output_processor.tracer = tracer

# EngineCore (starts the engine in background process).
self.engine_core = EngineCoreClient.make_async_mp_client(
Expand Down Expand Up @@ -603,7 +610,7 @@ async def get_tokenizer(
return self.tokenizer.get_lora_tokenizer(lora_request)

async def is_tracing_enabled(self) -> bool:
return False
return self.observability_config.otlp_traces_endpoint is not None

async def do_log_stats(
self,
Expand Down
7 changes: 7 additions & 0 deletions vllm/v1/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams
from vllm.tasks import SupportedTask
from vllm.tracing import init_tracer
from vllm.transformers_utils.tokenizer_group import (
TokenizerGroup, init_tokenizer_from_configs)
from vllm.usage.usage_lib import UsageContext
Expand Down Expand Up @@ -65,6 +66,7 @@ def __init__(
"Set VLLM_USE_V1=0 and file and issue on Github.")

self.vllm_config = vllm_config
self.observability_config = vllm_config.observability_config
self.model_config = vllm_config.model_config
self.cache_config = vllm_config.cache_config

Expand Down Expand Up @@ -99,6 +101,11 @@ def __init__(
# OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
self.output_processor = OutputProcessor(self.tokenizer,
log_stats=self.log_stats)
if self.observability_config.otlp_traces_endpoint is not None:
tracer = init_tracer(
"vllm.llm_engine",
self.observability_config.otlp_traces_endpoint)
self.output_processor.tracer = tracer

# EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs)
self.engine_core = EngineCoreClient.make_client(
Expand Down
Loading