Skip to content

Commit 347eeeb

Browse files
[Misc] Remove experimental dep from tracing.py (#12007)
Signed-off-by: Adrian Cole <[email protected]>
1 parent 18fd4a8 commit 347eeeb

File tree

3 files changed

+66
-60
lines changed

3 files changed

+66
-60
lines changed

tests/tracing/test_tracing.py

Lines changed: 30 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -100,32 +100,32 @@ def test_traces(trace_service):
100100

101101
attributes = decode_attributes(
102102
request.resource_spans[0].scope_spans[0].spans[0].attributes)
103-
assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model
103+
assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
104104
assert attributes.get(
105-
SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id
105+
SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
106+
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
107+
) == sampling_params.temperature
106108
assert attributes.get(
107-
SpanAttributes.LLM_REQUEST_TEMPERATURE) == sampling_params.temperature
109+
SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
108110
assert attributes.get(
109-
SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p
110-
assert attributes.get(
111-
SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
112-
assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n
113-
assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len(
111+
SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
112+
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
113+
assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
114114
outputs[0].prompt_token_ids)
115115
completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
116116
assert attributes.get(
117-
SpanAttributes.LLM_USAGE_COMPLETION_TOKENS) == completion_tokens
117+
SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
118118
metrics = outputs[0].metrics
119119
assert attributes.get(
120-
SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
120+
SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
121121
ttft = metrics.first_token_time - metrics.arrival_time
122122
assert attributes.get(
123-
SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
123+
SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
124124
e2e_time = metrics.finished_time - metrics.arrival_time
125-
assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time
125+
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
126126
assert metrics.scheduler_time > 0
127-
assert attributes.get(
128-
SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER) == metrics.scheduler_time
127+
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
128+
) == metrics.scheduler_time
129129
# Model forward and model execute should be none, since detailed traces is
130130
# not enabled.
131131
assert metrics.model_forward_time is None
@@ -166,37 +166,37 @@ def test_traces_with_detailed_steps(trace_service):
166166

167167
attributes = decode_attributes(
168168
request.resource_spans[0].scope_spans[0].spans[0].attributes)
169-
assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model
169+
assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
170170
assert attributes.get(
171-
SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id
171+
SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
172+
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
173+
) == sampling_params.temperature
172174
assert attributes.get(
173-
SpanAttributes.LLM_REQUEST_TEMPERATURE) == sampling_params.temperature
175+
SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
174176
assert attributes.get(
175-
SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p
176-
assert attributes.get(
177-
SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
178-
assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n
179-
assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len(
177+
SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
178+
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
179+
assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
180180
outputs[0].prompt_token_ids)
181181
completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
182182
assert attributes.get(
183-
SpanAttributes.LLM_USAGE_COMPLETION_TOKENS) == completion_tokens
183+
SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
184184
metrics = outputs[0].metrics
185185
assert attributes.get(
186-
SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
186+
SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
187187
ttft = metrics.first_token_time - metrics.arrival_time
188188
assert attributes.get(
189-
SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
189+
SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
190190
e2e_time = metrics.finished_time - metrics.arrival_time
191-
assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time
191+
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
192192
assert metrics.scheduler_time > 0
193-
assert attributes.get(
194-
SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER) == metrics.scheduler_time
193+
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
194+
) == metrics.scheduler_time
195195
assert metrics.model_forward_time > 0
196196
assert attributes.get(
197-
SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx(
197+
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx(
198198
metrics.model_forward_time / 1000)
199199
assert metrics.model_execute_time > 0
200-
assert attributes.get(SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_EXECUTE
200+
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE
201201
) == metrics.model_execute_time
202202
assert metrics.model_forward_time < 1000 * metrics.model_execute_time

vllm/engine/llm_engine.py

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1857,46 +1857,44 @@ def create_trace_span(self, seq_group: SequenceGroup) -> None:
18571857
metrics = seq_group.metrics
18581858
ttft = metrics.first_token_time - metrics.arrival_time
18591859
e2e_time = metrics.finished_time - metrics.arrival_time
1860-
# attribute names are based on
1861-
# https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/llm-spans.md
1862-
seq_span.set_attribute(SpanAttributes.LLM_RESPONSE_MODEL,
1860+
seq_span.set_attribute(SpanAttributes.GEN_AI_RESPONSE_MODEL,
18631861
self.model_config.model)
1864-
seq_span.set_attribute(SpanAttributes.LLM_REQUEST_ID,
1862+
seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID,
18651863
seq_group.request_id)
1866-
seq_span.set_attribute(SpanAttributes.LLM_REQUEST_TEMPERATURE,
1864+
seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE,
18671865
seq_group.sampling_params.temperature)
1868-
seq_span.set_attribute(SpanAttributes.LLM_REQUEST_TOP_P,
1866+
seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P,
18691867
seq_group.sampling_params.top_p)
1870-
seq_span.set_attribute(SpanAttributes.LLM_REQUEST_MAX_TOKENS,
1868+
seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS,
18711869
seq_group.sampling_params.max_tokens)
1872-
seq_span.set_attribute(SpanAttributes.LLM_REQUEST_N,
1870+
seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N,
18731871
seq_group.sampling_params.n)
1874-
seq_span.set_attribute(SpanAttributes.LLM_USAGE_NUM_SEQUENCES,
1872+
seq_span.set_attribute(SpanAttributes.GEN_AI_USAGE_NUM_SEQUENCES,
18751873
seq_group.num_seqs())
1876-
seq_span.set_attribute(SpanAttributes.LLM_USAGE_PROMPT_TOKENS,
1874+
seq_span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS,
18771875
len(seq_group.prompt_token_ids))
18781876
seq_span.set_attribute(
1879-
SpanAttributes.LLM_USAGE_COMPLETION_TOKENS,
1877+
SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS,
18801878
sum([
18811879
seq.get_output_len()
18821880
for seq in seq_group.get_finished_seqs()
18831881
]))
1884-
seq_span.set_attribute(SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE,
1882+
seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE,
18851883
metrics.time_in_queue)
18861884
seq_span.set_attribute(
1887-
SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN, ttft)
1888-
seq_span.set_attribute(SpanAttributes.LLM_LATENCY_E2E, e2e_time)
1885+
SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN, ttft)
1886+
seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E, e2e_time)
18891887
if metrics.scheduler_time is not None:
18901888
seq_span.set_attribute(
1891-
SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER,
1889+
SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER,
18921890
metrics.scheduler_time)
18931891
if metrics.model_forward_time is not None:
18941892
seq_span.set_attribute(
1895-
SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_FORWARD,
1893+
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD,
18961894
metrics.model_forward_time / 1000.0)
18971895
if metrics.model_execute_time is not None:
18981896
seq_span.set_attribute(
1899-
SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_EXECUTE,
1897+
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE,
19001898
metrics.model_execute_time)
19011899

19021900
def _validate_model_inputs(self, inputs: ProcessorInputs,

vllm/tracing.py

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
OTEL_EXPORTER_OTLP_TRACES_PROTOCOL)
1717
from opentelemetry.sdk.trace import TracerProvider
1818
from opentelemetry.sdk.trace.export import BatchSpanProcessor
19-
from opentelemetry.semconv_ai import SpanAttributes as BaseSpanAttributes
2019
from opentelemetry.trace import SpanKind, Tracer, set_tracer_provider
2120
from opentelemetry.trace.propagation.tracecontext import (
2221
TraceContextTextMapPropagator)
@@ -92,21 +91,30 @@ def extract_trace_headers(headers: Mapping[str, str]) -> Mapping[str, str]:
9291
return {h: headers[h] for h in TRACE_HEADERS if h in headers}
9392

9493

95-
class SpanAttributes(BaseSpanAttributes):
96-
# The following span attribute names are added here because they are missing
97-
# from the Semantic Conventions for LLM.
98-
LLM_REQUEST_ID = "gen_ai.request.id"
99-
LLM_REQUEST_N = "gen_ai.request.n"
100-
LLM_USAGE_NUM_SEQUENCES = "gen_ai.usage.num_sequences"
101-
LLM_LATENCY_TIME_IN_QUEUE = "gen_ai.latency.time_in_queue"
102-
LLM_LATENCY_TIME_TO_FIRST_TOKEN = "gen_ai.latency.time_to_first_token"
103-
LLM_LATENCY_E2E = "gen_ai.latency.e2e"
104-
LLM_LATENCY_TIME_IN_SCHEDULER = "gen_ai.latency.time_in_scheduler"
94+
class SpanAttributes:
95+
# Attribute names copied from here to avoid version conflicts:
96+
# https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/gen-ai-spans.md
97+
GEN_AI_USAGE_COMPLETION_TOKENS = "gen_ai.usage.completion_tokens"
98+
GEN_AI_USAGE_PROMPT_TOKENS = "gen_ai.usage.prompt_tokens"
99+
GEN_AI_REQUEST_MAX_TOKENS = "gen_ai.request.max_tokens"
100+
GEN_AI_REQUEST_TOP_P = "gen_ai.request.top_p"
101+
GEN_AI_REQUEST_TEMPERATURE = "gen_ai.request.temperature"
102+
GEN_AI_RESPONSE_MODEL = "gen_ai.response.model"
103+
# Attribute names added until they are added to the semantic conventions:
104+
GEN_AI_REQUEST_ID = "gen_ai.request.id"
105+
GEN_AI_REQUEST_N = "gen_ai.request.n"
106+
GEN_AI_USAGE_NUM_SEQUENCES = "gen_ai.usage.num_sequences"
107+
GEN_AI_LATENCY_TIME_IN_QUEUE = "gen_ai.latency.time_in_queue"
108+
GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN = "gen_ai.latency.time_to_first_token"
109+
GEN_AI_LATENCY_E2E = "gen_ai.latency.e2e"
110+
GEN_AI_LATENCY_TIME_IN_SCHEDULER = "gen_ai.latency.time_in_scheduler"
105111
# Time taken in the forward pass for this across all workers
106-
LLM_LATENCY_TIME_IN_MODEL_FORWARD = "gen_ai.latency.time_in_model_forward"
112+
GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD = (
113+
"gen_ai.latency.time_in_model_forward")
107114
# Time taken in the model execute function. This will include model
108115
# forward, block/sync across workers, cpu-gpu sync time and sampling time.
109-
LLM_LATENCY_TIME_IN_MODEL_EXECUTE = "gen_ai.latency.time_in_model_execute"
116+
GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE = (
117+
"gen_ai.latency.time_in_model_execute")
110118

111119

112120
def contains_trace_headers(headers: Mapping[str, str]) -> bool:

0 commit comments

Comments
 (0)