-
-
Couldn't load subscription status.
- Fork 10.8k
[V1] feat:add engine v1 tracing #20372
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 17 commits
e0bb716
a7414f7
440ca59
a30adc7
8afb03e
e0af39b
a5462a1
b5c27ed
7b1de1c
cdf0d9f
4661667
8e3887c
3d65643
6bea3fa
1a5af39
4e623e3
dd8c2a0
47bea22
33d736e
6699296
c182529
86e4321
516b954
71012c0
baa6b85
05b2e69
5f51aa1
e1113e9
81decbd
b0f85e6
38434cd
eedf207
28c0de7
c255374
73daf4d
0623cd7
57dbf9f
cdb9c48
2afc5bd
daa13c8
bdb8847
8cf7c88
a2b5346
57c0df6
27d6c69
1d100d0
a5c7f83
6370955
bce28cc
204a6b7
e03076c
23e74d3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -11,6 +11,8 @@ | |||
| from vllm.outputs import (CompletionOutput, PoolingOutput, | ||||
| PoolingRequestOutput, RequestOutput) | ||||
| from vllm.sampling_params import RequestOutputKind | ||||
| from vllm.tracing import (SpanAttributes, SpanKind, Tracer, | ||||
| extract_trace_context) | ||||
| from vllm.transformers_utils.tokenizer import AnyTokenizer | ||||
| from vllm.transformers_utils.tokenizer_group import TokenizerGroup | ||||
| from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason | ||||
|
|
@@ -71,7 +73,6 @@ def get_nowait( | |||
|
|
||||
| @dataclass | ||||
| class OutputProcessorOutput: | ||||
|
|
||||
| request_outputs: list[Union[RequestOutput, PoolingRequestOutput]] | ||||
| reqs_to_abort: list[str] | ||||
|
|
||||
|
|
@@ -274,16 +275,13 @@ def _new_pooling_output( | |||
| class OutputProcessor: | ||||
| """Process EngineCoreOutputs into RequestOutputs.""" | ||||
|
|
||||
| def __init__( | ||||
| self, | ||||
| tokenizer: TokenizerGroup, | ||||
| log_stats: bool, | ||||
| ): | ||||
| def __init__(self, tokenizer: TokenizerGroup, log_stats: bool): | ||||
| self.log_stats = log_stats | ||||
| self.tokenizer = tokenizer | ||||
| self.request_states: dict[str, RequestState] = {} | ||||
| self.parent_requests: dict[str, ParentRequest] = {} | ||||
| self.lora_states = LoRARequestStates() | ||||
| self.tracer: Optional[Tracer] = None | ||||
|
|
||||
| def get_num_unfinished_requests(self): | ||||
| return len(self.request_states) | ||||
|
|
@@ -441,14 +439,72 @@ def process_outputs( | |||
| # Track per-request stats | ||||
| self._update_stats_from_finished(req_state, finish_reason, | ||||
| iteration_stats) | ||||
|
|
||||
| if self.tracer: | ||||
| self.do_tracing(engine_core_output, req_state, | ||||
| iteration_stats) | ||||
| self.lora_states.update_iteration_stats(iteration_stats) | ||||
|
|
||||
| return OutputProcessorOutput( | ||||
| request_outputs=request_outputs, | ||||
| reqs_to_abort=reqs_to_abort, | ||||
| ) | ||||
|
|
||||
| def do_tracing(self, engine_core_output: EngineCoreOutput, | ||||
| req_state: RequestState, | ||||
| iteration_stats: Optional[IterationStats]) -> None: | ||||
| assert req_state.stats is not None | ||||
| assert iteration_stats is not None | ||||
RichardoMrMu marked this conversation as resolved.
Show resolved
Hide resolved
|
||||
| assert self.tracer is not None | ||||
|
|
||||
| arrival_time_nano_seconds = int(req_state.stats.arrival_time * 1e9) | ||||
| trace_context = extract_trace_context(engine_core_output.trace_headers) | ||||
| with (self.tracer.start_as_current_span( | ||||
| "llm_request", | ||||
| kind=SpanKind.SERVER, | ||||
| context=trace_context, | ||||
| start_time=arrival_time_nano_seconds) as span): | ||||
| metrics = req_state.stats | ||||
| e2e_time = iteration_stats.iteration_timestamp - \ | ||||
| metrics.arrival_time | ||||
| queued_time = metrics.scheduled_ts - metrics.queued_ts | ||||
| prefill_time = metrics.first_token_ts - metrics.scheduled_ts | ||||
| decode_time = metrics.last_token_ts - metrics.first_token_ts | ||||
| inference_time = metrics.last_token_ts - metrics.scheduled_ts | ||||
| span.set_attribute( | ||||
| SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN, | ||||
| metrics.first_token_latency) | ||||
| span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E, e2e_time) | ||||
| span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE, | ||||
| queued_time) | ||||
| span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS, | ||||
| len(req_state.prompt_token_ids)) | ||||
| span.set_attribute(SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS, | ||||
| metrics.num_generation_tokens) | ||||
| span.set_attribute( | ||||
| SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL, | ||||
| prefill_time) | ||||
| span.set_attribute( | ||||
| SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_DECODE, | ||||
| decode_time) | ||||
| span.set_attribute( | ||||
| SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE, | ||||
| inference_time) | ||||
|
|
||||
| # meta | ||||
| span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID, | ||||
| req_state.request_id) | ||||
| if req_state.parent_req and req_state.parent_req.sampling_params: | ||||
|
||||
| if is_pooling or params.n == 1: |
Is it possible to add these attributes regardless of model type and params.n ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, good point. I added fields in RequestState to hold these sampling_params from EngineCoreRequest. or plz let me know if there's better way to do it :)
Uh oh!
There was an error while loading. Please reload this page.