Skip to content

Commit b5cbe8e

Browse files
rajveerbywang96
andauthored
[Bugfix] Last token measurement fix (#11376)
Signed-off-by: rajveerb <[email protected]> Co-authored-by: Roger Wang <[email protected]>
1 parent df04dff commit b5cbe8e

File tree

2 files changed

+20
-12
lines changed

2 files changed

+20
-12
lines changed

vllm/engine/llm_engine.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1124,6 +1124,8 @@ def _process_model_outputs(self,
11241124

11251125
seq_group = scheduled_seq_group.seq_group
11261126
seq_group.maybe_set_first_token_time(now)
1127+
if not seq_group.is_prefill():
1128+
seq_group.set_last_token_time(now)
11271129
request_output = RequestOutputFactory.create(
11281130
seq_group,
11291131
self.seq_id_to_seq_group,
@@ -1166,6 +1168,8 @@ def _process_model_outputs(self,
11661168

11671169
seq_group = scheduled_seq_group.seq_group
11681170
seq_group.maybe_set_first_token_time(now)
1171+
if not seq_group.is_prefill():
1172+
seq_group.set_last_token_time(now)
11691173
request_output = RequestOutputFactory.create(
11701174
seq_group,
11711175
self.seq_id_to_seq_group,
@@ -1686,15 +1690,15 @@ def _get_stats(self,
16861690
# If the seq_group just finished the prefill state
16871691
# get TTFT.
16881692
if not seq_group.is_prefill():
1689-
latency = seq_group.get_last_latency(now)
1693+
latency = seq_group.get_last_token_latency()
16901694
time_to_first_tokens_iter.append(latency)
16911695

16921696
# One generation token per finished prefill.
16931697
num_generation_tokens_from_prefill_groups += (
16941698
seq_group.num_seqs())
16951699
else:
16961700
# TPOTs.
1697-
latency = seq_group.get_last_latency(now)
1701+
latency = seq_group.get_last_token_latency()
16981702
time_per_output_tokens_iter.append(latency)
16991703
if seq_group.state.current_step == 0:
17001704
# For async_output_proc, the do_log_stats()

vllm/sequence.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -667,6 +667,7 @@ def __init__(
667667
first_scheduled_time=None,
668668
first_token_time=None,
669669
time_in_queue=None)
670+
self.last_token_latency = 0.0
670671
self.lora_request = lora_request
671672
self.prompt_logprobs: Optional[PromptLogprobs] = None
672673
self.state = SequenceGroupState()
@@ -762,18 +763,21 @@ def init_multi_step_from_lookahead_slots(self, num_lookahead_slots: int,
762763
assert num_lookahead_slots + 1 == num_scheduler_steps or is_prefill
763764
self.init_multi_step(num_steps=num_lookahead_slots + 1)
764765

765-
def get_last_latency(self, now: float) -> float:
766+
def set_last_token_time(self, now: float) -> None:
766767
"""Sets the last token time for Request level timings."""
767-
# If still in prefill phase, raise Error.
768-
if self.is_prefill():
769-
raise ValueError(
770-
"seq_group.get_last_latency() should not be called "
771-
"if the seq_group is in prefill phase.")
772-
773-
# Otherwise return token latency.
774-
latency = now - self.metrics.last_token_time
768+
# If still in prefill phase, assertion fails.
769+
assert not self.is_prefill(), (
770+
"seq_group.set_last_token_time() should not be called "
771+
"if the seq_group is in prefill phase.")
772+
self.last_token_latency = now - self.metrics.last_token_time
775773
self.metrics.last_token_time = now
776-
return latency
774+
775+
def get_last_token_latency(self) -> float:
776+
"""Returns the latency of the last token."""
777+
assert not self.is_prefill(), (
778+
"seq_group.get_last_token_latency() should not be called "
779+
"if the seq_group is in prefill phase.")
780+
return self.last_token_latency
777781

778782
def maybe_set_first_token_time(self, time: float) -> None:
779783
"""Sets the first token time for Request level timings."""

0 commit comments

Comments
 (0)