@@ -667,6 +667,7 @@ def __init__(
667
667
first_scheduled_time = None ,
668
668
first_token_time = None ,
669
669
time_in_queue = None )
670
+ self .last_token_latency = 0.0
670
671
self .lora_request = lora_request
671
672
self .prompt_logprobs : Optional [PromptLogprobs ] = None
672
673
self .state = SequenceGroupState ()
@@ -762,18 +763,21 @@ def init_multi_step_from_lookahead_slots(self, num_lookahead_slots: int,
762
763
assert num_lookahead_slots + 1 == num_scheduler_steps or is_prefill
763
764
self .init_multi_step (num_steps = num_lookahead_slots + 1 )
764
765
765
- def get_last_latency (self , now : float ) -> float :
766
+ def set_last_token_time (self , now : float ) -> None :
766
767
"""Sets the last token time for Request level timings."""
767
- # If still in prefill phase, raise Error.
768
- if self .is_prefill ():
769
- raise ValueError (
770
- "seq_group.get_last_latency() should not be called "
771
- "if the seq_group is in prefill phase." )
772
-
773
- # Otherwise return token latency.
774
- latency = now - self .metrics .last_token_time
768
+ # If still in prefill phase, assertion fails.
769
+ assert not self .is_prefill (), (
770
+ "seq_group.set_last_token_time() should not be called "
771
+ "if the seq_group is in prefill phase." )
772
+ self .last_token_latency = now - self .metrics .last_token_time
775
773
self .metrics .last_token_time = now
776
- return latency
774
+
775
+ def get_last_token_latency (self ) -> float :
776
+ """Returns the latency of the last token."""
777
+ assert not self .is_prefill (), (
778
+ "seq_group.get_last_token_latency() should not be called "
779
+ "if the seq_group is in prefill phase." )
780
+ return self .last_token_latency
777
781
778
782
def maybe_set_first_token_time (self , time : float ) -> None :
779
783
"""Sets the first token time for Request level timings."""
0 commit comments