@@ -171,7 +171,8 @@ def __init__(
171171
172172 # OPTIMIZATION: Cache the tensors rather than creating them every step.
173173 self .arange_np = np .arange (max (self .max_num_reqs + 1 ,
174- self .max_model_len ),
174+ self .max_model_len ,
175+ self .max_num_tokens ),
175176 dtype = np .int32 )
176177 # NOTE(woosuk): These tensors are "stateless", i.e., they are literally
177178 # a faster version of creating a new tensor every time. Thus, we should
@@ -358,8 +359,15 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
358359
359360 # Get batched arange.
360361 # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
361- arange = np .concatenate (
362- [self .arange_np [:n ] for n in num_scheduled_tokens ])
362+ # Equivalent to but faster than:
363+ # np.concatenate([np.arange(n) for n in num_scheduled_tokens])
364+ # Step 1. [2, 5, 3] -> [2, 7, 10]
365+ cu_num_tokens = np .cumsum (num_scheduled_tokens )
366+ # Step 2. [2, 7, 10] -> [0, 0, 2, 2, 2, 2, 2, 7, 7, 7]
367+ cumsums_offsets = np .repeat (cu_num_tokens - num_scheduled_tokens ,
368+ num_scheduled_tokens )
369+ # Step 3. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
370+ arange = self .arange_np [:total_num_scheduled_tokens ] - cumsums_offsets
363371
364372 # Get positions.
365373 positions_np = self .positions_np [:total_num_scheduled_tokens ]
@@ -406,8 +414,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
406414
407415 # Prepare the attention metadata.
408416 self .query_start_loc_np [0 ] = 0
409- np .cumsum (num_scheduled_tokens ,
410- out = self .query_start_loc_np [1 :num_reqs + 1 ])
417+ self .query_start_loc_np [1 :num_reqs + 1 ] = cu_num_tokens
411418
412419 self .seq_lens_np [:num_reqs ] = (
413420 self .input_batch .num_computed_tokens_cpu [:num_reqs ] +
0 commit comments