bugfix: fix coredump issue when both prefixcache and mtp are enabled. (#377)

DongheJin · yq33victor · commit ea8b42e91349 · 2025-11-25T11:36:42.000+08:00
* bugfix: fix coredump issue when both prefixcache and mtp are enabled.

* bugfix: fix coredump caused by incorrect token replacement.
diff --git a/xllm/core/framework/model/model_args.h b/xllm/core/framework/model/model_args.h
diff --git a/xllm/core/framework/request/sequence_kv_state.cpp b/xllm/core/framework/request/sequence_kv_state.cpp
@@ -58,7 +58,6 @@ void KVCacheState::add_shared_kv_blocks(std::vector<Block>&& blocks,
   if (blocks.empty()) {
     return;
   }
-
   // The number of matched blocks may be fewer than the number of blocks held by
   // the sequence itself. In this case, try to replace the blocks computed by
   // the sequence with blocks from the prefix_cache and release the computed
@@ -86,6 +85,10 @@ void KVCacheState::add_shared_kv_blocks(std::vector<Block>&& blocks,
     CHECK_GT(block_size, 0);
     num_shared_tokens =
         ((current_total_num_tokens - 1) / block_size) * block_size;
+    if (num_owned_shared_blocks_ > 0) {
+      num_owned_shared_blocks_--;
+      blocks_.pop_back();
+    }
   }
   CHECK_LT(num_shared_tokens, current_total_num_tokens);
   // update the kv cache position
diff --git a/xllm/core/runtime/llm_worker_impl.cpp b/xllm/core/runtime/llm_worker_impl.cpp
@@ -174,7 +174,7 @@ std::optional<ForwardOutput> LLMWorkerImpl::step(const ForwardInput& input) {
   // should be in same prefill stage, so, to judge empty_kv_cache,
   // just use micro batch 0 here
   if (options_.enable_speculative_decode() && !is_spec_draft_) {
-    if (input.input_params.q_seq_lens_vec[0] > 1) {
+    if (check_is_prefill(input.input_params.q_seq_lens_vec)) {
       output.sample_output.embeddings = hidden_states;
     } else if (sampling_params.sample_idxes.defined()) {
       // auto sample_idxes =
diff --git a/xllm/core/runtime/speculative_worker_impl.cpp b/xllm/core/runtime/speculative_worker_impl.cpp
@@ -171,7 +171,7 @@ std::optional<ForwardOutput> SpeculativeWorkerImpl::step(
   }
 
   // TODO: support data parallel case
-  if (input.input_params.q_seq_lens_vec[0] > 1) {
+  if (check_is_prefill(input.input_params.q_seq_lens_vec)) {
     return step_prefill(input);
   } else {
     return step_decode(input);
@@ -180,7 +180,7 @@ std::optional<ForwardOutput> SpeculativeWorkerImpl::step(
 
 std::optional<ForwardOutput> SpeculativeWorkerImpl::step_empty(
     const ForwardInput& input) {
-  if (input.input_params.q_seq_lens_vec[0] > 1) {
+  if (check_is_prefill(input.input_params.q_seq_lens_vec)) {
     auto output = impl_->step(input);
     auto draft_output = draft_impl_->step(input);
     return output;
@@ -224,9 +224,10 @@ std::optional<ForwardOutput> SpeculativeWorkerImpl::step_prefill(
   auto offset = input.input_params.num_sequences;
   auto token_offset = prefill_input.token_ids.size(0);
   if (token_offset > 0) {
-    prefill_input.input_params.mm_data = MMData(
-        MMType::EMBEDDING,
-        {{"embedding", embeddings.narrow(0, token_start_idx, token_offset)}});
+    prefill_input.input_params.mm_data =
+        MMData(MMType::EMBEDDING,
+               {{"embedding",
+                 embeddings.narrow(0, token_start_idx, token_offset).clone()}});
   }
   if (next_tokens.defined()) {
     auto& token_ids = prefill_input.token_ids;
@@ -329,7 +330,11 @@ std::optional<ForwardOutput> SpeculativeWorkerImpl::step_decode(
       // final step
       prepare_validate_inputs(input, validate_input, true);
     } else {
-      prepare_draft_inputs(draft_input, next_step_input, 1, device_);
+      if (i == 0) {
+        prepare_draft_inputs(input, next_step_input, 1, device_);
+      } else {
+        prepare_draft_inputs(draft_input, next_step_input, 1, device_);
+      }
     }
     draft_outputs.push_back(std::move(future).get().value());
     // update input of next step
@@ -759,7 +764,7 @@ void SpeculativeWorkerImpl::update_sampling_params(
 void SpeculativeWorkerImpl::prepare_work_before_execute(
     const ForwardInput& input,
     ForwardInput& processed_input) {
-  if (input.input_params.q_seq_lens_vec[0] > 1) {
+  if (check_is_prefill(input.input_params.q_seq_lens_vec)) {
     WorkerImpl::prepare_work_before_execute(input, processed_input);
   } else {
     if (enable_schedule_overlap()) {
diff --git a/xllm/core/runtime/worker_impl.cpp b/xllm/core/runtime/worker_impl.cpp
@@ -1039,5 +1039,13 @@ AlignedTensorCreater::AlignedTensorCreater(
   LOG(INFO) << "Page aligned: "
             << ((uintptr_t)base_ptr_ % page_size == 0 ? "YES" : "NO");
 }
+bool WorkerImpl::check_is_prefill(const std::vector<int>& q_seq_lens_vec) {
+  for (auto q_len : q_seq_lens_vec) {
+    if (q_len > 1) {
+      return true;
+    }
+  }
+  return false;
+}
 
 }  // namespace xllm
diff --git a/xllm/core/runtime/worker_impl.h b/xllm/core/runtime/worker_impl.h
@@ -165,6 +165,8 @@ class WorkerImpl {
 
   torch::ScalarType dtype() const { return dtype_; }
 
+  bool check_is_prefill(const std::vector<int>& q_seq_lens_vec);
+
   int32_t hidden_size() const {
     return context_.get_model_args().hidden_size();
   }
diff --git a/xllm/core/scheduler/continuous_scheduler.cpp b/xllm/core/scheduler/continuous_scheduler.cpp
@@ -93,7 +93,6 @@ ContinuousScheduler::ContinuousScheduler(Engine* engine, const Options& options)
   } else {
     min_speculative_tokens_required_ = options_.num_speculative_tokens();
   }
-
 }
 
 ContinuousScheduler::~ContinuousScheduler() { running_requests_.clear(); }

Original file line number	Diff line number	Diff line change
`@@ -165,6 +165,8 @@ class WorkerImpl {`
`165`	`165`
`166`	`166`	`torch::ScalarType dtype() const { return dtype_; }`
`167`	`167`
	`168`	`+ bool check_is_prefill(const std::vector<int>& q_seq_lens_vec);`
	`169`	`+`
`168`	`170`	`int32_t hidden_size() const {`
`169`	`171`	`return context_.get_model_args().hidden_size();`
`170`	`172`	`}`
Original file line number	Diff line number	Diff line change
`@@ -93,7 +93,6 @@ ContinuousScheduler::ContinuousScheduler(Engine* engine, const Options& options)`
`93`	`93`	`} else {`
`94`	`94`	`min_speculative_tokens_required_ = options_.num_speculative_tokens();`
`95`	`95`	`}`
`96`		`-`
`97`	`96`	`}`
`98`	`97`
`99`	`98`	`ContinuousScheduler::~ContinuousScheduler() { running_requests_.clear(); }`