qwen2vl: fix mrope position

ngxson · ngxson · commit a9efdbbce56e · 2025-03-23T18:22:58.000+01:00
diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-cli.cpp
@@ -68,7 +68,7 @@ static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct lla
 
         float * batch_embd = image_embed->embed+i*n_embd;
         auto batch = llama_batch_ext_ptr::init_from_embd(batch_embd, n_eval, n_embd, 0, 0);
-        llama_batch_ext_set_pos(batch.get(), batch_mrope_pos.data(), n_eval);
+        llama_batch_ext_set_pos(batch.get(), batch_mrope_pos.data(), n_eval * 4);
 
         if (llama_decode_ext(ctx_llama, batch.get())) {
             LOG_ERR("%s : failed to eval\n", __func__);
@@ -91,18 +91,18 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
         }
 
         // TODO: add mrope pos ids somewhere else
-        int n_tokens = n_eval;
-        pos.resize(n_tokens * 4);
+        pos.resize(n_eval * 4);
         std::fill(pos.begin(), pos.end(), 0);
-        for (int j = 0; j < n_tokens * 3; j ++) {
-            pos[j] = *st_pos_id + (j % n_tokens);
+        for (int j = 0; j < n_eval * 3; j ++) {
+            pos[j] = *st_pos_id + (j % n_eval);
         }
 
         llama_batch_ext_ptr batch(llama_batch_ext_init(n_eval, 1));
         for (int j = 0; j < n_eval; j++) {
             llama_token token = tokens[i + j];
-            batch.add_text(token, pos[j], 0, false);
+            batch.add_text(token, 0, 0, false); // position is set in the next step
         }
+        llama_batch_ext_set_pos(batch.get(), pos.data(), pos.size());
         llama_batch_ext_set_output_last(batch.get());
 
         if (llama_decode_ext(ctx_llama, batch.get())) {
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
@@ -1,4 +1,5 @@
 #include "llama-batch.h"
+#include "llama-graph.h"
 
 #include <cstring>
 #include <algorithm>
@@ -356,7 +357,7 @@ static struct llama_batch_ext * llama_batch_ext_init_impl(int32_t n_tokens_alloc
         batch->token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
     }
 
-    batch->pos      = (llama_pos *)     malloc(sizeof(llama_pos)      * n_tokens_alloc);
+    batch->pos      = (llama_pos *)     malloc(sizeof(llama_pos)      * n_tokens_alloc * MAX_POS_PER_TOKEN);
     batch->n_seq_id = (int32_t *)       malloc(sizeof(int32_t)        * n_tokens_alloc);
     batch->seq_id   = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * (n_tokens_alloc + 1));
     for (int i = 0; i < n_tokens_alloc; ++i) {
@@ -390,7 +391,7 @@ struct llama_batch_ext * llama_batch_ext_init_from_embd(
 }
 
 int32_t llama_batch_ext_set_pos(struct llama_batch_ext * batch, llama_pos * pos, size_t n_pos) {
-    if ((size_t) batch->n_tokens != n_pos) {
+    if ((size_t) batch->n_tokens * MAX_POS_PER_TOKEN < n_pos) {
         return -1;
     }
     memcpy(batch->pos, pos, n_pos * sizeof(llama_pos));
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -603,7 +603,9 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
     }
 
 int64_t llm_graph_context::n_pos_per_token() const {
-    return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
+    constexpr int64_t n_pos_per_token_qwen2vl = 4;
+    static_assert(n_pos_per_token_qwen2vl <= MAX_POS_PER_TOKEN);
+    return arch == LLM_ARCH_QWEN2VL ? n_pos_per_token_qwen2vl : 1;
 }
 
 void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -10,6 +10,8 @@
 #include <set>
 #include <functional>
 
+#define MAX_POS_PER_TOKEN 4
+
 struct ggml_cgraph;
 struct ggml_context;
 struct ggml_tensor;

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`#include "llama-batch.h"`
	`2`	`+#include "llama-graph.h"`
`2`	`3`
`3`	`4`	`#include <cstring>`
`4`	`5`	`#include <algorithm>`
`@@ -356,7 +357,7 @@ static struct llama_batch_ext * llama_batch_ext_init_impl(int32_t n_tokens_alloc`
`356`	`357`	`batch->token = (llama_token ) malloc(sizeof(llama_token) n_tokens_alloc);`
`357`	`358`	`}`
`358`	`359`
`359`		`- batch->pos = (llama_pos ) malloc(sizeof(llama_pos) n_tokens_alloc);`
	`360`	`+ batch->pos = (llama_pos ) malloc(sizeof(llama_pos) n_tokens_alloc * MAX_POS_PER_TOKEN);`
`360`	`361`	`batch->n_seq_id = (int32_t ) malloc(sizeof(int32_t) n_tokens_alloc);`
`361`	`362`	`batch->seq_id = (llama_seq_id *) malloc(sizeof(llama_seq_id ) * (n_tokens_alloc + 1));`
`362`	`363`	`for (int i = 0; i < n_tokens_alloc; ++i) {`
`@@ -390,7 +391,7 @@ struct llama_batch_ext * llama_batch_ext_init_from_embd(`
`390`	`391`	`}`
`391`	`392`
`392`	`393`	`int32_t llama_batch_ext_set_pos(struct llama_batch_ext * batch, llama_pos * pos, size_t n_pos) {`
`393`		`- if ((size_t) batch->n_tokens != n_pos) {`
	`394`	`+ if ((size_t) batch->n_tokens * MAX_POS_PER_TOKEN < n_pos) {`
`394`	`395`	`return -1;`
`395`	`396`	`}`
`396`	`397`	`memcpy(batch->pos, pos, n_pos * sizeof(llama_pos));`
Original file line number	Diff line number	Diff line change
`@@ -603,7 +603,9 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :`
`603`	`603`	`}`
`604`	`604`
`605`	`605`	`int64_t llm_graph_context::n_pos_per_token() const {`
`606`		`- return arch == LLM_ARCH_QWEN2VL ? 4 : 1;`
	`606`	`+ constexpr int64_t n_pos_per_token_qwen2vl = 4;`
	`607`	`+ static_assert(n_pos_per_token_qwen2vl <= MAX_POS_PER_TOKEN);`
	`608`	`+ return arch == LLM_ARCH_QWEN2VL ? n_pos_per_token_qwen2vl : 1;`
`607`	`609`	`}`
`608`	`610`
`609`	`611`	`void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {`