@@ -55,7 +55,18 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
55
55
if (ubatch->pos && pos) {
56
56
const int64_t n_tokens = ubatch->n_tokens ;
57
57
58
- ggml_backend_tensor_set (pos, ubatch->pos , 0 , n_tokens*n_pos_per_token*ggml_element_size (pos));
58
+ if (ubatch->token && n_pos_per_embd > 1 ) {
59
+ // in case we're using M-RoPE with text tokens, convert the 1D positions to 4D
60
+ // the other dimensions are all 0, they are unused for text tokens
61
+ std::vector<llama_pos> pos_data (n_tokens*n_pos_per_embd, 0 );
62
+ // copy the first dimension
63
+ for (int i = 0 ; i < n_tokens; ++i) {
64
+ pos_data[i] = ubatch->pos [i];
65
+ }
66
+ ggml_backend_tensor_set (pos, pos_data.data (), 0 , pos_data.size ()*ggml_element_size (pos));
67
+ } else {
68
+ ggml_backend_tensor_set (pos, ubatch->pos , 0 , n_tokens*n_pos_per_embd*ggml_element_size (pos));
69
+ }
59
70
}
60
71
}
61
72
@@ -71,7 +82,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
71
82
) * f_attn_temp_scale + 1.0 ;
72
83
}
73
84
74
- ggml_backend_tensor_set (attn_scale, attn_scale_data.data (), 0 , n_tokens*n_pos_per_token* ggml_element_size (attn_scale));
85
+ ggml_backend_tensor_set (attn_scale, attn_scale_data.data (), 0 , n_tokens*ggml_element_size (attn_scale));
75
86
}
76
87
}
77
88
@@ -592,7 +603,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
592
603
res (std::make_unique<llm_graph_result>()) {
593
604
}
594
605
595
- int64_t llm_graph_context::n_pos_per_token () const {
606
+ int64_t llm_graph_context::n_pos_per_embd () const {
596
607
return arch == LLM_ARCH_QWEN2VL ? 4 : 1 ;
597
608
}
598
609
@@ -1018,11 +1029,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
1018
1029
}
1019
1030
1020
1031
ggml_tensor * llm_graph_context::build_inp_pos () const {
1021
- auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_token ());
1032
+ auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_embd ());
1022
1033
1023
1034
auto & cur = inp->pos ;
1024
1035
1025
- cur = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token ());
1036
+ cur = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd ());
1026
1037
ggml_set_input (cur);
1027
1038
1028
1039
res->add_input (std::move (inp));
@@ -1031,11 +1042,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
1031
1042
}
1032
1043
1033
1044
ggml_tensor * llm_graph_context::build_inp_attn_scale () const {
1034
- auto inp = std::make_unique<llm_graph_input_attn_temp>(n_pos_per_token (), hparams.n_attn_temp_floor_scale , hparams.f_attn_temp_scale );
1045
+ auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale , hparams.f_attn_temp_scale );
1035
1046
1036
1047
auto & cur = inp->attn_scale ;
1037
1048
1038
- cur = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 1 , 1 , n_tokens*n_pos_per_token ());
1049
+ // this need to be 1x1xN for broadcasting
1050
+ cur = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 1 , 1 , n_tokens);
1039
1051
ggml_set_input (cur);
1040
1052
1041
1053
res->add_input (std::move (inp));
0 commit comments