rm n_pos_per_embd from llm_graph_input_attn_temp

ngxson · ngxson · commit 9cd16a3981cb · 2025-04-28T10:30:37.000+02:00
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -82,7 +82,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
             ) * f_attn_temp_scale + 1.0;
         }
 
-        ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*n_pos_per_embd*ggml_element_size(attn_scale));
+        ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*ggml_element_size(attn_scale));
     }
 }
 
@@ -1042,12 +1042,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
 }
 
 ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
-    auto inp = std::make_unique<llm_graph_input_attn_temp>(n_pos_per_embd(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
+    auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
 
     auto & cur = inp->attn_scale;
 
     // this need to be 1x1xN for broadcasting
-    cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_embd());
+    cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens);
     ggml_set_input(cur);
 
     res->add_input(std::move(inp));
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -103,16 +103,14 @@ class llm_graph_input_pos : public llm_graph_input_i {
 // temperature tuning, used by llama4
 class llm_graph_input_attn_temp : public llm_graph_input_i {
 public:
-    llm_graph_input_attn_temp(int64_t n_pos_per_embd, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
-        : n_pos_per_embd(n_pos_per_embd), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
+    llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
+        : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
     virtual ~llm_graph_input_attn_temp() = default;
 
     void set_input(const llama_ubatch * ubatch) override;
 
     ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
 
-    const int64_t n_pos_per_embd = 1;
-
     const uint32_t n_attn_temp_floor_scale;
     const float    f_attn_temp_scale;
 };

Original file line number	Diff line number	Diff line change
`@@ -82,7 +82,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {`
`82`	`82`	`) * f_attn_temp_scale + 1.0;`
`83`	`83`	`}`
`84`	`84`
`85`		`- ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokensn_pos_per_embdggml_element_size(attn_scale));`
	`85`	`+ ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*ggml_element_size(attn_scale));`
`86`	`86`	`}`
`87`	`87`	`}`
`88`	`88`
`@@ -1042,12 +1042,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {`
`1042`	`1042`	`}`
`1043`	`1043`
`1044`	`1044`	`ggml_tensor * llm_graph_context::build_inp_attn_scale() const {`
`1045`		`- auto inp = std::make_unique<llm_graph_input_attn_temp>(n_pos_per_embd(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);`
	`1045`	`+ auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);`
`1046`	`1046`
`1047`	`1047`	`auto & cur = inp->attn_scale;`
`1048`	`1048`
`1049`	`1049`	`// this need to be 1x1xN for broadcasting`
`1050`		`- cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_embd());`
	`1050`	`+ cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens);`
`1051`	`1051`	`ggml_set_input(cur);`
`1052`	`1052`
`1053`	`1053`	`res->add_input(std::move(inp));`