@@ -659,6 +659,7 @@ struct llama_model_loader {
659
659
LLAMA_ASSERT (lt.ne .size () == 1 );
660
660
tensor = ggml_new_tensor_1d (ggml_ctx, lt.type , lt.ne .at (0 ));
661
661
}
662
+ ggml_set_name (tensor, lt.name .c_str ());
662
663
LLAMA_ASSERT (lt.ggml_tensor == NULL ); // if this fails, we called get_tensor twice on the same tensor
663
664
lt.ggml_tensor = tensor;
664
665
num_ggml_tensors_created++;
@@ -798,6 +799,8 @@ static bool kv_cache_init(
798
799
799
800
cache.k = ggml_new_tensor_1d (cache.ctx , wtype, n_elements);
800
801
cache.v = ggml_new_tensor_1d (cache.ctx , wtype, n_elements);
802
+ ggml_set_name (cache.k , " cache_k" );
803
+ ggml_set_name (cache.v , " cache_v" );
801
804
802
805
return true ;
803
806
}
@@ -1084,6 +1087,7 @@ static bool llama_eval_internal(
1084
1087
gf.n_threads = N >= 32 && ggml_cpu_has_blas () && !ggml_cpu_has_gpublas () ? 1 : n_threads;
1085
1088
1086
1089
struct ggml_tensor * embd = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, N);
1090
+ ggml_set_name (embd, " embd" );
1087
1091
memcpy (embd->data , tokens, N*ggml_element_size (embd));
1088
1092
1089
1093
struct ggml_tensor * inpL = ggml_get_rows (ctx0, model.tok_embeddings , embd);
@@ -1110,6 +1114,8 @@ static bool llama_eval_internal(
1110
1114
// compute Q and K and RoPE them
1111
1115
struct ggml_tensor * Qcur = ggml_rope (ctx0, ggml_reshape_3d (ctx0, ggml_mul_mat (ctx0, model.layers [il].wq , cur), n_embd/n_head, n_head, N), n_past, n_rot, 0 );
1112
1116
struct ggml_tensor * Kcur = ggml_rope (ctx0, ggml_reshape_3d (ctx0, ggml_mul_mat (ctx0, model.layers [il].wk , cur), n_embd/n_head, n_head, N), n_past, n_rot, 0 );
1117
+ ggml_set_name (Qcur, " Qcur" );
1118
+ ggml_set_name (Kcur, " Kcur" );
1113
1119
1114
1120
// store key and value to memory
1115
1121
{
@@ -1130,28 +1136,34 @@ static bool llama_eval_internal(
1130
1136
ggml_permute (ctx0,
1131
1137
Qcur,
1132
1138
0 , 2 , 1 , 3 );
1139
+ ggml_set_name (Q, " Q" );
1133
1140
1134
1141
struct ggml_tensor * K =
1135
1142
ggml_permute (ctx0,
1136
1143
ggml_reshape_3d (ctx0,
1137
1144
ggml_view_1d (ctx0, kv_self.k , (n_past + N)*n_embd, il*n_ctx*ggml_element_size (kv_self.k )*n_embd),
1138
1145
n_embd/n_head, n_head, n_past + N),
1139
1146
0 , 2 , 1 , 3 );
1147
+ ggml_set_name (K, " K" );
1140
1148
1141
1149
// K * Q
1142
1150
struct ggml_tensor * KQ = ggml_mul_mat (ctx0, K, Q);
1151
+ ggml_set_name (KQ, " KQ" );
1143
1152
1144
1153
// KQ_scaled = KQ / sqrt(n_embd/n_head)
1145
- struct ggml_tensor * KQ_scaled =
1146
- ggml_scale (ctx0,
1147
- KQ,
1148
- ggml_new_f32 (ctx0, 1 .0f /sqrtf (float (n_embd)/n_head)));
1154
+ struct ggml_tensor * KQ_scale = ggml_new_f32 (ctx0, 1 .0f /sqrtf (float (n_embd)/n_head));
1155
+ ggml_set_name (KQ_scale, " 1/sqrt(n_embd/n_head)" );
1156
+
1157
+ struct ggml_tensor * KQ_scaled = ggml_scale (ctx0, KQ, KQ_scale);
1158
+ ggml_set_name (KQ_scaled, " KQ_scaled" );
1149
1159
1150
1160
// KQ_masked = mask_past(KQ_scaled)
1151
1161
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf (ctx0, KQ_scaled, n_past);
1162
+ ggml_set_name (KQ_masked, " KQ_masked" );
1152
1163
1153
1164
// KQ = soft_max(KQ_masked)
1154
1165
struct ggml_tensor * KQ_soft_max = ggml_soft_max (ctx0, KQ_masked);
1166
+ ggml_set_name (KQ_soft_max, " KQ_soft_max" );
1155
1167
1156
1168
// split cached V into n_head heads
1157
1169
struct ggml_tensor * V =
@@ -1160,9 +1172,11 @@ static bool llama_eval_internal(
1160
1172
n_ctx*ggml_element_size (kv_self.v ),
1161
1173
n_ctx*ggml_element_size (kv_self.v )*n_embd/n_head,
1162
1174
il*n_ctx*ggml_element_size (kv_self.v )*n_embd);
1175
+ ggml_set_name (V, " V" );
1163
1176
1164
1177
#if 1
1165
1178
struct ggml_tensor * KQV = ggml_mul_mat (ctx0, V, KQ_soft_max);
1179
+ ggml_set_name (KQV, " KQV" );
1166
1180
#else
1167
1181
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
1168
1182
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
@@ -1173,11 +1187,13 @@ static bool llama_eval_internal(
1173
1187
1174
1188
// KQV_merged = KQV.permute(0, 2, 1, 3)
1175
1189
struct ggml_tensor * KQV_merged = ggml_permute (ctx0, KQV, 0 , 2 , 1 , 3 );
1190
+ ggml_set_name (KQV_merged, " KQV_merged" );
1176
1191
1177
1192
// cur = KQV_merged.contiguous().view(n_embd, N)
1178
1193
cur = ggml_cpy (ctx0,
1179
1194
KQV_merged,
1180
1195
ggml_new_tensor_2d (ctx0, GGML_TYPE_F32, n_embd, N));
1196
+ ggml_set_name (cur, " KQV_merged_contiguous" );
1181
1197
1182
1198
// projection (no bias)
1183
1199
cur = ggml_mul_mat (ctx0,
0 commit comments