@@ -2333,9 +2333,11 @@ static struct ggml_cgraph * llm_build_llama(
2333
2333
inpL = cur;
2334
2334
}
2335
2335
2336
+ cur = inpL;
2337
+
2336
2338
// norm
2337
2339
{
2338
- cur = ggml_rms_norm (ctx0, inpL , norm_rms_eps);
2340
+ cur = ggml_rms_norm (ctx0, cur , norm_rms_eps);
2339
2341
offload_func_nr (cur);
2340
2342
ggml_set_name (cur, " rms_norm_2" );
2341
2343
@@ -2436,7 +2438,6 @@ static struct ggml_cgraph * llm_build_falcon(
2436
2438
ggml_set_name (KQ_scale, " 1/sqrt(n_embd_head)" );
2437
2439
2438
2440
for (int il = 0 ; il < n_layer; ++il) {
2439
- struct ggml_tensor * cur;
2440
2441
struct ggml_tensor * attn_norm;
2441
2442
2442
2443
// self-attention
@@ -2561,6 +2562,12 @@ static struct ggml_cgraph * llm_build_falcon(
2561
2562
struct ggml_tensor * inpFF = attn_norm;
2562
2563
2563
2564
cur = ggml_mul_mat (ctx0, model.layers [il].w3 , inpFF);
2565
+
2566
+ // TODO: this is temporary needed to introduce artificial dependency between FF and ATTN
2567
+ // adding this, because there seems to be a bug in the Metal concurrency optimization
2568
+ // without this line, the results are non-deterministic and wrong
2569
+ cur->src [2 ] = attn_out;
2570
+
2564
2571
cur = ggml_gelu (ctx0, cur);
2565
2572
cur = ggml_mul_mat (ctx0, model.layers [il].w2 , cur);
2566
2573
}
@@ -2572,9 +2579,11 @@ static struct ggml_cgraph * llm_build_falcon(
2572
2579
inpL = cur;
2573
2580
}
2574
2581
2582
+ cur = inpL;
2583
+
2575
2584
// norm
2576
2585
{
2577
- cur = ggml_norm (ctx0, inpL , norm_eps);
2586
+ cur = ggml_norm (ctx0, cur , norm_eps);
2578
2587
2579
2588
cur = ggml_add (ctx0,
2580
2589
ggml_mul (ctx0, cur, model.output_norm ),
0 commit comments