@@ -2991,7 +2991,7 @@ static void llm_load_tensors(
29912991 } break ;
29922992 case LLM_ARCH_STABLELM:
29932993 {
2994- model.tok_embd = ml.create_tensor (ctx, tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2994+ model.tok_embeddings = ml.create_tensor (ctx, tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab}, GGML_BACKEND_CPU);
29952995
29962996 // output
29972997 {
@@ -3002,12 +3002,12 @@ static void llm_load_tensors(
30023002 // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
30033003 // on Windows however this is detrimental unless everything is on the GPU
30043004#ifndef _WIN32
3005- backend_norm = llama_backend_offload ;
3005+ backend_norm = GGML_BACKEND_GPU ;
30063006#else
3007- backend_norm = n_gpu_layers <= (int ) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload ;
3007+ backend_norm = n_gpu_layers <= (int ) n_layer + 2 ? GGML_BACKEND_CPU : GGML_BACKEND_GPU ;
30083008#endif // _WIN32
30093009
3010- backend_output = llama_backend_offload_split ;
3010+ backend_output = GGML_BACKEND_GPU_SPLIT ;
30113011 } else {
30123012 backend_norm = GGML_BACKEND_CPU;
30133013 backend_output = GGML_BACKEND_CPU;
@@ -3035,8 +3035,8 @@ static void llm_load_tensors(
30353035 /*
30363036 llama_model_loader: - tensor 4: blk.0.attn_output.weight f16 [ 2560, 2560, 1, 1 ]
30373037 */
3038- const ggml_backend_type backend = int (i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload ; // NOLINT
3039- const ggml_backend_type backend_split = int (i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split ; // NOLINT
3038+ const ggml_backend_type backend = int (i) < i_gpu_start ? GGML_BACKEND_CPU : GGML_BACKEND_GPU ; // NOLINT
3039+ const ggml_backend_type backend_split = int (i) < i_gpu_start ? GGML_BACKEND_CPU : GGML_BACKEND_GPU_SPLIT ; // NOLINT
30403040
30413041 auto & layer = model.layers [i];
30423042
@@ -3051,15 +3051,15 @@ static void llm_load_tensors(
30513051 layer.ffn_norm = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_NORM, " weight" , i), {n_embd}, backend);
30523052 layer.ffn_norm_b = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_NORM, " bias" , i), {n_embd}, backend);
30533053
3054- layer.ffn_gate = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_GATE, " weight" , i), {n_embd, n_ff}, backend_split);
3055- layer.ffn_down = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_DOWN, " weight" , i), { n_ff, n_embd}, backend_split);
3056- layer.ffn_up = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_UP, " weight" , i), {n_embd, n_ff}, backend_split);
3054+ layer.w1 = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_GATE, " weight" , i), {n_embd, n_ff}, backend_split);
3055+ layer.w2 = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_DOWN, " weight" , i), { n_ff, n_embd}, backend_split);
3056+ layer.w3 = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_UP, " weight" , i), {n_embd, n_ff}, backend_split);
30573057
30583058 if (backend == GGML_BACKEND_GPU) {
30593059 vram_weights +=
30603060 ggml_nbytes (layer.attn_norm ) + ggml_nbytes (layer.wq ) + ggml_nbytes (layer.wk ) +
30613061 ggml_nbytes (layer.wv ) + ggml_nbytes (layer.wo ) + ggml_nbytes (layer.ffn_norm ) +
3062- ggml_nbytes (layer.ffn_gate ) + ggml_nbytes (layer.ffn_down ) + ggml_nbytes (layer.ffn_up );
3062+ ggml_nbytes (layer.w1 ) + ggml_nbytes (layer.w2 ) + ggml_nbytes (layer.w3 );
30633063 }
30643064 }
30653065 } break ;
@@ -5943,7 +5943,7 @@ struct ggml_cgraph * build_stablelm() {
59435943 struct ggml_tensor * cur;
59445944 struct ggml_tensor * inpL;
59455945
5946- inpL = llm_build_inp_embd (ctx0, hparams, batch, model.tok_embd , cb);
5946+ inpL = llm_build_inp_embd (ctx0, hparams, batch, model.tok_embeddings , cb);
59475947 cb (inpL, " inp_embd" , -1 );
59485948
59495949 // inp_pos - contains the positions
@@ -6076,9 +6076,9 @@ struct ggml_cgraph * build_stablelm() {
60766076 cb (cur, " ffn_norm" , il);
60776077
60786078 cur = llm_build_ffn (ctx0, cur,
6079- model.layers [il].ffn_up , NULL ,
6080- model.layers [il].ffn_gate , NULL ,
6081- model.layers [il].ffn_down , NULL ,
6079+ model.layers [il].w3 , NULL ,
6080+ model.layers [il].w1 , NULL ,
6081+ model.layers [il].w2 , NULL ,
60826082 LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
60836083 cb (cur, " ffn_out" , il);
60846084 }
0 commit comments