@@ -1837,6 +1837,12 @@ struct llama_model_loader {
1837
1837
throw std::runtime_error (format (" %s: tensor '%s' not found" , __func__, name.c_str ()));
1838
1838
}
1839
1839
1840
+ if (backend == GGML_BACKEND_GPU_SPLIT) {
1841
+ if (ne.size () == 1 ) {
1842
+ throw std::runtime_error (format (" %s: 1-dimensional tensor '%s' cannot be split on the GPU" , __func__, name.c_str ()));
1843
+ }
1844
+ }
1845
+
1840
1846
{
1841
1847
bool is_ok = true ;
1842
1848
for (size_t i = 0 ; i < ne.size (); ++i) {
@@ -2817,8 +2823,8 @@ static void llm_load_tensors(
2817
2823
layer.ffn_down = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_DOWN, " weight" , i), {n_ff, n_embd}, backend_split);
2818
2824
layer.ffn_down_b = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_DOWN, " bias" , i), {n_embd}, backend);
2819
2825
2820
- layer.ffn_up = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_UP, " weight" , i), {n_embd, n_ff}, backend_split);
2821
- layer.ffn_up_b = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_UP, " bias" , i), {n_ff}, backend);
2826
+ layer.ffn_up = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_UP, " weight" , i), {n_embd, n_ff}, backend_split);
2827
+ layer.ffn_up_b = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_UP, " bias" , i), {n_ff}, backend);
2822
2828
2823
2829
if (backend == GGML_BACKEND_GPU) {
2824
2830
vram_weights +=
@@ -2877,13 +2883,13 @@ static void llm_load_tensors(
2877
2883
layer.attn_norm = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_NORM, " weight" , i), {n_embd}, backend);
2878
2884
layer.attn_norm_b = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_NORM, " bias" , i), {n_embd}, backend);
2879
2885
layer.wqkv = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_QKV, " weight" , i), {n_embd, n_embd + 2 *n_embd_gqa}, backend_split);
2880
- layer.bqkv = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_QKV, " bias" , i), {n_embd + 2 *n_embd_gqa}, backend_split );
2886
+ layer.bqkv = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_QKV, " bias" , i), {n_embd + 2 *n_embd_gqa}, backend );
2881
2887
layer.wo = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_OUT, " weight" , i), {n_embd, n_embd}, backend_split);
2882
- layer.bo = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_OUT, " bias" , i), {n_embd}, backend_split );
2888
+ layer.bo = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_OUT, " bias" , i), {n_embd}, backend );
2883
2889
layer.ffn_down = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_DOWN, " weight" , i), {n_ff, n_embd}, backend_split);
2884
- layer.ffn_down_b = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_DOWN, " bias" , i), {n_embd}, backend_split );
2890
+ layer.ffn_down_b = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_DOWN, " bias" , i), {n_embd}, backend );
2885
2891
layer.ffn_up = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_UP, " weight" , i), {n_embd, n_ff}, backend_split);
2886
- layer.ffn_up_b = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_UP, " bias" , i), {n_ff}, backend_split );
2892
+ layer.ffn_up_b = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_UP, " bias" , i), {n_ff}, backend );
2887
2893
layer.ffn_norm = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_NORM, " weight" , i), {n_embd}, backend);
2888
2894
layer.ffn_norm_b = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_NORM, " bias" , i), {n_embd}, backend);
2889
2895
layer.attn_q_norm = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_Q_NORM, " weight" , i), {64 }, backend);
@@ -2949,19 +2955,19 @@ static void llm_load_tensors(
2949
2955
layer.attn_norm_b = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_NORM, " bias" , i), {n_embd}, backend);
2950
2956
2951
2957
layer.wqkv = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_QKV, " weight" , i), {n_embd, n_embd + 2 *n_embd_gqa}, backend_split);
2952
- layer.bqkv = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_QKV, " bias" , i), {n_embd + 2 *n_embd_gqa}, backend_split );
2958
+ layer.bqkv = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_QKV, " bias" , i), {n_embd + 2 *n_embd_gqa}, backend );
2953
2959
2954
2960
layer.wo = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_OUT, " weight" , i), {n_embd, n_embd}, backend_split);
2955
- layer.bo = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_OUT, " bias" , i), {n_embd}, backend_split );
2961
+ layer.bo = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_OUT, " bias" , i), {n_embd}, backend );
2956
2962
2957
2963
layer.ffn_norm = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_NORM, " weight" , i), {n_embd}, backend);
2958
2964
layer.ffn_norm_b = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_NORM, " bias" , i), {n_embd}, backend);
2959
2965
2960
2966
layer.ffn_down = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_DOWN, " weight" , i), {n_ff, n_embd}, backend_split);
2961
- layer.ffn_down_b = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_DOWN, " bias" , i), {n_embd}, backend_split );
2967
+ layer.ffn_down_b = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_DOWN, " bias" , i), {n_embd}, backend );
2962
2968
2963
- layer.ffn_up = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_UP, " weight" , i), {n_embd, n_ff}, backend_split);
2964
- layer.ffn_up_b = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_UP, " bias" , i), {n_ff}, backend_split );
2969
+ layer.ffn_up = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_UP, " weight" , i), {n_embd, n_ff}, backend_split);
2970
+ layer.ffn_up_b = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_UP, " bias" , i), {n_ff}, backend );
2965
2971
2966
2972
if (backend == GGML_BACKEND_GPU) {
2967
2973
vram_weights +=
0 commit comments