@@ -707,6 +707,9 @@ struct llama_model_loader {
707
707
708
708
struct ggml_tensor * get_tensor_for (llama_load_tensor & lt, ggml_backend backend) {
709
709
struct ggml_tensor * tensor;
710
+ if (backend != GGML_BACKEND_CPU) {
711
+ ggml_set_no_alloc (ggml_ctx, true );
712
+ }
710
713
if (lt.ne .size () == 2 ) {
711
714
tensor = ggml_new_tensor_2d (ggml_ctx, lt.type , lt.ne .at (0 ), lt.ne .at (1 ));
712
715
} else {
@@ -716,6 +719,9 @@ struct llama_model_loader {
716
719
ggml_set_name (tensor, lt.name .c_str ());
717
720
LLAMA_ASSERT (lt.ggml_tensor == NULL ); // if this fails, we called get_tensor twice on the same tensor
718
721
722
+ if (backend != GGML_BACKEND_CPU) {
723
+ ggml_set_no_alloc (ggml_ctx, use_mmap);
724
+ }
719
725
tensor->backend = backend;
720
726
lt.ggml_tensor = tensor;
721
727
num_ggml_tensors_created++;
@@ -731,6 +737,7 @@ struct llama_model_loader {
731
737
void load_all_data (llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
732
738
size_t data_size = 0 ;
733
739
size_t prefetch_size = 0 ;
740
+ size_t lock_size = 0 ;
734
741
for (const llama_load_tensor & lt : tensors_map.tensors ) {
735
742
data_size += lt.size ;
736
743
if (lt.ggml_tensor ->backend == GGML_BACKEND_CPU) {
@@ -752,20 +759,48 @@ struct llama_model_loader {
752
759
753
760
size_t done_size = 0 ;
754
761
for (llama_load_tensor & lt : tensors_map.tensors ) {
755
- if (lt.ggml_tensor ->backend != GGML_BACKEND_CPU) {
756
- continue ;
757
- }
758
762
if (progress_callback) {
759
763
progress_callback ((float ) done_size / data_size, progress_callback_user_data);
760
764
}
761
765
LLAMA_ASSERT (lt.ggml_tensor ); // unused tensors should have been caught by load_data already
762
766
lt.data = (uint8_t *) lt.ggml_tensor ->data ;
767
+
768
+ // allocate temp buffer if not using mmap
769
+ if (!use_mmap && lt.data == NULL ) {
770
+ lt.data = (uint8_t *)malloc (ggml_nbytes (lt.ggml_tensor ));
771
+ }
772
+
763
773
load_data_for (lt);
764
- lt.ggml_tensor ->data = lt.data ;
765
- done_size += lt.size ;
766
- if (use_mmap && lmlock) {
767
- lmlock->grow_to (done_size);
774
+
775
+ switch (lt.ggml_tensor ->backend ) {
776
+ case GGML_BACKEND_CPU:
777
+ lt.ggml_tensor ->data = lt.data ;
778
+ if (use_mmap && lmlock) {
779
+ lock_size += lt.size ;
780
+ lmlock->grow_to (lock_size);
781
+ }
782
+ break ;
783
+ #if defined(GGML_USE_CUBLAS)
784
+ case GGML_BACKEND_GPU:
785
+ case GGML_BACKEND_GPU_SPLIT:
786
+ ggml_cuda_transform_tensor (lt.data , lt.ggml_tensor );
787
+ if (!use_mmap) {
788
+ free (lt.data );
789
+ }
790
+ break ;
791
+ #elif defined(GGML_USE_CLBLAST)
792
+ case GGML_BACKEND_GPU:
793
+ ggml_cl_transform_tensor (lt.data , lt.ggml_tensor );
794
+ if (!use_mmap) {
795
+ free (lt.data );
796
+ }
797
+ break ;
798
+ #endif
799
+ default :
800
+ continue ;
768
801
}
802
+
803
+ done_size += lt.size ;
769
804
}
770
805
}
771
806
@@ -1141,7 +1176,7 @@ static void llama_model_load_internal(
1141
1176
if (backend == GGML_BACKEND_GPU) {
1142
1177
vram_weights +=
1143
1178
ggml_nbytes (layer.attention_norm ) + ggml_nbytes (layer.wq ) + ggml_nbytes (layer.wk ) +
1144
- ggml_nbytes (layer.wv ) + ggml_nbytes (layer.wo ) + ggml_nbytes (layer.attention_norm ) +
1179
+ ggml_nbytes (layer.wv ) + ggml_nbytes (layer.wo ) + ggml_nbytes (layer.ffn_norm ) +
1145
1180
ggml_nbytes (layer.w1 ) + ggml_nbytes (layer.w2 ) + ggml_nbytes (layer.w3 );
1146
1181
}
1147
1182
}
@@ -1196,58 +1231,14 @@ static void llama_model_load_internal(
1196
1231
model.tensors_by_name .emplace_back (lt.name , lt.ggml_tensor );
1197
1232
}
1198
1233
1199
- ml->load_all_data (progress_callback, progress_callback_user_data, use_mlock ? &lctx.model .mlock_mmap : NULL );
1200
-
1201
1234
#if defined(GGML_USE_CUBLAS)
1202
1235
{
1203
1236
ggml_cuda_set_tensor_split (tensor_split);
1204
-
1205
- size_t done_size = 0 ;
1206
- size_t data_size = 0 ;
1207
- for (llama_load_tensor & lt : ml->tensors_map .tensors ) {
1208
- data_size += lt.size ;
1209
- if (lt.ggml_tensor ->backend == GGML_BACKEND_CPU) {
1210
- done_size += lt.size ;
1211
- }
1212
- }
1213
- for (llama_load_tensor & lt : ml->tensors_map .tensors ) {
1214
- ggml_backend backend = lt.ggml_tensor ->backend ;
1215
- if (backend != GGML_BACKEND_GPU && backend != GGML_BACKEND_GPU_SPLIT) {
1216
- continue ;
1217
- }
1218
- if (progress_callback) {
1219
- progress_callback ((float ) done_size / data_size, progress_callback_user_data);
1220
- }
1221
- ggml_cuda_load_data (fname.c_str (), lt.ggml_tensor , lt.shards .at (0 ).file_off );
1222
- done_size += lt.size ;
1223
- }
1224
1237
}
1225
- #elif defined(GGML_USE_CLBLAST)
1226
- {
1227
- size_t done_size = 0 ;
1228
- size_t data_size = 0 ;
1229
- for (llama_load_tensor & lt : ml->tensors_map .tensors ) {
1230
- data_size += lt.size ;
1231
- if (lt.ggml_tensor ->backend == GGML_BACKEND_CPU) {
1232
- done_size += lt.size ;
1233
- }
1234
- }
1235
- for (llama_load_tensor & lt : ml->tensors_map .tensors ) {
1236
- if (lt.ggml_tensor ->backend != GGML_BACKEND_GPU) {
1237
- continue ;
1238
- }
1239
- if (progress_callback) {
1240
- progress_callback ((float ) done_size / data_size, progress_callback_user_data);
1241
- }
1242
- ggml_cl_load_data (fname.c_str (), lt.ggml_tensor , lt.shards .at (0 ).file_off );
1243
- done_size += lt.size ;
1244
- }
1245
- }
1246
- #else
1247
- (void ) n_batch;
1248
- (void ) tensor_split;
1249
1238
#endif
1250
1239
1240
+ ml->load_all_data (progress_callback, progress_callback_user_data, use_mlock ? &lctx.model .mlock_mmap : NULL );
1241
+
1251
1242
if (progress_callback) {
1252
1243
progress_callback (1 .0f , progress_callback_user_data);
1253
1244
}
0 commit comments