@@ -673,13 +673,21 @@ struct llama_model_loader {
673
673
674
674
struct ggml_tensor * get_tensor_for (llama_load_tensor & lt, ggml_backend backend) {
675
675
struct ggml_tensor * tensor;
676
+
677
+ if (backend != GGML_BACKEND_CPU) {
678
+ ggml_set_no_alloc (ggml_ctx, true );
679
+ }
676
680
if (lt.ne .size () == 2 ) {
677
681
tensor = ggml_new_tensor_2d (ggml_ctx, lt.type , lt.ne .at (0 ), lt.ne .at (1 ));
678
682
} else {
679
683
LLAMA_ASSERT (lt.ne .size () == 1 );
680
684
tensor = ggml_new_tensor_1d (ggml_ctx, lt.type , lt.ne .at (0 ));
681
685
}
682
686
ggml_set_name (tensor, lt.name .c_str ());
687
+
688
+ if (backend != GGML_BACKEND_CPU) {
689
+ ggml_set_no_alloc (ggml_ctx, use_mmap);
690
+ }
683
691
LLAMA_ASSERT (lt.ggml_tensor == NULL ); // if this fails, we called get_tensor twice on the same tensor
684
692
tensor->backend = backend;
685
693
lt.ggml_tensor = tensor;
@@ -696,6 +704,7 @@ struct llama_model_loader {
696
704
void load_all_data (llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
697
705
size_t data_size = 0 ;
698
706
size_t prefetch_size = 0 ;
707
+ size_t lock_size = 0 ;
699
708
for (const llama_load_tensor & lt : tensors_map.tensors ) {
700
709
data_size += lt.size ;
701
710
if (lt.ggml_tensor ->backend == GGML_BACKEND_CPU) {
@@ -705,32 +714,52 @@ struct llama_model_loader {
705
714
706
715
if (use_mmap) {
707
716
mapping.reset (new llama_mmap (&file_loaders.at (0 )->file , prefetch_size));
708
- if (!lmlock) {
709
- // Don't call the callback since the actual loading will be lazy
710
- // and we can't measure it.
711
- progress_callback = NULL ;
712
- }
713
717
if (lmlock) {
714
718
lmlock->init (mapping->addr );
715
719
}
716
720
}
717
721
718
722
size_t done_size = 0 ;
719
723
for (llama_load_tensor & lt : tensors_map.tensors ) {
720
- if (lt.ggml_tensor ->backend != GGML_BACKEND_CPU) {
721
- continue ;
722
- }
723
724
if (progress_callback) {
724
725
progress_callback ((float ) done_size / data_size, progress_callback_user_data);
725
726
}
726
727
LLAMA_ASSERT (lt.ggml_tensor ); // unused tensors should have been caught by load_data already
727
728
lt.data = (uint8_t *) lt.ggml_tensor ->data ;
729
+ // allocate temp buffer if not using mmap
730
+ if (!use_mmap && lt.data == NULL ) {
731
+ lt.data = (uint8_t *)malloc (ggml_nbytes (lt.ggml_tensor ));
732
+ }
733
+
728
734
load_data_for (lt);
729
- lt.ggml_tensor ->data = lt.data ;
730
- done_size += lt.size ;
731
- if (use_mmap && lmlock) {
732
- lmlock->grow_to (done_size);
735
+ switch (lt.ggml_tensor ->backend ) {
736
+ case GGML_BACKEND_CPU:
737
+ lt.ggml_tensor ->data = lt.data ;
738
+ if (use_mmap && lmlock) {
739
+ lock_size += lt.size ;
740
+ lmlock->grow_to (lock_size);
741
+ }
742
+ break ;
743
+ #ifdef GGML_USE_CUBLAS
744
+ case GGML_BACKEND_CUDA:
745
+ ggml_cuda_transform_tensor (lt.data , lt.ggml_tensor );
746
+ if (!use_mmap) {
747
+ free (lt.data );
748
+ }
749
+ break ;
750
+ #endif
751
+ #ifdef GGML_USE_CLBLAST
752
+ case GGML_BACKEND_CL:
753
+ ggml_cl_transform_tensor (lt.data , lt.ggml_tensor );
754
+ if (!use_mmap) {
755
+ free (lt.data );
756
+ }
757
+ break ;
758
+ #endif
759
+ default :
760
+ continue ;
733
761
}
762
+ done_size += lt.size ;
734
763
}
735
764
}
736
765
@@ -1069,8 +1098,8 @@ static void llama_model_load_internal(
1069
1098
1070
1099
if (backend == LLAMA_BACKEND_OFFLOAD) {
1071
1100
vram_total +=
1072
- ggml_nbytes (layer.attention_norm ) + ggml_nbytes (layer.wq ) + ggml_nbytes (layer.wk ) +
1073
- ggml_nbytes (layer.wv ) + ggml_nbytes (layer.wo ) + ggml_nbytes (layer.attention_norm ) +
1101
+ ggml_nbytes (layer.attention_norm ) + ggml_nbytes (layer.wq ) + ggml_nbytes (layer.wk ) +
1102
+ ggml_nbytes (layer.wv ) + ggml_nbytes (layer.wo ) + ggml_nbytes (layer.ffn_norm ) +
1074
1103
ggml_nbytes (layer.w1 ) + ggml_nbytes (layer.w2 ) + ggml_nbytes (layer.w3 );
1075
1104
}
1076
1105
}
@@ -1117,50 +1146,6 @@ static void llama_model_load_internal(
1117
1146
1118
1147
ml->load_all_data (progress_callback, progress_callback_user_data, use_mlock ? &lctx.model .mlock_mmap : NULL );
1119
1148
1120
- #if defined(GGML_USE_CUBLAS)
1121
- {
1122
- size_t done_size = 0 ;
1123
- size_t data_size = 0 ;
1124
- for (llama_load_tensor & lt : ml->tensors_map .tensors ) {
1125
- data_size += lt.size ;
1126
- if (lt.ggml_tensor ->backend == GGML_BACKEND_CPU) {
1127
- done_size += lt.size ;
1128
- }
1129
- }
1130
- for (llama_load_tensor & lt : ml->tensors_map .tensors ) {
1131
- if (lt.ggml_tensor ->backend != GGML_BACKEND_CUDA) {
1132
- continue ;
1133
- }
1134
- if (progress_callback) {
1135
- progress_callback ((float ) done_size / data_size, progress_callback_user_data);
1136
- }
1137
- ggml_cuda_load_data (fname.c_str (), lt.ggml_tensor , lt.shards .at (0 ).file_off );
1138
- done_size += lt.size ;
1139
- }
1140
- }
1141
- #elif defined(GGML_USE_CLBLAST)
1142
- {
1143
- size_t done_size = 0 ;
1144
- size_t data_size = 0 ;
1145
- for (llama_load_tensor & lt : ml->tensors_map .tensors ) {
1146
- data_size += lt.size ;
1147
- if (lt.ggml_tensor ->backend == GGML_BACKEND_CPU) {
1148
- done_size += lt.size ;
1149
- }
1150
- }
1151
- for (llama_load_tensor & lt : ml->tensors_map .tensors ) {
1152
- if (lt.ggml_tensor ->backend != GGML_BACKEND_CL) {
1153
- continue ;
1154
- }
1155
- if (progress_callback) {
1156
- progress_callback ((float ) done_size / data_size, progress_callback_user_data);
1157
- }
1158
- ggml_cl_load_data (fname.c_str (), lt.ggml_tensor , lt.shards .at (0 ).file_off );
1159
- done_size += lt.size ;
1160
- }
1161
- }
1162
- #endif
1163
-
1164
1149
if (progress_callback) {
1165
1150
progress_callback (1 .0f , progress_callback_user_data);
1166
1151
}
0 commit comments