@@ -247,6 +247,26 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
247
247
}
248
248
}
249
249
250
+ // get extra buffer types of the CPU
251
+ // TODO: a more general solution for non-CPU extra buft should be imlpemented in the future
252
+ // ref: https://github.com/ggml-org/llama.cpp/pull/12593#pullrequestreview-2718659948
253
+ std::vector<ggml_backend_buffer_type_t > buft_extra;
254
+ {
255
+ auto * cpu_dev = ggml_backend_dev_by_type (GGML_BACKEND_DEVICE_TYPE_CPU);
256
+ auto * cpu_reg = ggml_backend_dev_backend_reg (cpu_dev);
257
+
258
+ auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t )
259
+ ggml_backend_reg_get_proc_address (cpu_reg, " ggml_backend_dev_get_extra_bufts" );
260
+
261
+ if (ggml_backend_dev_get_extra_bufts_fn) {
262
+ ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn (cpu_dev);
263
+ while (extra_bufts && *extra_bufts) {
264
+ buft_extra.emplace_back (*extra_bufts);
265
+ ++extra_bufts;
266
+ }
267
+ }
268
+ }
269
+
250
270
// add tensors
251
271
for (auto & it : ab_map) {
252
272
const std::string & name = it.first ;
@@ -263,7 +283,23 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
263
283
throw std::runtime_error (" LoRA tensor '" + name + " ' does not exist in base model (hint: maybe wrong base model?)" );
264
284
}
265
285
266
- ggml_context * dev_ctx = ctx_for_buft (ggml_backend_buffer_get_type (model_tensor->buffer ));
286
+ auto * buft = ggml_backend_buffer_get_type (model_tensor->buffer );
287
+
288
+ // do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case
289
+ for (auto & ex : buft_extra) {
290
+ if (ex == buft) {
291
+ LLAMA_LOG_WARN (" %s: lora for '%s' cannot use buft '%s', fallback to CPU\n " , __func__, model_tensor->name , ggml_backend_buft_name (buft));
292
+
293
+ auto * cpu_dev = ggml_backend_dev_by_type (GGML_BACKEND_DEVICE_TYPE_CPU);
294
+ buft = ggml_backend_dev_buffer_type (cpu_dev);
295
+
296
+ break ;
297
+ }
298
+ }
299
+
300
+ LLAMA_LOG_DEBUG (" %s: lora for '%s' -> '%s'\n " , __func__, model_tensor->name , ggml_backend_buft_name (buft));
301
+
302
+ ggml_context * dev_ctx = ctx_for_buft (buft);
267
303
// validate tensor shape
268
304
if (is_token_embd) {
269
305
// expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
0 commit comments