Skip to content

Commit f28bc4c

Browse files
authored
llama : make loras compatible with repacking (#12593)
* llama : make loras compatible with repacking ggml-ci * cont : simplify ggml-ci * cont : add TODO [no ci]
1 parent f17a3bb commit f28bc4c

File tree

1 file changed

+37
-1
lines changed

1 file changed

+37
-1
lines changed

src/llama-adapter.cpp

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,26 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
247247
}
248248
}
249249

250+
// get extra buffer types of the CPU
251+
// TODO: a more general solution for non-CPU extra buft should be imlpemented in the future
252+
// ref: https://github.com/ggml-org/llama.cpp/pull/12593#pullrequestreview-2718659948
253+
std::vector<ggml_backend_buffer_type_t> buft_extra;
254+
{
255+
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
256+
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
257+
258+
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
259+
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
260+
261+
if (ggml_backend_dev_get_extra_bufts_fn) {
262+
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
263+
while (extra_bufts && *extra_bufts) {
264+
buft_extra.emplace_back(*extra_bufts);
265+
++extra_bufts;
266+
}
267+
}
268+
}
269+
250270
// add tensors
251271
for (auto & it : ab_map) {
252272
const std::string & name = it.first;
@@ -263,7 +283,23 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
263283
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
264284
}
265285

266-
ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
286+
auto * buft = ggml_backend_buffer_get_type(model_tensor->buffer);
287+
288+
// do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case
289+
for (auto & ex : buft_extra) {
290+
if (ex == buft) {
291+
LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
292+
293+
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
294+
buft = ggml_backend_dev_buffer_type(cpu_dev);
295+
296+
break;
297+
}
298+
}
299+
300+
LLAMA_LOG_DEBUG("%s: lora for '%s' -> '%s'\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
301+
302+
ggml_context * dev_ctx = ctx_for_buft(buft);
267303
// validate tensor shape
268304
if (is_token_embd) {
269305
// expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()

0 commit comments

Comments
 (0)