Skip to content

Commit 4452ab1

Browse files
authored
Merge branch 'ggerganov:master' into master
2 parents 2011dda + 42cadc7 commit 4452ab1

File tree

20 files changed

+729
-571
lines changed

20 files changed

+729
-571
lines changed

Makefile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ BUILD_TARGETS = \
3434
llama-save-load-state \
3535
llama-server \
3636
llama-simple \
37+
llama-simple-chat \
3738
llama-speculative \
3839
llama-tokenize \
3940
llama-vdot \
@@ -1287,6 +1288,11 @@ llama-simple: examples/simple/simple.cpp \
12871288
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
12881289
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
12891290

1291+
llama-simple-chat: examples/simple-chat/simple-chat.cpp \
1292+
$(OBJ_ALL)
1293+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1294+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1295+
12901296
llama-tokenize: examples/tokenize/tokenize.cpp \
12911297
$(OBJ_ALL)
12921298
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
1717

1818
## Hot topics
1919

20-
- **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669**
20+
- **Introducing GGUF-my-LoRA** https://github.com/ggerganov/llama.cpp/discussions/10123
21+
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669
2122
- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
2223

2324
----

ci/run.sh

Lines changed: 91 additions & 91 deletions
Large diffs are not rendered by default.

common/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ struct common_sampler_params {
155155

156156
struct common_params {
157157
int32_t n_predict = -1; // new tokens to predict
158-
int32_t n_ctx = 0; // context size
158+
int32_t n_ctx = 4096; // context size
159159
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
160160
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
161161
int32_t n_keep = 0; // number of tokens to keep from initial prompt

convert_hf_to_gguf.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,8 @@ class Model:
7272
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
7373
use_temp_file: bool = False, eager: bool = False,
7474
metadata_override: Path | None = None, model_name: str | None = None,
75-
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
75+
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
76+
small_first_shard: bool = False, hparams: dict[str, Any] | None = None):
7677
if type(self) is Model:
7778
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
7879

@@ -87,7 +88,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
8788
self.is_safetensors = len(self.part_names) > 0
8889
if not self.is_safetensors:
8990
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
90-
self.hparams = Model.load_hparams(self.dir_model)
91+
self.hparams = Model.load_hparams(self.dir_model) if hparams is None else hparams
9192
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
9293
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
9394
self.tensor_names = None
@@ -1541,6 +1542,17 @@ def set_vocab(self):
15411542
special_vocab._set_special_token("eot", 32010)
15421543
special_vocab.add_to_gguf(self.gguf_writer)
15431544

1545+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
1546+
if tokenizer_config_file.is_file():
1547+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
1548+
tokenizer_config_json = json.load(f)
1549+
if "add_prefix_space" in tokenizer_config_json:
1550+
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
1551+
1552+
# Apply to granite small models only
1553+
if self.hparams.get("vocab_size", 32000) == 49152:
1554+
self.gguf_writer.add_add_bos_token(False)
1555+
15441556
def set_gguf_parameters(self):
15451557
super().set_gguf_parameters()
15461558
hparams = self.hparams
@@ -1557,17 +1569,6 @@ def set_gguf_parameters(self):
15571569
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
15581570
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
15591571

1560-
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
1561-
if tokenizer_config_file.is_file():
1562-
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
1563-
tokenizer_config_json = json.load(f)
1564-
if "add_prefix_space" in tokenizer_config_json:
1565-
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
1566-
1567-
# Apply to granite small models only
1568-
if self.hparams.get("vocab_size", 32000) == 49152:
1569-
self.gguf_writer.add_add_bos_token(False)
1570-
15711572
@staticmethod
15721573
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
15731574
if n_head_kv is not None and n_head != n_head_kv:

convert_lora_to_gguf.py

Lines changed: 37 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from math import prod
1313
from pathlib import Path
1414
from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
15+
from transformers import AutoConfig
1516

1617
import torch
1718

@@ -256,8 +257,8 @@ def parse_args() -> argparse.Namespace:
256257
help="only print out what will be done, without writing any new files",
257258
)
258259
parser.add_argument(
259-
"--base", type=Path, required=True,
260-
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required",
260+
"--base", type=Path,
261+
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
261262
)
262263
parser.add_argument(
263264
"lora_path", type=Path,
@@ -267,6 +268,12 @@ def parse_args() -> argparse.Namespace:
267268
return parser.parse_args()
268269

269270

271+
def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
272+
# normally, adapter does not come with base model config, we need to load it from AutoConfig
273+
config = AutoConfig.from_pretrained(hf_model_id)
274+
return config.to_dict()
275+
276+
270277
if __name__ == '__main__':
271278
args = parse_args()
272279
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
@@ -281,7 +288,7 @@ def parse_args() -> argparse.Namespace:
281288

282289
ftype = ftype_map[args.outtype]
283290

284-
dir_base_model: Path = args.base
291+
dir_base_model: Path | None = args.base
285292
dir_lora: Path = args.lora_path
286293
lora_config = dir_lora / "adapter_config.json"
287294
input_model = dir_lora / "adapter_model.safetensors"
@@ -301,9 +308,29 @@ def parse_args() -> argparse.Namespace:
301308
input_model = os.path.join(dir_lora, "adapter_model.bin")
302309
lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
303310

311+
# load LoRA config
312+
with open(lora_config, "r") as f:
313+
lparams: dict[str, Any] = json.load(f)
314+
304315
# load base model
305-
logger.info(f"Loading base model: {dir_base_model.name}")
306-
hparams = Model.load_hparams(dir_base_model)
316+
if dir_base_model is None:
317+
if "base_model_name_or_path" in lparams:
318+
model_id = lparams["base_model_name_or_path"]
319+
logger.info(f"Loading base model from Hugging Face: {model_id}")
320+
try:
321+
hparams = load_hparams_from_hf(model_id)
322+
except OSError as e:
323+
logger.error(f"Failed to load base model config: {e}")
324+
logger.error("Please try downloading the base model and add its path to --base")
325+
sys.exit(1)
326+
else:
327+
logger.error("'base_model_name_or_path' is not found in adapter_config.json")
328+
logger.error("Base model config is required. Please download the base model and add its path to --base")
329+
sys.exit(1)
330+
else:
331+
logger.info(f"Loading base model: {dir_base_model.name}")
332+
hparams = Model.load_hparams(dir_base_model)
333+
307334
with torch.inference_mode():
308335
try:
309336
model_class = Model.from_model_architecture(hparams["architectures"][0])
@@ -323,13 +350,15 @@ def __init__(self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs):
323350
self.dir_model_card = dir_lora_model
324351
self.lora_alpha = float(lora_alpha)
325352

353+
def set_vocab(self):
354+
pass
355+
326356
def set_type(self):
327357
self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
328358
self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
329359

330360
def set_gguf_parameters(self):
331361
self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
332-
super().set_gguf_parameters()
333362

334363
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
335364
# Never add extra tensors (e.g. rope_freqs) for LoRA adapters
@@ -350,7 +379,7 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
350379
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
351380
if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
352381
logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
353-
logger.error("Hint: if you are using TRL, make sure not to call setup_chat_format()")
382+
logger.error("Please refer to https://github.com/ggerganov/llama.cpp/pull/9948")
354383
sys.exit(1)
355384

356385
if base_name in tensor_map:
@@ -384,9 +413,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
384413
yield (dest_name + ".lora_a", lora_a)
385414
yield (dest_name + ".lora_b", lora_b)
386415

387-
with open(lora_config, "r") as f:
388-
lparams: dict[str, Any] = json.load(f)
389-
390416
alpha: float = lparams["lora_alpha"]
391417

392418
model_instance = LoraModel(
@@ -399,6 +425,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
399425
dry_run=args.dry_run,
400426
dir_lora_model=dir_lora,
401427
lora_alpha=alpha,
428+
hparams=hparams,
402429
)
403430

404431
logger.info("Exporting model...")

examples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ else()
4949
endif()
5050
add_subdirectory(save-load-state)
5151
add_subdirectory(simple)
52+
add_subdirectory(simple-chat)
5253
add_subdirectory(speculative)
5354
add_subdirectory(tokenize)
5455
endif()

examples/server/server.cpp

Lines changed: 21 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,7 @@ struct server_slot {
247247
if (is_processing()) {
248248
SLT_INF(*this, "stop processing: n_past = %d, truncated = %d\n", n_past, truncated);
249249

250+
t_last_used = ggml_time_us();
250251
t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
251252
state = SLOT_STATE_IDLE;
252253
callback_on_release(id);
@@ -725,12 +726,12 @@ struct server_context {
725726
return nullptr;
726727
}
727728

728-
server_slot * get_available_slot(const std::string & prompt) {
729+
server_slot * get_available_slot(const server_task & task) {
729730
server_slot * ret = nullptr;
730731

731732
// find the slot that has at least n% prompt similarity
732-
if (ret == nullptr && slot_prompt_similarity != 0.0f && !prompt.empty()) {
733-
int max_lcp_len = 0;
733+
if (ret == nullptr && slot_prompt_similarity != 0.0f) {
734+
int lcs_len = 0;
734735
float similarity = 0;
735736

736737
for (server_slot & slot : slots) {
@@ -740,25 +741,26 @@ struct server_context {
740741
}
741742

742743
// skip the slot if it does not contains cached tokens
743-
if (slot.prompt_tokens.empty()) {
744+
if (slot.cache_tokens.empty()) {
744745
continue;
745746
}
746747

747-
// length of the Longest Common Prefix between the current slot's prompt and the input prompt
748-
int lcp_len = longest_common_prefix(slot.cache_tokens, slot.prompt_tokens);
748+
// length of the Longest Common Subsequence between the current slot's prompt and the input prompt
749+
int cur_lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens);
749750

750-
// fraction of the common substring length compared to the current slot's prompt length
751-
similarity = static_cast<float>(lcp_len) / static_cast<int>(slot.prompt_tokens.size());
751+
// fraction of the common subsequence length compared to the current slot's prompt length
752+
float cur_similarity = static_cast<float>(cur_lcs_len) / static_cast<int>(slot.cache_tokens.size());
752753

753754
// select the current slot if the criteria match
754-
if (lcp_len > max_lcp_len && similarity > slot_prompt_similarity) {
755-
max_lcp_len = lcp_len;
755+
if (cur_lcs_len > lcs_len && cur_similarity > slot_prompt_similarity) {
756+
lcs_len = cur_lcs_len;
757+
similarity = cur_similarity;
756758
ret = &slot;
757759
}
758760
}
759761

760762
if (ret != nullptr) {
761-
SLT_DBG(*ret, "selected slot by lcp similarity, max_lcp_len = %d, similarity = %f\n", max_lcp_len, similarity);
763+
SLT_DBG(*ret, "selected slot by lcs similarity, lcs_len = %d, similarity = %f\n", lcs_len, similarity);
762764
}
763765
}
764766

@@ -1514,18 +1516,7 @@ struct server_context {
15141516
{
15151517
const int id_slot = json_value(task.data, "id_slot", -1);
15161518

1517-
server_slot * slot;
1518-
1519-
if (id_slot != -1) {
1520-
slot = get_slot_by_id(id_slot);
1521-
} else {
1522-
std::string prompt;
1523-
if (task.data.contains("prompt") && task.data.at("prompt").is_string()) {
1524-
prompt = json_value(task.data, "prompt", std::string());
1525-
}
1526-
1527-
slot = get_available_slot(prompt);
1528-
}
1519+
server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);
15291520

15301521
if (slot == nullptr) {
15311522
// if no slot is available, we defer this task for processing later
@@ -2714,8 +2705,8 @@ int main(int argc, char ** argv) {
27142705
};
27152706

27162707
const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_inf_type inf_type, json & data, httplib::Response & res) {
2717-
if (ctx_server.params.embedding || ctx_server.params.reranking) {
2718-
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
2708+
if (ctx_server.params.embedding) {
2709+
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
27192710
return;
27202711
}
27212712

@@ -2820,8 +2811,8 @@ int main(int argc, char ** argv) {
28202811

28212812
// TODO: maybe merge this function with "handle_completions_generic"
28222813
const auto handle_chat_completions = [&ctx_server, &params, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) {
2823-
if (ctx_server.params.embedding || ctx_server.params.reranking) {
2824-
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
2814+
if (ctx_server.params.embedding) {
2815+
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
28252816
return;
28262817
}
28272818

@@ -2946,11 +2937,6 @@ int main(int argc, char ** argv) {
29462937
};
29472938

29482939
const auto handle_embeddings = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
2949-
// TODO: somehow clean up this checks in the future
2950-
if (!ctx_server.params.embedding || ctx_server.params.reranking) {
2951-
res_error(res, format_error_response("This server does not support embeddings. Start it with `--embeddings` and without `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
2952-
return;
2953-
}
29542940
const json body = json::parse(req.body);
29552941
bool is_openai = false;
29562942

@@ -3002,10 +2988,11 @@ int main(int argc, char ** argv) {
30022988
};
30032989

30042990
const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
3005-
if (!ctx_server.params.reranking) {
3006-
res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
2991+
if (!ctx_server.params.reranking || ctx_server.params.embedding) {
2992+
res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED));
30072993
return;
30082994
}
2995+
30092996
const json body = json::parse(req.body);
30102997

30112998
// TODO: implement

0 commit comments

Comments
 (0)