Skip to content

Commit 88b5769

Browse files
ggerganovklosax
andauthored
gguf : deduplicate (#2629)
* gguf : better type names * dedup : CPU + Metal is working * ggml : fix warnings about unused results * llama.cpp : fix line feed and compiler warning * llama : fix strncpy warning + note token_to_str does not write null * llama : restore the original load/save session implementation Will migrate this to GGUF in the future * convert-llama-h5-to-gguf.py : support alt ctx param name * ggml : assert when using ggml_mul with non-F32 src1 * examples : dedup simple --------- Co-authored-by: klosax <[email protected]>
1 parent 758ff1b commit 88b5769

21 files changed

+1776
-7544
lines changed

CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -529,7 +529,6 @@ endif()
529529
add_library(llama
530530
llama.cpp
531531
llama.h
532-
llama-util.h
533532
)
534533

535534
target_include_directories(llama PUBLIC .)

Makefile

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Define the default target now so that it is always the first target
2-
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test gguf gguf-llama-simple gptneox-main
2+
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test gguf gptneox-main
33

44
# Binaries only useful for tests
55
TEST_TARGETS = tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
@@ -329,10 +329,7 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
329329

330330
OBJS += ggml-alloc.o
331331

332-
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
333-
$(CXX) $(CXXFLAGS) -c $< -o $@
334-
335-
gguf-llama.o: gguf-llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h gguf-llama.h
332+
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h
336333
$(CXX) $(CXXFLAGS) -c $< -o $@
337334

338335
common.o: examples/common.cpp examples/common.h
@@ -388,10 +385,7 @@ $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-in
388385
embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
389386
$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
390387

391-
gguf: examples/gguf/gguf.cpp build-info.h ggml.o gguf-llama.o $(OBJS)
392-
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
393-
394-
gguf-llama-simple: examples/gguf/gguf-llama-simple.cpp build-info.h ggml.o gguf-llama.o common.o $(OBJS)
388+
gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS)
395389
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
396390

397391
gptneox-main: gptneox-main.cpp ggml.o $(OBJS)

convert-llama-7b-pth-to-gguf.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ def count_model_parts(dir_model: str) -> int:
132132
toktype = 1 # defualt to normal token type
133133
if tokenizer.is_unknown(i): toktype = 2
134134
if tokenizer.is_control(i): toktype = 3
135-
135+
136136
# TODO: How to determinate if a token is user defined?
137137
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
138138
# if tokenizer.is_user_defined(i): toktype = 4
@@ -223,7 +223,7 @@ def count_model_parts(dir_model: str) -> int:
223223
sys.exit()
224224

225225
n_dims = len(data.shape)
226-
data_dtype = data.dtype
226+
data_dtype = data.dtype
227227

228228
# if f32 desired, convert any float16 to float32
229229
if ftype == 0 and data.dtype == np.float16:
@@ -261,7 +261,6 @@ def count_model_parts(dir_model: str) -> int:
261261
for name in model_part.keys():
262262
data = model_part[name]
263263

264-
265264
old_dtype = data.dtype
266265

267266
# we don't need these
@@ -284,7 +283,7 @@ def count_model_parts(dir_model: str) -> int:
284283
sys.exit()
285284

286285
n_dims = len(data.shape)
287-
data_dtype = data.dtype
286+
data_dtype = data.dtype
288287

289288
# if f32 desired, convert any float16 to float32
290289
if ftype == 0 and data.dtype == np.float16:

convert-llama-h5-to-gguf.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,12 +95,21 @@ def count_model_parts(dir_model: str) -> int:
9595
else:
9696
hf_repo=""
9797

98+
if "max_sequence_length" in hparams:
99+
ctx_length = hparams["max_sequence_length"]
100+
elif "max_position_embeddings" in hparams:
101+
ctx_length = hparams["max_position_embeddings"]
102+
else:
103+
print("gguf: can not find ctx length parameter.")
104+
sys.exit()
105+
106+
98107
gguf_writer.add_architecture(llm_arch)
99108
gguf_writer.add_name(last_dir)
100109
gguf_writer.add_file_type("All tensors F32" if ftype == 0 else "Most tensors F16, some F32")
101110
gguf_writer.add_source_hf_repo(hf_repo)
102111
gguf_writer.add_tensor_data_layout(llm_arch, "Meta AI original pth")
103-
gguf_writer.add_context_length(llm_arch, hparams["max_position_embeddings"])
112+
gguf_writer.add_context_length(llm_arch, ctx_length)
104113
gguf_writer.add_embedding_length(llm_arch, hparams["hidden_size"])
105114
gguf_writer.add_block_count(llm_arch, block_count)
106115
gguf_writer.add_feed_forward_length(llm_arch, hparams["intermediate_size"])
@@ -318,7 +327,7 @@ def count_model_parts(dir_model: str) -> int:
318327
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
319328
data = data.astype(np.float16)
320329

321-
print( name + ", shape " + str(len(data.shape)) + ", " + str(old_dtype) + " --> " + str(data.dtype))
330+
print(name + ", shape " + str(len(data.shape)) + ", " + str(old_dtype) + " --> " + str(data.dtype))
322331

323332
gguf_writer.write_tensor_to_file(data)
324333

examples/common.cpp

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -170,18 +170,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
170170
break;
171171
}
172172
params.n_ctx = std::stoi(argv[i]);
173-
} else if (arg == "-gqa" || arg == "--gqa") {
174-
if (++i >= argc) {
175-
invalid_param = true;
176-
break;
177-
}
178-
params.n_gqa = std::stoi(argv[i]);
179-
} else if (arg == "-eps" || arg == "--rms-norm-eps") {
180-
if (++i >= argc) {
181-
invalid_param = true;
182-
break;
183-
}
184-
params.rms_norm_eps = std::stof(argv[i]);
185173
} else if (arg == "--rope-freq-base") {
186174
if (++i >= argc) {
187175
invalid_param = true;
@@ -546,8 +534,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
546534
fprintf(stdout, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
547535
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
548536
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
549-
fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
550-
fprintf(stdout, " -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
551537
fprintf(stdout, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
552538
fprintf(stdout, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
553539
fprintf(stdout, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
@@ -638,8 +624,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
638624

639625
lparams.n_ctx = params.n_ctx;
640626
lparams.n_batch = params.n_batch;
641-
lparams.n_gqa = params.n_gqa;
642-
lparams.rms_norm_eps = params.rms_norm_eps;
643627
lparams.n_gpu_layers = params.n_gpu_layers;
644628
lparams.main_gpu = params.main_gpu;
645629
lparams.tensor_split = params.tensor_split;

examples/common.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,12 @@ struct gpt_params {
2323
int32_t n_predict = -1; // new tokens to predict
2424
int32_t n_ctx = 512; // context size
2525
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
26-
int32_t n_gqa = 1; // grouped-query attention factor (TODO: move to hparams)
2726
int32_t n_keep = 0; // number of tokens to keep from initial prompt
2827
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
2928
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
3029
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
3130
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
3231
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
33-
float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; // rms norm epsilon
3432
float rope_freq_base = 10000.0f; // RoPE base frequency
3533
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
3634

examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp

Lines changed: 75 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include "ggml.h"
22
#include "llama.h"
3+
34
#include <unordered_map>
45
#include <vector>
56
#include <cassert>
@@ -502,7 +503,7 @@ bool is_ggml_file(const char *filename) {
502503
return false;
503504
}
504505
uint32_t magic = file.read_u32();
505-
return magic == LLAMA_FILE_MAGIC;
506+
return magic == GGUF_MAGIC;
506507
}
507508

508509
void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
@@ -590,75 +591,80 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
590591
if (file.fp == NULL) {
591592
return;
592593
}
593-
// write_magic
594-
file.write_u32(LLAMA_FILE_MAGIC); // magic
595-
file.write_u32(LLAMA_FILE_VERSION); // version
596-
// write_hparams
597-
file.write_u32(model->hparams.n_vocab);
598-
file.write_u32(model->hparams.n_embd);
599-
file.write_u32(model->hparams.n_mult);
600-
file.write_u32(model->hparams.n_head);
601-
file.write_u32(model->hparams.n_layer);
602-
file.write_u32(model->hparams.n_rot);
603-
file.write_u32(LLAMA_FTYPE_ALL_F32);
604-
605-
// write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
606-
uint32_t n_vocab = model->hparams.n_vocab;
607-
for (uint32_t i = 0; i < n_vocab; i++) {
608-
const auto & token_score = vocab->id_to_token.at(i);
609-
file.write_u32((uint32_t) token_score.tok.size());
610-
file.write_raw(token_score.tok.data(), token_score.tok.size());
611-
file.write_raw(&token_score.score, sizeof(token_score.score));
612-
}
613-
614-
// stuff AK weights into GG weights one by one.
615-
// w->token_embedding_table -> model->tok_embeddings
616-
// float* -> struct ggml_tensor
617-
stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
618-
stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
619-
620-
stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
621-
//print_row(model->norm, 0);
622-
623-
// for rms-att-weight
624-
int row_length = model->hparams.n_embd;
625-
const auto & hparams = model->hparams;
626-
//int n_ff = model->hparams.n_embd;
627-
int n_ff = get_n_ff(&hparams);
628594

629-
for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
630-
auto & layer = model->layers[i];
631-
// 1d
632-
stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
633-
stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
634-
635-
// from 3d matrix layer x dim x dim to 2d matrix dim x dim
636-
stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]);
637-
stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]);
638-
stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]);
639-
stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]);
640-
641-
stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
642-
stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
643-
stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
644-
}
645-
// write tensors
646-
write_tensor(&file, model->tok_embeddings);
647-
write_tensor(&file, model->norm);
648-
write_tensor(&file, model->output); // ?
649-
for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
650-
auto & layer = model->layers[i];
651-
652-
write_tensor(&file, layer.attention_norm);
653-
write_tensor(&file, layer.wq);
654-
write_tensor(&file, layer.wk);
655-
write_tensor(&file, layer.wv);
656-
write_tensor(&file, layer.wo);
657-
write_tensor(&file, layer.ffn_norm);
658-
write_tensor(&file, layer.w1);
659-
write_tensor(&file, layer.w2);
660-
write_tensor(&file, layer.w3);
661-
}
595+
#pragma message("TODO: implement file saving using gguf")
596+
(void) vocab;
597+
(void) model;
598+
(void) w;
599+
// // write_magic
600+
// file.write_u32(LLAMA_FILE_MAGIC); // magic
601+
// file.write_u32(LLAMA_FILE_VERSION); // version
602+
// // write_hparams
603+
// file.write_u32(model->hparams.n_vocab);
604+
// file.write_u32(model->hparams.n_embd);
605+
// file.write_u32(model->hparams.n_mult);
606+
// file.write_u32(model->hparams.n_head);
607+
// file.write_u32(model->hparams.n_layer);
608+
// file.write_u32(model->hparams.n_rot);
609+
// file.write_u32(LLAMA_FTYPE_ALL_F32);
610+
//
611+
// // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
612+
// uint32_t n_vocab = model->hparams.n_vocab;
613+
// for (uint32_t i = 0; i < n_vocab; i++) {
614+
// const auto & token_score = vocab->id_to_token.at(i);
615+
// file.write_u32((uint32_t) token_score.tok.size());
616+
// file.write_raw(token_score.tok.data(), token_score.tok.size());
617+
// file.write_raw(&token_score.score, sizeof(token_score.score));
618+
// }
619+
//
620+
// // stuff AK weights into GG weights one by one.
621+
// // w->token_embedding_table -> model->tok_embeddings
622+
// // float* -> struct ggml_tensor
623+
// stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
624+
// stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
625+
//
626+
// stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
627+
// //print_row(model->norm, 0);
628+
//
629+
// // for rms-att-weight
630+
// int row_length = model->hparams.n_embd;
631+
// const auto & hparams = model->hparams;
632+
// //int n_ff = model->hparams.n_embd;
633+
// int n_ff = get_n_ff(&hparams);
634+
//
635+
// for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
636+
// auto & layer = model->layers[i];
637+
// // 1d
638+
// stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
639+
// stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
640+
//
641+
// // from 3d matrix layer x dim x dim to 2d matrix dim x dim
642+
// stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]);
643+
// stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]);
644+
// stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]);
645+
// stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]);
646+
//
647+
// stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
648+
// stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
649+
// stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
650+
// }
651+
// // write tensors
652+
// write_tensor(&file, model->tok_embeddings);
653+
// write_tensor(&file, model->norm);
654+
// write_tensor(&file, model->output); // ?
655+
// for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
656+
// auto & layer = model->layers[i];
657+
//
658+
// write_tensor(&file, layer.attention_norm);
659+
// write_tensor(&file, layer.wq);
660+
// write_tensor(&file, layer.wk);
661+
// write_tensor(&file, layer.wv);
662+
// write_tensor(&file, layer.wo);
663+
// write_tensor(&file, layer.ffn_norm);
664+
// write_tensor(&file, layer.w1);
665+
// write_tensor(&file, layer.w2);
666+
// write_tensor(&file, layer.w3);
667+
// }
662668
}
663669

664670
struct train_params get_default_train_params() {

0 commit comments

Comments
 (0)