From 10420f48d7388d74eced45ae85ae549af813c051 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 3 Feb 2024 11:08:39 +0200 Subject: [PATCH 1/6] readme : update instructions with correct path --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 01990dc..4a1ad13 100644 --- a/README.md +++ b/README.md @@ -16,8 +16,8 @@ To fetch models from `huggingface` and convert them to `gguf` format run the fo ```sh cd models python download-repo.py BAAI/bge-base-en-v1.5 # or any other model -python convert-to-ggml.py BAAI/bge-base-en-v1.5 f16 -python convert-to-ggml.py BAAI/bge-base-en-v1.5 f32 +python convert-to-ggml.py bge-base-en-v1.5 f16 +python convert-to-ggml.py bge-base-en-v1.5 f32 ``` ### Build From b89da19561ce5763031f67abee8d0f2850095849 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 3 Feb 2024 11:09:56 +0200 Subject: [PATCH 2/6] convert : on Mac, this option requires "accelerate" package ImportError: Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install accelerate` --- models/convert-to-ggml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/convert-to-ggml.py b/models/convert-to-ggml.py index 6efeb24..a1ac626 100644 --- a/models/convert-to-ggml.py +++ b/models/convert-to-ggml.py @@ -47,7 +47,7 @@ # load tokenizer and model tokenizer = AutoTokenizer.from_pretrained(model_dir) -model = AutoModel.from_pretrained(model_dir, low_cpu_mem_usage=True) +model = AutoModel.from_pretrained(model_dir) # print model hparam_keys = [ From 1fdce3ddb44497b299b967a0bd2e51015cdbe757 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 3 Feb 2024 11:10:24 +0200 Subject: [PATCH 3/6] bert : --cpu option was ignored for Metal --- bert.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/bert.cpp b/bert.cpp index adfdcef..2c455e8 100644 --- a/bert.cpp +++ b/bert.cpp @@ -451,9 +451,13 @@ struct bert_ctx * bert_load_from_file(const char *fname, bool use_cpu) { #endif #ifdef GGML_USE_METAL - new_bert->backend = ggml_backend_metal_init(); - if (!new_bert->backend) { - fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); + if (!use_cpu) { + new_bert->backend = ggml_backend_metal_init(); + if (!new_bert->backend) { + fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); + } else { + fprintf(stderr, "%s: using Metal backend\n", __func__); + } } #endif From b506fd0527af5697bdc39dc777f746ad399d1ed5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 3 Feb 2024 11:10:42 +0200 Subject: [PATCH 4/6] minor : whitespaces --- bert.cpp | 2 +- examples/main.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bert.cpp b/bert.cpp index 2c455e8..20b42aa 100644 --- a/bert.cpp +++ b/bert.cpp @@ -83,7 +83,7 @@ static std::string get_ftype(int ftype) { static void tensor_stats(ggml_tensor * t) { int32_t src0 = t->src[0] ? t->src[0]->backend : -1; int32_t src1 = t->src[1] ? t->src[1]->backend : -1; - fprintf(stderr, + fprintf(stderr, "type = %s, dims = %d, shape = (%ld, %ld, %ld, %ld), backend = %d, src0 = %d, src1 = %d\n", ggml_type_name(t->type), ggml_n_dims(t), t->ne[0], t->ne[1], t->ne[2], t->ne[3], t->backend, src0, src1 ); diff --git a/examples/main.cpp b/examples/main.cpp index 883cb48..1d46689 100644 --- a/examples/main.cpp +++ b/examples/main.cpp @@ -111,7 +111,7 @@ int main(int argc, char ** argv) { int64_t t_end_us = ggml_time_us(); int64_t t_eval_us = t_end_us - t_mid_us; - + printf("[ "); for (int i = 0; i < n_embd; i++) { const char * sep = (i == n_embd - 1) ? "" : ","; From 62cac76bca82f0f10c0000023d295727d1b989f5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 3 Feb 2024 11:11:33 +0200 Subject: [PATCH 5/6] bert : avoid extra transposing of V --- bert.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bert.cpp b/bert.cpp index 20b42aa..aca4cd4 100644 --- a/bert.cpp +++ b/bert.cpp @@ -790,7 +790,7 @@ ggml_cgraph * bert_build_graph(bert_ctx * ctx, bert_batch batch) { struct ggml_tensor * V = cur; V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, V), model.layers[il].v_b); // [E, L, B] V = ggml_reshape_4d(ctx0, V, d_head, n_head, cur_max_len, n_batch_size); // [D, H, L, B] - V = ggml_cont(ctx0, ggml_permute(ctx0, V, 0, 2, 1, 3)); // [D, L, H, B] + V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); // [L, D, H, B] // scaled attention struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); // -> [L, L, H, B] @@ -799,7 +799,6 @@ ggml_cgraph * bert_build_graph(bert_ctx * ctx, bert_batch batch) { KQ = ggml_soft_max(ctx0, KQ); // get weighted values - V = ggml_cont(ctx0, ggml_transpose(ctx0, V)); // -> [L, D, H, B] struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); // -> [D, L, H, B] KQV = ggml_cont(ctx0, ggml_permute(ctx0, KQV, 0, 2, 1, 3)); // -> [D, H, L, B] From 8fbd461b0607ff44de54a1c6e9b426d42294c6be Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 3 Feb 2024 11:23:28 +0200 Subject: [PATCH 6/6] readme : update execute instructions for Metal --- README.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 4a1ad13..4453ef1 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,12 @@ # bert.cpp -This is a [ggml](https://github.com/ggerganov/ggml) implementation of the BERT embedding architecture. It supports inference on both CPU and CUDA in floating point and a wide variety of quantization schemes. Includes Python bindings for batched inference. +This is a [ggml](https://github.com/ggerganov/ggml) implementation of the BERT embedding architecture. It supports inference on CPU, CUDA and Metal in floating point and a wide variety of quantization schemes. Includes Python bindings for batched inference. This repo is a fork of original [bert.cpp](https://github.com/skeskinen/bert.cpp) as well as [embeddings.cpp](https://github.com/xyzhang626/embeddings.cpp). Thanks to both of you! ### Install -Fetch this respository then download submodules and install packages with +Fetch this repository then download submodules and install packages with ```sh git submodule update --init --recursive pip install -r requirements.txt @@ -33,7 +33,7 @@ If you're compiling for GPU, you should run cmake -DGGML_CUBLAS=ON -B build . make -C build -j ``` -On some distros, you also need to specifiy the host C++ compiler. To do this, I suggest setting the `CUDAHOSTCXX` environment variable to your C++ bindir. +On some distros, you also need to specify the host C++ compiler. To do this, I suggest setting the `CUDAHOSTCXX` environment variable to your C++ bindir. And for Apple Metal, you should run ```sh @@ -41,11 +41,15 @@ cmake -DGGML_METAL=ON -B build . make -C build -j ``` -### Excecute +### Execute All executables are placed in `build/bin`. To run inference on a given text, run ```sh +# CPU / CUDA build/bin/main -m models/bge-base-en-v1.5/ggml-model-f16.gguf -p "Hello world" + +# Metal +make -C build -j && GGML_METAL_PATH_RESOURCES=build/bin/ build/bin/main -m models/bge-base-en-v1.5/ggml-model-f16.gguf -p "Hello world" ``` To force CPU usage, add the flag `-c`.