iamlemec · iamlemec · Feb 3, 2024 · Feb 3, 2024 · Feb 3, 2024 · Feb 3, 2024
diff --git a/README.md b/README.md
@@ -1,12 +1,12 @@
 # bert.cpp
 
-This is a [ggml](https://github.com/ggerganov/ggml) implementation of the BERT embedding architecture. It supports inference on both CPU and CUDA in floating point and a wide variety of quantization schemes. Includes Python bindings for batched inference.
+This is a [ggml](https://github.com/ggerganov/ggml) implementation of the BERT embedding architecture. It supports inference on CPU, CUDA and Metal in floating point and a wide variety of quantization schemes. Includes Python bindings for batched inference.
 
 This repo is a fork of original [bert.cpp](https://github.com/skeskinen/bert.cpp) as well as [embeddings.cpp](https://github.com/xyzhang626/embeddings.cpp). Thanks to both of you!
 
 ### Install
 
-Fetch this respository then download submodules and install packages with
+Fetch this repository then download submodules and install packages with
 ```sh
 git submodule update --init --recursive
 pip install -r requirements.txt
@@ -16,8 +16,8 @@ To fetch models from `huggingface`  and convert them to `gguf` format run the fo
 ```sh
 cd models
 python download-repo.py BAAI/bge-base-en-v1.5 # or any other model
-python convert-to-ggml.py BAAI/bge-base-en-v1.5 f16
-python convert-to-ggml.py BAAI/bge-base-en-v1.5 f32
+python convert-to-ggml.py bge-base-en-v1.5 f16
+python convert-to-ggml.py bge-base-en-v1.5 f32
 ```
 
 ### Build
@@ -33,19 +33,23 @@ If you're compiling for GPU, you should run
 cmake -DGGML_CUBLAS=ON -B build .
 make -C build -j
 ```
-On some distros, you also need to specifiy the host C++ compiler. To do this, I suggest setting the `CUDAHOSTCXX` environment variable to your C++ bindir.
+On some distros, you also need to specify the host C++ compiler. To do this, I suggest setting the `CUDAHOSTCXX` environment variable to your C++ bindir.
 
 And for Apple Metal, you should run
 ```sh
 cmake -DGGML_METAL=ON -B build .
 make -C build -j
 ```
 
-### Excecute
+### Execute
 
 All executables are placed in `build/bin`. To run inference on a given text, run
 ```sh
+# CPU / CUDA
 build/bin/main -m models/bge-base-en-v1.5/ggml-model-f16.gguf -p "Hello world"
+
+# Metal
+make -C build -j && GGML_METAL_PATH_RESOURCES=build/bin/ build/bin/main -m models/bge-base-en-v1.5/ggml-model-f16.gguf -p "Hello world"
 ```
 To force CPU usage, add the flag `-c`.
 

diff --git a/bert.cpp b/bert.cpp
@@ -83,7 +83,7 @@ static std::string get_ftype(int ftype) {
 static void tensor_stats(ggml_tensor * t) {
     int32_t src0 = t->src[0] ? t->src[0]->backend : -1;
     int32_t src1 = t->src[1] ? t->src[1]->backend : -1;
-    fprintf(stderr, 
+    fprintf(stderr,
         "type = %s, dims = %d, shape = (%ld, %ld, %ld, %ld), backend = %d, src0 = %d, src1 = %d\n",
         ggml_type_name(t->type), ggml_n_dims(t), t->ne[0], t->ne[1], t->ne[2], t->ne[3], t->backend, src0, src1
     );
@@ -451,9 +451,13 @@ struct bert_ctx * bert_load_from_file(const char *fname, bool use_cpu) {
 #endif
 
 #ifdef GGML_USE_METAL
-    new_bert->backend = ggml_backend_metal_init();
-    if (!new_bert->backend) {
-        fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
+    if (!use_cpu) {
+        new_bert->backend = ggml_backend_metal_init();
+        if (!new_bert->backend) {
+            fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
+        } else {
+            fprintf(stderr, "%s: using Metal backend\n", __func__);
+        }
     }
 #endif
 
@@ -786,7 +790,7 @@ ggml_cgraph * bert_build_graph(bert_ctx * ctx, bert_batch batch) {
             struct ggml_tensor * V = cur;
             V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, V), model.layers[il].v_b); // [E, L, B]
             V = ggml_reshape_4d(ctx0, V, d_head, n_head, cur_max_len, n_batch_size); // [D, H, L, B]
-            V = ggml_cont(ctx0, ggml_permute(ctx0, V, 0, 2, 1, 3)); // [D, L, H, B]
+            V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); // [L, D, H, B]
 
             // scaled attention
             struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); // -> [L, L, H, B]
@@ -795,7 +799,6 @@ ggml_cgraph * bert_build_graph(bert_ctx * ctx, bert_batch batch) {
             KQ = ggml_soft_max(ctx0, KQ);
 
             // get weighted values
-            V = ggml_cont(ctx0, ggml_transpose(ctx0, V)); // -> [L, D, H, B]
             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); // -> [D, L, H, B]
             KQV = ggml_cont(ctx0, ggml_permute(ctx0, KQV, 0, 2, 1, 3)); // -> [D, H, L, B]
 

diff --git a/examples/main.cpp b/examples/main.cpp
@@ -111,7 +111,7 @@ int main(int argc, char ** argv) {
 
     int64_t t_end_us = ggml_time_us();
     int64_t t_eval_us = t_end_us - t_mid_us;
-    
+
     printf("[ ");
     for (int i = 0; i < n_embd; i++) {
         const char * sep = (i == n_embd - 1) ? "" : ",";

diff --git a/models/convert-to-ggml.py b/models/convert-to-ggml.py
@@ -47,7 +47,7 @@
 
 # load tokenizer and model
 tokenizer = AutoTokenizer.from_pretrained(model_dir)
-model = AutoModel.from_pretrained(model_dir, low_cpu_mem_usage=True)
+model = AutoModel.from_pretrained(model_dir)
 
 # print model
 hparam_keys = [