From 10420f48d7388d74eced45ae85ae549af813c051 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 3 Feb 2024 11:08:39 +0200
Subject: [PATCH 1/6] readme : update instructions with correct path

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 01990dc..4a1ad13 100644
--- a/README.md
+++ b/README.md
@@ -16,8 +16,8 @@ To fetch models from `huggingface`  and convert them to `gguf` format run the fo
 ```sh
 cd models
 python download-repo.py BAAI/bge-base-en-v1.5 # or any other model
-python convert-to-ggml.py BAAI/bge-base-en-v1.5 f16
-python convert-to-ggml.py BAAI/bge-base-en-v1.5 f32
+python convert-to-ggml.py bge-base-en-v1.5 f16
+python convert-to-ggml.py bge-base-en-v1.5 f32
 ```
 
 ### Build

From b89da19561ce5763031f67abee8d0f2850095849 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 3 Feb 2024 11:09:56 +0200
Subject: [PATCH 2/6] convert : on Mac, this option requires "accelerate"
 package

ImportError: Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install accelerate`
---
 models/convert-to-ggml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/convert-to-ggml.py b/models/convert-to-ggml.py
index 6efeb24..a1ac626 100644
--- a/models/convert-to-ggml.py
+++ b/models/convert-to-ggml.py
@@ -47,7 +47,7 @@
 
 # load tokenizer and model
 tokenizer = AutoTokenizer.from_pretrained(model_dir)
-model = AutoModel.from_pretrained(model_dir, low_cpu_mem_usage=True)
+model = AutoModel.from_pretrained(model_dir)
 
 # print model
 hparam_keys = [

From 1fdce3ddb44497b299b967a0bd2e51015cdbe757 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 3 Feb 2024 11:10:24 +0200
Subject: [PATCH 3/6] bert : --cpu option was ignored for Metal

---
 bert.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/bert.cpp b/bert.cpp
index adfdcef..2c455e8 100644
--- a/bert.cpp
+++ b/bert.cpp
@@ -451,9 +451,13 @@ struct bert_ctx * bert_load_from_file(const char *fname, bool use_cpu) {
 #endif
 
 #ifdef GGML_USE_METAL
-    new_bert->backend = ggml_backend_metal_init();
-    if (!new_bert->backend) {
-        fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
+    if (!use_cpu) {
+        new_bert->backend = ggml_backend_metal_init();
+        if (!new_bert->backend) {
+            fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
+        } else {
+            fprintf(stderr, "%s: using Metal backend\n", __func__);
+        }
     }
 #endif
 

From b506fd0527af5697bdc39dc777f746ad399d1ed5 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 3 Feb 2024 11:10:42 +0200
Subject: [PATCH 4/6] minor : whitespaces

---
 bert.cpp          | 2 +-
 examples/main.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/bert.cpp b/bert.cpp
index 2c455e8..20b42aa 100644
--- a/bert.cpp
+++ b/bert.cpp
@@ -83,7 +83,7 @@ static std::string get_ftype(int ftype) {
 static void tensor_stats(ggml_tensor * t) {
     int32_t src0 = t->src[0] ? t->src[0]->backend : -1;
     int32_t src1 = t->src[1] ? t->src[1]->backend : -1;
-    fprintf(stderr, 
+    fprintf(stderr,
         "type = %s, dims = %d, shape = (%ld, %ld, %ld, %ld), backend = %d, src0 = %d, src1 = %d\n",
         ggml_type_name(t->type), ggml_n_dims(t), t->ne[0], t->ne[1], t->ne[2], t->ne[3], t->backend, src0, src1
     );
diff --git a/examples/main.cpp b/examples/main.cpp
index 883cb48..1d46689 100644
--- a/examples/main.cpp
+++ b/examples/main.cpp
@@ -111,7 +111,7 @@ int main(int argc, char ** argv) {
 
     int64_t t_end_us = ggml_time_us();
     int64_t t_eval_us = t_end_us - t_mid_us;
-    
+
     printf("[ ");
     for (int i = 0; i < n_embd; i++) {
         const char * sep = (i == n_embd - 1) ? "" : ",";

From 62cac76bca82f0f10c0000023d295727d1b989f5 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 3 Feb 2024 11:11:33 +0200
Subject: [PATCH 5/6] bert : avoid extra transposing of V

---
 bert.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/bert.cpp b/bert.cpp
index 20b42aa..aca4cd4 100644
--- a/bert.cpp
+++ b/bert.cpp
@@ -790,7 +790,7 @@ ggml_cgraph * bert_build_graph(bert_ctx * ctx, bert_batch batch) {
             struct ggml_tensor * V = cur;
             V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, V), model.layers[il].v_b); // [E, L, B]
             V = ggml_reshape_4d(ctx0, V, d_head, n_head, cur_max_len, n_batch_size); // [D, H, L, B]
-            V = ggml_cont(ctx0, ggml_permute(ctx0, V, 0, 2, 1, 3)); // [D, L, H, B]
+            V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); // [L, D, H, B]
 
             // scaled attention
             struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); // -> [L, L, H, B]
@@ -799,7 +799,6 @@ ggml_cgraph * bert_build_graph(bert_ctx * ctx, bert_batch batch) {
             KQ = ggml_soft_max(ctx0, KQ);
 
             // get weighted values
-            V = ggml_cont(ctx0, ggml_transpose(ctx0, V)); // -> [L, D, H, B]
             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); // -> [D, L, H, B]
             KQV = ggml_cont(ctx0, ggml_permute(ctx0, KQV, 0, 2, 1, 3)); // -> [D, H, L, B]
 

From 8fbd461b0607ff44de54a1c6e9b426d42294c6be Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 3 Feb 2024 11:23:28 +0200
Subject: [PATCH 6/6] readme : update execute instructions for Metal

---
 README.md | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 4a1ad13..4453ef1 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,12 @@
 # bert.cpp
 
-This is a [ggml](https://github.com/ggerganov/ggml) implementation of the BERT embedding architecture. It supports inference on both CPU and CUDA in floating point and a wide variety of quantization schemes. Includes Python bindings for batched inference.
+This is a [ggml](https://github.com/ggerganov/ggml) implementation of the BERT embedding architecture. It supports inference on CPU, CUDA and Metal in floating point and a wide variety of quantization schemes. Includes Python bindings for batched inference.
 
 This repo is a fork of original [bert.cpp](https://github.com/skeskinen/bert.cpp) as well as [embeddings.cpp](https://github.com/xyzhang626/embeddings.cpp). Thanks to both of you!
 
 ### Install
 
-Fetch this respository then download submodules and install packages with
+Fetch this repository then download submodules and install packages with
 ```sh
 git submodule update --init --recursive
 pip install -r requirements.txt
@@ -33,7 +33,7 @@ If you're compiling for GPU, you should run
 cmake -DGGML_CUBLAS=ON -B build .
 make -C build -j
 ```
-On some distros, you also need to specifiy the host C++ compiler. To do this, I suggest setting the `CUDAHOSTCXX` environment variable to your C++ bindir.
+On some distros, you also need to specify the host C++ compiler. To do this, I suggest setting the `CUDAHOSTCXX` environment variable to your C++ bindir.
 
 And for Apple Metal, you should run
 ```sh
@@ -41,11 +41,15 @@ cmake -DGGML_METAL=ON -B build .
 make -C build -j
 ```
 
-### Excecute
+### Execute
 
 All executables are placed in `build/bin`. To run inference on a given text, run
 ```sh
+# CPU / CUDA
 build/bin/main -m models/bge-base-en-v1.5/ggml-model-f16.gguf -p "Hello world"
+
+# Metal
+make -C build -j && GGML_METAL_PATH_RESOURCES=build/bin/ build/bin/main -m models/bge-base-en-v1.5/ggml-model-f16.gguf -p "Hello world"
 ```
 To force CPU usage, add the flag `-c`.