Skip to content

Commit bad2726

Browse files
authored
Merge pull request #3 from ggerganov/gg/mac-improvements
bert : various improvements
2 parents 7f084fa + 8fbd461 commit bad2726

File tree

4 files changed

+21
-14
lines changed

4 files changed

+21
-14
lines changed

README.md

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
# bert.cpp
22

3-
This is a [ggml](https://github.com/ggerganov/ggml) implementation of the BERT embedding architecture. It supports inference on both CPU and CUDA in floating point and a wide variety of quantization schemes. Includes Python bindings for batched inference.
3+
This is a [ggml](https://github.com/ggerganov/ggml) implementation of the BERT embedding architecture. It supports inference on CPU, CUDA and Metal in floating point and a wide variety of quantization schemes. Includes Python bindings for batched inference.
44

55
This repo is a fork of original [bert.cpp](https://github.com/skeskinen/bert.cpp) as well as [embeddings.cpp](https://github.com/xyzhang626/embeddings.cpp). Thanks to both of you!
66

77
### Install
88

9-
Fetch this respository then download submodules and install packages with
9+
Fetch this repository then download submodules and install packages with
1010
```sh
1111
git submodule update --init --recursive
1212
pip install -r requirements.txt
@@ -16,8 +16,8 @@ To fetch models from `huggingface` and convert them to `gguf` format run the fo
1616
```sh
1717
cd models
1818
python download-repo.py BAAI/bge-base-en-v1.5 # or any other model
19-
python convert-to-ggml.py BAAI/bge-base-en-v1.5 f16
20-
python convert-to-ggml.py BAAI/bge-base-en-v1.5 f32
19+
python convert-to-ggml.py bge-base-en-v1.5 f16
20+
python convert-to-ggml.py bge-base-en-v1.5 f32
2121
```
2222

2323
### Build
@@ -33,19 +33,23 @@ If you're compiling for GPU, you should run
3333
cmake -DGGML_CUBLAS=ON -B build .
3434
make -C build -j
3535
```
36-
On some distros, you also need to specifiy the host C++ compiler. To do this, I suggest setting the `CUDAHOSTCXX` environment variable to your C++ bindir.
36+
On some distros, you also need to specify the host C++ compiler. To do this, I suggest setting the `CUDAHOSTCXX` environment variable to your C++ bindir.
3737

3838
And for Apple Metal, you should run
3939
```sh
4040
cmake -DGGML_METAL=ON -B build .
4141
make -C build -j
4242
```
4343

44-
### Excecute
44+
### Execute
4545

4646
All executables are placed in `build/bin`. To run inference on a given text, run
4747
```sh
48+
# CPU / CUDA
4849
build/bin/main -m models/bge-base-en-v1.5/ggml-model-f16.gguf -p "Hello world"
50+
51+
# Metal
52+
make -C build -j && GGML_METAL_PATH_RESOURCES=build/bin/ build/bin/main -m models/bge-base-en-v1.5/ggml-model-f16.gguf -p "Hello world"
4953
```
5054
To force CPU usage, add the flag `-c`.
5155

bert.cpp

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ static std::string get_ftype(int ftype) {
8383
static void tensor_stats(ggml_tensor * t) {
8484
int32_t src0 = t->src[0] ? t->src[0]->backend : -1;
8585
int32_t src1 = t->src[1] ? t->src[1]->backend : -1;
86-
fprintf(stderr,
86+
fprintf(stderr,
8787
"type = %s, dims = %d, shape = (%ld, %ld, %ld, %ld), backend = %d, src0 = %d, src1 = %d\n",
8888
ggml_type_name(t->type), ggml_n_dims(t), t->ne[0], t->ne[1], t->ne[2], t->ne[3], t->backend, src0, src1
8989
);
@@ -451,9 +451,13 @@ struct bert_ctx * bert_load_from_file(const char *fname, bool use_cpu) {
451451
#endif
452452

453453
#ifdef GGML_USE_METAL
454-
new_bert->backend = ggml_backend_metal_init();
455-
if (!new_bert->backend) {
456-
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
454+
if (!use_cpu) {
455+
new_bert->backend = ggml_backend_metal_init();
456+
if (!new_bert->backend) {
457+
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
458+
} else {
459+
fprintf(stderr, "%s: using Metal backend\n", __func__);
460+
}
457461
}
458462
#endif
459463

@@ -786,7 +790,7 @@ ggml_cgraph * bert_build_graph(bert_ctx * ctx, bert_batch batch) {
786790
struct ggml_tensor * V = cur;
787791
V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, V), model.layers[il].v_b); // [E, L, B]
788792
V = ggml_reshape_4d(ctx0, V, d_head, n_head, cur_max_len, n_batch_size); // [D, H, L, B]
789-
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 0, 2, 1, 3)); // [D, L, H, B]
793+
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); // [L, D, H, B]
790794

791795
// scaled attention
792796
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); // -> [L, L, H, B]
@@ -795,7 +799,6 @@ ggml_cgraph * bert_build_graph(bert_ctx * ctx, bert_batch batch) {
795799
KQ = ggml_soft_max(ctx0, KQ);
796800

797801
// get weighted values
798-
V = ggml_cont(ctx0, ggml_transpose(ctx0, V)); // -> [L, D, H, B]
799802
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); // -> [D, L, H, B]
800803
KQV = ggml_cont(ctx0, ggml_permute(ctx0, KQV, 0, 2, 1, 3)); // -> [D, H, L, B]
801804

examples/main.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ int main(int argc, char ** argv) {
111111

112112
int64_t t_end_us = ggml_time_us();
113113
int64_t t_eval_us = t_end_us - t_mid_us;
114-
114+
115115
printf("[ ");
116116
for (int i = 0; i < n_embd; i++) {
117117
const char * sep = (i == n_embd - 1) ? "" : ",";

models/convert-to-ggml.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747

4848
# load tokenizer and model
4949
tokenizer = AutoTokenizer.from_pretrained(model_dir)
50-
model = AutoModel.from_pretrained(model_dir, low_cpu_mem_usage=True)
50+
model = AutoModel.from_pretrained(model_dir)
5151

5252
# print model
5353
hparam_keys = [

0 commit comments

Comments
 (0)