Skip to content

Commit 530bec9

Browse files
authored
feat(llama.cpp): do not specify backends to autoload and add llama.cpp variants (#2232)
* feat(initializer): do not specify backends to autoload We can simply try to autoload the backends extracted in the asset dir. This will allow to build variants of the same backend (for e.g. with different instructions sets), so to have a single binary for all the variants. Signed-off-by: mudler <[email protected]> * refactor(prepare): refactor out llama.cpp prepare steps Make it so are idempotent and that we can re-build Signed-off-by: mudler <[email protected]> * [TEST] feat(build): build noavx version along Signed-off-by: mudler <[email protected]> * build: make build parallel Signed-off-by: Ettore Di Giacinto <[email protected]> * build: do not override CMAKE_ARGS Signed-off-by: Ettore Di Giacinto <[email protected]> * build: add fallback variant Signed-off-by: Ettore Di Giacinto <[email protected]> * Fixups Signed-off-by: Ettore Di Giacinto <[email protected]> * fix(huggingface-langchain): fail if no token is set Signed-off-by: Ettore Di Giacinto <[email protected]> * fix(huggingface-langchain): rename Signed-off-by: Ettore Di Giacinto <[email protected]> * fix: do not autoload local-store Signed-off-by: Ettore Di Giacinto <[email protected]> * fix: give priority between the listed backends Signed-off-by: Ettore Di Giacinto <[email protected]> --------- Signed-off-by: mudler <[email protected]> Signed-off-by: Ettore Di Giacinto <[email protected]>
1 parent fa10302 commit 530bec9

File tree

7 files changed

+161
-54
lines changed

7 files changed

+161
-54
lines changed

Makefile

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -152,9 +152,11 @@ ifeq ($(findstring tts,$(GO_TAGS)),tts)
152152
OPTIONAL_GRPC+=backend-assets/grpc/piper
153153
endif
154154

155-
ALL_GRPC_BACKENDS=backend-assets/grpc/langchain-huggingface
155+
ALL_GRPC_BACKENDS=backend-assets/grpc/huggingface
156156
ALL_GRPC_BACKENDS+=backend-assets/grpc/bert-embeddings
157157
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp
158+
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-noavx
159+
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
158160
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
159161
ALL_GRPC_BACKENDS+=backend-assets/grpc/gpt4all
160162
ALL_GRPC_BACKENDS+=backend-assets/grpc/rwkv
@@ -293,6 +295,7 @@ clean: ## Remove build related file
293295
rm -rf backend-assets/*
294296
$(MAKE) -C backend/cpp/grpc clean
295297
$(MAKE) -C backend/cpp/llama clean
298+
rm -rf backend/cpp/llama-* || true
296299
$(MAKE) dropreplace
297300
$(MAKE) protogen-clean
298301
rmdir pkg/grpc/proto || true
@@ -311,7 +314,7 @@ build: prepare backend-assets grpcs ## Build the project
311314
CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./
312315

313316
build-minimal:
314-
BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS=backend-assets/grpc/llama-cpp GO_TAGS=none $(MAKE) build
317+
BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp" GO_TAGS=none $(MAKE) build
315318

316319
build-api:
317320
BUILD_GRPC_FOR_BACKEND_LLAMA=true BUILD_API_ONLY=true GO_TAGS=none $(MAKE) build
@@ -616,8 +619,8 @@ backend-assets/grpc/gpt4all: sources/gpt4all sources/gpt4all/gpt4all-bindings/go
616619
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ \
617620
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./backend/go/llm/gpt4all/
618621

619-
backend-assets/grpc/langchain-huggingface: backend-assets/grpc
620-
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/langchain-huggingface ./backend/go/llm/langchain/
622+
backend-assets/grpc/huggingface: backend-assets/grpc
623+
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/huggingface ./backend/go/llm/langchain/
621624

622625
backend/cpp/llama/llama.cpp:
623626
LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama llama.cpp
@@ -629,7 +632,7 @@ ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \
629632
-Dutf8_range_DIR=${INSTALLED_LIB_CMAKE}/utf8_range \
630633
-DgRPC_DIR=${INSTALLED_LIB_CMAKE}/grpc \
631634
-DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=${INSTALLED_PACKAGES}/include
632-
backend/cpp/llama/grpc-server:
635+
build-llama-cpp-grpc-server:
633636
# Conditionally build grpc for the llama backend to use if needed
634637
ifdef BUILD_GRPC_FOR_BACKEND_LLAMA
635638
$(MAKE) -C backend/cpp/grpc build
@@ -638,19 +641,37 @@ ifdef BUILD_GRPC_FOR_BACKEND_LLAMA
638641
PATH="${INSTALLED_PACKAGES}/bin:${PATH}" \
639642
CMAKE_ARGS="${CMAKE_ARGS} ${ADDED_CMAKE_ARGS}" \
640643
LLAMA_VERSION=$(CPPLLAMA_VERSION) \
641-
$(MAKE) -C backend/cpp/llama grpc-server
644+
$(MAKE) -C backend/cpp/${VARIANT} grpc-server
642645
else
643646
echo "BUILD_GRPC_FOR_BACKEND_LLAMA is not defined."
644-
LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server
647+
LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/${VARIANT} grpc-server
645648
endif
646649

647-
backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/grpc-server
648-
cp -rfv backend/cpp/llama/grpc-server backend-assets/grpc/llama-cpp
650+
backend-assets/grpc/llama-cpp: backend-assets/grpc
651+
$(info ${GREEN}I llama-cpp build info:standard${RESET})
652+
cp -rf backend/cpp/llama backend/cpp/llama-default
653+
$(MAKE) -C backend/cpp/llama-default purge
654+
$(MAKE) VARIANT="llama-default" build-llama-cpp-grpc-server
655+
cp -rfv backend/cpp/llama-default/grpc-server backend-assets/grpc/llama-cpp
649656
# TODO: every binary should have its own folder instead, so can have different metal implementations
650657
ifeq ($(BUILD_TYPE),metal)
651-
cp backend/cpp/llama/llama.cpp/build/bin/default.metallib backend-assets/grpc/
658+
cp backend/cpp/llama-default/llama.cpp/build/bin/default.metallib backend-assets/grpc/
652659
endif
653660

661+
backend-assets/grpc/llama-cpp-noavx: backend-assets/grpc
662+
cp -rf backend/cpp/llama backend/cpp/llama-noavx
663+
$(MAKE) -C backend/cpp/llama-noavx purge
664+
$(info ${GREEN}I llama-cpp build info:noavx${RESET})
665+
CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF" $(MAKE) VARIANT="llama-noavx" build-llama-cpp-grpc-server
666+
cp -rfv backend/cpp/llama-noavx/grpc-server backend-assets/grpc/llama-cpp-noavx
667+
668+
backend-assets/grpc/llama-cpp-fallback: backend-assets/grpc
669+
cp -rf backend/cpp/llama backend/cpp/llama-fallback
670+
$(MAKE) -C backend/cpp/llama-fallback purge
671+
$(info ${GREEN}I llama-cpp build info:fallback${RESET})
672+
CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" $(MAKE) VARIANT="llama-fallback" build-llama-cpp-grpc-server
673+
cp -rfv backend/cpp/llama-fallback/grpc-server backend-assets/grpc/llama-cpp-fallback
674+
654675
backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
655676
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
656677
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/

backend/cpp/llama/Makefile

Lines changed: 9 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -43,31 +43,23 @@ llama.cpp:
4343

4444
llama.cpp/examples/grpc-server: llama.cpp
4545
mkdir -p llama.cpp/examples/grpc-server
46-
cp -r $(abspath ./)/CMakeLists.txt llama.cpp/examples/grpc-server/
47-
cp -r $(abspath ./)/grpc-server.cpp llama.cpp/examples/grpc-server/
48-
cp -rfv $(abspath ./)/json.hpp llama.cpp/examples/grpc-server/
49-
cp -rfv $(abspath ./)/utils.hpp llama.cpp/examples/grpc-server/
50-
echo "add_subdirectory(grpc-server)" >> llama.cpp/examples/CMakeLists.txt
51-
## XXX: In some versions of CMake clip wasn't being built before llama.
52-
## This is an hack for now, but it should be fixed in the future.
53-
cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
54-
cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
55-
echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
56-
cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
57-
cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
46+
bash prepare.sh
5847

5948
rebuild:
60-
cp -rfv $(abspath ./)/CMakeLists.txt llama.cpp/examples/grpc-server/
61-
cp -rfv $(abspath ./)/grpc-server.cpp llama.cpp/examples/grpc-server/
62-
cp -rfv $(abspath ./)/json.hpp llama.cpp/examples/grpc-server/
49+
bash prepare.sh
6350
rm -rf grpc-server
6451
$(MAKE) grpc-server
6552

66-
clean:
67-
rm -rf llama.cpp
53+
purge:
54+
rm -rf llama.cpp/build
55+
rm -rf llama.cpp/examples/grpc-server
6856
rm -rf grpc-server
6957

58+
clean: purge
59+
rm -rf llama.cpp
60+
7061
grpc-server: llama.cpp llama.cpp/examples/grpc-server
62+
@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
7163
ifneq (,$(findstring sycl,$(BUILD_TYPE)))
7264
bash -c "source $(ONEAPI_VARS); \
7365
cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release"

backend/cpp/llama/prepare.sh

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#!/bin/bash
2+
3+
cp -r CMakeLists.txt llama.cpp/examples/grpc-server/
4+
cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
5+
cp -rfv json.hpp llama.cpp/examples/grpc-server/
6+
cp -rfv utils.hpp llama.cpp/examples/grpc-server/
7+
8+
if grep -q "grpc-server" llama.cpp/examples/CMakeLists.txt; then
9+
echo "grpc-server already added"
10+
else
11+
echo "add_subdirectory(grpc-server)" >> llama.cpp/examples/CMakeLists.txt
12+
fi
13+
14+
## XXX: In some versions of CMake clip wasn't being built before llama.
15+
## This is an hack for now, but it should be fixed in the future.
16+
cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
17+
cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
18+
echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
19+
cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
20+
cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp

backend/go/llm/langchain/langchain.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ package main
44
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
55
import (
66
"fmt"
7+
"os"
78

89
"github.com/go-skynet/LocalAI/pkg/grpc/base"
910
pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
@@ -18,9 +19,14 @@ type LLM struct {
1819
}
1920

2021
func (llm *LLM) Load(opts *pb.ModelOptions) error {
21-
llm.langchain, _ = langchain.NewHuggingFace(opts.Model)
22+
var err error
23+
hfToken := os.Getenv("HUGGINGFACEHUB_API_TOKEN")
24+
if hfToken == "" {
25+
return fmt.Errorf("no huggingface token provided")
26+
}
27+
llm.langchain, err = langchain.NewHuggingFace(opts.Model, hfToken)
2228
llm.model = opts.Model
23-
return nil
29+
return err
2430
}
2531

2632
func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {

core/http/app_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -787,11 +787,11 @@ var _ = Describe("API test", func() {
787787
})
788788

789789
It("returns errors", func() {
790-
backends := len(model.AutoLoadBackends) + 1 // +1 for huggingface
791790
_, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "foomodel", Prompt: testPrompt})
792791
Expect(err).To(HaveOccurred())
793-
Expect(err.Error()).To(ContainSubstring(fmt.Sprintf("error, status code: 500, message: could not load model - all backends returned error: %d errors occurred:", backends)))
792+
Expect(err.Error()).To(ContainSubstring("error, status code: 500, message: could not load model - all backends returned error:"))
794793
})
794+
795795
It("transcribes audio", func() {
796796
if runtime.GOOS != "linux" {
797797
Skip("test supported only on linux")

pkg/langchain/huggingface.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,26 +2,32 @@ package langchain
22

33
import (
44
"context"
5+
"fmt"
56

67
"github.com/tmc/langchaingo/llms"
78
"github.com/tmc/langchaingo/llms/huggingface"
89
)
910

1011
type HuggingFace struct {
1112
modelPath string
13+
token string
1214
}
1315

14-
func NewHuggingFace(repoId string) (*HuggingFace, error) {
16+
func NewHuggingFace(repoId, token string) (*HuggingFace, error) {
17+
if token == "" {
18+
return nil, fmt.Errorf("no huggingface token provided")
19+
}
1520
return &HuggingFace{
1621
modelPath: repoId,
22+
token: token,
1723
}, nil
1824
}
1925

2026
func (s *HuggingFace) PredictHuggingFace(text string, opts ...PredictOption) (*Predict, error) {
2127
po := NewPredictOptions(opts...)
2228

2329
// Init client
24-
llm, err := huggingface.New()
30+
llm, err := huggingface.New(huggingface.WithToken(s.token))
2531
if err != nil {
2632
return nil, err
2733
}

pkg/model/initializers.go

Lines changed: 83 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,27 +2,32 @@ package model
22

33
import (
44
"context"
5+
"errors"
56
"fmt"
67
"os"
78
"path/filepath"
9+
"slices"
810
"strings"
911
"time"
1012

1113
grpc "github.com/go-skynet/LocalAI/pkg/grpc"
12-
"github.com/hashicorp/go-multierror"
1314
"github.com/phayes/freeport"
1415
"github.com/rs/zerolog/log"
1516
)
1617

1718
var Aliases map[string]string = map[string]string{
18-
"go-llama": LLamaCPP,
19-
"llama": LLamaCPP,
20-
"embedded-store": LocalStoreBackend,
19+
"go-llama": LLamaCPP,
20+
"llama": LLamaCPP,
21+
"embedded-store": LocalStoreBackend,
22+
"langchain-huggingface": LCHuggingFaceBackend,
2123
}
2224

2325
const (
24-
LlamaGGML = "llama-ggml"
25-
LLamaCPP = "llama-cpp"
26+
LlamaGGML = "llama-ggml"
27+
LLamaCPP = "llama-cpp"
28+
29+
LLamaCPPFallback = "llama-cpp-fallback"
30+
2631
Gpt4AllLlamaBackend = "gpt4all-llama"
2732
Gpt4AllMptBackend = "gpt4all-mpt"
2833
Gpt4AllJBackend = "gpt4all-j"
@@ -34,21 +39,73 @@ const (
3439
StableDiffusionBackend = "stablediffusion"
3540
TinyDreamBackend = "tinydream"
3641
PiperBackend = "piper"
37-
LCHuggingFaceBackend = "langchain-huggingface"
42+
LCHuggingFaceBackend = "huggingface"
3843

3944
LocalStoreBackend = "local-store"
4045
)
4146

42-
var AutoLoadBackends []string = []string{
43-
LLamaCPP,
44-
LlamaGGML,
45-
Gpt4All,
46-
BertEmbeddingsBackend,
47-
RwkvBackend,
48-
WhisperBackend,
49-
StableDiffusionBackend,
50-
TinyDreamBackend,
51-
PiperBackend,
47+
func backendPath(assetDir, backend string) string {
48+
return filepath.Join(assetDir, "backend-assets", "grpc", backend)
49+
}
50+
51+
func backendsInAssetDir(assetDir string) ([]string, error) {
52+
excludeBackends := []string{"local-store"}
53+
entry, err := os.ReadDir(backendPath(assetDir, ""))
54+
if err != nil {
55+
return nil, err
56+
}
57+
var backends []string
58+
ENTRY:
59+
for _, e := range entry {
60+
for _, exclude := range excludeBackends {
61+
if e.Name() == exclude {
62+
continue ENTRY
63+
}
64+
}
65+
if !e.IsDir() {
66+
backends = append(backends, e.Name())
67+
}
68+
}
69+
70+
// order backends from the asset directory.
71+
// as we scan for backends, we want to keep some order which backends are tried of.
72+
// for example, llama.cpp should be tried first, and we want to keep the huggingface backend at the last.
73+
74+
// sets a priority list
75+
// First has more priority
76+
priorityList := []string{
77+
// First llama.cpp and llama-ggml
78+
LLamaCPP, LLamaCPPFallback, LlamaGGML, Gpt4All,
79+
}
80+
toTheEnd := []string{
81+
// last has to be huggingface
82+
LCHuggingFaceBackend,
83+
// then bert embeddings
84+
BertEmbeddingsBackend,
85+
}
86+
slices.Reverse(priorityList)
87+
slices.Reverse(toTheEnd)
88+
89+
// order certain backends first
90+
for _, b := range priorityList {
91+
for i, be := range backends {
92+
if be == b {
93+
backends = append([]string{be}, append(backends[:i], backends[i+1:]...)...)
94+
break
95+
}
96+
}
97+
}
98+
// make sure that some others are pushed at the end
99+
for _, b := range toTheEnd {
100+
for i, be := range backends {
101+
if be == b {
102+
backends = append(append(backends[:i], backends[i+1:]...), be)
103+
break
104+
}
105+
}
106+
}
107+
108+
return backends, nil
52109
}
53110

54111
// starts the grpcModelProcess for the backend, and returns a grpc client
@@ -99,7 +156,7 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
99156
client = ModelAddress(uri)
100157
}
101158
} else {
102-
grpcProcess := filepath.Join(o.assetDir, "backend-assets", "grpc", backend)
159+
grpcProcess := backendPath(o.assetDir, backend)
103160
// Check if the file exists
104161
if _, err := os.Stat(grpcProcess); os.IsNotExist(err) {
105162
return "", fmt.Errorf("grpc process not found: %s. some backends(stablediffusion, tts) require LocalAI compiled with GO_TAGS", grpcProcess)
@@ -243,7 +300,12 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
243300

244301
// autoload also external backends
245302
allBackendsToAutoLoad := []string{}
246-
allBackendsToAutoLoad = append(allBackendsToAutoLoad, AutoLoadBackends...)
303+
autoLoadBackends, err := backendsInAssetDir(o.assetDir)
304+
if err != nil {
305+
return nil, err
306+
}
307+
log.Debug().Msgf("Loading from the following backends (in order): %+v", autoLoadBackends)
308+
allBackendsToAutoLoad = append(allBackendsToAutoLoad, autoLoadBackends...)
247309
for _, b := range o.externalBackends {
248310
allBackendsToAutoLoad = append(allBackendsToAutoLoad, b)
249311
}
@@ -271,10 +333,10 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
271333
log.Info().Msgf("[%s] Loads OK", b)
272334
return model, nil
273335
} else if modelerr != nil {
274-
err = multierror.Append(err, modelerr)
336+
err = errors.Join(err, modelerr)
275337
log.Info().Msgf("[%s] Fails: %s", b, modelerr.Error())
276338
} else if model == nil {
277-
err = multierror.Append(err, fmt.Errorf("backend returned no usable model"))
339+
err = errors.Join(err, fmt.Errorf("backend returned no usable model"))
278340
log.Info().Msgf("[%s] Fails: %s", b, "backend returned no usable model")
279341
}
280342
}

0 commit comments

Comments
 (0)