Skip to content

Commit c89271b

Browse files
authored
feat(llama.cpp): add distributed llama.cpp inferencing (#2324)
* feat(llama.cpp): support distributed llama.cpp Signed-off-by: Ettore Di Giacinto <[email protected]> * feat: let tweak how chat messages are merged together Signed-off-by: Ettore Di Giacinto <[email protected]> * refactor Signed-off-by: Ettore Di Giacinto <[email protected]> * Makefile: register to ALL_GRPC_BACKENDS Signed-off-by: Ettore Di Giacinto <[email protected]> * refactoring, allow disable auto-detection of backends Signed-off-by: Ettore Di Giacinto <[email protected]> * minor fixups Signed-off-by: mudler <[email protected]> * feat: add cmd to start rpc-server from llama.cpp Signed-off-by: mudler <[email protected]> * ci: add ccache Signed-off-by: mudler <[email protected]> --------- Signed-off-by: Ettore Di Giacinto <[email protected]> Signed-off-by: mudler <[email protected]>
1 parent 2990966 commit c89271b

File tree

11 files changed

+220
-80
lines changed

11 files changed

+220
-80
lines changed

.env

+5
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,11 @@
7171
### Define the number of parallel LLAMA.cpp workers (Defaults to 1)
7272
# LLAMACPP_PARALLEL=1
7373

74+
### Define a list of GRPC Servers for llama-cpp workers to distribute the load
75+
# https://github.com/ggerganov/llama.cpp/pull/6829
76+
# https://github.com/ggerganov/llama.cpp/blob/master/examples/rpc/README.md
77+
# LLAMACPP_GRPC_SERVERS=""
78+
7479
### Enable to run parallel requests
7580
# LOCALAI_PARALLEL_REQUESTS=true
7681

.github/workflows/release.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ jobs:
2929
- name: Dependencies
3030
run: |
3131
sudo apt-get update
32-
sudo apt-get install build-essential ffmpeg protobuf-compiler
32+
sudo apt-get install build-essential ffmpeg protobuf-compiler ccache
3333
- name: Install CUDA Dependencies
3434
run: |
3535
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
@@ -86,7 +86,7 @@ jobs:
8686
cache: false
8787
- name: Dependencies
8888
run: |
89-
sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler
89+
sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler ccache
9090
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
9191
go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
9292
- name: Build stablediffusion

Dockerfile

+1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ ARG GO_TAGS="stablediffusion tinydream tts"
1919
RUN apt-get update && \
2020
apt-get install -y --no-install-recommends \
2121
build-essential \
22+
ccache \
2223
ca-certificates \
2324
cmake \
2425
curl \

Makefile

+15-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ BINARY_NAME=local-ai
55

66
# llama.cpp versions
77
GOLLAMA_STABLE_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
8-
CPPLLAMA_VERSION?=dc685be46622a8fabfd57cfa804237c8f15679b8
8+
CPPLLAMA_VERSION?=4f0263633b40e94e8b69fd6e7e4395cfedfd5c12
99

1010
# gpt4all version
1111
GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -158,6 +158,8 @@ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
158158
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
159159
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
160160
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
161+
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
162+
ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
161163
ALL_GRPC_BACKENDS+=backend-assets/grpc/gpt4all
162164
ALL_GRPC_BACKENDS+=backend-assets/grpc/rwkv
163165
ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
@@ -314,7 +316,7 @@ build: prepare backend-assets grpcs ## Build the project
314316
CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./
315317

316318
build-minimal:
317-
BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp" GO_TAGS=none $(MAKE) build
319+
BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=none $(MAKE) build
318320

319321
build-api:
320322
BUILD_GRPC_FOR_BACKEND_LLAMA=true BUILD_API_ONLY=true GO_TAGS=none $(MAKE) build
@@ -691,6 +693,17 @@ backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc
691693
CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server
692694
cp -rfv backend/cpp/llama-cuda/grpc-server backend-assets/grpc/llama-cpp-cuda
693695

696+
backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc
697+
cp -rf backend/cpp/llama backend/cpp/llama-grpc
698+
$(MAKE) -C backend/cpp/llama-grpc purge
699+
$(info ${GREEN}I llama-cpp build info:grpc${RESET})
700+
CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_RPC=ON -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-grpc" build-llama-cpp-grpc-server
701+
cp -rfv backend/cpp/llama-grpc/grpc-server backend-assets/grpc/llama-cpp-grpc
702+
703+
backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
704+
mkdir -p backend-assets/util/
705+
cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server
706+
694707
backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
695708
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
696709
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/

backend/cpp/llama/grpc-server.cpp

+6
Original file line numberDiff line numberDiff line change
@@ -2217,6 +2217,12 @@ static void params_parse(const backend::ModelOptions* request,
22172217
} else {
22182218
params.n_parallel = 1;
22192219
}
2220+
2221+
const char *llama_grpc_servers = std::getenv("LLAMACPP_GRPC_SERVERS");
2222+
if (llama_grpc_servers != NULL) {
2223+
params.rpc_servers = std::string(llama_grpc_servers);
2224+
}
2225+
22202226
// TODO: Add yarn
22212227

22222228
if (!request->tensorsplit().empty()) {

core/cli/cli.go

+5-4
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,9 @@ type Context struct {
1313
var CLI struct {
1414
Context `embed:""`
1515

16-
Run RunCMD `cmd:"" help:"Run LocalAI, this the default command if no other command is specified. Run 'local-ai run --help' for more information" default:"withargs"`
17-
Models ModelsCMD `cmd:"" help:"Manage LocalAI models and definitions"`
18-
TTS TTSCMD `cmd:"" help:"Convert text to speech"`
19-
Transcript TranscriptCMD `cmd:"" help:"Convert audio to text"`
16+
Run RunCMD `cmd:"" help:"Run LocalAI, this the default command if no other command is specified. Run 'local-ai run --help' for more information" default:"withargs"`
17+
Models ModelsCMD `cmd:"" help:"Manage LocalAI models and definitions"`
18+
TTS TTSCMD `cmd:"" help:"Convert text to speech"`
19+
Transcript TranscriptCMD `cmd:"" help:"Convert audio to text"`
20+
LLAMACPPWorker LLAMACPPWorkerCMD `cmd:"" help:"Run workers to distribute workload (llama.cpp-only)"`
2021
}

core/cli/llamacppworker.go

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
package cli
2+
3+
import (
4+
"os"
5+
"syscall"
6+
7+
"github.com/go-skynet/LocalAI/pkg/assets"
8+
"github.com/rs/zerolog/log"
9+
)
10+
11+
type LLAMACPPWorkerCMD struct {
12+
Args []string `arg:"" optional:"" name:"models" help:"Worker arguments: host port"`
13+
BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
14+
}
15+
16+
func (r *LLAMACPPWorkerCMD) Run(ctx *Context) error {
17+
// Extract files from the embedded FS
18+
err := assets.ExtractFiles(ctx.BackendAssets, r.BackendAssetsPath)
19+
log.Debug().Msgf("Extracting backend assets files to %s", r.BackendAssetsPath)
20+
if err != nil {
21+
log.Warn().Msgf("Failed extracting backend assets files: %s (might be required for some backends to work properly, like gpt4all)", err)
22+
}
23+
24+
return syscall.Exec(
25+
assets.ResolvePath(
26+
r.BackendAssetsPath,
27+
"util",
28+
"llama-cpp-rpc-server",
29+
),
30+
append([]string{
31+
assets.ResolvePath(
32+
r.BackendAssetsPath,
33+
"util",
34+
"llama-cpp-rpc-server",
35+
)}, r.Args...),
36+
os.Environ())
37+
}

core/config/backend_config.go

+27-6
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ type Diffusers struct {
9393
ControlNet string `yaml:"control_net"`
9494
}
9595

96+
// LLMConfig is a struct that holds the configuration that are
97+
// generic for most of the LLM backends.
9698
type LLMConfig struct {
9799
SystemPrompt string `yaml:"system_prompt"`
98100
TensorSplit string `yaml:"tensor_split"`
@@ -144,20 +146,39 @@ type LLMConfig struct {
144146
YarnBetaSlow float32 `yaml:"yarn_beta_slow"`
145147
}
146148

149+
// AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
147150
type AutoGPTQ struct {
148151
ModelBaseName string `yaml:"model_base_name"`
149152
Device string `yaml:"device"`
150153
Triton bool `yaml:"triton"`
151154
UseFastTokenizer bool `yaml:"use_fast_tokenizer"`
152155
}
153156

157+
// TemplateConfig is a struct that holds the configuration of the templating system
154158
type TemplateConfig struct {
155-
Chat string `yaml:"chat"`
156-
ChatMessage string `yaml:"chat_message"`
157-
Completion string `yaml:"completion"`
158-
Edit string `yaml:"edit"`
159-
Functions string `yaml:"function"`
160-
UseTokenizerTemplate bool `yaml:"use_tokenizer_template"`
159+
// Chat is the template used in the chat completion endpoint
160+
Chat string `yaml:"chat"`
161+
162+
// ChatMessage is the template used for chat messages
163+
ChatMessage string `yaml:"chat_message"`
164+
165+
// Completion is the template used for completion requests
166+
Completion string `yaml:"completion"`
167+
168+
// Edit is the template used for edit completion requests
169+
Edit string `yaml:"edit"`
170+
171+
// Functions is the template used when tools are present in the client requests
172+
Functions string `yaml:"function"`
173+
174+
// UseTokenizerTemplate is a flag that indicates if the tokenizer template should be used.
175+
// Note: this is mostly consumed for backends such as vllm and transformers
176+
// that can use the tokenizers specified in the JSON config files of the models
177+
UseTokenizerTemplate bool `yaml:"use_tokenizer_template"`
178+
179+
// JoinChatMessagesByCharacter is a string that will be used to join chat messages together.
180+
// It defaults to \n
181+
JoinChatMessagesByCharacter *string `yaml:"join_chat_messages_by_character"`
161182
}
162183

163184
func (c *BackendConfig) SetFunctionCallString(s string) {

core/http/endpoints/openai/chat.go

+6-1
Original file line numberDiff line numberDiff line change
@@ -349,7 +349,12 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
349349
mess = append(mess, content)
350350
}
351351

352-
predInput = strings.Join(mess, "\n")
352+
joinCharacter := "\n"
353+
if config.TemplateConfig.JoinChatMessagesByCharacter != nil {
354+
joinCharacter = *config.TemplateConfig.JoinChatMessagesByCharacter
355+
}
356+
357+
predInput = strings.Join(mess, joinCharacter)
353358
log.Debug().Msgf("Prompt (before templating): %s", predInput)
354359

355360
templateFile := ""

pkg/assets/extract.go

+5-1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@ import (
88
"path/filepath"
99
)
1010

11+
func ResolvePath(dir string, paths ...string) string {
12+
return filepath.Join(append([]string{dir, "backend-assets"}, paths...)...)
13+
}
14+
1115
func ExtractFiles(content embed.FS, extractDir string) error {
1216
// Create the target directory if it doesn't exist
1317
err := os.MkdirAll(extractDir, 0750)
@@ -39,7 +43,7 @@ func ExtractFiles(content embed.FS, extractDir string) error {
3943
}
4044

4145
// Create the file in the target directory
42-
err = os.WriteFile(targetFile, fileData, 0600)
46+
err = os.WriteFile(targetFile, fileData, 0700)
4347
if err != nil {
4448
return fmt.Errorf("failed to write file: %v", err)
4549
}

0 commit comments

Comments
 (0)