Skip to content

Commit 5dd2fbe

Browse files
committed
Merge 'origin/master' into hipblas
2 parents 67e229b + 20568fe commit 5dd2fbe

File tree

12 files changed

+1281
-259
lines changed

12 files changed

+1281
-259
lines changed

CMakeLists.txt

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
7070
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
7171
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
7272
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
73+
option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
7374
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
7475
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
7576
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
@@ -239,6 +240,9 @@ if (LLAMA_CUBLAS)
239240
add_compile_definitions(GGML_USE_CUBLAS)
240241
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
241242
add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
243+
if (LLAMA_CUDA_DMMV_F16)
244+
add_compile_definitions(GGML_CUDA_DMMV_F16)
245+
endif()
242246
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
243247

244248
if (LLAMA_STATIC)
@@ -497,6 +501,7 @@ add_library(ggml_static STATIC $<TARGET_OBJECTS:ggml>)
497501
if (BUILD_SHARED_LIBS)
498502
set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
499503
add_library(ggml_shared SHARED $<TARGET_OBJECTS:ggml>)
504+
target_link_libraries(ggml_shared PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
500505
endif()
501506

502507
add_library(llama
@@ -522,9 +527,18 @@ endif()
522527

523528
if (GGML_SOURCES_CUDA)
524529
message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
525-
set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF)
530+
set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES "native")
526531
set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
527-
set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES OFF)
532+
533+
set_property(TARGET ggml_static PROPERTY CUDA_ARCHITECTURES "native")
534+
set_property(TARGET ggml_static PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
535+
536+
if (BUILD_SHARED_LIBS)
537+
set_property(TARGET ggml_shared PROPERTY CUDA_ARCHITECTURES "native")
538+
set_property(TARGET ggml_shared PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
539+
endif()
540+
541+
set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES "native")
528542
endif()
529543

530544

Makefile

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,9 @@ ifdef LLAMA_CUDA_DMMV_Y
169169
else
170170
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
171171
endif # LLAMA_CUDA_DMMV_Y
172+
ifdef LLAMA_CUDA_DMMV_F16
173+
NVCCFLAGS += -DGGML_CUDA_DMMV_F16
174+
endif # LLAMA_CUDA_DMMV_F16
172175
ifdef LLAMA_CUDA_KQUANTS_ITER
173176
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
174177
else
@@ -270,7 +273,7 @@ $(info )
270273
ggml.o: ggml.c ggml.h ggml-cuda.h
271274
$(CC) $(CFLAGS) -c $< -o $@
272275

273-
llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
276+
llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
274277
$(CXX) $(CXXFLAGS) -c $< -o $@
275278

276279
common.o: examples/common.cpp examples/common.h

README.md

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -337,7 +337,14 @@ Building the program with BLAS support may lead to some performance improvements
337337
cmake --build . --config Release
338338
```
339339
340-
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.
340+
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
341+
342+
| Option | Legal values | Default | Description |
343+
|-------------------------|------------------------|---------|-------------|
344+
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
345+
| LLAMA_CUDA_DMMV_Y | Positive integer | 1 | Block size in y direction for the CUDA dequantization + mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
346+
| LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. |
347+
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value 2 1 can improve performance for slow GPUs. |
341348
342349
- #### CLBlast
343350
@@ -617,7 +624,12 @@ And after 4.45 hours, you will have the final perplexity.
617624
618625
#### Building the Project using Android NDK
619626
You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/).
620-
First, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
627+
628+
First, install the essential packages for termux:
629+
```
630+
pkg install clang wget git cmake
631+
```
632+
Second, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
621633
```
622634
$ mkdir build-android
623635
$ cd build-android

examples/metal/metal.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,10 @@ int main(int argc, char ** argv) {
4040
// this allocates all Metal resources and memory buffers
4141
auto * ctx_metal = ggml_metal_init();
4242

43-
ggml_metal_add_buffer(ctx_metal, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data));
44-
ggml_metal_add_buffer(ctx_metal, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval));
43+
const size_t max_size_data = ggml_get_max_tensor_size(ctx_data);
44+
const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval);
45+
ggml_metal_add_buffer(ctx_metal, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data), max_size_data);
46+
ggml_metal_add_buffer(ctx_metal, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval), max_size_eval);
4547

4648
// main
4749
{

examples/server/README.md

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ Command line options:
2121
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
2222
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
2323
- `--port`: Set the port to listen. Default: `8080`.
24+
- `--embedding`: Enable embedding extraction, Default: disabled.
2425

2526
## Build
2627

@@ -119,14 +120,14 @@ node .
119120

120121
`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
121122

122-
`n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. (default: 128, -1 = infinity).
123+
`n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: 128, -1 = infinity).
123124

124125
`n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context.
125126
By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
126127

127128
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
128129

129-
`prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate.
130+
`prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. A space is inserted in the front like main.cpp does.
130131

131132
`stop`: Specify a JSON array of stopping strings.
132133
These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
@@ -163,6 +164,14 @@ node .
163164

164165
`content`: Set the text to tokenize.
165166

167+
Note that the special `BOS` token is not added in fron of the text and also a space character is not inserted automatically as it is for `/completion`.
168+
169+
- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.
170+
171+
*Options:*
172+
173+
`content`: Set the text to process.
174+
166175
## More examples
167176

168177
### Interactive mode

examples/server/server.cpp

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,11 @@ struct llama_server_context {
254254
n_past += n_eval;
255255
}
256256

257+
if (params.n_predict == 0) {
258+
has_next_token = false;
259+
return llama_token_eos();
260+
}
261+
257262
// out of user input, sample next token
258263
const float temp = params.temp;
259264
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
@@ -419,6 +424,19 @@ struct llama_server_context {
419424

420425
return token_text;
421426
}
427+
428+
std::vector<float> getEmbedding() {
429+
static const int n_embd = llama_n_embd(ctx);
430+
if (!params.embedding) {
431+
LOG_WARNING("embedding disabled", {
432+
{ "params.embedding", params.embedding },
433+
});
434+
return std::vector<float>(n_embd, 0.0f);
435+
}
436+
const float * data = llama_get_embeddings(ctx);
437+
std::vector<float> embedding(data, data + n_embd);
438+
return embedding;
439+
}
422440
};
423441

424442
static void server_print_usage(const char * argv0, const gpt_params & params,
@@ -457,6 +475,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params,
457475
fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
458476
fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port);
459477
fprintf(stderr, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
478+
fprintf(stderr, " --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
460479
fprintf(stderr, "\n");
461480
}
462481

@@ -603,6 +622,8 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
603622
params.use_mlock = true;
604623
} else if (arg == "--no-mmap") {
605624
params.use_mmap = false;
625+
} else if (arg == "--embedding") {
626+
params.embedding = true;
606627
} else {
607628
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
608629
server_print_usage(argv[0], default_params, default_sparams);
@@ -646,6 +667,12 @@ static json format_generation_settings(llama_server_context & llama) {
646667
};
647668
}
648669

670+
static json format_embedding_response(llama_server_context & llama) {
671+
return json {
672+
{ "embedding", llama.getEmbedding() },
673+
};
674+
}
675+
649676
static json format_final_response(llama_server_context & llama, const std::string & content) {
650677
return json {
651678
{ "content", content },
@@ -881,12 +908,27 @@ int main(int argc, char ** argv) {
881908

882909
svr.Post("/tokenize", [&llama](const Request & req, Response & res) {
883910
const json body = json::parse(req.body);
884-
const std::string content = body["content"].get<std::string>();
911+
const std::string content = body.value("content", "");
885912
const std::vector<llama_token> tokens = llama_tokenize(llama.ctx, content, false);
886913
const json data = format_tokenizer_response(tokens);
887914
return res.set_content(data.dump(), "application/json");
888915
});
889916

917+
svr.Post("/embedding", [&llama](const Request & req, Response & res) {
918+
const json body = json::parse(req.body);
919+
920+
llama.rewind();
921+
llama_reset_timings(llama.ctx);
922+
llama.params.prompt = body.value("content", "");
923+
llama.params.n_predict = 0;
924+
llama.loadPrompt();
925+
llama.beginCompletion();
926+
llama.doCompletion();
927+
928+
const json data = format_embedding_response(llama);
929+
return res.set_content(data.dump(), "application/json");
930+
});
931+
890932
svr.set_logger(log_server_request);
891933

892934
svr.set_exception_handler([](const Request &, Response & res, std::exception_ptr ep) {

0 commit comments

Comments
 (0)