Skip to content

Commit cb6d996

Browse files
committed
Merge branch 'master' into compilade/bitnet-ternary
2 parents 35cc556 + 11b84eb commit cb6d996

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

77 files changed

+4690
-2221
lines changed

.devops/llama-cli-cann.Dockerfile

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
2+
3+
FROM cosdt/cann:$ASCEND_VERSION AS build
4+
5+
WORKDIR /app
6+
7+
COPY . .
8+
9+
RUN yum install -y gcc g++ cmake make
10+
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
11+
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
12+
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
13+
ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
14+
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
15+
ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
16+
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
17+
ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
18+
ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
19+
20+
# find libascend_hal.so, because the drive hasn`t been mounted.
21+
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
22+
23+
RUN echo "Building with static libs" && \
24+
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
25+
cmake -B build -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF && \
26+
cmake --build build --config Release --target llama-cli
27+
28+
# TODO: use image with NNRT
29+
FROM cosdt/cann:$ASCEND_VERSION AS runtime
30+
COPY --from=build /app/build/bin/llama-cli /llama-cli
31+
32+
ENV LC_ALL=C.utf8
33+
34+
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
35+
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
36+
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
37+
ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
38+
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
39+
ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
40+
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
41+
ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
42+
ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
43+
44+
ENTRYPOINT ["/llama-cli" ]

.github/workflows/bench.yml renamed to .github/workflows/bench.yml.disabled

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# TODO: there have been some issues with the workflow, so disabling for now
2+
# https://github.com/ggerganov/llama.cpp/issues/7893
3+
#
14
# Benchmark
25
name: Benchmark
36

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,3 +129,6 @@ poetry.toml
129129

130130
# Scripts
131131
!/scripts/install-oneapi.bat
132+
133+
# Test models for lora adapters
134+
/lora-tests

CMakePresets.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
2929
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
3030
{ "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
31+
{ "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },
3132

3233
{
3334
"name": "arm64-windows-msvc", "hidden": true,
@@ -60,6 +61,8 @@
6061
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
6162

6263
{ "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] },
63-
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
64+
{ "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] },
65+
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] },
66+
{ "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] }
6467
]
6568
}

Makefile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -763,6 +763,10 @@ ifdef GGML_VULKAN_MEMORY_DEBUG
763763
MK_CPPFLAGS += -DGGML_VULKAN_MEMORY_DEBUG
764764
endif
765765

766+
ifdef GGML_VULKAN_PERF
767+
MK_CPPFLAGS += -DGGML_VULKAN_PERF
768+
endif
769+
766770
ifdef GGML_VULKAN_VALIDATE
767771
MK_CPPFLAGS += -DGGML_VULKAN_VALIDATE
768772
endif

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,8 @@ Typically finetunes of the base models below are supported as well.
105105
- [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
106106
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
107107
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
108+
- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
109+
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
108110

109111
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
110112

@@ -424,6 +426,7 @@ Please refer to [Build llama.cpp locally](./docs/build.md)
424426
| [CUDA](./docs/build.md#cuda) | Nvidia GPU |
425427
| [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
426428
| [Vulkan](./docs/build.md#vulkan) | GPU |
429+
| [CANN](./docs/build.md#cann) | Ascend NPU |
427430

428431
## Tools
429432

common/common.cpp

Lines changed: 91 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,41 @@
7777

7878
using json = nlohmann::ordered_json;
7979

80+
//
81+
// Environment variable utils
82+
//
83+
84+
template<typename T>
85+
static typename std::enable_if<std::is_same<T, std::string>::value, void>::type
86+
get_env(std::string name, T & target) {
87+
char * value = std::getenv(name.c_str());
88+
target = value ? std::string(value) : target;
89+
}
90+
91+
template<typename T>
92+
static typename std::enable_if<!std::is_same<T, bool>::value && std::is_integral<T>::value, void>::type
93+
get_env(std::string name, T & target) {
94+
char * value = std::getenv(name.c_str());
95+
target = value ? std::stoi(value) : target;
96+
}
97+
98+
template<typename T>
99+
static typename std::enable_if<std::is_floating_point<T>::value, void>::type
100+
get_env(std::string name, T & target) {
101+
char * value = std::getenv(name.c_str());
102+
target = value ? std::stof(value) : target;
103+
}
104+
105+
template<typename T>
106+
static typename std::enable_if<std::is_same<T, bool>::value, void>::type
107+
get_env(std::string name, T & target) {
108+
char * value = std::getenv(name.c_str());
109+
if (value) {
110+
std::string val(value);
111+
target = val == "1" || val == "true";
112+
}
113+
}
114+
80115
//
81116
// CPU utils
82117
//
@@ -110,8 +145,34 @@ int32_t cpu_get_num_physical_cores() {
110145
if (result == 0) {
111146
return num_physical_cores;
112147
}
113-
#elif defined(_WIN32)
114-
//TODO: Implement
148+
#elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
149+
// TODO: windows + arm64 + mingw64
150+
unsigned int n_threads_win = std::thread::hardware_concurrency();
151+
unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4;
152+
153+
DWORD buffer_size = 0;
154+
if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) {
155+
if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
156+
return default_threads;
157+
}
158+
}
159+
160+
std::vector<char> buffer(buffer_size);
161+
if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()), &buffer_size)) {
162+
return default_threads;
163+
}
164+
165+
int32_t num_physical_cores = 0;
166+
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
167+
while (buffer_size > 0) {
168+
if (info->Relationship == RelationProcessorCore) {
169+
num_physical_cores += info->Processor.GroupCount;
170+
}
171+
buffer_size -= info->Size;
172+
info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<char*>(info) + info->Size);
173+
}
174+
175+
return num_physical_cores > 0 ? num_physical_cores : default_threads;
115176
#endif
116177
unsigned int n_threads = std::thread::hardware_concurrency();
117178
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
@@ -194,12 +255,6 @@ int32_t cpu_get_num_math() {
194255
// CLI argument parsing
195256
//
196257

197-
void gpt_params_handle_hf_token(gpt_params & params) {
198-
if (params.hf_token.empty() && std::getenv("HF_TOKEN")) {
199-
params.hf_token = std::getenv("HF_TOKEN");
200-
}
201-
}
202-
203258
void gpt_params_handle_model_default(gpt_params & params) {
204259
if (!params.hf_repo.empty()) {
205260
// short-hand to avoid specifying --hf-file -> default it to --model
@@ -247,7 +302,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
247302

248303
gpt_params_handle_model_default(params);
249304

250-
gpt_params_handle_hf_token(params);
305+
if (params.hf_token.empty()) {
306+
get_env("HF_TOKEN", params.hf_token);
307+
}
251308

252309
if (params.escape) {
253310
string_process_escapes(params.prompt);
@@ -267,6 +324,25 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
267324
return true;
268325
}
269326

327+
void gpt_params_parse_from_env(gpt_params & params) {
328+
// we only care about server-related params for now
329+
get_env("LLAMA_ARG_MODEL", params.model);
330+
get_env("LLAMA_ARG_THREADS", params.n_threads);
331+
get_env("LLAMA_ARG_CTX_SIZE", params.n_ctx);
332+
get_env("LLAMA_ARG_N_PARALLEL", params.n_parallel);
333+
get_env("LLAMA_ARG_BATCH", params.n_batch);
334+
get_env("LLAMA_ARG_UBATCH", params.n_ubatch);
335+
get_env("LLAMA_ARG_N_GPU_LAYERS", params.n_gpu_layers);
336+
get_env("LLAMA_ARG_THREADS_HTTP", params.n_threads_http);
337+
get_env("LLAMA_ARG_CHAT_TEMPLATE", params.chat_template);
338+
get_env("LLAMA_ARG_N_PREDICT", params.n_predict);
339+
get_env("LLAMA_ARG_ENDPOINT_METRICS", params.endpoint_metrics);
340+
get_env("LLAMA_ARG_ENDPOINT_SLOTS", params.endpoint_slots);
341+
get_env("LLAMA_ARG_EMBEDDINGS", params.embedding);
342+
get_env("LLAMA_ARG_FLASH_ATTN", params.flash_attn);
343+
get_env("LLAMA_ARG_DEFRAG_THOLD", params.defrag_thold);
344+
}
345+
270346
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
271347
const auto params_org = params; // the example can modify the default params
272348

@@ -1727,7 +1803,13 @@ std::string gpt_params_get_system_info(const gpt_params & params) {
17271803
if (params.n_threads_batch != -1) {
17281804
os << " (n_threads_batch = " << params.n_threads_batch << ")";
17291805
}
1806+
#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
1807+
// TODO: windows + arm64 + mingw64
1808+
DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
1809+
os << " / " << logicalProcessorCount << " | " << llama_print_system_info();
1810+
#else
17301811
os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
1812+
#endif
17311813

17321814
return os.str();
17331815
}
@@ -2702,12 +2784,6 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
27022784
return text;
27032785
}
27042786

2705-
bool llama_should_add_bos_token(const llama_model * model) {
2706-
const int add_bos = llama_add_bos_token(model);
2707-
2708-
return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
2709-
}
2710-
27112787
//
27122788
// Chat template utils
27132789
//

common/common.h

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,7 @@ struct gpt_params {
267267
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
268268
};
269269

270-
void gpt_params_handle_hf_token(gpt_params & params);
270+
void gpt_params_parse_from_env(gpt_params & params);
271271
void gpt_params_handle_model_default(gpt_params & params);
272272

273273
bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
@@ -380,10 +380,6 @@ std::string llama_detokenize(
380380
const std::vector<llama_token> & tokens,
381381
bool special = true);
382382

383-
// Uses the value from the model metadata if possible, otherwise
384-
// defaults to true when model type is SPM, otherwise false.
385-
bool llama_should_add_bos_token(const llama_model * model);
386-
387383
//
388384
// Chat template utils
389385
//

0 commit comments

Comments
 (0)