Skip to content

Commit e7c8a2e

Browse files
committed
Merge branch 'master' into xsn/mtmd_ultravox
2 parents 7033aa1 + 2aa777d commit e7c8a2e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+2333
-1408
lines changed

.devops/musa.Dockerfile

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
ARG UBUNTU_VERSION=22.04
22
# This needs to generally match the container host's environment.
3-
ARG MUSA_VERSION=rc3.1.1
3+
ARG MUSA_VERSION=rc4.0.1
44
# Target the MUSA build image
5-
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
5+
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION}
66

7-
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
7+
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}
88

99
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
1010

@@ -21,21 +21,14 @@ RUN apt-get update && \
2121
libcurl4-openssl-dev \
2222
libgomp1
2323

24-
COPY requirements.txt requirements.txt
25-
COPY requirements requirements
26-
27-
RUN pip install --upgrade pip setuptools wheel \
28-
&& pip install -r requirements.txt
29-
3024
WORKDIR /app
3125

3226
COPY . .
3327

34-
# Use the default MUSA archs if not specified
3528
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
3629
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
3730
fi && \
38-
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
31+
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
3932
cmake --build build --config Release -j$(nproc)
4033

4134
RUN mkdir -p /app/lib && \

.github/workflows/build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,7 @@ jobs:
351351
352352
ubuntu-22-cmake-musa:
353353
runs-on: ubuntu-22.04
354-
container: mthreads/musa:rc3.1.1-devel-ubuntu22.04
354+
container: mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
355355

356356
steps:
357357
- name: Clone

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ range of hardware - locally and in the cloud.
3737
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
3838
- AVX, AVX2, AVX512 and AMX support for x86 architectures
3939
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
40-
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
40+
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads GPUs via MUSA)
4141
- Vulkan and SYCL backend support
4242
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
4343

@@ -237,7 +237,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
237237
| [BLAS](docs/build.md#blas-build) | All |
238238
| [BLIS](docs/backend/BLIS.md) | All |
239239
| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
240-
| [MUSA](docs/build.md#musa) | Moore Threads MTT GPU |
240+
| [MUSA](docs/build.md#musa) | Moore Threads GPU |
241241
| [CUDA](docs/build.md#cuda) | Nvidia GPU |
242242
| [HIP](docs/build.md#hip) | AMD GPU |
243243
| [Vulkan](docs/build.md#vulkan) | GPU |

ci/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ docker run --privileged -it \
5454
-v $HOME/llama.cpp/ci-cache:/ci-cache \
5555
-v $HOME/llama.cpp/ci-results:/ci-results \
5656
-v $PWD:/ws -w /ws \
57-
mthreads/musa:rc3.1.1-devel-ubuntu22.04
57+
mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
5858
```
5959

6060
Inside the container, execute the following commands:

common/arg.cpp

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1445,6 +1445,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
14451445
params.n_keep = value;
14461446
}
14471447
));
1448+
add_opt(common_arg(
1449+
{"--swa-full"},
1450+
string_format("use full-size SWA cache (default: %s)\n"
1451+
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)", params.swa_full ? "true" : "false"),
1452+
[](common_params & params) {
1453+
params.swa_full = true;
1454+
}
1455+
).set_env("LLAMA_ARG_SWA_FULL"));
14481456
add_opt(common_arg(
14491457
{"--no-context-shift"},
14501458
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
@@ -1670,7 +1678,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
16701678
[](common_params & params) {
16711679
params.warmup = false;
16721680
}
1673-
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING}));
1681+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL}));
16741682
add_opt(common_arg(
16751683
{"--spm-infill"},
16761684
string_format(
@@ -2057,13 +2065,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20572065
params.grp_attn_w = value;
20582066
}
20592067
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
2060-
add_opt(common_arg(
2061-
{"-dkvc", "--dump-kv-cache"},
2062-
"verbose print of the KV cache",
2063-
[](common_params & params) {
2064-
params.dump_kv_cache = true;
2065-
}
2066-
));
20672068
add_opt(common_arg(
20682069
{"-nkvo", "--no-kv-offload"},
20692070
"disable KV offload",

common/common.cpp

Lines changed: 4 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1102,6 +1102,9 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
11021102
mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
11031103
}
11041104

1105+
mparams.progress_callback = params.load_progress_callback;
1106+
mparams.progress_callback_user_data = params.load_progress_callback_user_data;
1107+
11051108
return mparams;
11061109
}
11071110

@@ -1133,6 +1136,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
11331136
cparams.flash_attn = params.flash_attn;
11341137
cparams.no_perf = params.no_perf;
11351138
cparams.op_offload = !params.no_op_offload;
1139+
cparams.swa_full = params.swa_full;
11361140

11371141
if (params.reranking) {
11381142
cparams.embeddings = true;
@@ -1325,81 +1329,6 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
13251329
return text;
13261330
}
13271331

1328-
//
1329-
// KV cache utils
1330-
//
1331-
1332-
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
1333-
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
1334-
1335-
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
1336-
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1337-
1338-
llama_kv_cache_view_cell * c_curr = view.cells;
1339-
llama_seq_id * cs_curr = view.cells_sequences;
1340-
1341-
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1342-
if (i % row_size == 0) {
1343-
printf("\n%5d: ", i);
1344-
}
1345-
int seq_count = 0;
1346-
for (int j = 0; j < view.n_seq_max; j++) {
1347-
if (cs_curr[j] >= 0) { seq_count++; }
1348-
}
1349-
putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
1350-
}
1351-
1352-
printf("\n=== Done dumping\n");
1353-
}
1354-
1355-
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
1356-
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
1357-
1358-
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
1359-
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1360-
1361-
std::unordered_map<llama_seq_id, size_t> seqs;
1362-
llama_kv_cache_view_cell * c_curr = view.cells;
1363-
llama_seq_id * cs_curr = view.cells_sequences;
1364-
1365-
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1366-
for (int j = 0; j < view.n_seq_max; j++) {
1367-
if (cs_curr[j] < 0) { continue; }
1368-
if (seqs.find(cs_curr[j]) == seqs.end()) {
1369-
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
1370-
const size_t sz = seqs.size();
1371-
seqs[cs_curr[j]] = sz;
1372-
}
1373-
}
1374-
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
1375-
}
1376-
1377-
printf("=== Sequence legend: ");
1378-
for (const auto & it : seqs) {
1379-
printf("%zu=%d, ", it.second, it.first);
1380-
}
1381-
printf("'+'=other sequence ids");
1382-
1383-
c_curr = view.cells;
1384-
cs_curr = view.cells_sequences;
1385-
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1386-
if (i % row_size == 0) {
1387-
printf("\n%5d: ", i);
1388-
}
1389-
for (int j = 0; j < view.n_seq_max; j++) {
1390-
if (cs_curr[j] >= 0) {
1391-
const auto & it = seqs.find(cs_curr[j]);
1392-
putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
1393-
} else {
1394-
putchar('.');
1395-
}
1396-
}
1397-
putchar(' ');
1398-
}
1399-
1400-
printf("\n=== Done dumping\n");
1401-
}
1402-
14031332
//
14041333
// Embedding utils
14051334
//

common/common.h

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -323,13 +323,13 @@ struct common_params {
323323
bool flash_attn = false; // flash attention
324324
bool no_perf = false; // disable performance metrics
325325
bool ctx_shift = true; // context shift on inifinite text generation
326+
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
326327

327328
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
328329
bool use_mmap = true; // use mmap for faster loads
329330
bool use_mlock = false; // use mlock to keep model in memory
330331
bool verbose_prompt = false; // print prompt tokens before generation
331332
bool display_prompt = true; // print prompt before generation
332-
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
333333
bool no_kv_offload = false; // disable KV offloading
334334
bool warmup = true; // warmup run
335335
bool check_tensors = false; // validate tensor data
@@ -428,6 +428,11 @@ struct common_params {
428428

429429
// common params
430430
std::string out_file; // output filename for all example programs
431+
// optional callback for model loading progress and cancellation:
432+
// called with a progress value between 0.0 and 1.0.
433+
// return false from callback to abort model loading or true to continue
434+
llama_progress_callback load_progress_callback = NULL;
435+
void * load_progress_callback_user_data = NULL;
431436
};
432437

433438
// call once at the start of a program if it uses libcommon
@@ -616,16 +621,6 @@ std::string common_detokenize(
616621
const std::vector<llama_token> & tokens,
617622
bool special = true);
618623

619-
//
620-
// KV cache utils
621-
//
622-
623-
// Dump the KV cache view with the number of sequences per cell.
624-
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
625-
626-
// Dump the KV cache view showing individual sequences in each cell (long output).
627-
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
628-
629624
//
630625
// Embedding utils
631626
//

docs/backend/CANN.md

Lines changed: 74 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -56,60 +56,82 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
5656

5757
## Model Supports
5858

59-
| Model Name | FP16 | Q8_0 | Q4_0 |
59+
| Model Name | FP16 | Q4_0 | Q8_0 |
6060
|:----------------------------|:-----:|:----:|:----:|
61-
| AquilaChat2-7B ||||
62-
| Baichuan-7b ||||
63-
| Baichuan2-7B-Chat ||||
64-
| bitnet_b1_58-large ||||
65-
| bloom-560m || x ||
66-
| bloomz-alpaca-560m || x ||
67-
| c4ai-command-r-35B-v01 | x | x | x |
68-
| chatglm3-6B | x | x | x |
69-
| chinese-alpaca-2-1.3b ||||
70-
| CodeShell-7B ||||
71-
| deepseek-ai_deepseek-coder-1.3B-base | x | x | x |
72-
| deepseek-ai_DeepSeek-V2-Lite | x | x | x |
73-
| deepseek-coder-6.7B-instruct | x | x | x |
74-
| DeepSeek-V2-Lite-64x1.5B | x | x | x |
75-
| falcon-7b-instruct ||||
76-
| flan-t5-large ||||
77-
| gemma-2-9b-it ||||
78-
| glm-4-9B | x | x | x |
79-
| gpt2 ||||
80-
| Gpt2-163M ||||
81-
| granite-3B-code-instruct ||||
61+
| Llama-2 ||||
62+
| Llama-3 ||||
63+
| Mistral-7B ||||
64+
| Mistral MOE ||||
65+
| DBRX | - | - | - |
66+
| Falcon ||||
67+
| Chinese LLaMA/Alpaca ||||
68+
| Vigogne(French) ||||
69+
| BERT | x | x | x |
70+
| Koala ||||
71+
| Baichuan ||||
72+
| Aquila 1 & 2 ||||
73+
| Starcoder models ||||
74+
| Refact ||||
75+
| MPT ||||
76+
| Bloom ||||
77+
| Yi models ||||
78+
| stablelm models ||||
79+
| DeepSeek models | x | x | x |
80+
| Qwen models ||||
81+
| PLaMo-13B ||||
82+
| Phi models ||||
83+
| PhiMoE ||||
84+
| GPT-2 ||||
85+
| Orion ||||
86+
| InternlLM2 ||||
87+
| CodeShell ||||
88+
| Gemma ||||
89+
| Mamba ||||
90+
| Xverse ||||
91+
| command-r models ||||
92+
| Grok-1 | - | - | - |
93+
| SEA-LION ||||
8294
| GritLM-7B ||||
83-
| internlm2_5-7b-chat ||||
84-
| koala-7B-HF ||||
85-
| Llama-2-7b-chat-hf ||||
86-
| Llama-3-Smaug-8B ||||
87-
| Llama2-Chinese-7b-Chat ||||
88-
| Llama3-8B ||||
89-
| Llama3-8b-chinese ||||
90-
| mamba-130m-hf ||||
91-
| Mistral-7B-Instruct-v0.2 ||||
92-
| Mixtral-8x7B-Instruct-v0.1 | x |||
93-
| mpt-7B ||||
94-
| OLMo-1B-hf ||||
95-
| OpenELM-3B-Instruct ||||
96-
| Orion-14b-base ||||
97-
| phi1 | x | x | x |
98-
| phi2 | x | x | x |
99-
| Phi-3-mini-4k-instruct ||||
100-
| plamo-13b ||||
101-
| pythia-70M | x | x | x |
102-
| Qwen-7B ||||
103-
| Qwen2-1.5B-Instruct || x ||
104-
| Refact-1_6B-fim ||||
105-
| SmolLM-135M ||||
106-
| stablelm-zephyr | x | x | x |
107-
| stablelm-2-zephyr-1_6b | x | x | x |
108-
| starcoderbase-1b ||||
109-
| starcoder2-3b ||||
110-
| vigogne-7b-chat ||||
111-
| xverse-7b-chat ||||
112-
| Yi-6b-Chat ||||
95+
| OLMo ||||
96+
| OLMo 2 ||||
97+
| OLMoE ||||
98+
| Granite models ||||
99+
| GPT-NeoX ||||
100+
| Pythia ||||
101+
| Snowflake-Arctic MoE | - | - | - |
102+
| Smaug ||||
103+
| Poro 34B ||||
104+
| Bitnet b1.58 models || x | x |
105+
| Flan-T5 ||||
106+
| Open Elm models | x |||
107+
| chatGLM3-6B + ChatGLM4-9b + GLMEdge-1.5b + GLMEdge-4b ||||
108+
| GLM-4-0414 ||||
109+
| SmolLM ||||
110+
| EXAONE-3.0-7.8B-Instruct ||||
111+
| FalconMamba Models ||||
112+
| Jais Models | - | x | x |
113+
| Bielik-11B-v2.3 ||||
114+
| RWKV-6 | - |||
115+
| QRWKV-6 ||||
116+
| GigaChat-20B-A3B | x | x | x |
117+
| Trillion-7B-preview ||||
118+
| Ling models ||||
119+
120+
121+
**Multimodal**
122+
| Model Name | FP16 | Q4_0 | Q8_0 |
123+
|:----------------------------|:-----:|:----:|:----:|
124+
| LLaVA 1.5 models, LLaVA 1.6 models | x | x | x |
125+
| BakLLaVA ||||
126+
| Obsidian || - | - |
127+
| ShareGPT4V | x | - | - |
128+
| MobileVLM 1.7B/3B models | - | - | - |
129+
| Yi-VL | - | - | - |
130+
| Mini CPM ||||
131+
| Moondream ||||
132+
| Bunny || - | - |
133+
| GLM-EDGE ||||
134+
| Qwen2-VL ||||
113135

114136

115137

0 commit comments

Comments
 (0)