Skip to content

Commit c5a0176

Browse files
committed
Merge branch 'master' into xsn/private_batch_api
2 parents 30f1db9 + 960e726 commit c5a0176

36 files changed

+2550
-721
lines changed

.github/workflows/build.yml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -676,6 +676,35 @@ jobs:
676676
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
677677
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
678678
679+
macOS-latest-cmake-visionos:
680+
runs-on: macos-latest
681+
682+
steps:
683+
- name: Clone
684+
id: checkout
685+
uses: actions/checkout@v4
686+
687+
- name: Dependencies
688+
id: depends
689+
continue-on-error: true
690+
run: |
691+
brew update
692+
693+
- name: Build
694+
id: cmake_build
695+
run: |
696+
sysctl -a
697+
cmake -B build -G Xcode \
698+
-DGGML_METAL_USE_BF16=ON \
699+
-DGGML_METAL_EMBED_LIBRARY=ON \
700+
-DLLAMA_BUILD_EXAMPLES=OFF \
701+
-DLLAMA_BUILD_TESTS=OFF \
702+
-DLLAMA_BUILD_SERVER=OFF \
703+
-DCMAKE_SYSTEM_NAME=visionOS \
704+
-DCMAKE_OSX_DEPLOYMENT_TARGET=1.0 \
705+
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
706+
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
707+
679708
macOS-latest-swift:
680709
runs-on: macos-latest
681710

build-xcframework.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -432,8 +432,8 @@ cmake -B build-visionos -G Xcode \
432432
-DCMAKE_SYSTEM_NAME=visionOS \
433433
-DCMAKE_OSX_SYSROOT=xros \
434434
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
435-
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_C_FLAGS}" \
436-
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_CXX_FLAGS}" \
435+
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
436+
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
437437
-S .
438438
cmake --build build-visionos --config Release -- -quiet
439439

@@ -445,8 +445,8 @@ cmake -B build-visionos-sim -G Xcode \
445445
-DCMAKE_SYSTEM_NAME=visionOS \
446446
-DCMAKE_OSX_SYSROOT=xrsimulator \
447447
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
448-
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_C_FLAGS}" \
449-
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_CXX_FLAGS}" \
448+
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
449+
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
450450
-S .
451451
cmake --build build-visionos-sim --config Release -- -quiet
452452

convert_hf_to_gguf.py

Lines changed: 39 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,8 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
180180
extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
181181
missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
182182
if len(extra) == 0 and len(missing_files) > 0:
183-
raise ValueError(f"Missing or incomplete model files: {missing_files}")
183+
raise ValueError(f"Missing or incomplete model files: {missing_files}\n"
184+
f"Missing tensors: {missing}")
184185
else:
185186
raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
186187
f"Missing tensors: {missing}\n"
@@ -528,6 +529,8 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
528529
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
529530
added_vocab = tokenizer.get_added_vocab()
530531

532+
added_tokens_decoder = tokenizer.added_tokens_decoder
533+
531534
for i in range(vocab_size):
532535
if i not in reverse_vocab:
533536
tokens.append(f"[PAD{i}]")
@@ -537,13 +540,13 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
537540
if token in added_vocab:
538541
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
539542
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
540-
if not tokenizer.added_tokens_decoder[i].normalized:
543+
if not added_tokens_decoder[i].normalized:
541544
previous_token = token
542545
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
543546
if previous_token != token:
544547
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
545548

546-
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
549+
if added_tokens_decoder[i].special or self.does_token_look_special(token):
547550
toktypes.append(gguf.TokenType.CONTROL)
548551
else:
549552
# NOTE: this was added for Gemma.
@@ -1099,13 +1102,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
10991102

11001103
tensors.append((self.map_tensor_name(name), data_torch))
11011104

1102-
if name == "word_embeddings.weight":
1103-
assert self.tensor_names is not None
1104-
1105-
# TODO: tie them at runtime, don't duplicate in the model file
1106-
if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
1107-
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
1108-
11091105
return tensors
11101106

11111107

@@ -1747,6 +1743,25 @@ def prepare_tensors(self):
17471743
raise ValueError(f"Unprocessed experts: {experts}")
17481744

17491745

1746+
@Model.register("Mistral3ForConditionalGeneration")
1747+
class Mistral3Model(LlamaModel):
1748+
model_arch = gguf.MODEL_ARCH.LLAMA
1749+
1750+
# we need to merge the text_config into the root level of hparams
1751+
def __init__(self, *args, **kwargs):
1752+
hparams = Model.load_hparams(kwargs["dir_model"])
1753+
if "text_config" in hparams:
1754+
hparams = {**hparams, **hparams["text_config"]}
1755+
kwargs["hparams"] = hparams
1756+
super().__init__(*args, **kwargs)
1757+
1758+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
1759+
name = name.replace("language_model.", "")
1760+
if "multi_modal_projector" in name or "vision_tower" in name:
1761+
return []
1762+
return super().modify_tensors(data_torch, name, bid)
1763+
1764+
17501765
@Model.register("DeciLMForCausalLM")
17511766
class DeciModel(Model):
17521767
model_arch = gguf.MODEL_ARCH.DECI
@@ -2404,10 +2419,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
24042419

24052420
tensors.append((new_name, data_torch))
24062421

2407-
# note: GPT2 output is tied to (same as) wte in original model
2408-
if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
2409-
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
2410-
24112422
return tensors
24122423

24132424

@@ -2737,21 +2748,26 @@ def set_gguf_parameters(self):
27372748
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
27382749
self.gguf_writer.add_rope_scaling_factor(1.0)
27392750

2751+
_has_tok_embd = False
2752+
27402753
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
27412754
del bid # unused
27422755

2743-
new_name = self.map_tensor_name(name)
2744-
2745-
tensors: list[tuple[str, Tensor]] = [(new_name, data_torch)]
2756+
output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
2757+
tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD)
27462758

2747-
if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
2748-
assert self.tensor_names is not None
2759+
new_name = self.map_tensor_name(name)
27492760

2750-
if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
2751-
# copy tok_embd.weight to output.weight
2752-
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
2761+
# assuming token_embd.weight is seen before output.weight
2762+
if not self._has_tok_embd and new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
2763+
# even though the tensor file(s) does not contain the word embeddings they are still in the weight map
2764+
if self.tensor_names and "transformer.wte.weight" in self.tensor_names:
2765+
logger.debug(f"{tok_embd_name} not found before {output_name}, assuming they are tied")
2766+
self.tensor_names.remove("transformer.wte.weight")
2767+
elif new_name == tok_embd_name:
2768+
self._has_tok_embd = True
27532769

2754-
return tensors
2770+
return [(new_name, data_torch)]
27552771

27562772

27572773
@Model.register("InternLM2ForCausalLM")

docs/backend/SYCL.md

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,15 @@ cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENAB
237237
cmake --build buildWithCublas --config Release
238238
```
239239

240+
**oneDNN**: The current oneDNN releases *(shipped with the oneAPI base-toolkit)* do not include the NVIDIA backend. Therefore, oneDNN must be compiled from source to enable the NVIDIA target:
241+
242+
```sh
243+
git clone https://github.com/oneapi-src/oneDNN.git
244+
cd oneDNN
245+
cmake -GNinja -Bbuild-nvidia -DDNNL_CPU_RUNTIME=DPCPP -DDNNL_GPU_RUNTIME=DPCPP -DDNNL_GPU_VENDOR=NVIDIA -DONEDNN_BUILD_GRAPH=OFF -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
246+
cmake --build build-nvidia --config Release
247+
```
248+
240249
- **Adding support to AMD GPUs**
241250

242251
**oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit.
@@ -327,10 +336,10 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
327336
GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture
328337

329338
# Option 1: Use FP32 (recommended for better performance in most cases)
330-
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
339+
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DDNNL_DIR=/path/to/oneDNN/build-nvidia/install/lib/cmake/dnnl
331340

332341
# Option 2: Use FP16
333-
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
342+
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DDNNL_DIR=/path/to/oneDNN/build-nvidia/install/lib/cmake/dnnl
334343

335344
# build all binary
336345
cmake --build build --config Release -j -v

examples/server/public/index.html.gz

90 Bytes
Binary file not shown.

examples/server/webui/src/components/ChatScreen.tsx

Lines changed: 53 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -99,13 +99,9 @@ export default function ChatScreen() {
9999
canvasData,
100100
replaceMessageAndGenerate,
101101
} = useAppContext();
102-
const [inputMsg, setInputMsg] = useState(prefilledMsg.content());
103-
const inputRef = useRef<HTMLTextAreaElement>(null);
102+
const textarea = useOptimizedTextarea(prefilledMsg.content());
104103

105-
const { extraContext, clearExtraContext } = useVSCodeContext(
106-
inputRef,
107-
setInputMsg
108-
);
104+
const { extraContext, clearExtraContext } = useVSCodeContext(textarea);
109105
// TODO: improve this when we have "upload file" feature
110106
const currExtra: Message['extra'] = extraContext ? [extraContext] : undefined;
111107

@@ -135,9 +131,10 @@ export default function ChatScreen() {
135131
};
136132

137133
const sendNewMessage = async () => {
138-
if (inputMsg.trim().length === 0 || isGenerating(currConvId ?? '')) return;
139-
const lastInpMsg = inputMsg;
140-
setInputMsg('');
134+
const lastInpMsg = textarea.value();
135+
if (lastInpMsg.trim().length === 0 || isGenerating(currConvId ?? ''))
136+
return;
137+
textarea.setValue('');
141138
scrollToBottom(false);
142139
setCurrNodeId(-1);
143140
// get the last message node
@@ -146,13 +143,13 @@ export default function ChatScreen() {
146143
!(await sendMessage(
147144
currConvId,
148145
lastMsgNodeId,
149-
inputMsg,
146+
lastInpMsg,
150147
currExtra,
151148
onChunk
152149
))
153150
) {
154151
// restore the input message if failed
155-
setInputMsg(lastInpMsg);
152+
textarea.setValue(lastInpMsg);
156153
}
157154
// OK
158155
clearExtraContext();
@@ -195,16 +192,13 @@ export default function ChatScreen() {
195192
// send the prefilled message if needed
196193
sendNewMessage();
197194
} else {
198-
// otherwise, focus on the input and move the cursor to the end
199-
if (inputRef.current) {
200-
inputRef.current.focus();
201-
inputRef.current.selectionStart = inputRef.current.value.length;
202-
}
195+
// otherwise, focus on the input
196+
textarea.focus();
203197
}
204198
prefilledMsg.clear();
205199
// no need to keep track of sendNewMessage
206200
// eslint-disable-next-line react-hooks/exhaustive-deps
207-
}, [inputRef]);
201+
}, [textarea.ref]);
208202

209203
// due to some timing issues of StorageUtils.appendMsg(), we need to make sure the pendingMsg is not duplicated upon rendering (i.e. appears once in the saved conversation and once in the pendingMsg)
210204
const pendingMsgDisplay: MessageDisplay[] =
@@ -258,9 +252,7 @@ export default function ChatScreen() {
258252
<textarea
259253
className="textarea textarea-bordered w-full"
260254
placeholder="Type a message (Shift+Enter to add a new line)"
261-
ref={inputRef}
262-
value={inputMsg}
263-
onChange={(e) => setInputMsg(e.target.value)}
255+
ref={textarea.ref}
264256
onKeyDown={(e) => {
265257
if (e.nativeEvent.isComposing || e.keyCode === 229) return;
266258
if (e.key === 'Enter' && e.shiftKey) return;
@@ -280,11 +272,7 @@ export default function ChatScreen() {
280272
Stop
281273
</button>
282274
) : (
283-
<button
284-
className="btn btn-primary ml-2"
285-
onClick={sendNewMessage}
286-
disabled={inputMsg.trim().length === 0}
287-
>
275+
<button className="btn btn-primary ml-2" onClick={sendNewMessage}>
288276
Send
289277
</button>
290278
)}
@@ -298,3 +286,43 @@ export default function ChatScreen() {
298286
</div>
299287
);
300288
}
289+
290+
export interface OptimizedTextareaValue {
291+
value: () => string;
292+
setValue: (value: string) => void;
293+
focus: () => void;
294+
ref: React.RefObject<HTMLTextAreaElement>;
295+
}
296+
297+
// This is a workaround to prevent the textarea from re-rendering when the inner content changes
298+
// See https://github.com/ggml-org/llama.cpp/pull/12299
299+
function useOptimizedTextarea(initValue: string): OptimizedTextareaValue {
300+
const [savedInitValue, setSavedInitValue] = useState<string>(initValue);
301+
const textareaRef = useRef<HTMLTextAreaElement>(null);
302+
303+
useEffect(() => {
304+
if (textareaRef.current && savedInitValue) {
305+
textareaRef.current.value = savedInitValue;
306+
setSavedInitValue('');
307+
}
308+
}, [textareaRef, savedInitValue, setSavedInitValue]);
309+
310+
return {
311+
value: () => {
312+
return textareaRef.current?.value ?? savedInitValue;
313+
},
314+
setValue: (value: string) => {
315+
if (textareaRef.current) {
316+
textareaRef.current.value = value;
317+
}
318+
},
319+
focus: () => {
320+
if (textareaRef.current) {
321+
// focus and move the cursor to the end
322+
textareaRef.current.focus();
323+
textareaRef.current.selectionStart = textareaRef.current.value.length;
324+
}
325+
},
326+
ref: textareaRef,
327+
};
328+
}

examples/server/webui/src/utils/llama-vscode.ts

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { useEffect, useState } from 'react';
22
import { MessageExtraContext } from './types';
3+
import { OptimizedTextareaValue } from '../components/ChatScreen';
34

45
// Extra context when using llama.cpp WebUI from llama-vscode, inside an iframe
56
// Ref: https://github.com/ggml-org/llama.cpp/pull/11940
@@ -14,10 +15,7 @@ interface SetTextEvData {
1415
* window.postMessage({ command: 'setText', text: 'Spot the syntax error', context: 'def test()\n return 123' }, '*');
1516
*/
1617

17-
export const useVSCodeContext = (
18-
inputRef: React.RefObject<HTMLTextAreaElement>,
19-
setInputMsg: (text: string) => void
20-
) => {
18+
export const useVSCodeContext = (textarea: OptimizedTextareaValue) => {
2119
const [extraContext, setExtraContext] = useState<MessageExtraContext | null>(
2220
null
2321
);
@@ -27,20 +25,20 @@ export const useVSCodeContext = (
2725
const handleMessage = (event: MessageEvent) => {
2826
if (event.data?.command === 'setText') {
2927
const data: SetTextEvData = event.data;
30-
setInputMsg(data?.text);
28+
textarea.setValue(data?.text);
3129
if (data?.context && data.context.length > 0) {
3230
setExtraContext({
3331
type: 'context',
3432
content: data.context,
3533
});
3634
}
37-
inputRef.current?.focus();
35+
textarea.focus();
3836
}
3937
};
4038

4139
window.addEventListener('message', handleMessage);
4240
return () => window.removeEventListener('message', handleMessage);
43-
}, [inputRef, setInputMsg]);
41+
}, [textarea]);
4442

4543
// Add a keydown listener that sends the "escapePressed" message to the parent window
4644
useEffect(() => {

examples/speculative/speculative.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -333,7 +333,7 @@ int main(int argc, char ** argv) {
333333
}
334334

335335
active_seqs.erase(s);
336-
for(int i = 0; i < n_seq_dft; i++) {
336+
for (int i = 0; i < n_seq_dft; i++) {
337337
if (i == s) {
338338
continue;
339339
}

0 commit comments

Comments
 (0)