Skip to content

Commit 01aec4a

Browse files
authored
server : add Speech Recognition & Synthesis to UI (#8679)
* server : add Speech Recognition & Synthesis to UI * server : add Speech Recognition & Synthesis to UI (fixes)
1 parent 41cd47c commit 01aec4a

File tree

1 file changed

+164
-16
lines changed

1 file changed

+164
-16
lines changed

examples/server/public/index.html

Lines changed: 164 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
<html>
2-
32
<head>
43
<meta charset="UTF-8">
54
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
@@ -132,12 +131,20 @@
132131
align-items: stretch;
133132
}
134133

135-
.right {
134+
.message-controls {
136135
display: flex;
137-
flex-direction: row;
138-
gap: 0.5em;
139136
justify-content: flex-end;
140137
}
138+
.message-controls > div:nth-child(2) {
139+
display: flex;
140+
flex-direction: column;
141+
gap: 0.5em;
142+
}
143+
.message-controls > div:nth-child(2) > div {
144+
display: flex;
145+
margin-left: auto;
146+
gap: 0.5em;
147+
}
141148

142149
fieldset {
143150
border: none;
@@ -276,6 +283,7 @@
276283

277284
import { llama } from './completion.js';
278285
import { SchemaConverter } from './json-schema-to-grammar.mjs';
286+
279287
let selected_image = false;
280288
var slot_id = -1;
281289

@@ -447,6 +455,9 @@
447455

448456
/* END: Support for storing prompt templates and parameters in browsers LocalStorage */
449457

458+
const tts = window.speechSynthesis;
459+
const ttsVoice = signal(null)
460+
450461
const llamaStats = signal(null)
451462
const controller = signal(null)
452463

@@ -596,8 +607,51 @@
596607
});
597608
}
598609

610+
const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
611+
const talkRecognition = SpeechRecognition ? new SpeechRecognition() : null;
599612
function MessageInput() {
600-
const message = useSignal("")
613+
const message = useSignal("");
614+
615+
const talkActive = useSignal(false);
616+
const sendOnTalk = useSignal(false);
617+
const talkStop = (e) => {
618+
if (e) e.preventDefault();
619+
620+
talkActive.value = false;
621+
talkRecognition?.stop();
622+
}
623+
const talk = (e) => {
624+
e.preventDefault();
625+
626+
if (talkRecognition)
627+
talkRecognition.start();
628+
else
629+
alert("Speech recognition is not supported by this browser.");
630+
}
631+
if(talkRecognition) {
632+
talkRecognition.onstart = () => {
633+
talkActive.value = true;
634+
}
635+
talkRecognition.onresult = (e) => {
636+
if (event.results.length > 0) {
637+
message.value = event.results[0][0].transcript;
638+
if (sendOnTalk.value) {
639+
submit(e);
640+
}
641+
}
642+
}
643+
talkRecognition.onspeechend = () => {
644+
talkStop();
645+
}
646+
}
647+
648+
const ttsVoices = useSignal(tts?.getVoices() || []);
649+
const ttsVoiceDefault = computed(() => ttsVoices.value.find(v => v.default));
650+
if (tts) {
651+
tts.onvoiceschanged = () => {
652+
ttsVoices.value = tts.getVoices();
653+
}
654+
}
601655

602656
const submit = (e) => {
603657
stop(e);
@@ -624,11 +678,45 @@
624678
value="${message}"
625679
/>
626680
</div>
627-
<div class="right">
628-
<button type="submit" disabled=${generating.value}>Send</button>
629-
<button onclick=${uploadImage}>Upload Image</button>
630-
<button onclick=${stop} disabled=${!generating.value}>Stop</button>
631-
<button onclick=${reset}>Reset</button>
681+
<div class="message-controls">
682+
<div> </div>
683+
<div>
684+
<div>
685+
<button type="submit" disabled=${generating.value || talkActive.value}>Send</button>
686+
<button disabled=${generating.value || talkActive.value} onclick=${uploadImage}>Upload Image</button>
687+
<button onclick=${stop} disabled=${!generating.value}>Stop</button>
688+
<button onclick=${reset}>Reset</button>
689+
</div>
690+
<div>
691+
<a href="#" style="cursor: help;" title="Help" onclick=${e => {
692+
e.preventDefault();
693+
alert(`STT supported by your browser: ${SpeechRecognition ? 'Yes' : 'No'}\n` +
694+
`(TTS and speech recognition are not provided by llama.cpp)\n` +
695+
`Note: STT requires HTTPS to work.`);
696+
}}>[?]</a>
697+
<button disabled=${generating.value} onclick=${talkActive.value ? talkStop : talk}>${talkActive.value ? "Stop Talking" : "Talk"}</button>
698+
<div>
699+
<input type="checkbox" id="send-on-talk" name="send-on-talk" checked="${sendOnTalk}" onchange=${(e) => sendOnTalk.value = e.target.checked} />
700+
<label for="send-on-talk" style="line-height: initial;">Send after talking</label>
701+
</div>
702+
</div>
703+
<div>
704+
<a href="#" style="cursor: help;" title="Help" onclick=${e => {
705+
e.preventDefault();
706+
alert(`TTS supported by your browser: ${tts ? 'Yes' : 'No'}\n(TTS and speech recognition are not provided by llama.cpp)`);
707+
}}>[?]</a>
708+
<label for="tts-voices" style="line-height: initial;">Bot Voice:</label>
709+
<select id="tts-voices" name="tts-voices" onchange=${(e) => ttsVoice.value = e.target.value} style="max-width: 100px;">
710+
<option value="" selected="${!ttsVoice.value}">None</option>
711+
${[
712+
...(ttsVoiceDefault.value ? [ttsVoiceDefault.value] : []),
713+
...ttsVoices.value.filter(v => !v.default),
714+
].map(
715+
v => html`<option value="${v.name}" selected="${ttsVoice.value === v.name}">${v.name} (${v.lang}) ${v.default ? '(default)' : ''}</option>`
716+
)}
717+
</select>
718+
</div>
719+
</div>
632720
</div>
633721
</form>
634722
`
@@ -659,26 +747,86 @@
659747
}
660748
}, [messages])
661749

750+
const ttsChatLineActiveIx = useSignal(undefined);
751+
const ttsChatLine = (e, ix, msg) => {
752+
if (e) e.preventDefault();
753+
754+
if (!tts || !ttsVoice.value || !('SpeechSynthesisUtterance' in window)) return;
755+
756+
const ttsVoices = tts.getVoices();
757+
const voice = ttsVoices.find(v => v.name === ttsVoice.value);
758+
if (!voice) return;
759+
760+
if (ttsChatLineActiveIx.value !== undefined) {
761+
tts.cancel();
762+
if (ttsChatLineActiveIx.value === ix) {
763+
ttsChatLineActiveIx.value = undefined;
764+
return;
765+
}
766+
}
767+
768+
ttsChatLineActiveIx.value = ix;
769+
let ttsUtter = new SpeechSynthesisUtterance(msg);
770+
ttsUtter.voice = voice;
771+
ttsUtter.onend = e => {
772+
ttsChatLineActiveIx.value = undefined;
773+
};
774+
tts.speak(ttsUtter);
775+
}
776+
662777
const isCompletionMode = session.value.type === 'completion'
778+
779+
// Try play the last bot message
780+
const lastCharChatLinesIxs = useSignal([]);
781+
const lastCharChatLinesIxsOld = useSignal([]);
782+
useEffect(() => {
783+
if (
784+
!isCompletionMode
785+
&& lastCharChatLinesIxs.value.length !== lastCharChatLinesIxsOld.value.length
786+
&& !generating.value
787+
) {
788+
const ix = lastCharChatLinesIxs.value[lastCharChatLinesIxs.value.length - 1];
789+
if (ix !== undefined) {
790+
const msg = messages[ix];
791+
ttsChatLine(null, ix, Array.isArray(msg) ? msg[1].map(m => m.content).join('') : msg);
792+
}
793+
794+
lastCharChatLinesIxsOld.value = structuredClone(lastCharChatLinesIxs.value);
795+
}
796+
}, [generating.value]);
797+
663798
const chatLine = ([user, data], index) => {
664799
let message
665-
const isArrayMessage = Array.isArray(data)
800+
const isArrayMessage = Array.isArray(data);
801+
const text = isArrayMessage ?
802+
data.map(msg => msg.content).join('') :
803+
data;
666804
if (params.value.n_probs > 0 && isArrayMessage) {
667805
message = html`<${Probabilities} data=${data} />`
668806
} else {
669-
const text = isArrayMessage ?
670-
data.map(msg => msg.content).join('') :
671-
data;
672807
message = isCompletionMode ?
673808
text :
674809
html`<${Markdownish} text=${template(text)} />`
675810
}
811+
812+
const fromBot = user && user === '{{char}}';
813+
if (fromBot && !lastCharChatLinesIxs.value.includes(index))
814+
lastCharChatLinesIxs.value.push(index);
815+
676816
if (user) {
677-
return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
817+
return html`
818+
<div>
819+
<p key=${index}><strong>${template(user)}:</strong> ${message}</p>
820+
${
821+
fromBot && ttsVoice.value
822+
&& html`<button disabled=${generating.value} onclick=${e => ttsChatLine(e, index, text)} aria-label=${ttsChatLineActiveIx.value === index ? 'Pause' : 'Play'}>${ ttsChatLineActiveIx.value === index ? '⏸️' : '▶️' }</div>`
823+
}
824+
</div>
825+
`;
678826
} else {
679827
return isCompletionMode ?
680828
html`<span key=${index}>${message}</span>` :
681-
html`<p key=${index}>${message}</p>`
829+
html`<div><p key=${index}>${message}</p></div>`
682830
}
683831
};
684832

0 commit comments

Comments
 (0)