Skip to content

Commit 755f9fa

Browse files
authored
Merge pull request #118 from SagsMug/main
Fix UnicodeDecodeError permanently
2 parents 523825e + 18a0c10 commit 755f9fa

File tree

4 files changed

+59
-10
lines changed

4 files changed

+59
-10
lines changed

examples/low_level_api/low_level_api_chat_cpp.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ def __init__(self, params: GptParams) -> None:
201201
# tokenize a prompt
202202
def _tokenize(self, prompt, bos=True):
203203
_arr = (llama_cpp.llama_token * (len(prompt) + 1))()
204-
_n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8"), _arr, len(_arr), bos)
204+
_n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8", errors="ignore"), _arr, len(_arr), bos)
205205
return _arr[:_n]
206206

207207
def set_color(self, c):
@@ -342,7 +342,7 @@ def exit(self):
342342
# return past text
343343
def past(self):
344344
for id in self.last_n_tokens[-self.n_past:]:
345-
yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8")
345+
yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8", errors="ignore")
346346

347347
# write input
348348
def input(self, prompt: str):
@@ -356,7 +356,7 @@ def input(self, prompt: str):
356356
def output(self):
357357
self.remaining_tokens = self.params.n_predict
358358
for id in self.generate():
359-
yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8")
359+
yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8", errors="ignore")
360360

361361
# read user input
362362
def read_input(self):

examples/low_level_api/low_level_api_llama_cpp.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@
7070
if not input_noecho:
7171
for id in embd:
7272
print(
73-
llama_cpp.llama_token_to_str(ctx, id).decode("utf-8"),
73+
llama_cpp.llama_token_to_str(ctx, id).decode("utf-8", errors="ignore"),
7474
end="",
7575
flush=True,
7676
)

llama_cpp/llama.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -446,6 +446,7 @@ def _create_completion(
446446
self.load_state(self.cache[prompt_tokens])
447447

448448
finish_reason = "length"
449+
multibyte_fix = 0
449450
for token in self.generate(
450451
prompt_tokens,
451452
top_k=top_k,
@@ -467,6 +468,20 @@ def _create_completion(
467468
completion_tokens.append(token)
468469

469470
all_text = self.detokenize(completion_tokens)
471+
472+
# Contains multi-byte UTF8
473+
for k,char in enumerate(all_text[-3:]):
474+
k = 3 - k
475+
for num,pattern in [(2, 192), (3, 224), (4, 240)]:
476+
# Bitwise AND check
477+
if (num > k and pattern & char == pattern):
478+
multibyte_fix = num - k
479+
480+
# Stop incomplete bytes from passing
481+
if (multibyte_fix > 0):
482+
multibyte_fix -= 1
483+
continue
484+
470485
any_stop = [s for s in stop_sequences if s in all_text]
471486
if len(any_stop) > 0:
472487
first_stop = any_stop[0]
@@ -495,7 +510,7 @@ def _create_completion(
495510
"model": self.model_path,
496511
"choices": [
497512
{
498-
"text": text[start:].decode("utf-8"),
513+
"text": text[start:].decode("utf-8", errors="ignore"),
499514
"index": 0,
500515
"logprobs": None,
501516
"finish_reason": None,
@@ -516,7 +531,7 @@ def _create_completion(
516531
"model": self.model_path,
517532
"choices": [
518533
{
519-
"text": text[returned_characters:].decode("utf-8"),
534+
"text": text[returned_characters:].decode("utf-8", errors="ignore"),
520535
"index": 0,
521536
"logprobs": None,
522537
"finish_reason": finish_reason,
@@ -525,7 +540,7 @@ def _create_completion(
525540
}
526541
return
527542

528-
text_str = text.decode("utf-8")
543+
text_str = text.decode("utf-8", errors="ignore")
529544

530545
if echo:
531546
text_str = prompt + text_str
@@ -543,7 +558,7 @@ def _create_completion(
543558

544559
all_tokens = prompt_tokens + completion_tokens
545560
all_token_strs = [
546-
self.detokenize([token]).decode("utf-8") for token in all_tokens
561+
self.detokenize([token]).decode("utf-8", errors="ignore") for token in all_tokens
547562
]
548563
all_logprobs = [
549564
[Llama.logit_to_logprob(logit) for logit in row]
@@ -562,7 +577,7 @@ def _create_completion(
562577
)
563578
token_logprobs.append(sorted_logprobs[int(token)][0])
564579
top_logprob = {
565-
self.detokenize([llama_cpp.llama_token(i)]).decode("utf-8"): logprob
580+
self.detokenize([llama_cpp.llama_token(i)]).decode("utf-8", errors="ignore"): logprob
566581
for logprob, i in sorted_logprobs[:logprobs]
567582
}
568583
top_logprob.update({token_str: sorted_logprobs[int(token)][0]})

tests/test_llama.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,4 +93,38 @@ def test_llama_pickle():
9393

9494
text = b"Hello World"
9595

96-
assert llama.detokenize(llama.tokenize(text)) == text
96+
assert llama.detokenize(llama.tokenize(text)) == text
97+
98+
def test_utf8(monkeypatch):
99+
llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
100+
101+
## Set up mock function
102+
def mock_eval(*args, **kwargs):
103+
return 0
104+
105+
monkeypatch.setattr("llama_cpp.llama_cpp.llama_eval", mock_eval)
106+
107+
output_text = "😀"
108+
output_tokens = llama.tokenize(output_text.encode("utf-8"))
109+
token_eos = llama.token_eos()
110+
n = 0
111+
112+
def mock_sample(*args, **kwargs):
113+
nonlocal n
114+
if n < len(output_tokens):
115+
n += 1
116+
return output_tokens[n - 1]
117+
else:
118+
return token_eos
119+
120+
monkeypatch.setattr("llama_cpp.llama_cpp.llama_sample_top_p_top_k", mock_sample)
121+
122+
## Test basic completion with utf8 multibyte
123+
n = 0 # reset
124+
completion = llama.create_completion("", max_tokens=4)
125+
assert completion["choices"][0]["text"] == output_text
126+
127+
## Test basic completion with incomplete utf8 multibyte
128+
n = 0 # reset
129+
completion = llama.create_completion("", max_tokens=1)
130+
assert completion["choices"][0]["text"] == ""

0 commit comments

Comments
 (0)