Skip to content

Commit 2a20f48

Browse files
authored
Fix UTF-8 handling (including colors) (#79)
1 parent d1f2247 commit 2a20f48

File tree

2 files changed

+27
-11
lines changed

2 files changed

+27
-11
lines changed

convert-pth-to-ggml.py

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
import struct
2323
import numpy as np
2424
import torch
25-
2625
from sentencepiece import SentencePieceProcessor
2726

2827
if len(sys.argv) < 3:
@@ -101,12 +100,28 @@ def get_n_parts(dim):
101100

102101
# Is this correct??
103102
for i in range(32000):
104-
# TODO: this is probably wrong - not sure how this tokenizer works
105-
text = tokenizer.decode([29889, i]).encode('utf-8')
106-
# remove the first byte (it's always '.')
107-
text = text[1:]
108-
fout.write(struct.pack("i", len(text)))
109-
fout.write(text)
103+
if tokenizer.is_unknown(i):
104+
# "<unk>" token (translated as ??)
105+
text = " \u2047 ".encode("utf-8")
106+
fout.write(struct.pack("i", len(text)))
107+
fout.write(text)
108+
elif tokenizer.is_control(i):
109+
# "<s>"/"</s>" tokens
110+
fout.write(struct.pack("i", 0))
111+
elif tokenizer.is_byte(i):
112+
# "<U+XX>" tokens (which may be invalid UTF-8)
113+
piece = tokenizer.id_to_piece(i)
114+
if len(piece) != 6:
115+
print("Invalid token: " + piece)
116+
sys.exit(1)
117+
byte_value = int(piece[3:-1], 16)
118+
fout.write(struct.pack("i", 1))
119+
fout.write(struct.pack("B", byte_value))
120+
else:
121+
# normal token. Uses U+2581 (LOWER ONE EIGHTH BLOCK) to represent spaces.
122+
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
123+
fout.write(struct.pack("i", len(text)))
124+
fout.write(text)
110125

111126
for k, v in model.items():
112127
name = k

main.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -939,17 +939,18 @@ int main(int argc, char ** argv) {
939939
break;
940940
}
941941
}
942+
943+
// reset color to default if we there is no pending user input
944+
if (!input_noecho && params.use_color && embd_inp.size() == input_consumed) {
945+
printf(ANSI_COLOR_RESET);
946+
}
942947
}
943948

944949
// display text
945950
if (!input_noecho) {
946951
for (auto id : embd) {
947952
printf("%s", vocab.id_to_token[id].c_str());
948953
}
949-
// reset color to default if we there is no pending user input
950-
if (params.use_color && embd_inp.size() <= input_consumed) {
951-
printf(ANSI_COLOR_RESET);
952-
}
953954
fflush(stdout);
954955
}
955956

0 commit comments

Comments
 (0)