Skip to content

Commit ec1b100

Browse files
authored
llama : tokenizer fixes (#2549)
* Merge tokenizer fixes into the gguf branch. * Add test vocabularies
1 parent 8af3a99 commit ec1b100

17 files changed

+611
-146
lines changed

convert.py

Lines changed: 61 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -238,22 +238,58 @@ def load(model_plus: 'ModelPlus') -> 'Params':
238238
return params
239239

240240

241-
class SentencePieceVocab:
242-
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], vocabtype: Optional[str]) -> None:
243-
self.vocabtype = vocabtype
244-
if self.vocabtype == "bpe":
245-
self.sentencepiece_tokenizer = json.loads(open(str(fname_tokenizer)).read())
246-
else:
247-
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
241+
class BpeVocab:
242+
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
243+
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
248244
added_tokens: Dict[str, int]
249245
if fname_added_tokens is not None:
250-
added_tokens = json.load(open(fname_added_tokens))
246+
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
251247
else:
252248
added_tokens = {}
253-
if self.vocabtype == "bpe":
254-
vocab_size: int = len(self.sentencepiece_tokenizer)
249+
vocab_size: int = len(self.bpe_tokenizer)
250+
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
251+
actual_ids = sorted(added_tokens.values())
252+
if expected_ids != actual_ids:
253+
raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
254+
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
255+
self.added_tokens_list = [text for (text, idx) in items]
256+
self.vocab_size_base: int = vocab_size
257+
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
258+
self.fname_tokenizer = fname_tokenizer
259+
self.fname_added_tokens = fname_added_tokens
260+
261+
def bpe_tokens(self) -> Iterable[Tuple[bytes, float]]:
262+
tokenizer = self.bpe_tokenizer
263+
from transformers.models.gpt2 import tokenization_gpt2
264+
byte_encoder = tokenization_gpt2.bytes_to_unicode()
265+
byte_decoder = {v: k for k, v in byte_encoder.items()}
266+
for i, item in enumerate(tokenizer):
267+
text: bytes = item.encode("utf-8")
268+
score: float = -i
269+
yield text, score
270+
271+
def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
272+
for text in self.added_tokens_list:
273+
score = -1000.0
274+
yield text.encode("utf-8"), score
275+
276+
def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
277+
yield from self.bpe_tokens()
278+
yield from self.added_tokens()
279+
280+
def __repr__(self) -> str:
281+
return f"BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
282+
283+
284+
class SentencePieceVocab:
285+
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
286+
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
287+
added_tokens: Dict[str, int]
288+
if fname_added_tokens is not None:
289+
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
255290
else:
256-
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
291+
added_tokens = {}
292+
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
257293
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
258294
actual_ids = sorted(added_tokens.values())
259295
if expected_ids != actual_ids:
@@ -267,32 +303,11 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], vo
267303

268304
def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
269305
tokenizer = self.sentencepiece_tokenizer
270-
if self.vocabtype == "bpe":
271-
from transformers.models.gpt2 import tokenization_gpt2
272-
byte_encoder = tokenization_gpt2.bytes_to_unicode()
273-
byte_decoder = {v: k for k, v in byte_encoder.items()}
274-
for i, item in enumerate(tokenizer):
275-
text: bytes
276-
text = b''.join([x.to_bytes(1, byteorder='big') for x in [byte_decoder[y] for y in item]])
277-
score: float = -i
306+
for i in range(tokenizer.vocab_size()):
307+
piece = tokenizer.id_to_piece(i)
308+
text: bytes = piece.encode("utf-8")
309+
score: float = tokenizer.get_score(i)
278310
yield text, score
279-
else:
280-
for i in range(tokenizer.vocab_size()):
281-
text: bytes
282-
if tokenizer.is_unknown(i):
283-
text = " \u2047 ".encode("utf-8")
284-
elif tokenizer.is_control(i):
285-
text = b""
286-
elif tokenizer.is_byte(i):
287-
piece = tokenizer.id_to_piece(i)
288-
if len(piece) != 6:
289-
raise Exception(f"Invalid token: {piece}")
290-
byte_value = int(piece[3:-1], 16)
291-
text = struct.pack("B", byte_value)
292-
else:
293-
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
294-
score: float = tokenizer.get_score(i)
295-
yield text, score
296311

297312
def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
298313
for text in self.added_tokens_list:
@@ -319,7 +334,7 @@ def __repr__(self) -> str:
319334
return f"<GGMLVocab with {self.vocab_size} tokens>"
320335

321336

322-
Vocab = Union[SentencePieceVocab, GGMLVocab]
337+
Vocab = Union[BpeVocab, SentencePieceVocab, GGMLVocab]
323338

324339

325340
def permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
@@ -1044,7 +1059,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
10441059
def check_vocab_size(params: Params, vocab: Vocab) -> None:
10451060
if params.n_vocab != vocab.vocab_size:
10461061
# GGMLVocab comes from the same file as the model so shouldn't mismatch:
1047-
assert isinstance(vocab, SentencePieceVocab)
1062+
assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
10481063
if params.n_vocab == vocab.vocab_size_base:
10491064
print("Ignoring added_tokens.json since model matches vocab size without it.")
10501065
vocab.added_tokens_list = []
@@ -1093,7 +1108,7 @@ def write_vocab(self, vocab: Vocab) -> None:
10931108
@staticmethod
10941109
def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
10951110
of = OutputFile(fname_out)
1096-
params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, n_head=1, n_layer=0)
1111+
params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, n_head=1, n_layer=0, n_kv_head=None)
10971112
of = OutputFile(fname_out)
10981113
of.write_file_header(params, file_type=GGMLFileType.AllF32)
10991114
of.write_vocab(vocab)
@@ -1228,7 +1243,7 @@ def filter_and_sort_tensors(model: LazyModel) -> LazyModel:
12281243
return {name: model[name] for name in TENSORS_LIST if name in model}
12291244

12301245

1231-
def load_vocab(path: Path, vocabtype: Optional[str]) -> SentencePieceVocab:
1246+
def load_vocab(path: Path, vocabtype: Optional[str]) -> Union[BpeVocab, SentencePieceVocab]:
12321247
print(f"vocabtype: {vocabtype}")
12331248
# Be extra-friendly and accept either a file or a directory. Also, if it's
12341249
# a directory, it might be the model directory, and tokenizer.model might
@@ -1250,8 +1265,12 @@ def load_vocab(path: Path, vocabtype: Optional[str]) -> SentencePieceVocab:
12501265
"if it's in another directory, pass the directory as --vocab-dir")
12511266
added_tokens_path = path.parent / "added_tokens.json"
12521267
print(f"Loading vocab file {path}")
1253-
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None,
1254-
vocabtype)
1268+
if vocabtype == "bpe":
1269+
return BpeVocab(path, added_tokens_path if added_tokens_path.exists() else None)
1270+
elif vocabtype == "spm":
1271+
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
1272+
else:
1273+
raise ValueError(f"Unsupported vocabulary type {vocabtype}")
12551274

12561275

12571276
def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:

examples/common.cpp

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -633,17 +633,6 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
633633
return "The";
634634
}
635635

636-
// TODO: not great allocating this every time
637-
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
638-
// initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
639-
std::vector<llama_token> res(text.size() + (int) add_bos);
640-
const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
641-
assert(n >= 0);
642-
res.resize(n);
643-
644-
return res;
645-
}
646-
647636
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
648637
auto lparams = llama_context_default_params();
649638

examples/common.h

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
#pragma once
44

5+
#define LLAMA_API_CPP // TODO: eliminate me
56
#include "llama.h"
67

78
#include <string>
@@ -100,12 +101,6 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
100101

101102
std::string gpt_random_prompt(std::mt19937 & rng);
102103

103-
//
104-
// Vocab utils
105-
//
106-
107-
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
108-
109104
//
110105
// Model utils
111106
//

examples/embedding/embedding.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ int main(int argc, char ** argv) {
6767
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
6868
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
6969
for (int i = 0; i < (int) embd_inp.size(); i++) {
70-
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
70+
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
7171
}
7272
fprintf(stderr, "\n");
7373
}

examples/main/main.cpp

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -191,10 +191,6 @@ int main(int argc, char ** argv) {
191191

192192
// tokenize the prompt
193193
std::vector<llama_token> embd_inp;
194-
195-
// Add a space in front of the first character to match OG llama tokenizer behavior
196-
params.prompt.insert(0, 1, ' ');
197-
198194
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
199195
embd_inp = ::llama_tokenize(ctx, params.prompt, true);
200196
} else {
@@ -278,22 +274,22 @@ int main(int argc, char ** argv) {
278274
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
279275
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
280276
for (int i = 0; i < (int) embd_inp.size(); i++) {
281-
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
277+
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
282278
}
283279

284280
if (ctx_guidance) {
285281
fprintf(stderr, "\n");
286282
fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
287283
fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
288284
for (int i = 0; i < (int) guidance_inp.size(); i++) {
289-
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]));
285+
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]).c_str());
290286
}
291287
}
292288

293289
if (params.n_keep > 0) {
294290
fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
295291
for (int i = 0; i < params.n_keep; i++) {
296-
fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]));
292+
fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]).c_str());
297293
}
298294
fprintf(stderr, "'\n");
299295
}
@@ -662,7 +658,7 @@ int main(int argc, char ** argv) {
662658
// display text
663659
if (input_echo) {
664660
for (auto id : embd) {
665-
printf("%s", llama_token_to_str(ctx, id));
661+
printf("%s", llama_token_to_str(ctx, id).c_str());
666662
}
667663
fflush(stdout);
668664
}

examples/quantize-stats/quantize-stats.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "ggml.h"
22
#include "build-info.h"
33

4+
#define LLAMA_API_CPP // TODO: eliminate me
45
#define LLAMA_API_INTERNAL
56
#include "llama.h"
67

examples/save-load-state/save-load-state.cpp

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,8 @@ int main(int argc, char ** argv) {
4545
llama_free_model(model);
4646
return 1;
4747
}
48-
auto tokens = std::vector<llama_token>(params.n_ctx);
49-
auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);
50-
48+
auto tokens = llama_tokenize(ctx, params.prompt.c_str(), true);
49+
auto n_prompt_tokens = tokens.size();
5150
if (n_prompt_tokens < 1) {
5251
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
5352
llama_free(ctx);
@@ -92,7 +91,7 @@ int main(int argc, char ** argv) {
9291
auto next_token_str = llama_token_to_str(ctx, next_token);
9392
last_n_tokens_data.push_back(next_token);
9493

95-
printf("%s", next_token_str);
94+
printf("%s", next_token_str.c_str());
9695
if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
9796
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
9897
llama_free(ctx);
@@ -152,7 +151,7 @@ int main(int argc, char ** argv) {
152151
auto next_token_str = llama_token_to_str(ctx2, next_token);
153152
last_n_tokens_data.push_back(next_token);
154153

155-
printf("%s", next_token_str);
154+
printf("%s", next_token_str.c_str());
156155
if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
157156
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
158157
llama_free(ctx2);

examples/simple/simple.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ int main(int argc, char ** argv) {
6262
fprintf(stderr, "\n\n");
6363

6464
for (auto id : tokens_list) {
65-
fprintf(stderr, "%s", llama_token_to_str(ctx, id));
65+
fprintf(stderr, "%s", llama_token_to_str(ctx, id).c_str());
6666
}
6767

6868
fflush(stderr);
@@ -109,7 +109,7 @@ int main(int argc, char ** argv) {
109109
}
110110

111111
// print the new token :
112-
printf("%s", llama_token_to_str(ctx, new_token_id));
112+
printf("%s", llama_token_to_str(ctx, new_token_id).c_str());
113113
fflush(stdout);
114114

115115
// push this new token for next evaluation

examples/train-text-from-scratch/train-text-from-scratch.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include "ggml.h"
2+
#include "common.h"
23
#include "llama.h"
34
#include <unordered_map>
45
#include <vector>
@@ -1961,7 +1962,7 @@ void print_matrix(struct ggml_tensor * probs) {
19611962

19621963

19631964
void print_token(struct llama_context * ctx, llama_token token) {
1964-
printf("%s", llama_token_to_str(ctx, token));
1965+
printf("%s", llama_token_to_str(ctx, token).c_str());
19651966
}
19661967

19671968
void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
@@ -2188,29 +2189,28 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
21882189
f.read_raw(buf.data(), f.size);
21892190
buf[f.size] = '\0';
21902191

2191-
out.resize(buf.size());
2192-
2193-
int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false);
2194-
if (n_tokens >= 0) {
2195-
out.resize(n_tokens);
2192+
int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
2193+
if (n_tokens < 0) {
2194+
out.resize(-n_tokens);
2195+
llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
21962196
}
21972197

21982198
bool verify = false;
21992199
if (verify) {
22002200
const char * in = buf.data();
22012201
const char * end = buf.data() + buf.size();
22022202
for (int i = 0; i < (int) out.size(); ++i) {
2203-
const char * s = llama_token_to_str(lctx, out[i]);
2204-
int len = strlen(s);
2203+
std::string s = llama_token_to_str(lctx, out[i]);
2204+
int len = s.length();
22052205
if (in >= end) {
22062206
printf("%s: unexpected end of original text.\n", __func__);
22072207
break;
22082208
}
2209-
const bool matches = (strncmp(in, s, len) == 0);
2209+
const bool matches = (strncmp(in, s.c_str(), len) == 0);
22102210
if (matches) {
22112211
in += len;
22122212
} else {
2213-
printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s);
2213+
printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s.c_str());
22142214
}
22152215
}
22162216
}

0 commit comments

Comments
 (0)