Skip to content

Commit bfccc62

Browse files
committed
Use some tricks to eliminate the necessity for a new format
1 parent aa90c83 commit bfccc62

File tree

3 files changed

+35
-43
lines changed

3 files changed

+35
-43
lines changed

convert.py

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ def make_tensors_list() -> List[str]:
133133
@dataclass
134134
class Params:
135135
n_vocab: int
136-
n_vocab_sp: int
136+
n_vocab_base: int
137137
n_embd: int
138138
n_mult: int
139139
n_head: int
@@ -146,7 +146,7 @@ def guessed(model: 'LazyModel', vocab: 'Vocab', file_type: GGMLFileType) -> 'Par
146146

147147
return Params(
148148
n_vocab=n_vocab,
149-
n_vocab_sp=vocab.vocab_special_size,
149+
n_vocab_base=vocab.vocab_size_base,
150150
n_embd=n_embd,
151151
n_mult=256,
152152
n_head=n_embd // 128,
@@ -190,7 +190,7 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], fn
190190
else:
191191
tokenizer_config = {}
192192
for key, value in tokenizer_config.items():
193-
if not isinstance(value, dict) or not isinstance(value, str):
193+
if not isinstance(value, dict) and not isinstance(value, str):
194194
continue
195195
token_id = TOKEN_NAME_TO_ID.get(key, -1)
196196
if token_id == -1:
@@ -203,15 +203,13 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], fn
203203
else:
204204
special_tokens = {}
205205
for key, value in special_tokens.items():
206-
if not isinstance(value, dict) or not isinstance(value, str):
206+
if not isinstance(value, dict) and not isinstance(value, str):
207207
continue
208208
token_id = TOKEN_NAME_TO_ID.get(key, -1)
209209
if token_id == -1 or token_id in self.special_tokens_map:
210210
continue
211211
self.special_tokens_map[token_id] = value["content"] if isinstance(value, dict) else value
212212

213-
self.vocab_special_size: int = len(self.added_tokens_list) + len(self.special_tokens_map)
214-
215213
def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
216214
tokenizer = self.sentencepiece_tokenizer
217215
special_tokens = [tokenizer.bos_id(), tokenizer.eos_id(), tokenizer.pad_id()]
@@ -258,7 +256,7 @@ def __init__(self, tokens: List[Tuple[bytes, float]]):
258256
self.tokens = tokens
259257
self.special_tokens = []
260258
self.vocab_size = len(tokens)
261-
self.vocab_special_size = 0
259+
self.vocab_size_base = 0
262260

263261
def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
264262
return self.tokens
@@ -976,17 +974,16 @@ def __init__(self, fname_out: Path) -> None:
976974
def write_file_header(self, params: Params) -> None:
977975
self.fout.write(b"ggjt"[::-1]) # magic
978976
values = [
979-
4, # file version
977+
1, # file version
980978
params.n_vocab,
981-
params.n_vocab_sp,
982979
params.n_embd,
983980
params.n_mult,
984981
params.n_head,
985982
params.n_layer,
986-
params.n_embd // params.n_head, # rot (obsolete)
983+
params.n_vocab_base | 0xF0000000, # reuse obsolete rot value to store vocab_base
987984
params.file_type.value,
988985
]
989-
self.fout.write(struct.pack("i" * len(values), *values))
986+
self.fout.write(struct.pack("I" * len(values), *values))
990987

991988
def write_tensor_header(self, name: str, shape: Sequence[int], data_type: DataType) -> None:
992989
sname = name.encode('utf-8')
@@ -1000,13 +997,11 @@ def write_vocab(self, vocab: Vocab) -> None:
1000997
self.fout.write(struct.pack("i", len(text)))
1001998
self.fout.write(text)
1002999
self.fout.write(struct.pack("f", score))
1003-
for token_id in vocab.all_special_tokens():
1004-
self.fout.write(struct.pack("i", token_id))
10051000

10061001
@staticmethod
10071002
def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
10081003
of = OutputFile(fname_out)
1009-
params = Params(n_vocab=vocab.vocab_size, n_vocab_sp=vocab.vocab_special_size, n_embd=0, n_mult=0,
1004+
params = Params(n_vocab=vocab.vocab_size, n_vocab_base=vocab.vocab_size_base, n_embd=0, n_mult=0,
10101005
n_head=1, n_layer=0, file_type=GGMLFileType.AllF32)
10111006
of = OutputFile(fname_out)
10121007
of.write_file_header(params)

llama.cpp

Lines changed: 25 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -128,13 +128,12 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
128128
// default hparams (LLaMA 7B)
129129
struct llama_hparams {
130130
uint32_t n_vocab = 32000;
131-
uint32_t n_vocab_sp = 0;
131+
uint32_t n_vocab_base = 32000;
132132
uint32_t n_ctx = 512; // this is provided as user input?
133133
uint32_t n_embd = 4096;
134134
uint32_t n_mult = 256;
135135
uint32_t n_head = 32;
136136
uint32_t n_layer = 32;
137-
uint32_t n_rot = 64;
138137
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
139138

140139
bool operator!=(const llama_hparams & other) const {
@@ -460,7 +459,6 @@ enum llama_file_version {
460459
LLAMA_FILE_VERSION_GGJT_V1, // added padding
461460
LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
462461
LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
463-
LLAMA_FILE_VERSION_GGJT_V4, // improved support for added/special tokens
464462
};
465463

466464
struct llama_file_loader {
@@ -476,6 +474,7 @@ struct llama_file_loader {
476474
read_hparams();
477475
read_vocab();
478476
read_tensor_metadata(file_idx, tensors_map);
477+
set_vocab_sp();
479478
}
480479
void read_magic() {
481480
uint32_t magic = file.read_u32();
@@ -498,7 +497,6 @@ struct llama_file_loader {
498497
case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return;
499498
case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return;
500499
case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return;
501-
case 4: file_version = LLAMA_FILE_VERSION_GGJT_V4; return;
502500
}
503501
}
504502

@@ -507,12 +505,12 @@ struct llama_file_loader {
507505
}
508506
void read_hparams() {
509507
hparams.n_vocab = file.read_u32();
510-
hparams.n_vocab_sp = file_version >= LLAMA_FILE_VERSION_GGJT_V4 ? file.read_u32() : 0;
511508
hparams.n_embd = file.read_u32();
512509
hparams.n_mult = file.read_u32();
513510
hparams.n_head = file.read_u32();
514511
hparams.n_layer = file.read_u32();
515-
hparams.n_rot = file.read_u32();
512+
hparams.n_vocab_base = file.read_u32();
513+
hparams.n_vocab_base = (hparams.n_vocab_base & 0xF0000000) == 0 ? hparams.n_vocab : (hparams.n_vocab_base & ~0xF0000000); // this bitwise operation is necessary for compatibility with older models
516514
hparams.ftype = (enum llama_ftype) file.read_u32();
517515
}
518516
void read_vocab() {
@@ -533,20 +531,6 @@ struct llama_file_loader {
533531
tok_score.tok = std::move(word);
534532
tok_score.score = score;
535533
}
536-
537-
vocab.special_token_to_id.reserve(hparams.n_vocab_sp);
538-
539-
for (uint32_t i = 0; i < hparams.n_vocab_sp; i++) {
540-
llama_vocab::id token_id = file.read_u32();
541-
const auto & word = vocab.id_to_token[token_id].tok;
542-
543-
vocab.special_token_trie.add(word);
544-
vocab.special_token_to_id[word] = token_id;
545-
546-
if (vocab.max_special_token_length < word.size()) {
547-
vocab.max_special_token_length = word.size();
548-
}
549-
}
550534
}
551535
void read_tensor_metadata(size_t file_idx, llama_load_tensors_map & tensors_map) {
552536
while (file.tell() < file.size) {
@@ -601,6 +585,24 @@ struct llama_file_loader {
601585
tensors_map.tensors.at(idx).shards.push_back(shard);
602586
}
603587
}
588+
void set_vocab_sp() {
589+
uint32_t vocab_sp = 3 + hparams.n_vocab - hparams.n_vocab_base;
590+
vocab.special_token_to_id.reserve(vocab_sp);
591+
for (uint32_t i = 0; i < vocab_sp; i++) {
592+
llama_vocab::id token_id = i > 2 ? hparams.n_vocab_base + i : i;
593+
const auto & word = vocab.id_to_token[token_id].tok;
594+
if (word.empty()) {
595+
continue;
596+
}
597+
598+
vocab.special_token_trie.add(word);
599+
vocab.special_token_to_id[word] = token_id;
600+
601+
if (vocab.max_special_token_length < word.size()) {
602+
vocab.max_special_token_length = word.size();
603+
}
604+
}
605+
}
604606
};
605607

606608
struct llama_file_saver {
@@ -620,12 +622,11 @@ struct llama_file_saver {
620622
void write_hparams(enum llama_ftype new_ftype) {
621623
const llama_hparams & hparams = any_file_loader->hparams;
622624
file.write_u32(hparams.n_vocab);
623-
file.write_u32(hparams.n_vocab_sp);
624625
file.write_u32(hparams.n_embd);
625626
file.write_u32(hparams.n_mult);
626627
file.write_u32(hparams.n_head);
627628
file.write_u32(hparams.n_layer);
628-
file.write_u32(hparams.n_rot);
629+
file.write_u32(hparams.n_vocab_base | 0xF0000000); // this bitwise operation is necessary for compatibility with older models
629630
file.write_u32(new_ftype);
630631
}
631632
void write_vocab() {
@@ -639,9 +640,6 @@ struct llama_file_saver {
639640
file.write_raw(token_score.tok.data(), token_score.tok.size());
640641
file.write_raw(&token_score.score, sizeof(token_score.score));
641642
}
642-
for (const auto & pair : any_file_loader->vocab.special_token_to_id) {
643-
file.write_u32(pair.second);
644-
}
645643
}
646644
void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
647645
switch (new_type) {
@@ -1015,8 +1013,7 @@ static const char *llama_file_version_name(llama_file_version version) {
10151013
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
10161014
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
10171015
case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)";
1018-
case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (pre #1931)";
1019-
case LLAMA_FILE_VERSION_GGJT_V4: return "ggjt v4 (latest)";
1016+
case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)";
10201017
}
10211018

10221019
return "unknown";
@@ -1113,7 +1110,7 @@ static void llama_model_load_internal(
11131110
fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
11141111
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
11151112
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
1116-
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
1113+
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_embd/hparams.n_head);
11171114
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
11181115
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
11191116
fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());

llama.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
3333
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
3434

35-
#define LLAMA_FILE_VERSION 4
35+
#define LLAMA_FILE_VERSION 3
3636
#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
3737
#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
3838
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN

0 commit comments

Comments
 (0)