Skip to content

Commit cd00be8

Browse files
committed
chore: Add model metadata
1 parent 1957ca4 commit cd00be8

File tree

1 file changed

+103
-50
lines changed

1 file changed

+103
-50
lines changed

gguf-py/gguf/constants.py

Lines changed: 103 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -992,10 +992,14 @@ class HFModelFileType(IntEnum):
992992
)
993993

994994
# NOTE: GPT-2 is the standard default pre-tokenizer for all models
995+
# NOTE: BERT models inherit from the Byte Level Pre-tokenizer.
996+
# https://github.com/huggingface/tokenizers/blob/main/tokenizers/src/pre_tokenizers/byte_level.rs#L117
997+
# https://github.com/huggingface/tokenizers/blob/main/tokenizers/src/pre_tokenizers/bert.rs#L13
995998
BPE_PRE_TOKENIZERS = {
996999
# gpt2, olmo, phi (1, 1_5, 2, 3, ...)
9971000
"gpt2": (GPT_PRE_TOKENIZER_DEFAULT,),
9981001
# dbrx
1002+
# NOTE: PR#6920: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
9991003
"llama3": (
10001004
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
10011005
),
@@ -1033,7 +1037,7 @@ class HFModelFileType(IntEnum):
10331037
# This will get out of control if not properly managed.
10341038
# This needs a proper solution. The short-term solution is to manually build a map here.
10351039
# A proper long-term solution would be to build a dynamic registry.
1036-
# The issue is that this requires a mapping or a database.
1040+
# The issue is that this requires a dynamically persistent mapping or a database.
10371041
# Possible solutions are to use JSON, HDF5, or SQLite.
10381042
# Some of these mappings could be dynamically generated, but it's sketchy at best.
10391043
# Model versions should be included along with the model name to mitigate name conflicts.
@@ -1060,14 +1064,14 @@ class HFModelFileType(IntEnum):
10601064
# - Possible algorithms are WordLevel, BPE, WordPiece, or Unigram
10611065
# - Possible LLaMa tokenizer model types are: None, SPM, BPE, or WPM
10621066
HF_MODEL_MAP = (
1063-
# Sentence Piece Models
1067+
# SPM (Sentence Piece Models): Default to Byte Level Pre-tokenization.
10641068
{
10651069
"model_repo": "meta-llama/Llama-2-7b-hf",
10661070
"model_arch": MODEL_ARCH_NAMES[MODEL_ARCH.LLAMA],
10671071
"model_parts": 2,
10681072
"model_type": HFModelFileType.SFT,
10691073
"vocab_type": LLaMaVocabType.SPM,
1070-
"vocab_pre": (),
1074+
"vocab_pre": GPT_PRE_TOKENIZER_DEFAULT,
10711075
"vocab_files": HF_TOKENIZER_SPM_FILES,
10721076
},
10731077
{
@@ -1076,7 +1080,7 @@ class HFModelFileType(IntEnum):
10761080
"model_parts": 3,
10771081
"model_type": HFModelFileType.SFT,
10781082
"vocab_type": LLaMaVocabType.SPM,
1079-
"vocab_pre": (),
1083+
"vocab_pre": GPT_PRE_TOKENIZER_DEFAULT,
10801084
"vocab_files": HF_TOKENIZER_SPM_FILES,
10811085
},
10821086
{
@@ -1085,7 +1089,7 @@ class HFModelFileType(IntEnum):
10851089
"model_parts": 8,
10861090
"model_type": HFModelFileType.SFT,
10871091
"vocab_type": LLaMaVocabType.SPM,
1088-
"vocab_pre": (),
1092+
"vocab_pre": GPT_PRE_TOKENIZER_DEFAULT,
10891093
"vocab_files": HF_TOKENIZER_SPM_FILES,
10901094
},
10911095
{
@@ -1094,35 +1098,37 @@ class HFModelFileType(IntEnum):
10941098
"model_parts": 2,
10951099
"model_type": HFModelFileType.SFT,
10961100
"vocab_type": LLaMaVocabType.SPM,
1097-
"vocab_pre": (),
1101+
"vocab_pre": GPT_PRE_TOKENIZER_DEFAULT,
10981102
"vocab_files": HF_TOKENIZER_SPM_FILES,
10991103
},
1100-
# Word Piece Models
1104+
# WPM (Word Piece Models): Default to Byte Level Pre-tokenization.
1105+
# NOTE: BERT Normalization and Pre-tokenization rules differ from Byte Level Pre-tokenization.
11011106
{
11021107
"model_repo": "BAAI/bge-small-en-v1.5",
11031108
"model_arch": MODEL_ARCH_NAMES[MODEL_ARCH.BERT],
11041109
"model_parts": 1,
11051110
"model_type": HFModelFileType.BIN,
11061111
"vocab_type": LLaMaVocabType.WPM,
1107-
"vocab_pre": (),
1112+
"vocab_pre": GPT_PRE_TOKENIZER_DEFAULT,
11081113
"vocab_files": HF_TOKENIZER_BPE_FILES,
11091114
},
11101115
{
11111116
"model_repo": "jinaai/jina-embeddings-v2-base-en",
1112-
"model_arch": MODEL_ARCH.JINA_BERT_V2,
1117+
"model_arch": MODEL_ARCH_NAMES[MODEL_ARCH.JINA_BERT_V2],
1118+
"model_parts": 1,
1119+
"model_type": HFModelFileType.SFT,
11131120
"vocab_type": LLaMaVocabType.WPM,
1121+
"vocab_pre": GPT_PRE_TOKENIZER_DEFAULT,
1122+
"vocab_files": HF_TOKENIZER_BPE_FILES,
11141123
},
1115-
# Byte Pair Encoding Models
1124+
# BPE (Byte Pair Encoding Models): Default is Byte Level Pre-tokenization
11161125
{
11171126
"model_repo": "meta-llama/Meta-Llama-3-8B",
11181127
"model_arch": MODEL_ARCH.LLAMA,
11191128
"model_parts": 4,
11201129
"model_type": HFModelFileType.SFT,
11211130
"vocab_type": LLaMaVocabType.BPE,
1122-
# PR#6920: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
1123-
"vocab_pre": (
1124-
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
1125-
),
1131+
"vocab_pre": BPE_PRE_TOKENIZERS["llama3"],
11261132
"vocab_files": HF_TOKENIZER_BPE_FILES,
11271133
},
11281134
{
@@ -1131,7 +1137,7 @@ class HFModelFileType(IntEnum):
11311137
"model_parts": 2,
11321138
"model_type": HFModelFileType.BIN,
11331139
"vocab_type": LLaMaVocabType.BPE,
1134-
"vocab_pre": BPE_PRE_TOKENIZERS[MODEL_ARCH_NAMES[MODEL_ARCH.FALCON]],
1140+
"vocab_pre": BPE_PRE_TOKENIZERS["falcon"],
11351141
"vocab_files": HF_TOKENIZER_BPE_FILES,
11361142
},
11371143
{
@@ -1140,14 +1146,7 @@ class HFModelFileType(IntEnum):
11401146
"model_parts": 2,
11411147
"model_type": HFModelFileType.BIN,
11421148
"vocab_type": LLaMaVocabType.BPE,
1143-
"vocab_pre": (
1144-
"[\r\n]",
1145-
"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
1146-
"\\s?[!-/:-~!-/:-~‘-‟ -。]+",
1147-
"\\s+$",
1148-
"[一-龥ࠀ-一가-퟿]+",
1149-
"\\p{N}+",
1150-
),
1149+
"vocab_pre": BPE_PRE_TOKENIZERS["deepseek"],
11511150
"vocab_files": HF_TOKENIZER_BPE_FILES,
11521151
},
11531152
{
@@ -1156,13 +1155,7 @@ class HFModelFileType(IntEnum):
11561155
"model_parts": 2,
11571156
"model_type": HFModelFileType.SFT,
11581157
"vocab_type": LLaMaVocabType.BPE,
1159-
"vocab_pre": (
1160-
"[\r\n]",
1161-
"\\s?\\p{L}+",
1162-
"\\s?\\p{P}+",
1163-
"[一-龥ࠀ-一가-퟿]+",
1164-
"\\p{N}",
1165-
),
1158+
"vocab_pre": BPE_PRE_TOKENIZERS["deepseek-coder"],
11661159
"vocab_files": HF_TOKENIZER_BPE_FILES,
11671160
},
11681161
{
@@ -1171,74 +1164,134 @@ class HFModelFileType(IntEnum):
11711164
"model_parts": 2,
11721165
"model_type": HFModelFileType.BIN,
11731166
"vocab_type": LLaMaVocabType.BPE,
1174-
"vocab_pre": (
1175-
"\\s?\\p{L}+",
1176-
"\\s?\\p{P}+",
1177-
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
1178-
),
1167+
"vocab_pre": BPE_PRE_TOKENIZERS["mpt"],
11791168
"vocab_files": HF_TOKENIZER_BPE_FILES,
11801169
},
1170+
#
1171+
# BPE: STARCODER
1172+
#
11811173
{
11821174
"model_repo": "bigcode/starcoder2-3b",
11831175
"model_arch": MODEL_ARCH.STARCODER2,
11841176
"model_parts": 1,
11851177
"model_type": HFModelFileType.SFT,
11861178
"vocab_type": LLaMaVocabType.BPE,
1187-
"vocab_pre": (
1188-
"\\p{N}",
1189-
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
1190-
),
1179+
"vocab_pre": BPE_PRE_TOKENIZERS["starcoder"],
11911180
"vocab_files": HF_TOKENIZER_BPE_FILES,
11921181
},
1193-
{
1194-
"model_repo": "openai-community/gpt2",
1195-
"model_arch": MODEL_ARCH.GPT2,
1196-
"vocab_type": LLaMaVocabType.BPE,
1197-
},
11981182
{
11991183
"model_repo": "smallcloudai/Refact-1_6-base",
12001184
"model_arch": MODEL_ARCH.REFACT,
1185+
"model_parts": 1,
1186+
"model_type": HFModelFileType.BIN,
12011187
"vocab_type": LLaMaVocabType.BPE,
1188+
"vocab_pre": BPE_PRE_TOKENIZERS["starcoder"],
1189+
"vocab_files": HF_TOKENIZER_BPE_FILES,
12021190
},
12031191
{
12041192
"model_repo": "CohereForAI/c4ai-command-r-v01",
12051193
"model_arch": MODEL_ARCH.COMMAND_R,
1194+
"model_parts": 15,
1195+
"model_type": HFModelFileType.SFT,
12061196
"vocab_type": LLaMaVocabType.BPE,
1197+
"vocab_pre": BPE_PRE_TOKENIZERS["starcoder"],
1198+
"vocab_files": HF_TOKENIZER_BPE_FILES,
12071199
},
1200+
#
1201+
# BPE: QWEN
1202+
#
12081203
{
12091204
"model_repo": "Qwen/Qwen1.5-7B",
12101205
"model_arch": MODEL_ARCH.QWEN2,
1206+
"model_parts": 4,
1207+
"model_type": HFModelFileType.SFT,
12111208
"vocab_type": LLaMaVocabType.BPE,
1209+
"vocab_pre": BPE_PRE_TOKENIZERS["qwen"],
1210+
"vocab_files": HF_TOKENIZER_BPE_FILES,
1211+
},
1212+
{
1213+
"model_repo": "stabilityai/stablelm-2-zephyr-1_6b",
1214+
"model_arch": MODEL_ARCH.STABLELM,
1215+
"model_parts": 1,
1216+
"model_type": HFModelFileType.SFT,
1217+
"vocab_type": LLaMaVocabType.BPE,
1218+
"vocab_pre": BPE_PRE_TOKENIZERS["qwen"],
1219+
"vocab_files": HF_TOKENIZER_BPE_FILES,
1220+
},
1221+
#
1222+
# BPE: GPT-2
1223+
#
1224+
{
1225+
"model_repo": "openai-community/gpt2",
1226+
"model_arch": MODEL_ARCH.GPT2,
1227+
"model_parts": 1,
1228+
"model_type": HFModelFileType.SFT,
1229+
"vocab_type": LLaMaVocabType.BPE,
1230+
"vocab_pre": BPE_PRE_TOKENIZERS["gpt2"],
1231+
"vocab_files": HF_TOKENIZER_BPE_FILES,
12121232
},
12131233
{
12141234
"model_repo": "allenai/OLMo-1.7-7B-hf",
12151235
"model_arch": MODEL_ARCH.OLMO,
1236+
"model_parts": 6,
1237+
"model_type": HFModelFileType.SFT,
12161238
"vocab_type": LLaMaVocabType.BPE,
1239+
"vocab_pre": BPE_PRE_TOKENIZERS["gpt2"],
1240+
"vocab_files": HF_TOKENIZER_BPE_FILES,
12171241
},
1218-
{
1242+
{ # NOTE: I don't have access to this model
12191243
"model_repo": "databricks/dbrx-base",
12201244
"model_arch": MODEL_ARCH.DBRX,
1245+
"model_parts": 0,
1246+
"model_type": HFModelFileType.SFT,
12211247
"vocab_type": LLaMaVocabType.BPE,
1248+
"vocab_pre": BPE_PRE_TOKENIZERS["gpt2"],
1249+
"vocab_files": HF_TOKENIZER_BPE_FILES,
12221250
},
1223-
{
1251+
{ # NOTE: RoBERTa post processor
12241252
"model_repo": "jinaai/jina-embeddings-v2-base-es",
12251253
"model_arch": MODEL_ARCH.JINA_BERT_V2,
1254+
"model_parts": 1,
1255+
"model_type": HFModelFileType.SFT,
12261256
"vocab_type": LLaMaVocabType.BPE,
1257+
"vocab_pre": BPE_PRE_TOKENIZERS["gpt2"],
1258+
"vocab_files": HF_TOKENIZER_BPE_FILES,
12271259
},
1228-
{
1260+
{ # NOTE: RoBERTa post processor
12291261
"model_repo": "jinaai/jina-embeddings-v2-base-de",
12301262
"model_arch": MODEL_ARCH.JINA_BERT_V2,
1263+
"model_parts": 1,
1264+
"model_type": HFModelFileType.SFT,
12311265
"vocab_type": LLaMaVocabType.BPE,
1266+
"vocab_pre": BPE_PRE_TOKENIZERS["gpt2"],
1267+
"vocab_files": HF_TOKENIZER_BPE_FILES,
12321268
},
1233-
{
1269+
{ # NOTE: Phi-1 is compatible with GPT-2 arch and vocab
12341270
"model_repo": "microsoft/phi-1",
12351271
"model_arch": MODEL_ARCH.PHI2,
1272+
"model_parts": 1,
1273+
"model_type": HFModelFileType.SFT,
12361274
"vocab_type": LLaMaVocabType.BPE,
1275+
"vocab_pre": BPE_PRE_TOKENIZERS["gpt2"],
1276+
"vocab_files": HF_TOKENIZER_BPE_FILES,
12371277
},
12381278
{
1239-
"model_repo": "stabilityai/stablelm-2-zephyr-1_6b",
1240-
"model_arch": MODEL_ARCH.STABLELM,
1279+
"model_repo": "microsoft/phi-1_5",
1280+
"model_arch": MODEL_ARCH.PHI2,
1281+
"model_parts": 1,
1282+
"model_type": HFModelFileType.SFT,
12411283
"vocab_type": LLaMaVocabType.BPE,
1284+
"vocab_pre": BPE_PRE_TOKENIZERS["gpt2"],
1285+
"vocab_files": HF_TOKENIZER_BPE_FILES,
1286+
},
1287+
{
1288+
"model_repo": "microsoft/phi-2",
1289+
"model_arch": MODEL_ARCH.PHI2,
1290+
"model_parts": 2,
1291+
"model_type": HFModelFileType.SFT,
1292+
"vocab_type": LLaMaVocabType.BPE,
1293+
"vocab_pre": BPE_PRE_TOKENIZERS["gpt2"],
1294+
"vocab_files": HF_TOKENIZER_BPE_FILES,
12421295
},
12431296
)
12441297

0 commit comments

Comments
 (0)