@@ -992,10 +992,14 @@ class HFModelFileType(IntEnum):
992
992
)
993
993
994
994
# NOTE: GPT-2 is the standard default pre-tokenizer for all models
995
+ # NOTE: BERT models inherit from the Byte Level Pre-tokenizer.
996
+ # https://github.com/huggingface/tokenizers/blob/main/tokenizers/src/pre_tokenizers/byte_level.rs#L117
997
+ # https://github.com/huggingface/tokenizers/blob/main/tokenizers/src/pre_tokenizers/bert.rs#L13
995
998
BPE_PRE_TOKENIZERS = {
996
999
# gpt2, olmo, phi (1, 1_5, 2, 3, ...)
997
1000
"gpt2" : (GPT_PRE_TOKENIZER_DEFAULT ,),
998
1001
# dbrx
1002
+ # NOTE: PR#6920: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
999
1003
"llama3" : (
1000
1004
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\ r\\ n\\ p{L}\\ p{N}]?\\ p{L}+|\\ p{N}{1,3}| ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n]*|\\ s*[\\ r\\ n]+|\\ s+(?!\\ S)|\\ s+" ,
1001
1005
),
@@ -1033,7 +1037,7 @@ class HFModelFileType(IntEnum):
1033
1037
# This will get out of control if not properly managed.
1034
1038
# This needs a proper solution. The short-term solution is to manually build a map here.
1035
1039
# A proper long-term solution would be to build a dynamic registry.
1036
- # The issue is that this requires a mapping or a database.
1040
+ # The issue is that this requires a dynamically persistent mapping or a database.
1037
1041
# Possible solutions are to use JSON, HDF5, or SQLite.
1038
1042
# Some of these mappings could be dynamically generated, but it's sketchy at best.
1039
1043
# Model versions should be included along with the model name to mitigate name conflicts.
@@ -1060,14 +1064,14 @@ class HFModelFileType(IntEnum):
1060
1064
# - Possible algorithms are WordLevel, BPE, WordPiece, or Unigram
1061
1065
# - Possible LLaMa tokenizer model types are: None, SPM, BPE, or WPM
1062
1066
HF_MODEL_MAP = (
1063
- # Sentence Piece Models
1067
+ # SPM ( Sentence Piece Models): Default to Byte Level Pre-tokenization.
1064
1068
{
1065
1069
"model_repo" : "meta-llama/Llama-2-7b-hf" ,
1066
1070
"model_arch" : MODEL_ARCH_NAMES [MODEL_ARCH .LLAMA ],
1067
1071
"model_parts" : 2 ,
1068
1072
"model_type" : HFModelFileType .SFT ,
1069
1073
"vocab_type" : LLaMaVocabType .SPM ,
1070
- "vocab_pre" : () ,
1074
+ "vocab_pre" : GPT_PRE_TOKENIZER_DEFAULT ,
1071
1075
"vocab_files" : HF_TOKENIZER_SPM_FILES ,
1072
1076
},
1073
1077
{
@@ -1076,7 +1080,7 @@ class HFModelFileType(IntEnum):
1076
1080
"model_parts" : 3 ,
1077
1081
"model_type" : HFModelFileType .SFT ,
1078
1082
"vocab_type" : LLaMaVocabType .SPM ,
1079
- "vocab_pre" : () ,
1083
+ "vocab_pre" : GPT_PRE_TOKENIZER_DEFAULT ,
1080
1084
"vocab_files" : HF_TOKENIZER_SPM_FILES ,
1081
1085
},
1082
1086
{
@@ -1085,7 +1089,7 @@ class HFModelFileType(IntEnum):
1085
1089
"model_parts" : 8 ,
1086
1090
"model_type" : HFModelFileType .SFT ,
1087
1091
"vocab_type" : LLaMaVocabType .SPM ,
1088
- "vocab_pre" : () ,
1092
+ "vocab_pre" : GPT_PRE_TOKENIZER_DEFAULT ,
1089
1093
"vocab_files" : HF_TOKENIZER_SPM_FILES ,
1090
1094
},
1091
1095
{
@@ -1094,35 +1098,37 @@ class HFModelFileType(IntEnum):
1094
1098
"model_parts" : 2 ,
1095
1099
"model_type" : HFModelFileType .SFT ,
1096
1100
"vocab_type" : LLaMaVocabType .SPM ,
1097
- "vocab_pre" : () ,
1101
+ "vocab_pre" : GPT_PRE_TOKENIZER_DEFAULT ,
1098
1102
"vocab_files" : HF_TOKENIZER_SPM_FILES ,
1099
1103
},
1100
- # Word Piece Models
1104
+ # WPM (Word Piece Models): Default to Byte Level Pre-tokenization.
1105
+ # NOTE: BERT Normalization and Pre-tokenization rules differ from Byte Level Pre-tokenization.
1101
1106
{
1102
1107
"model_repo" : "BAAI/bge-small-en-v1.5" ,
1103
1108
"model_arch" : MODEL_ARCH_NAMES [MODEL_ARCH .BERT ],
1104
1109
"model_parts" : 1 ,
1105
1110
"model_type" : HFModelFileType .BIN ,
1106
1111
"vocab_type" : LLaMaVocabType .WPM ,
1107
- "vocab_pre" : () ,
1112
+ "vocab_pre" : GPT_PRE_TOKENIZER_DEFAULT ,
1108
1113
"vocab_files" : HF_TOKENIZER_BPE_FILES ,
1109
1114
},
1110
1115
{
1111
1116
"model_repo" : "jinaai/jina-embeddings-v2-base-en" ,
1112
- "model_arch" : MODEL_ARCH .JINA_BERT_V2 ,
1117
+ "model_arch" : MODEL_ARCH_NAMES [MODEL_ARCH .JINA_BERT_V2 ],
1118
+ "model_parts" : 1 ,
1119
+ "model_type" : HFModelFileType .SFT ,
1113
1120
"vocab_type" : LLaMaVocabType .WPM ,
1121
+ "vocab_pre" : GPT_PRE_TOKENIZER_DEFAULT ,
1122
+ "vocab_files" : HF_TOKENIZER_BPE_FILES ,
1114
1123
},
1115
- # Byte Pair Encoding Models
1124
+ # BPE ( Byte Pair Encoding Models): Default is Byte Level Pre-tokenization
1116
1125
{
1117
1126
"model_repo" : "meta-llama/Meta-Llama-3-8B" ,
1118
1127
"model_arch" : MODEL_ARCH .LLAMA ,
1119
1128
"model_parts" : 4 ,
1120
1129
"model_type" : HFModelFileType .SFT ,
1121
1130
"vocab_type" : LLaMaVocabType .BPE ,
1122
- # PR#6920: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
1123
- "vocab_pre" : (
1124
- "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\ r\\ n\\ p{L}\\ p{N}]?\\ p{L}+|\\ p{N}{1,3}| ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n]*|\\ s*[\\ r\\ n]+|\\ s+(?!\\ S)|\\ s+" ,
1125
- ),
1131
+ "vocab_pre" : BPE_PRE_TOKENIZERS ["llama3" ],
1126
1132
"vocab_files" : HF_TOKENIZER_BPE_FILES ,
1127
1133
},
1128
1134
{
@@ -1131,7 +1137,7 @@ class HFModelFileType(IntEnum):
1131
1137
"model_parts" : 2 ,
1132
1138
"model_type" : HFModelFileType .BIN ,
1133
1139
"vocab_type" : LLaMaVocabType .BPE ,
1134
- "vocab_pre" : BPE_PRE_TOKENIZERS [MODEL_ARCH_NAMES [ MODEL_ARCH . FALCON ] ],
1140
+ "vocab_pre" : BPE_PRE_TOKENIZERS ["falcon" ],
1135
1141
"vocab_files" : HF_TOKENIZER_BPE_FILES ,
1136
1142
},
1137
1143
{
@@ -1140,14 +1146,7 @@ class HFModelFileType(IntEnum):
1140
1146
"model_parts" : 2 ,
1141
1147
"model_type" : HFModelFileType .BIN ,
1142
1148
"vocab_type" : LLaMaVocabType .BPE ,
1143
- "vocab_pre" : (
1144
- "[\r \n ]" ,
1145
- "\\ s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+" ,
1146
- "\\ s?[!-/:-~!-/:-~‘-‟ -。]+" ,
1147
- "\\ s+$" ,
1148
- "[一-龥ࠀ-一가-]+" ,
1149
- "\\ p{N}+" ,
1150
- ),
1149
+ "vocab_pre" : BPE_PRE_TOKENIZERS ["deepseek" ],
1151
1150
"vocab_files" : HF_TOKENIZER_BPE_FILES ,
1152
1151
},
1153
1152
{
@@ -1156,13 +1155,7 @@ class HFModelFileType(IntEnum):
1156
1155
"model_parts" : 2 ,
1157
1156
"model_type" : HFModelFileType .SFT ,
1158
1157
"vocab_type" : LLaMaVocabType .BPE ,
1159
- "vocab_pre" : (
1160
- "[\r \n ]" ,
1161
- "\\ s?\\ p{L}+" ,
1162
- "\\ s?\\ p{P}+" ,
1163
- "[一-龥ࠀ-一가-]+" ,
1164
- "\\ p{N}" ,
1165
- ),
1158
+ "vocab_pre" : BPE_PRE_TOKENIZERS ["deepseek-coder" ],
1166
1159
"vocab_files" : HF_TOKENIZER_BPE_FILES ,
1167
1160
},
1168
1161
{
@@ -1171,74 +1164,134 @@ class HFModelFileType(IntEnum):
1171
1164
"model_parts" : 2 ,
1172
1165
"model_type" : HFModelFileType .BIN ,
1173
1166
"vocab_type" : LLaMaVocabType .BPE ,
1174
- "vocab_pre" : (
1175
- "\\ s?\\ p{L}+" ,
1176
- "\\ s?\\ p{P}+" ,
1177
- "'s|'t|'re|'ve|'m|'ll|'d| ?\\ p{L}+| ?\\ p{N}+| ?[^\\ s\\ p{L}\\ p{N}]+|\\ s+(?!\\ S)" ,
1178
- ),
1167
+ "vocab_pre" : BPE_PRE_TOKENIZERS ["mpt" ],
1179
1168
"vocab_files" : HF_TOKENIZER_BPE_FILES ,
1180
1169
},
1170
+ #
1171
+ # BPE: STARCODER
1172
+ #
1181
1173
{
1182
1174
"model_repo" : "bigcode/starcoder2-3b" ,
1183
1175
"model_arch" : MODEL_ARCH .STARCODER2 ,
1184
1176
"model_parts" : 1 ,
1185
1177
"model_type" : HFModelFileType .SFT ,
1186
1178
"vocab_type" : LLaMaVocabType .BPE ,
1187
- "vocab_pre" : (
1188
- "\\ p{N}" ,
1189
- "'s|'t|'re|'ve|'m|'ll|'d| ?\\ p{L}+| ?\\ p{N}+| ?[^\\ s\\ p{L}\\ p{N}]+|\\ s+(?!\\ S)" ,
1190
- ),
1179
+ "vocab_pre" : BPE_PRE_TOKENIZERS ["starcoder" ],
1191
1180
"vocab_files" : HF_TOKENIZER_BPE_FILES ,
1192
1181
},
1193
- {
1194
- "model_repo" : "openai-community/gpt2" ,
1195
- "model_arch" : MODEL_ARCH .GPT2 ,
1196
- "vocab_type" : LLaMaVocabType .BPE ,
1197
- },
1198
1182
{
1199
1183
"model_repo" : "smallcloudai/Refact-1_6-base" ,
1200
1184
"model_arch" : MODEL_ARCH .REFACT ,
1185
+ "model_parts" : 1 ,
1186
+ "model_type" : HFModelFileType .BIN ,
1201
1187
"vocab_type" : LLaMaVocabType .BPE ,
1188
+ "vocab_pre" : BPE_PRE_TOKENIZERS ["starcoder" ],
1189
+ "vocab_files" : HF_TOKENIZER_BPE_FILES ,
1202
1190
},
1203
1191
{
1204
1192
"model_repo" : "CohereForAI/c4ai-command-r-v01" ,
1205
1193
"model_arch" : MODEL_ARCH .COMMAND_R ,
1194
+ "model_parts" : 15 ,
1195
+ "model_type" : HFModelFileType .SFT ,
1206
1196
"vocab_type" : LLaMaVocabType .BPE ,
1197
+ "vocab_pre" : BPE_PRE_TOKENIZERS ["starcoder" ],
1198
+ "vocab_files" : HF_TOKENIZER_BPE_FILES ,
1207
1199
},
1200
+ #
1201
+ # BPE: QWEN
1202
+ #
1208
1203
{
1209
1204
"model_repo" : "Qwen/Qwen1.5-7B" ,
1210
1205
"model_arch" : MODEL_ARCH .QWEN2 ,
1206
+ "model_parts" : 4 ,
1207
+ "model_type" : HFModelFileType .SFT ,
1211
1208
"vocab_type" : LLaMaVocabType .BPE ,
1209
+ "vocab_pre" : BPE_PRE_TOKENIZERS ["qwen" ],
1210
+ "vocab_files" : HF_TOKENIZER_BPE_FILES ,
1211
+ },
1212
+ {
1213
+ "model_repo" : "stabilityai/stablelm-2-zephyr-1_6b" ,
1214
+ "model_arch" : MODEL_ARCH .STABLELM ,
1215
+ "model_parts" : 1 ,
1216
+ "model_type" : HFModelFileType .SFT ,
1217
+ "vocab_type" : LLaMaVocabType .BPE ,
1218
+ "vocab_pre" : BPE_PRE_TOKENIZERS ["qwen" ],
1219
+ "vocab_files" : HF_TOKENIZER_BPE_FILES ,
1220
+ },
1221
+ #
1222
+ # BPE: GPT-2
1223
+ #
1224
+ {
1225
+ "model_repo" : "openai-community/gpt2" ,
1226
+ "model_arch" : MODEL_ARCH .GPT2 ,
1227
+ "model_parts" : 1 ,
1228
+ "model_type" : HFModelFileType .SFT ,
1229
+ "vocab_type" : LLaMaVocabType .BPE ,
1230
+ "vocab_pre" : BPE_PRE_TOKENIZERS ["gpt2" ],
1231
+ "vocab_files" : HF_TOKENIZER_BPE_FILES ,
1212
1232
},
1213
1233
{
1214
1234
"model_repo" : "allenai/OLMo-1.7-7B-hf" ,
1215
1235
"model_arch" : MODEL_ARCH .OLMO ,
1236
+ "model_parts" : 6 ,
1237
+ "model_type" : HFModelFileType .SFT ,
1216
1238
"vocab_type" : LLaMaVocabType .BPE ,
1239
+ "vocab_pre" : BPE_PRE_TOKENIZERS ["gpt2" ],
1240
+ "vocab_files" : HF_TOKENIZER_BPE_FILES ,
1217
1241
},
1218
- {
1242
+ { # NOTE: I don't have access to this model
1219
1243
"model_repo" : "databricks/dbrx-base" ,
1220
1244
"model_arch" : MODEL_ARCH .DBRX ,
1245
+ "model_parts" : 0 ,
1246
+ "model_type" : HFModelFileType .SFT ,
1221
1247
"vocab_type" : LLaMaVocabType .BPE ,
1248
+ "vocab_pre" : BPE_PRE_TOKENIZERS ["gpt2" ],
1249
+ "vocab_files" : HF_TOKENIZER_BPE_FILES ,
1222
1250
},
1223
- {
1251
+ { # NOTE: RoBERTa post processor
1224
1252
"model_repo" : "jinaai/jina-embeddings-v2-base-es" ,
1225
1253
"model_arch" : MODEL_ARCH .JINA_BERT_V2 ,
1254
+ "model_parts" : 1 ,
1255
+ "model_type" : HFModelFileType .SFT ,
1226
1256
"vocab_type" : LLaMaVocabType .BPE ,
1257
+ "vocab_pre" : BPE_PRE_TOKENIZERS ["gpt2" ],
1258
+ "vocab_files" : HF_TOKENIZER_BPE_FILES ,
1227
1259
},
1228
- {
1260
+ { # NOTE: RoBERTa post processor
1229
1261
"model_repo" : "jinaai/jina-embeddings-v2-base-de" ,
1230
1262
"model_arch" : MODEL_ARCH .JINA_BERT_V2 ,
1263
+ "model_parts" : 1 ,
1264
+ "model_type" : HFModelFileType .SFT ,
1231
1265
"vocab_type" : LLaMaVocabType .BPE ,
1266
+ "vocab_pre" : BPE_PRE_TOKENIZERS ["gpt2" ],
1267
+ "vocab_files" : HF_TOKENIZER_BPE_FILES ,
1232
1268
},
1233
- {
1269
+ { # NOTE: Phi-1 is compatible with GPT-2 arch and vocab
1234
1270
"model_repo" : "microsoft/phi-1" ,
1235
1271
"model_arch" : MODEL_ARCH .PHI2 ,
1272
+ "model_parts" : 1 ,
1273
+ "model_type" : HFModelFileType .SFT ,
1236
1274
"vocab_type" : LLaMaVocabType .BPE ,
1275
+ "vocab_pre" : BPE_PRE_TOKENIZERS ["gpt2" ],
1276
+ "vocab_files" : HF_TOKENIZER_BPE_FILES ,
1237
1277
},
1238
1278
{
1239
- "model_repo" : "stabilityai/stablelm-2-zephyr-1_6b" ,
1240
- "model_arch" : MODEL_ARCH .STABLELM ,
1279
+ "model_repo" : "microsoft/phi-1_5" ,
1280
+ "model_arch" : MODEL_ARCH .PHI2 ,
1281
+ "model_parts" : 1 ,
1282
+ "model_type" : HFModelFileType .SFT ,
1241
1283
"vocab_type" : LLaMaVocabType .BPE ,
1284
+ "vocab_pre" : BPE_PRE_TOKENIZERS ["gpt2" ],
1285
+ "vocab_files" : HF_TOKENIZER_BPE_FILES ,
1286
+ },
1287
+ {
1288
+ "model_repo" : "microsoft/phi-2" ,
1289
+ "model_arch" : MODEL_ARCH .PHI2 ,
1290
+ "model_parts" : 2 ,
1291
+ "model_type" : HFModelFileType .SFT ,
1292
+ "vocab_type" : LLaMaVocabType .BPE ,
1293
+ "vocab_pre" : BPE_PRE_TOKENIZERS ["gpt2" ],
1294
+ "vocab_files" : HF_TOKENIZER_BPE_FILES ,
1242
1295
},
1243
1296
)
1244
1297
0 commit comments