Skip to content

Commit 847d49f

Browse files
authored
Add Phi4 test and fix regex parsing.
Differential Revision: D83641294 Pull Request resolved: #130
1 parent 65e41a9 commit 847d49f

File tree

4 files changed

+68
-16
lines changed

4 files changed

+68
-16
lines changed

src/hf_tokenizer.cpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ Error HFTokenizer::load(const std::string& path) {
5656
json parsed_json;
5757
try {
5858
parsed_json = json::parse(contents);
59-
} catch (const json::exception& e) {
59+
} catch (const std::exception& e) {
6060
TK_LOG(Error, "Error parsing json file: %s", e.what());
6161
return Error::LoadFailure;
6262
}
@@ -76,7 +76,7 @@ Error HFTokenizer::load(const std::string& path) {
7676

7777
// Store for future use.
7878
special_token_map_.emplace(std::move(special_token_map));
79-
} catch (const json::out_of_range& e) {
79+
} catch (const std::exception& e) {
8080
TK_LOG(Info, "Could not parse special tokens: %s", e.what());
8181
return Error::LoadFailure;
8282
}
@@ -96,7 +96,7 @@ Error HFTokenizer::load(const std::string& path) {
9696

9797
auto token_map = TK_UNWRAP(detail::build_token_map(std::move(token_pairs)));
9898
token_map_.emplace(std::move(token_map));
99-
} catch (const json::out_of_range& e) {
99+
} catch (const std::exception& e) {
100100
TK_LOG(Info, "Could not parse tokens: %s", e.what());
101101
return Error::LoadFailure;
102102
}
@@ -114,7 +114,7 @@ Error HFTokenizer::load(const std::string& path) {
114114
} else {
115115
TK_LOG(Info, "Normalizer field is null, skipping");
116116
}
117-
} catch (const json::out_of_range& e) {
117+
} catch (const std::exception& e) {
118118
// No "Normalizer" field found
119119
TK_LOG(
120120
Info,
@@ -129,7 +129,7 @@ Error HFTokenizer::load(const std::string& path) {
129129
.parse_json(parsed_json.at("pre_tokenizer"))
130130
.create();
131131
TK_LOG(Info, "Pretokenizer set up");
132-
} catch (const json::out_of_range& e) {
132+
} catch (const std::exception& e) {
133133
TK_LOG(Info, "Could not parse pre_tokenizer: %s", e.what());
134134
return Error::LoadFailure;
135135
}
@@ -138,7 +138,7 @@ Error HFTokenizer::load(const std::string& path) {
138138
try {
139139
_decoder =
140140
TokenDecoderConfig().parse_json(parsed_json.at("decoder")).create();
141-
} catch (const json::out_of_range&) {
141+
} catch (const std::exception&) {
142142
// No decoder specified
143143
}
144144

@@ -192,7 +192,7 @@ Error HFTokenizer::load(const std::string& path) {
192192
"Built merge ranks map with %" PRId64 " entries",
193193
static_cast<int64_t>(merge_ranks.size()));
194194
merge_ranks_.emplace(std::move(merge_ranks));
195-
} catch (const json::out_of_range& e) {
195+
} catch (const std::exception& e) {
196196
TK_LOG(Error, "Could not parse merges: %s", e.what());
197197
return Error::LoadFailure;
198198
}
@@ -211,7 +211,7 @@ Error HFTokenizer::load(const std::string& path) {
211211
json parsed_config_json;
212212
try {
213213
parsed_config_json = json::parse(config_contents);
214-
} catch (const json::exception& e) {
214+
} catch (const std::exception& e) {
215215
TK_LOG(Error, "Error parsing model config json json file: %s", e.what());
216216
return Error::LoadFailure;
217217
}
@@ -239,7 +239,7 @@ Error HFTokenizer::load(const std::string& path) {
239239
}
240240
bos_tok_ = *bos_res;
241241
eos_tok_ = *eos_res;
242-
} catch (const json::out_of_range& e) {
242+
} catch (const std::exception& e) {
243243
TK_LOG(Error, "Could not eos/bos from tokenizer config: %s", e.what());
244244
return Error::LoadFailure;
245245
}

src/pre_tokenizer.cpp

Lines changed: 43 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -48,14 +48,15 @@ PreTokenizer::Ptr PreTokenizerConfig::create() const {
4848
}
4949

5050
// Validate invert parameter
51-
bool invert_flag = invert ? *invert : false;
52-
if (invert_flag) {
51+
const bool invert_flag = invert ? *invert : false;
52+
const bool delimiter_flag = is_delimiter ? *is_delimiter : false;
53+
if (invert_flag && delimiter_flag) {
5354
throw std::runtime_error(
54-
"invert=true is not supported for Split PreTokenizer. Only invert=false is supported.");
55+
"invert=true is not supported for Split PreTokenizer with a String pattern.");
5556
}
5657

57-
return PreTokenizer::Ptr(new RegexPreTokenizer(
58-
*pattern, is_delimiter ? *is_delimiter : false, behavior_str));
58+
return PreTokenizer::Ptr(
59+
new RegexPreTokenizer(*pattern, delimiter_flag, behavior_str));
5960
}
6061
if (type == "Digits") {
6162
if (individual_digits) {
@@ -143,16 +144,51 @@ PreTokenizerConfig& PreTokenizerConfig::parse_json(const json& json_config) {
143144

144145
// RegexPreTokenizer ///////////////////////////////////////////////////////////
145146

147+
namespace {
148+
149+
// Make Hugging Face Split patterns RE2-compatible by:
150+
// 1) removing the negative look-ahead "\s+(?!\S)" (→ "\s+$")
151+
// 2) expanding the inline case-insensitive contractions
152+
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)" into explicit alternations.
153+
static void replace_all_in_place(
154+
std::string& input,
155+
const std::string& needle,
156+
const std::string& replacement) {
157+
if (needle.empty()) {
158+
return;
159+
}
160+
size_t search_pos = 0;
161+
while ((search_pos = input.find(needle, search_pos)) != std::string::npos) {
162+
input.replace(search_pos, needle.size(), replacement);
163+
search_pos += replacement.size();
164+
}
165+
}
166+
167+
static std::string make_re2_compatible(std::string pattern) {
168+
const std::string lookahead_trailing_space = R"(\s+(?!\S))";
169+
const std::string trailing_space_replacement = R"(\s+$)";
170+
replace_all_in_place(
171+
pattern, lookahead_trailing_space, trailing_space_replacement);
172+
const std::string ci_contractions = R"((?i:'s|'t|'re|'ve|'m|'ll|'d))";
173+
const std::string contractions_expanded =
174+
"(?:'s|'S|'t|'T|'re|'RE|'ve|'VE|'m|'M|'ll|'LL|'d|'D)";
175+
replace_all_in_place(pattern, ci_contractions, contractions_expanded);
176+
return pattern;
177+
}
178+
179+
} // namespace
180+
146181
std::unique_ptr<IRegex> RegexPreTokenizer::create_regex_(
147182
const std::string& pattern) {
148183
assert(!pattern.empty());
149-
return TK_UNWRAP_THROW(create_regex(pattern));
184+
return TK_UNWRAP_THROW(create_regex(make_re2_compatible(pattern)));
150185
}
151186

152187
std::vector<std::string> RegexPreTokenizer::pre_tokenize(
153188
const std::string& input) const {
154-
if (!regex_)
189+
if (!regex_) {
155190
return {};
191+
}
156192

157193
std::vector<std::string> results;
158194
auto matches = regex_->find_all(input);

test/test_hf_tokenizer.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,3 +48,16 @@ def test_llama3_2_1b(self) -> None:
4848
tokens = tokenizer.encode(PROMPT)
4949
cpp_tokens = cpp_tokenizer.encode(PROMPT, bos=1)
5050
self.assertEqual(tokens, cpp_tokens)
51+
52+
def test_phi_4_mini(self) -> None:
53+
tokenizer = AutoTokenizer.from_pretrained(
54+
"software-mansion/react-native-executorch-phi-4-mini"
55+
)
56+
tokenizer_path = tokenizer.save_pretrained(self.temp_dir.name)[-1]
57+
58+
cpp_tokenizer = CppHFTokenizer()
59+
cpp_tokenizer.load(tokenizer_path)
60+
61+
tokens = tokenizer.encode(PROMPT)
62+
cpp_tokens = cpp_tokenizer.encode(PROMPT)
63+
self.assertEqual(tokens, cpp_tokens)

third-party/targets.bzl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ def define_common_targets():
1212
exported_headers = subdir_glob([
1313
("llama.cpp-unicode/include", "*.h"),
1414
]),
15+
compiler_flags = [
16+
"-Wno-error=deprecated-declarations",
17+
],
1518
visibility = ["@EXECUTORCH_CLIENTS", "//pytorch/tokenizers/..."],
1619
)
1720

0 commit comments

Comments
 (0)