Skip to content

Commit 8b99e2a

Browse files
authored
llama : handle unknown utf8 bytes (#7588)
1 parent 271ff3f commit 8b99e2a

File tree

1 file changed

+10
-1
lines changed

1 file changed

+10
-1
lines changed

llama.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17940,7 +17940,16 @@ static std::string llama_decode_text(const std::string & text) {
1794017940

1794117941
const auto cpts = unicode_cpts_from_utf8(text);
1794217942
for (const auto cpt : cpts) {
17943-
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
17943+
const auto utf8 = unicode_cpt_to_utf8(cpt);
17944+
try {
17945+
decoded_text += unicode_utf8_to_byte(utf8);
17946+
} catch (const std::out_of_range & e) {
17947+
decoded_text += "[UNK_BYTE_0x";
17948+
for (const auto c : utf8) {
17949+
decoded_text += format("%02x", (uint8_t) c);
17950+
}
17951+
decoded_text += text + "]";
17952+
}
1794417953
}
1794517954

1794617955
return decoded_text;

0 commit comments

Comments
 (0)