Skip to content

Commit e5a260e

Browse files
committed
Add support for UTF-16 in the JSON parser
The previous implementation only supported UTF-8 (up to \u007F) as characters, and all remaining characters up to \u00FF as integers. The new implementation supports all UTF-16 (single) characters.
1 parent b8539ce commit e5a260e

File tree

4 files changed

+64
-9
lines changed

4 files changed

+64
-9
lines changed

src/json/parser.y

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
%{
1919
#include "json_parser.h"
2020

21+
#include <util/unicode.h>
22+
2123
int yyjsonlex();
2224
extern char *yyjsontext;
2325
extern int yyjsonleng; // really an int, not a size_t
@@ -51,7 +53,7 @@ static std::string convert_TOK_STRING()
5153
// \uABCD, i.e. the following four digits are part of this character.
5254
assert(p + 4 < yyjsontext + len - 1);
5355
std::string hex(++p, 4);
54-
result += std::stoi(hex, nullptr, 16);
56+
result += utf16_hex_to_utf8(hex);
5557
p += 3;
5658
break;
5759
}

src/util/unicode.cpp

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,14 @@ Author: Daniel Kroening, [email protected]
88

99
#include "unicode.h"
1010

11+
#include <codecvt>
12+
#include <cstdint>
1113
#include <cstring>
12-
#include <locale>
1314
#include <iomanip>
15+
#include <locale>
1416
#include <sstream>
15-
#include <cstdint>
17+
18+
#include "invariant.h"
1619

1720
#ifdef _WIN32
1821
#include <util/pragma_push.def>
@@ -315,3 +318,25 @@ std::string utf16_native_endian_to_java(const std::wstring &in)
315318
utf16_native_endian_to_java(ch, result, loc);
316319
return result.str();
317320
}
321+
322+
char16_t utf16_hex_to_utf16_native_endian(const std::string &hex)
323+
{
324+
PRECONDITION(hex.length() == 4);
325+
return std::strtol(hex.c_str(), nullptr, 16);
326+
}
327+
328+
std::string utf16_native_endian_to_utf8(const std::u16string &utf16_str)
329+
{
330+
return std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>{}
331+
.to_bytes(utf16_str);
332+
}
333+
334+
std::string utf16_native_endian_to_utf8(const char16_t utf16_char)
335+
{
336+
return utf16_native_endian_to_utf8(std::u16string(1, utf16_char));
337+
}
338+
339+
std::string utf16_hex_to_utf8(const std::string &hex)
340+
{
341+
return utf16_native_endian_to_utf8(utf16_hex_to_utf16_native_endian(hex));
342+
}

src/util/unicode.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,17 @@ std::string utf16_native_endian_to_java(const std::wstring &in);
3131

3232
std::vector<std::string> narrow_argv(int argc, const wchar_t **argv_wide);
3333

34+
char16_t utf16_hex_to_utf16_native_endian(const std::string &hex);
35+
36+
std::string utf16_native_endian_to_utf8(const std::u16string &utf16_str);
37+
38+
std::string utf16_native_endian_to_utf8(char16_t utf16_char);
39+
40+
/// \param hex: a representation of a UTF-16 symbol as a four-digit string
41+
/// (e.g.\ "0041" for \\u0041)
42+
/// \return the UTF-8 encoding of the symbol.
43+
std::string utf16_hex_to_utf8(const std::string &hex);
44+
3445
template <typename It>
3546
std::vector<const char *> to_c_str_array(It b, It e)
3647
{

unit/json/json_parser.cpp

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -84,17 +84,21 @@ SCENARIO("Loading JSON files")
8484
}
8585
}
8686
}
87-
GIVEN("A JSON file containing a hexadecimal Unicode character")
87+
GIVEN("A JSON file containing hexadecimal Unicode symbols")
8888
{
8989
temporary_filet unicode_json_file("cbmc_unit_json_parser_unicode", ".json");
9090
const std::string unicode_json_path = unicode_json_file();
9191
{
9292
std::ofstream unicode_json_out(unicode_json_path);
9393
unicode_json_out << "{\n"
94-
<< " \"special character\": \"\\u0001\"\n"
94+
<< " \"one\": \"\\u0001\",\n"
95+
<< " \"latin\": \"\\u0042\",\n"
96+
<< " \"grave\": \"\\u00E0\",\n"
97+
<< " \"trema\": \"\\u00FF\",\n"
98+
<< " \"high\": \"\\uFFFF\"\n"
9599
<< "}\n";
96100
}
97-
WHEN("Loading the JSON file with the special character")
101+
WHEN("Loading the JSON file with the Unicode symbols")
98102
{
99103
jsont unicode_json;
100104
const auto unicode_parse_error =
@@ -105,9 +109,22 @@ SCENARIO("Loading JSON files")
105109
REQUIRE(unicode_json.is_object());
106110

107111
const json_objectt &json_object = to_json_object(unicode_json);
108-
REQUIRE(json_object.find("special character") != json_object.end());
109-
REQUIRE(json_object["special character"].value.size() == 1);
110-
REQUIRE(json_object["special character"].value == "\u0001");
112+
113+
REQUIRE(json_object.find("one") != json_object.end());
114+
REQUIRE(json_object["one"].value.size() == 1);
115+
REQUIRE(json_object["one"].value == "\u0001");
116+
117+
REQUIRE(json_object.find("latin") != json_object.end());
118+
REQUIRE(json_object["latin"].value == "B");
119+
120+
REQUIRE(json_object.find("grave") != json_object.end());
121+
REQUIRE(json_object["grave"].value == "à");
122+
123+
REQUIRE(json_object.find("trema") != json_object.end());
124+
REQUIRE(json_object["trema"].value == "ÿ");
125+
126+
REQUIRE(json_object.find("high") != json_object.end());
127+
REQUIRE(json_object["high"].value == "\uFFFF");
111128
}
112129
}
113130
}

0 commit comments

Comments
 (0)