Skip to content

Commit 040d31b

Browse files
committed
Add support for UTF-16 in the JSON parser
The previous implementation only supported UTF-8 (up to \u007F) as characters, and all remaining characters up to \u00FF as integers. The new implementation supports all UTF-16 (single) characters.
1 parent b8539ce commit 040d31b

File tree

4 files changed

+45
-9
lines changed

4 files changed

+45
-9
lines changed

src/json/parser.y

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
%{
1919
#include "json_parser.h"
2020

21+
#include <util/unicode.h>
22+
2123
int yyjsonlex();
2224
extern char *yyjsontext;
2325
extern int yyjsonleng; // really an int, not a size_t
@@ -51,7 +53,7 @@ static std::string convert_TOK_STRING()
5153
// \uABCD, i.e. the following four digits are part of this character.
5254
assert(p + 4 < yyjsontext + len - 1);
5355
std::string hex(++p, 4);
54-
result += std::stoi(hex, nullptr, 16);
56+
result += utf16_hex_to_utf8(hex);
5557
p += 3;
5658
break;
5759
}

src/util/unicode.cpp

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,14 @@ Author: Daniel Kroening, [email protected]
88

99
#include "unicode.h"
1010

11+
#include <codecvt>
12+
#include <cstdint>
1113
#include <cstring>
12-
#include <locale>
1314
#include <iomanip>
15+
#include <locale>
1416
#include <sstream>
15-
#include <cstdint>
17+
18+
#include "invariant.h"
1619

1720
#ifdef _WIN32
1821
#include <util/pragma_push.def>
@@ -315,3 +318,12 @@ std::string utf16_native_endian_to_java(const std::wstring &in)
315318
utf16_native_endian_to_java(ch, result, loc);
316319
return result.str();
317320
}
321+
322+
std::string utf16_hex_to_utf8(const std::string &hex)
323+
{
324+
PRECONDITION(hex.length() == 4);
325+
const char16_t utf16_char = std::strtol(hex.c_str(), nullptr, 16);
326+
const std::u16string u16(1, utf16_char);
327+
return std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>{}
328+
.to_bytes(u16);
329+
}

src/util/unicode.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@ std::string utf16_native_endian_to_java(const std::wstring &in);
3131

3232
std::vector<std::string> narrow_argv(int argc, const wchar_t **argv_wide);
3333

34+
/// \param hex: a representation of a UTF-16 character as a four-digit string
35+
/// (e.g.\ "0041" for \u0041)
36+
/// \return the UTF-8 encoding of the character.
37+
std::string utf16_hex_to_utf8(const std::string &hex);
38+
3439
template <typename It>
3540
std::vector<const char *> to_c_str_array(It b, It e)
3641
{

unit/json/json_parser.cpp

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -84,17 +84,21 @@ SCENARIO("Loading JSON files")
8484
}
8585
}
8686
}
87-
GIVEN("A JSON file containing a hexadecimal Unicode character")
87+
GIVEN("A JSON file containing hexadecimal Unicode characters")
8888
{
8989
temporary_filet unicode_json_file("cbmc_unit_json_parser_unicode", ".json");
9090
const std::string unicode_json_path = unicode_json_file();
9191
{
9292
std::ofstream unicode_json_out(unicode_json_path);
9393
unicode_json_out << "{\n"
94-
<< " \"special character\": \"\\u0001\"\n"
94+
<< " \"one\": \"\\u0001\",\n"
95+
<< " \"latin\": \"\\u0042\",\n"
96+
<< " \"grave\": \"\\u00E0\",\n"
97+
<< " \"trema\": \"\\u00FF\",\n"
98+
<< " \"high\": \"\\uFFFF\"\n"
9599
<< "}\n";
96100
}
97-
WHEN("Loading the JSON file with the special character")
101+
WHEN("Loading the JSON file with the special characters")
98102
{
99103
jsont unicode_json;
100104
const auto unicode_parse_error =
@@ -105,9 +109,22 @@ SCENARIO("Loading JSON files")
105109
REQUIRE(unicode_json.is_object());
106110

107111
const json_objectt &json_object = to_json_object(unicode_json);
108-
REQUIRE(json_object.find("special character") != json_object.end());
109-
REQUIRE(json_object["special character"].value.size() == 1);
110-
REQUIRE(json_object["special character"].value == "\u0001");
112+
113+
REQUIRE(json_object.find("one") != json_object.end());
114+
REQUIRE(json_object["one"].value.size() == 1);
115+
REQUIRE(json_object["one"].value == "\u0001");
116+
117+
REQUIRE(json_object.find("latin") != json_object.end());
118+
REQUIRE(json_object["latin"].value == "B");
119+
120+
REQUIRE(json_object.find("grave") != json_object.end());
121+
REQUIRE(json_object["grave"].value == "à");
122+
123+
REQUIRE(json_object.find("trema") != json_object.end());
124+
REQUIRE(json_object["trema"].value == "ÿ");
125+
126+
REQUIRE(json_object.find("high") != json_object.end());
127+
REQUIRE(json_object["high"].value == "\uFFFF");
111128
}
112129
}
113130
}

0 commit comments

Comments
 (0)