diff --git a/src/json/parser.y b/src/json/parser.y index e0998fa5448..85df4c0ddfe 100644 --- a/src/json/parser.y +++ b/src/json/parser.y @@ -18,6 +18,8 @@ %{ #include "json_parser.h" +#include + int yyjsonlex(); extern char *yyjsontext; extern int yyjsonleng; // really an int, not a size_t @@ -49,10 +51,11 @@ static std::string convert_TOK_STRING() { // Character in hexadecimal Unicode representation, in the format // \uABCD, i.e. the following four digits are part of this character. - assert(p + 4 < yyjsontext + len - 1); + char *last_hex_digit = p + 4; + assert(last_hex_digit < yyjsontext + len - 1); std::string hex(++p, 4); - result += std::stoi(hex, nullptr, 16); - p += 3; + result += codepoint_hex_to_utf8(hex); + p = last_hex_digit; break; } default:; /* an error */ diff --git a/src/util/unicode.cpp b/src/util/unicode.cpp index 111e95bb581..b879cae7369 100644 --- a/src/util/unicode.cpp +++ b/src/util/unicode.cpp @@ -8,11 +8,14 @@ Author: Daniel Kroening, kroening@kroening.com #include "unicode.h" +#include +#include #include -#include #include +#include #include -#include + +#include "invariant.h" #ifdef _WIN32 #include @@ -315,3 +318,33 @@ std::string utf16_native_endian_to_java(const std::wstring &in) utf16_native_endian_to_java(ch, result, loc); return result.str(); } + +std::string utf16_native_endian_to_utf8(const char16_t utf16_char) +{ + return utf16_native_endian_to_utf8(std::u16string(1, utf16_char)); +} + +std::string utf16_native_endian_to_utf8(const std::u16string &utf16_str) +{ +#ifdef _MSC_VER + // Workaround for Visual Studio bug, see + // https://stackoverflow.com/questions/32055357 + std::wstring wide_string(utf16_str.begin(), utf16_str.end()); + return std::wstring_convert, wchar_t>{} + .to_bytes(wide_string); +#else + return std::wstring_convert, char16_t>{} + .to_bytes(utf16_str); +#endif +} + +char16_t codepoint_hex_to_utf16_native_endian(const std::string &hex) +{ + PRECONDITION(hex.length() == 4); + return std::strtol(hex.c_str(), nullptr, 16); +} + +std::string codepoint_hex_to_utf8(const std::string &hex) +{ + return utf16_native_endian_to_utf8(codepoint_hex_to_utf16_native_endian(hex)); +} diff --git a/src/util/unicode.h b/src/util/unicode.h index 9a0cc116ecb..e317aca8564 100644 --- a/src/util/unicode.h +++ b/src/util/unicode.h @@ -31,6 +31,26 @@ std::string utf16_native_endian_to_java(const std::wstring &in); std::vector narrow_argv(int argc, const wchar_t **argv_wide); +/// \param utf16_char: UTF-16 character in architecture-native endianness +/// encoding +/// \return UTF-8 encoding of the same codepoint +std::string utf16_native_endian_to_utf8(char16_t utf16_char); + +/// \param utf16_str: UTF-16 string in architecture-native endianness encoding +/// \return UTF-8 encoding of the string +std::string utf16_native_endian_to_utf8(const std::u16string &utf16_str); + +/// \param hex: representation of a BMP codepoint as a four-digit string +/// (e.g.\ "0041" for \\u0041) +/// \return encoding of the codepoint as a single UTF-16 character in +/// architecture-native endianness encoding +char16_t codepoint_hex_to_utf16_native_endian(const std::string &hex); + +/// \param hex: representation of a BMP codepoint as a four-digit string +/// (e.g.\ "0041" for \\u0041) +/// \return UTF-8 encoding of the codepoint +std::string codepoint_hex_to_utf8(const std::string &hex); + template std::vector to_c_str_array(It b, It e) { diff --git a/unit/json/json_parser.cpp b/unit/json/json_parser.cpp index 1da52d01b69..5a78390d326 100644 --- a/unit/json/json_parser.cpp +++ b/unit/json/json_parser.cpp @@ -84,17 +84,22 @@ SCENARIO("Loading JSON files") } } } - GIVEN("A JSON file containing a hexadecimal Unicode character") + GIVEN("A JSON file containing hexadecimal Unicode symbols") { temporary_filet unicode_json_file("cbmc_unit_json_parser_unicode", ".json"); const std::string unicode_json_path = unicode_json_file(); { std::ofstream unicode_json_out(unicode_json_path); unicode_json_out << "{\n" - << " \"special character\": \"\\u0001\"\n" + << " \"one\": \"\\u0001\",\n" + << " \"latin\": \"\\u0042\",\n" + << " \"grave\": \"\\u00E0\",\n" + << " \"trema\": \"\\u00FF\",\n" + << " \"high\": \"\\uFFFF\",\n" + << " \"several\": \"a\\u0041b\\u2FC3\\uFFFF\"\n" << "}\n"; } - WHEN("Loading the JSON file with the special character") + WHEN("Loading the JSON file with the Unicode symbols") { jsont unicode_json; const auto unicode_parse_error = @@ -105,9 +110,25 @@ SCENARIO("Loading JSON files") REQUIRE(unicode_json.is_object()); const json_objectt &json_object = to_json_object(unicode_json); - REQUIRE(json_object.find("special character") != json_object.end()); - REQUIRE(json_object["special character"].value.size() == 1); - REQUIRE(json_object["special character"].value == "\u0001"); + + REQUIRE(json_object.find("one") != json_object.end()); + REQUIRE(json_object["one"].value.size() == 1); + REQUIRE(json_object["one"].value == u8"\u0001"); + + REQUIRE(json_object.find("latin") != json_object.end()); + REQUIRE(json_object["latin"].value == "B"); + + REQUIRE(json_object.find("grave") != json_object.end()); + REQUIRE(json_object["grave"].value == "à"); + + REQUIRE(json_object.find("trema") != json_object.end()); + REQUIRE(json_object["trema"].value == "ÿ"); + + REQUIRE(json_object.find("high") != json_object.end()); + REQUIRE(json_object["high"].value == u8"\uFFFF"); + + REQUIRE(json_object.find("several") != json_object.end()); + REQUIRE(json_object["several"].value == u8"aAb\u2FC3\uFFFF"); } } }