diffblue · tautschnig · May 2, 2019 · Apr 30, 2019 · May 1, 2019 · tautschnig
@@ -18,6 +18,8 @@
 %{
 #include "json_parser.h"
 
+#include <util/unicode.h>
+
 int yyjsonlex();
 extern char *yyjsontext;
 extern int yyjsonleng; // really an int, not a size_t
@@ -49,10 +51,11 @@ static std::string convert_TOK_STRING()
       {
         // Character in hexadecimal Unicode representation, in the format
         // \uABCD, i.e. the following four digits are part of this character.
-        assert(p + 4 < yyjsontext + len - 1);
+        char *last_hex_digit = p + 4;
+        assert(last_hex_digit < yyjsontext + len - 1);
         std::string hex(++p, 4);
-        result += std::stoi(hex, nullptr, 16);
-        p += 3;
+        result += codepoint_hex_to_utf8(hex);
+        p = last_hex_digit;
         break;
       }
       default:; /* an error */

@@ -8,11 +8,14 @@ Author: Daniel Kroening, [email protected]
 
 #include "unicode.h"
 
+#include <codecvt>
+#include <cstdint>
 #include <cstring>
-#include <locale>
 #include <iomanip>
+#include <locale>
 #include <sstream>
-#include <cstdint>
+
+#include "invariant.h"
 
 #ifdef _WIN32
 #include <util/pragma_push.def>
@@ -315,3 +318,33 @@ std::string utf16_native_endian_to_java(const std::wstring &in)
     utf16_native_endian_to_java(ch, result, loc);
   return result.str();
 }
+
+std::string utf16_native_endian_to_utf8(const char16_t utf16_char)
+{
+  return utf16_native_endian_to_utf8(std::u16string(1, utf16_char));
+}
+
+std::string utf16_native_endian_to_utf8(const std::u16string &utf16_str)
+{
+#ifdef _MSC_VER
+  // Workaround for Visual Studio bug, see
+  // https://stackoverflow.com/questions/32055357
+  std::wstring wide_string(utf16_str.begin(), utf16_str.end());
+  return std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t>{}
+    .to_bytes(wide_string);
+#else
+  return std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>{}
+    .to_bytes(utf16_str);
+#endif
+}
+
+char16_t codepoint_hex_to_utf16_native_endian(const std::string &hex)
+{
+  PRECONDITION(hex.length() == 4);
+  return std::strtol(hex.c_str(), nullptr, 16);
+}
+
+std::string codepoint_hex_to_utf8(const std::string &hex)
+{
+  return utf16_native_endian_to_utf8(codepoint_hex_to_utf16_native_endian(hex));
+}
@@ -31,6 +31,26 @@ std::string utf16_native_endian_to_java(const std::wstring &in);
 
 std::vector<std::string> narrow_argv(int argc, const wchar_t **argv_wide);
 
+/// \param utf16_char: UTF-16 character in architecture-native endianness
+///   encoding
+/// \return UTF-8 encoding of the same codepoint
+std::string utf16_native_endian_to_utf8(char16_t utf16_char);
+
+/// \param utf16_str: UTF-16 string in architecture-native endianness encoding
+/// \return UTF-8 encoding of the string
+std::string utf16_native_endian_to_utf8(const std::u16string &utf16_str);
+
+/// \param hex: representation of a BMP codepoint as a four-digit string
+///   (e.g.\ "0041" for \\u0041)
+/// \return encoding of the codepoint as a single UTF-16 character in
+///   architecture-native endianness encoding
+char16_t codepoint_hex_to_utf16_native_endian(const std::string &hex);
+
+/// \param hex: representation of a BMP codepoint as a four-digit string
+///   (e.g.\ "0041" for \\u0041)
+/// \return UTF-8 encoding of the codepoint
+std::string codepoint_hex_to_utf8(const std::string &hex);
+
 template <typename It>
 std::vector<const char *> to_c_str_array(It b, It e)
 {

diff --git a/unit/json/json_parser.cpp b/unit/json/json_parser.cpp
@@ -84,17 +84,22 @@ SCENARIO("Loading JSON files")
       }
     }
   }
-  GIVEN("A JSON file containing a hexadecimal Unicode character")
+  GIVEN("A JSON file containing hexadecimal Unicode symbols")
   {
     temporary_filet unicode_json_file("cbmc_unit_json_parser_unicode", ".json");
     const std::string unicode_json_path = unicode_json_file();
     {
       std::ofstream unicode_json_out(unicode_json_path);
       unicode_json_out << "{\n"
-                       << "  \"special character\": \"\\u0001\"\n"
+                       << "  \"one\": \"\\u0001\",\n"
+                       << "  \"latin\": \"\\u0042\",\n"
+                       << "  \"grave\": \"\\u00E0\",\n"
+                       << "  \"trema\": \"\\u00FF\",\n"
+                       << "  \"high\": \"\\uFFFF\",\n"
+                       << "  \"several\": \"a\\u0041b\\u2FC3\\uFFFF\"\n"
                        << "}\n";
     }
-    WHEN("Loading the JSON file with the special character")
+    WHEN("Loading the JSON file with the Unicode symbols")
     {
       jsont unicode_json;
       const auto unicode_parse_error =
@@ -105,9 +110,25 @@ SCENARIO("Loading JSON files")
         REQUIRE(unicode_json.is_object());
 
         const json_objectt &json_object = to_json_object(unicode_json);
-        REQUIRE(json_object.find("special character") != json_object.end());
-        REQUIRE(json_object["special character"].value.size() == 1);
-        REQUIRE(json_object["special character"].value == "\u0001");
+
+        REQUIRE(json_object.find("one") != json_object.end());
+        REQUIRE(json_object["one"].value.size() == 1);
+        REQUIRE(json_object["one"].value == u8"\u0001");
+
+        REQUIRE(json_object.find("latin") != json_object.end());
+        REQUIRE(json_object["latin"].value == "B");
+
+        REQUIRE(json_object.find("grave") != json_object.end());
+        REQUIRE(json_object["grave"].value == "à");
+
+        REQUIRE(json_object.find("trema") != json_object.end());
+        REQUIRE(json_object["trema"].value == "ÿ");
+
+        REQUIRE(json_object.find("high") != json_object.end());
+        REQUIRE(json_object["high"].value == u8"\uFFFF");
+
+        REQUIRE(json_object.find("several") != json_object.end());
+        REQUIRE(json_object["several"].value == u8"aAb\u2FC3\uFFFF");
       }
     }
   }