Don't write 0 to the end of strings in embind (#10844)

jgravelle-google · web-flow · commit cfd03f305525 · 2020-05-13T18:48:19.000-07:00
This is unsafe because it's one past the end of the string, and in
threaded contexts it can cause data races
diff --git a/src/embind/embind.js b/src/embind/embind.js
@@ -635,20 +635,13 @@ var LibraryEmbind = {
 
             var str;
             if (stdStringIsUTF8) {
-                //ensure null termination at one-past-end byte if not present yet
-                var endChar = HEAPU8[value + 4 + length];
-                var endCharSwap = 0;
-                if (endChar != 0) {
-                    endCharSwap = endChar;
-                    HEAPU8[value + 4 + length] = 0;
-                }
-
                 var decodeStartPtr = value + 4;
                 // Looping here to support possible embedded '0' bytes
                 for (var i = 0; i <= length; ++i) {
                     var currentBytePtr = value + 4 + i;
-                    if (HEAPU8[currentBytePtr] == 0) {
-                        var stringSegment = UTF8ToString(decodeStartPtr);
+                    if (HEAPU8[currentBytePtr] == 0 || i == length) {
+                        var maxRead = currentBytePtr - decodeStartPtr;
+                        var stringSegment = UTF8ToString(decodeStartPtr, maxRead);
                         if (str === undefined) {
                             str = stringSegment;
                         } else {
@@ -658,10 +651,6 @@ var LibraryEmbind = {
                         decodeStartPtr = currentBytePtr + 1;
                     }
                 }
-
-                if (endCharSwap != 0) {
-                    HEAPU8[value + 4 + length] = endCharSwap;
-                }
             } else {
                 var a = new Array(length);
                 for (var i = 0; i < length; ++i) {
@@ -754,20 +743,14 @@ var LibraryEmbind = {
             var length = HEAPU32[value >> 2];
             var HEAP = getHeap();
             var str;
-            // Ensure null termination at one-past-end byte if not present yet
-            var endChar = HEAP[(value + 4 + length * charSize) >> shift];
-            var endCharSwap = 0;
-            if (endChar != 0) {
-                endCharSwap = endChar;
-                HEAP[(value + 4 + length * charSize) >> shift] = 0;
-            }
 
             var decodeStartPtr = value + 4;
             // Looping here to support possible embedded '0' bytes
             for (var i = 0; i <= length; ++i) {
                 var currentBytePtr = value + 4 + i * charSize;
-                if (HEAP[currentBytePtr >> shift] == 0) {
-                    var stringSegment = decodeString(decodeStartPtr);
+                if (HEAP[currentBytePtr >> shift] == 0 || i == length) {
+                    var maxReadBytes = currentBytePtr - decodeStartPtr;
+                    var stringSegment = decodeString(decodeStartPtr, maxReadBytes);
                     if (str === undefined) {
                         str = stringSegment;
                     } else {
@@ -778,10 +761,6 @@ var LibraryEmbind = {
                 }
             }
 
-            if (endCharSwap != 0) {
-                HEAP[(value + 4 + length * charSize) >> shift] = endCharSwap;
-            }
-
             _free(value);
 
             return str;
diff --git a/src/runtime_strings_extra.js b/src/runtime_strings_extra.js
@@ -39,7 +39,7 @@ var UTF16Decoder = typeof TextDecoder !== 'undefined' ? new TextDecoder('utf-16l
 #endif // TEXTDECODER
 #endif // TEXTDECODER == 2
 
-function UTF16ToString(ptr) {
+function UTF16ToString(ptr, maxBytesToRead) {
 #if ASSERTIONS
   assert(ptr % 2 == 0, 'Pointer passed to UTF16ToString must be aligned to two bytes!');
 #endif
@@ -48,7 +48,10 @@ function UTF16ToString(ptr) {
   // TextDecoder needs to know the byte length in advance, it doesn't stop on null terminator by itself.
   // Also, use the length info to avoid running tiny strings through TextDecoder, since .subarray() allocates garbage.
   var idx = endPtr >> 1;
-  while (HEAP16[idx]) ++idx;
+  var maxIdx = idx + maxBytesToRead / 2;
+  // If maxBytesToRead is not passed explicitly, it will be undefined, and this
+  // will always evaluate to true. This saves on code size.
+  while (!(idx >= maxIdx) && HEAPU16[idx]) ++idx;
   endPtr = idx << 1;
 
 #if TEXTDECODER != 2
@@ -64,7 +67,7 @@ function UTF16ToString(ptr) {
     var str = '';
     while (1) {
       var codeUnit = {{{ makeGetValue('ptr', 'i*2', 'i16') }}};
-      if (codeUnit == 0) return str;
+      if (codeUnit == 0 || i == maxBytesToRead / 2) return str;
       ++i;
       // fromCharCode constructs a character from a UTF-16 code unit, so we can pass the UTF16 string right through.
       str += String.fromCharCode(codeUnit);
@@ -117,16 +120,18 @@ function lengthBytesUTF16(str) {
   return str.length*2;
 }
 
-function UTF32ToString(ptr) {
+function UTF32ToString(ptr, maxBytesToRead) {
 #if ASSERTIONS
   assert(ptr % 4 == 0, 'Pointer passed to UTF32ToString must be aligned to four bytes!');
 #endif
   var i = 0;
 
   var str = '';
-  while (1) {
+  // If maxBytesToRead is not passed explicitly, it will be undefined, and this
+  // will always evaluate to true. This saves on code size.
+  while (!(i >= maxBytesToRead / 4)) {
     var utf32 = {{{ makeGetValue('ptr', 'i*4', 'i32') }}};
-    if (utf32 == 0) return str;
+    if (utf32 == 0) break;
     ++i;
     // Gotcha: fromCharCode constructs a character from a UTF-16 encoded code (pair), not from a Unicode code point! So encode the code point to UTF-16 for constructing.
     // See http://unicode.org/faq/utf_bom.html#utf16-3
@@ -137,6 +142,7 @@ function UTF32ToString(ptr) {
       str += String.fromCharCode(utf32);
     }
   }
+  return str;
 }
 
 // Copies the given Javascript String object 'str' to the emscripten HEAP at address 'outPtr',