protobufjs · adamfaulkner · Apr 17, 2025 · Apr 20, 2025 · Apr 20, 2025
diff --git a/bench/utf8_bench.js b/bench/utf8_bench.js
@@ -0,0 +1,76 @@
+"use strict";
+
+var newSuite = require("./suite");
+var utf8 = require("../lib/utf8");
+
+const textDecoder = new TextDecoder("utf8");
+const textEncoder = new TextEncoder("utf8");
+
+const sizes = {
+    "very small": 7,
+    small: 20,
+    medium: 100,
+    large: 1000,
+};
+
+// Generates a random unicode string in the Basic Multilingual Plane, as a Uint8Array.
+function generateUnicodeBuffer(length) {
+    let unicodeString = "";
+    const minUnicode = 0x0020; // Space
+    const maxUnicode = 0xFFFF; // Last code point in Basic Multilingual Plane.
+    for (let i = 0; i < length; i++) {
+        const randomCodePoint = Math.floor(Math.random() * (maxUnicode - minUnicode + 1)) + minUnicode;
+
+        // Convert the code point to a character and append it
+        unicodeString += String.fromCharCode(randomCodePoint);
+    }
+
+    // Slice it again so we end up with a Uint8Array of the appropriate length.
+    return textEncoder.encode(unicodeString).subarray(0, length);
+}
+
+// Generates a random ascii string, as a Uint8Array.
+function generateAsciiBuffer(length) {
+    let asciiString = "";
+    const minAscii = 32;
+    const maxAscii = 126;
+    for (let i = 0; i < length; i++) {
+        const randomCodePoint = Math.floor(Math.random() * (maxAscii - minAscii + 1)) + minAscii;
+        asciiString += String.fromCharCode(randomCodePoint);
+    }
+    return textEncoder.encode(asciiString);
+}
+
+const bufferGeneratorFunctions = {
+    ascii: generateAsciiBuffer,
+    nonAscii: generateUnicodeBuffer,
+};
+
+
+// Define Suites
+
+for (const [size, length] of Object.entries(sizes)) {
+    for (const [stringType, generatorFunction] of Object.entries(bufferGeneratorFunctions)) {
+        const buffer = generatorFunction(length);
+
+        newSuite(`${stringType} decoding - ${size} strings (${length} bytes)`)
+            .add("Fallback implementation", function () {
+                utf8._utf8_decode_fallback(buffer, 0, buffer.length);
+            })
+            .add("Ascii optimized implementation", function () {
+                utf8._ascii_decode_unrolled(buffer, 0, buffer.length);
+            })
+            .add("Optimized implementation", function () {
+                utf8.read(buffer, 0, buffer.length);
+            })
+            .add("Node Buffer.toString", function () {
+                const nodeBuffer = Buffer.from(buffer);
+                nodeBuffer.toString("utf8", 0, buffer.length);
+            })
+            .add("TextDecoder", function () {
+                textDecoder.decode(buffer);
+            })
+            .run();
+    }
+}
+
diff --git a/index.d.ts b/index.d.ts
@@ -2498,7 +2498,16 @@ export namespace util {
      */
     function pool(alloc: PoolAllocator, slice: PoolSlicer, size?: number): PoolAllocator;
 
-    /** A minimal UTF8 implementation for number arrays. */
+    /**
+     * A minimal UTF8 implementation for Uint8Arrays.
+     *
+     * This implementation uses a combination of techniques for optimal performance:
+     * - TextDecoder for longer strings and non-ASCII content
+     * - 8-byte unrolling for ASCII-only content
+     * - Inspired by the approach taken in avsc:
+     * https://github.com/mtth/avsc/blob/91d653f72906102448a059cb81692177bb678f52/lib/utils.js#L796
+     *
+     */
     namespace utf8 {
 
         /**
@@ -2509,7 +2518,16 @@ export namespace util {
         function length(string: string): number;
 
         /**
-         * Reads UTF8 bytes as a string.
+         * Reads UTF8 bytes as a string. This attempts to take the most optimal
+         * approach of the above implementations:
+         *
+         * - Special case the empty string
+         * - If the string is long and TextDecoder is available, use TextDecoder
+         * - If the string is ASCII only, use ascii_decode_unrolled
+         * - Otherwise, use utf8_decode_fallback
+         *
+         * See the code in `bench/utf8_bench.js` if attempting to tune this code.
+         *
          * @param buffer Source buffer
          * @param start Source start
          * @param end Source end

diff --git a/lib/utf8/index.js b/lib/utf8/index.js
@@ -1,12 +1,23 @@
 "use strict";
 
 /**
- * A minimal UTF8 implementation for number arrays.
+ * A minimal UTF8 implementation for Uint8Arrays.
+ *
+ * This implementation uses a combination of techniques for optimal performance:
+ * - TextDecoder for longer strings and non-ASCII content
+ * - 8-byte unrolling for ASCII-only content
+ * - Inspired by the approach taken in avsc:
+ *   https://github.com/mtth/avsc/blob/91d653f72906102448a059cb81692177bb678f52/lib/utils.js#L796
+ *
  * @memberof util
  * @namespace
  */
 var utf8 = exports;
 
+// TextDecoder is not available in IE or browsers from before 2017.
+var TEXT_DECODER_AVAILABLE = typeof TextDecoder !== "undefined";
+var textDecoder = TEXT_DECODER_AVAILABLE ? new TextDecoder("utf-8") : null;
+
 /**
  * Calculates the UTF8 byte length of a string.
  * @param {string} string String
@@ -30,20 +41,13 @@ utf8.length = function utf8_length(string) {
     return len;
 };
 
-/**
- * Reads UTF8 bytes as a string.
- * @param {Uint8Array} buffer Source buffer
- * @param {number} start Source start
- * @param {number} end Source end
- * @returns {string} String read
- */
-utf8.read = function utf8_read(buffer, start, end) {
-    if (end - start < 1) {
-        return "";
-    }
+// Manually decodes UTF8 bytes to string. This function supports old browsers
+// without TextDecoder.
+function utf8_decode_fallback(buffer, start, end, initialStr, initialPos) {
+    var str = initialStr || "";
+    var i = initialPos || start;
 
-    var str = "";
-    for (var i = start; i < end;) {
+    for (; i < end;) {
         var t = buffer[i++];
         if (t <= 0x7F) {
             str += String.fromCharCode(t);
@@ -59,6 +63,92 @@ utf8.read = function utf8_read(buffer, start, end) {
     }
 
     return str;
+}
+
+// Export fallback function for direct benchmarking.
+utf8._utf8_decode_fallback = utf8_decode_fallback;
+
+// Fast path for ASCII-only strings. This falls back to utf8_decode_fallback if
+// it encounters a non-ASCII character. This function works in all browsers,
+// but will only be faster than TextDecoder on short ASCII strings.
+function ascii_decode_unrolled(buffer, start, end, initialStr, initialPos) {
+    var str = initialStr || "";
+    var i = initialPos || start;
+
+    // process 4 bytes at a time when possible
+    for (; i + 3 < end; i += 4) {
+        const a = buffer[i], b = buffer[i + 1], c = buffer[i + 2], d = buffer[i + 3];
+
+        // Check all 4 bytes at once for non-ASCII
+        if ((a | b | c | d) & 0x80) {
+            // Non-ASCII character detected, fall back to the generic utf8 implementation
+            return utf8_decode_fallback(buffer, start, end, str, i);
+        }
+        // Process 4 ASCII bytes at once
+        str += String.fromCharCode(
+            a, b, c, d
+        );
+    }
+
+    // Handle remaining ASCII bytes one by one
+    for (; i < end; i++) {
+        var t = buffer[i];
+        if (t > 0x7F) {
+            // Non-ASCII character detected, fall back to the generic utf8 implementation
+            return utf8_decode_fallback(buffer, start, end, str, i);
+        }
+        str += String.fromCharCode(t);
+    }
+
+    return str;
+}
+
+// Export ascii function for direct benchmarking.
+utf8._ascii_decode_unrolled = ascii_decode_unrolled;
+
+
+// Slices bytes according to a start and an end. This avoids creating a new
+// Uint8Array if start and end already correspond to the start and end of the
+// input.
+//
+// This is an important optimization because `src/reader:Reader` will
+// often create a subarray immediately before passing it to utf8_read. Creating
+// an additional subarray object is expensive and not useful.
+function subarray(buffer, start, end) {
+    if (start === 0 && end === buffer.length) {
+        return buffer;
+    }
+    return buffer.subarray(start, end);
+}
+
+/**
+ * Reads UTF8 bytes as a string. This attempts to take the most optimal
+ * approach of the above implementations:
+ *
+ * - Special case the empty string
+ * - If the string is long and TextDecoder is available, use TextDecoder
+ * - If the string is ASCII only, use ascii_decode_unrolled
+ * - Otherwise, use utf8_decode_fallback
+ *
+ * See the code in `bench/utf8_bench.js` if attempting to tune this code.
+ *
+ * @param {Uint8Array} buffer Source buffer
+ * @param {number} start Source start
+ * @param {number} end Source end
+ * @returns {string} String read
+ */
+utf8.read = function utf8_read(buffer, start, end) {
+    if (end - start < 1) {
+        return "";
+    }
+
+    // Use TextDecoder for strings longer than 24 characters
+    if (end - start > 24 && TEXT_DECODER_AVAILABLE) {
+        return textDecoder.decode(subarray(buffer, start, end));
+    }
+
+    return ascii_decode_unrolled(buffer, start, end);
+
 };
 
 /**

diff --git a/src/reader.js b/src/reader.js
@@ -327,8 +327,21 @@ Reader.prototype.bytes = function read_bytes() {
  * @returns {string} Value read
  */
 Reader.prototype.string = function read_string() {
-    var bytes = this.bytes();
-    return utf8.read(bytes, 0, bytes.length);
+    // Note that we could simply use the `.bytes()` function. However, creating
+    // slices of a Uint8Array tends to be pretty expensive. If we instead just
+    // call utf8.read with appropriate start and end indicies, we can often
+    // avoid creating one of these slices and save some time.
+    var length = this.uint32(),
+        start  = this.pos,
+        end    = this.pos + length;
+
+    /* istanbul ignore if */
+    if (end > this.len)
+        throw indexOutOfRange(this, length);
+
+    this.pos += length;
+
+    return utf8.read(this.buf, start, end);
 };
 
 /**