diff --git a/bench/utf8_bench.js b/bench/utf8_bench.js new file mode 100644 index 000000000..265e515dc --- /dev/null +++ b/bench/utf8_bench.js @@ -0,0 +1,76 @@ +"use strict"; + +var newSuite = require("./suite"); +var utf8 = require("../lib/utf8"); + +const textDecoder = new TextDecoder("utf8"); +const textEncoder = new TextEncoder("utf8"); + +const sizes = { + "very small": 7, + small: 20, + medium: 100, + large: 1000, +}; + +// Generates a random unicode string in the Basic Multilingual Plane, as a Uint8Array. +function generateUnicodeBuffer(length) { + let unicodeString = ""; + const minUnicode = 0x0020; // Space + const maxUnicode = 0xFFFF; // Last code point in Basic Multilingual Plane. + for (let i = 0; i < length; i++) { + const randomCodePoint = Math.floor(Math.random() * (maxUnicode - minUnicode + 1)) + minUnicode; + + // Convert the code point to a character and append it + unicodeString += String.fromCharCode(randomCodePoint); + } + + // Slice it again so we end up with a Uint8Array of the appropriate length. + return textEncoder.encode(unicodeString).subarray(0, length); +} + +// Generates a random ascii string, as a Uint8Array. +function generateAsciiBuffer(length) { + let asciiString = ""; + const minAscii = 32; + const maxAscii = 126; + for (let i = 0; i < length; i++) { + const randomCodePoint = Math.floor(Math.random() * (maxAscii - minAscii + 1)) + minAscii; + asciiString += String.fromCharCode(randomCodePoint); + } + return textEncoder.encode(asciiString); +} + +const bufferGeneratorFunctions = { + ascii: generateAsciiBuffer, + nonAscii: generateUnicodeBuffer, +}; + + +// Define Suites + +for (const [size, length] of Object.entries(sizes)) { + for (const [stringType, generatorFunction] of Object.entries(bufferGeneratorFunctions)) { + const buffer = generatorFunction(length); + + newSuite(`${stringType} decoding - ${size} strings (${length} bytes)`) + .add("Fallback implementation", function () { + utf8._utf8_decode_fallback(buffer, 0, buffer.length); + }) + .add("Ascii optimized implementation", function () { + utf8._ascii_decode_unrolled(buffer, 0, buffer.length); + }) + .add("Optimized implementation", function () { + utf8.read(buffer, 0, buffer.length); + }) + .add("Node Buffer.toString", function () { + const nodeBuffer = Buffer.from(buffer); + nodeBuffer.toString("utf8", 0, buffer.length); + }) + .add("TextDecoder", function () { + textDecoder.decode(buffer); + }) + .run(); + } +} + diff --git a/index.d.ts b/index.d.ts index 6e211631d..de500888d 100644 --- a/index.d.ts +++ b/index.d.ts @@ -2498,7 +2498,16 @@ export namespace util { */ function pool(alloc: PoolAllocator, slice: PoolSlicer, size?: number): PoolAllocator; - /** A minimal UTF8 implementation for number arrays. */ + /** + * A minimal UTF8 implementation for Uint8Arrays. + * + * This implementation uses a combination of techniques for optimal performance: + * - TextDecoder for longer strings and non-ASCII content + * - 8-byte unrolling for ASCII-only content + * - Inspired by the approach taken in avsc: + * https://github.com/mtth/avsc/blob/91d653f72906102448a059cb81692177bb678f52/lib/utils.js#L796 + * + */ namespace utf8 { /** @@ -2509,7 +2518,16 @@ export namespace util { function length(string: string): number; /** - * Reads UTF8 bytes as a string. + * Reads UTF8 bytes as a string. This attempts to take the most optimal + * approach of the above implementations: + * + * - Special case the empty string + * - If the string is long and TextDecoder is available, use TextDecoder + * - If the string is ASCII only, use ascii_decode_unrolled + * - Otherwise, use utf8_decode_fallback + * + * See the code in `bench/utf8_bench.js` if attempting to tune this code. + * * @param buffer Source buffer * @param start Source start * @param end Source end diff --git a/lib/utf8/index.js b/lib/utf8/index.js index bb4a0e86b..db45a364d 100644 --- a/lib/utf8/index.js +++ b/lib/utf8/index.js @@ -1,12 +1,23 @@ "use strict"; /** - * A minimal UTF8 implementation for number arrays. + * A minimal UTF8 implementation for Uint8Arrays. + * + * This implementation uses a combination of techniques for optimal performance: + * - TextDecoder for longer strings and non-ASCII content + * - 8-byte unrolling for ASCII-only content + * - Inspired by the approach taken in avsc: + * https://github.com/mtth/avsc/blob/91d653f72906102448a059cb81692177bb678f52/lib/utils.js#L796 + * * @memberof util * @namespace */ var utf8 = exports; +// TextDecoder is not available in IE or browsers from before 2017. +var TEXT_DECODER_AVAILABLE = typeof TextDecoder !== "undefined"; +var textDecoder = TEXT_DECODER_AVAILABLE ? new TextDecoder("utf-8") : null; + /** * Calculates the UTF8 byte length of a string. * @param {string} string String @@ -30,20 +41,13 @@ utf8.length = function utf8_length(string) { return len; }; -/** - * Reads UTF8 bytes as a string. - * @param {Uint8Array} buffer Source buffer - * @param {number} start Source start - * @param {number} end Source end - * @returns {string} String read - */ -utf8.read = function utf8_read(buffer, start, end) { - if (end - start < 1) { - return ""; - } +// Manually decodes UTF8 bytes to string. This function supports old browsers +// without TextDecoder. +function utf8_decode_fallback(buffer, start, end, initialStr, initialPos) { + var str = initialStr || ""; + var i = initialPos || start; - var str = ""; - for (var i = start; i < end;) { + for (; i < end;) { var t = buffer[i++]; if (t <= 0x7F) { str += String.fromCharCode(t); @@ -59,6 +63,92 @@ utf8.read = function utf8_read(buffer, start, end) { } return str; +} + +// Export fallback function for direct benchmarking. +utf8._utf8_decode_fallback = utf8_decode_fallback; + +// Fast path for ASCII-only strings. This falls back to utf8_decode_fallback if +// it encounters a non-ASCII character. This function works in all browsers, +// but will only be faster than TextDecoder on short ASCII strings. +function ascii_decode_unrolled(buffer, start, end, initialStr, initialPos) { + var str = initialStr || ""; + var i = initialPos || start; + + // process 4 bytes at a time when possible + for (; i + 3 < end; i += 4) { + const a = buffer[i], b = buffer[i + 1], c = buffer[i + 2], d = buffer[i + 3]; + + // Check all 4 bytes at once for non-ASCII + if ((a | b | c | d) & 0x80) { + // Non-ASCII character detected, fall back to the generic utf8 implementation + return utf8_decode_fallback(buffer, start, end, str, i); + } + // Process 4 ASCII bytes at once + str += String.fromCharCode( + a, b, c, d + ); + } + + // Handle remaining ASCII bytes one by one + for (; i < end; i++) { + var t = buffer[i]; + if (t > 0x7F) { + // Non-ASCII character detected, fall back to the generic utf8 implementation + return utf8_decode_fallback(buffer, start, end, str, i); + } + str += String.fromCharCode(t); + } + + return str; +} + +// Export ascii function for direct benchmarking. +utf8._ascii_decode_unrolled = ascii_decode_unrolled; + + +// Slices bytes according to a start and an end. This avoids creating a new +// Uint8Array if start and end already correspond to the start and end of the +// input. +// +// This is an important optimization because `src/reader:Reader` will +// often create a subarray immediately before passing it to utf8_read. Creating +// an additional subarray object is expensive and not useful. +function subarray(buffer, start, end) { + if (start === 0 && end === buffer.length) { + return buffer; + } + return buffer.subarray(start, end); +} + +/** + * Reads UTF8 bytes as a string. This attempts to take the most optimal + * approach of the above implementations: + * + * - Special case the empty string + * - If the string is long and TextDecoder is available, use TextDecoder + * - If the string is ASCII only, use ascii_decode_unrolled + * - Otherwise, use utf8_decode_fallback + * + * See the code in `bench/utf8_bench.js` if attempting to tune this code. + * + * @param {Uint8Array} buffer Source buffer + * @param {number} start Source start + * @param {number} end Source end + * @returns {string} String read + */ +utf8.read = function utf8_read(buffer, start, end) { + if (end - start < 1) { + return ""; + } + + // Use TextDecoder for strings longer than 24 characters + if (end - start > 24 && TEXT_DECODER_AVAILABLE) { + return textDecoder.decode(subarray(buffer, start, end)); + } + + return ascii_decode_unrolled(buffer, start, end); + }; /** diff --git a/src/reader.js b/src/reader.js index b4fbf291e..045869717 100644 --- a/src/reader.js +++ b/src/reader.js @@ -327,8 +327,21 @@ Reader.prototype.bytes = function read_bytes() { * @returns {string} Value read */ Reader.prototype.string = function read_string() { - var bytes = this.bytes(); - return utf8.read(bytes, 0, bytes.length); + // Note that we could simply use the `.bytes()` function. However, creating + // slices of a Uint8Array tends to be pretty expensive. If we instead just + // call utf8.read with appropriate start and end indicies, we can often + // avoid creating one of these slices and save some time. + var length = this.uint32(), + start = this.pos, + end = this.pos + length; + + /* istanbul ignore if */ + if (end > this.len) + throw indexOutOfRange(this, length); + + this.pos += length; + + return utf8.read(this.buf, start, end); }; /**