Skip to content

Optimize decoding utf8 buffers to strings. #2062

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 76 additions & 0 deletions bench/utf8_bench.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
"use strict";

var newSuite = require("./suite");
var utf8 = require("../lib/utf8");

const textDecoder = new TextDecoder("utf8");
const textEncoder = new TextEncoder("utf8");

const sizes = {
"very small": 7,
small: 20,
medium: 100,
large: 1000,
};

// Generates a random unicode string in the Basic Multilingual Plane, as a Uint8Array.
function generateUnicodeBuffer(length) {
let unicodeString = "";
const minUnicode = 0x0020; // Space
const maxUnicode = 0xFFFF; // Last code point in Basic Multilingual Plane.
for (let i = 0; i < length; i++) {
const randomCodePoint = Math.floor(Math.random() * (maxUnicode - minUnicode + 1)) + minUnicode;

// Convert the code point to a character and append it
unicodeString += String.fromCharCode(randomCodePoint);
}

// Slice it again so we end up with a Uint8Array of the appropriate length.
return textEncoder.encode(unicodeString).subarray(0, length);
}

// Generates a random ascii string, as a Uint8Array.
function generateAsciiBuffer(length) {
let asciiString = "";
const minAscii = 32;
const maxAscii = 126;
for (let i = 0; i < length; i++) {
const randomCodePoint = Math.floor(Math.random() * (maxAscii - minAscii + 1)) + minAscii;
asciiString += String.fromCharCode(randomCodePoint);
}
return textEncoder.encode(asciiString);
}

const bufferGeneratorFunctions = {
ascii: generateAsciiBuffer,
nonAscii: generateUnicodeBuffer,
};


// Define Suites

for (const [size, length] of Object.entries(sizes)) {
for (const [stringType, generatorFunction] of Object.entries(bufferGeneratorFunctions)) {
const buffer = generatorFunction(length);

newSuite(`${stringType} decoding - ${size} strings (${length} bytes)`)
.add("Fallback implementation", function () {
utf8._utf8_decode_fallback(buffer, 0, buffer.length);
})
.add("Ascii optimized implementation", function () {
utf8._ascii_decode_unrolled(buffer, 0, buffer.length);
})
.add("Optimized implementation", function () {
utf8.read(buffer, 0, buffer.length);
})
.add("Node Buffer.toString", function () {
const nodeBuffer = Buffer.from(buffer);
nodeBuffer.toString("utf8", 0, buffer.length);
})
.add("TextDecoder", function () {
textDecoder.decode(buffer);
})
.run();
}
}

22 changes: 20 additions & 2 deletions index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2498,7 +2498,16 @@ export namespace util {
*/
function pool(alloc: PoolAllocator, slice: PoolSlicer, size?: number): PoolAllocator;

/** A minimal UTF8 implementation for number arrays. */
/**
* A minimal UTF8 implementation for Uint8Arrays.
*
* This implementation uses a combination of techniques for optimal performance:
* - TextDecoder for longer strings and non-ASCII content
* - 8-byte unrolling for ASCII-only content
* - Inspired by the approach taken in avsc:
* https://github.com/mtth/avsc/blob/91d653f72906102448a059cb81692177bb678f52/lib/utils.js#L796
*
*/
namespace utf8 {

/**
Expand All @@ -2509,7 +2518,16 @@ export namespace util {
function length(string: string): number;

/**
* Reads UTF8 bytes as a string.
* Reads UTF8 bytes as a string. This attempts to take the most optimal
* approach of the above implementations:
*
* - Special case the empty string
* - If the string is long and TextDecoder is available, use TextDecoder
* - If the string is ASCII only, use ascii_decode_unrolled
* - Otherwise, use utf8_decode_fallback
*
* See the code in `bench/utf8_bench.js` if attempting to tune this code.
*
* @param buffer Source buffer
* @param start Source start
* @param end Source end
Expand Down
118 changes: 104 additions & 14 deletions lib/utf8/index.js
Original file line number Diff line number Diff line change
@@ -1,12 +1,23 @@
"use strict";

/**
* A minimal UTF8 implementation for number arrays.
* A minimal UTF8 implementation for Uint8Arrays.
*
* This implementation uses a combination of techniques for optimal performance:
* - TextDecoder for longer strings and non-ASCII content
* - 8-byte unrolling for ASCII-only content
* - Inspired by the approach taken in avsc:
* https://github.com/mtth/avsc/blob/91d653f72906102448a059cb81692177bb678f52/lib/utils.js#L796
*
* @memberof util
* @namespace
*/
var utf8 = exports;

// TextDecoder is not available in IE or browsers from before 2017.
var TEXT_DECODER_AVAILABLE = typeof TextDecoder !== "undefined";
var textDecoder = TEXT_DECODER_AVAILABLE ? new TextDecoder("utf-8") : null;

/**
* Calculates the UTF8 byte length of a string.
* @param {string} string String
Expand All @@ -30,20 +41,13 @@ utf8.length = function utf8_length(string) {
return len;
};

/**
* Reads UTF8 bytes as a string.
* @param {Uint8Array} buffer Source buffer
* @param {number} start Source start
* @param {number} end Source end
* @returns {string} String read
*/
utf8.read = function utf8_read(buffer, start, end) {
if (end - start < 1) {
return "";
}
// Manually decodes UTF8 bytes to string. This function supports old browsers
// without TextDecoder.
function utf8_decode_fallback(buffer, start, end, initialStr, initialPos) {
var str = initialStr || "";
var i = initialPos || start;

var str = "";
for (var i = start; i < end;) {
for (; i < end;) {
var t = buffer[i++];
if (t <= 0x7F) {
str += String.fromCharCode(t);
Expand All @@ -59,6 +63,92 @@ utf8.read = function utf8_read(buffer, start, end) {
}

return str;
}

// Export fallback function for direct benchmarking.
utf8._utf8_decode_fallback = utf8_decode_fallback;

// Fast path for ASCII-only strings. This falls back to utf8_decode_fallback if
// it encounters a non-ASCII character. This function works in all browsers,
// but will only be faster than TextDecoder on short ASCII strings.
function ascii_decode_unrolled(buffer, start, end, initialStr, initialPos) {
var str = initialStr || "";
var i = initialPos || start;

// process 4 bytes at a time when possible
for (; i + 3 < end; i += 4) {
const a = buffer[i], b = buffer[i + 1], c = buffer[i + 2], d = buffer[i + 3];

// Check all 4 bytes at once for non-ASCII
if ((a | b | c | d) & 0x80) {
// Non-ASCII character detected, fall back to the generic utf8 implementation
return utf8_decode_fallback(buffer, start, end, str, i);
}
// Process 4 ASCII bytes at once
str += String.fromCharCode(
a, b, c, d
);
}

// Handle remaining ASCII bytes one by one
for (; i < end; i++) {
var t = buffer[i];
if (t > 0x7F) {
// Non-ASCII character detected, fall back to the generic utf8 implementation
return utf8_decode_fallback(buffer, start, end, str, i);
}
str += String.fromCharCode(t);
}

return str;
}

// Export ascii function for direct benchmarking.
utf8._ascii_decode_unrolled = ascii_decode_unrolled;


// Slices bytes according to a start and an end. This avoids creating a new
// Uint8Array if start and end already correspond to the start and end of the
// input.
//
// This is an important optimization because `src/reader:Reader` will
// often create a subarray immediately before passing it to utf8_read. Creating
// an additional subarray object is expensive and not useful.
function subarray(buffer, start, end) {
if (start === 0 && end === buffer.length) {
return buffer;
}
return buffer.subarray(start, end);
}

/**
* Reads UTF8 bytes as a string. This attempts to take the most optimal
* approach of the above implementations:
*
* - Special case the empty string
* - If the string is long and TextDecoder is available, use TextDecoder
* - If the string is ASCII only, use ascii_decode_unrolled
* - Otherwise, use utf8_decode_fallback
*
* See the code in `bench/utf8_bench.js` if attempting to tune this code.
*
* @param {Uint8Array} buffer Source buffer
* @param {number} start Source start
* @param {number} end Source end
* @returns {string} String read
*/
utf8.read = function utf8_read(buffer, start, end) {
if (end - start < 1) {
return "";
}

// Use TextDecoder for strings longer than 24 characters
if (end - start > 24 && TEXT_DECODER_AVAILABLE) {
return textDecoder.decode(subarray(buffer, start, end));
}

return ascii_decode_unrolled(buffer, start, end);

};

/**
Expand Down
17 changes: 15 additions & 2 deletions src/reader.js
Original file line number Diff line number Diff line change
Expand Up @@ -327,8 +327,21 @@ Reader.prototype.bytes = function read_bytes() {
* @returns {string} Value read
*/
Reader.prototype.string = function read_string() {
var bytes = this.bytes();
return utf8.read(bytes, 0, bytes.length);
// Note that we could simply use the `.bytes()` function. However, creating
// slices of a Uint8Array tends to be pretty expensive. If we instead just
// call utf8.read with appropriate start and end indicies, we can often
// avoid creating one of these slices and save some time.
var length = this.uint32(),
start = this.pos,
end = this.pos + length;

/* istanbul ignore if */
if (end > this.len)
throw indexOutOfRange(this, length);

this.pos += length;

return utf8.read(this.buf, start, end);
};

/**
Expand Down