From 5667e5c3c6c7deea07166a2e6cdf3ac86af09e0c Mon Sep 17 00:00:00 2001 From: Denver Coneybeare Date: Mon, 7 Jul 2025 14:22:32 -0400 Subject: [PATCH 1/3] fix: Improve performance of the UTF-8 string comparison logic. The semantics of this logic were originally fixed by #2275, but this fix caused a material performance degradation, which was then improved by #2299 The performance was, however, still suboptimal, and this PR further improves the speed back to close to its original speed and, serendipitously, simplifies the algorithm too. This commit is a port of https://github.com/firebase/firebase-js-sdk/pull/9143 --- dev/src/order.ts | 81 ++++++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 44 deletions(-) diff --git a/dev/src/order.ts b/dev/src/order.ts index 04c93bbc1..eba6ddf7f 100644 --- a/dev/src/order.ts +++ b/dev/src/order.ts @@ -254,56 +254,49 @@ function compareVectors(left: ApiMapValue, right: ApiMapValue): number { * @internal */ export function compareUtf8Strings(left: string, right: string): number { - let i = 0; - while (i < left.length && i < right.length) { - const leftCodePoint = left.codePointAt(i)!; - const rightCodePoint = right.codePointAt(i)!; - - if (leftCodePoint !== rightCodePoint) { - if (leftCodePoint < 128 && rightCodePoint < 128) { - // ASCII comparison - return primitiveComparator(leftCodePoint, rightCodePoint); - } else { - // Lazy instantiate TextEncoder - const encoder = new TextEncoder(); - - // UTF-8 encode the character at index i for byte comparison. - const leftBytes = encoder.encode(getUtf8SafeSubstring(left, i)); - const rightBytes = encoder.encode(getUtf8SafeSubstring(right, i)); - const comp = compareBlobs( - Buffer.from(leftBytes), - Buffer.from(rightBytes) - ); - if (comp !== 0) { - return comp; - } else { - // EXTREMELY RARE CASE: Code points differ, but their UTF-8 byte - // representations are identical. This can happen with malformed input - // (invalid surrogate pairs). The backend also actively prevents invalid - // surrogates as INVALID_ARGUMENT errors, so we almost never receive - // invalid strings from backend. - // Fallback to code point comparison for graceful handling. - return primitiveComparator(leftCodePoint, rightCodePoint); - } - } + // Find the first differing character (a.k.a. "UTF-16 code unit") in the two strings and, + // if found, use that character to determine the relative ordering of the two strings as a + // whole. Comparing UTF-16 strings in UTF-8 byte order can be done simply and efficiently by + // comparing the UTF-16 code units (chars). This serendipitously works because of the way UTF-8 + // and UTF-16 happen to represent Unicode code points. + // + // After finding the first pair of differing characters, there are two cases: + // + // Case 1: Both characters are non-surrogates (code points less than or equal to 0xFFFF) or + // both are surrogates from a surrogate pair (that collectively represent code points greater + // than 0xFFFF). In this case their numeric order as UTF-16 code units is the same as the + // lexicographical order of their corresponding UTF-8 byte sequences. A direct comparison is + // sufficient. + // + // Case 2: One character is a surrogate and the other is not. In this case the surrogate- + // containing string is always ordered after the non-surrogate. This is because surrogates are + // used to represent code points greater than 0xFFFF which have 4-byte UTF-8 representations + // and are lexicographically greater than the 1, 2, or 3-byte representations of code points + // less than or equal to 0xFFFF. + const length = Math.min(left.length, right.length); + for (let i = 0; i < length; i++) { + const leftChar = left.charAt(i); + const rightChar = right.charAt(i); + if (leftChar !== rightChar) { + return isSurrogate(leftChar) === isSurrogate(rightChar) + ? primitiveComparator(leftChar, rightChar) + : isSurrogate(leftChar) + ? 1 + : -1; } - // Increment by 2 for surrogate pairs, 1 otherwise - i += leftCodePoint > 0xffff ? 2 : 1; } - // Compare lengths if all characters are equal + // Use the lengths of the strings to determine the overall comparison result since either the + // strings were equal or one is a prefix of the other. return primitiveComparator(left.length, right.length); } -function getUtf8SafeSubstring(str: string, index: number): string { - const firstCodePoint = str.codePointAt(index)!; - if (firstCodePoint > 0xffff) { - // It's a surrogate pair, return the whole pair - return str.substring(index, index + 2); - } else { - // It's a single code point, return it - return str.substring(index, index + 1); - } +const MIN_SURROGATE = 0xd800; +const MAX_SURROGATE = 0xdfff; + +export function isSurrogate(s: string): boolean { + const c = s.charCodeAt(0); + return c >= MIN_SURROGATE && c <= MAX_SURROGATE; } /*! From 772c6257a04ab9ec33e72ff7fcb4827a4082d126 Mon Sep 17 00:00:00 2001 From: Denver Coneybeare Date: Mon, 7 Jul 2025 14:30:40 -0400 Subject: [PATCH 2/3] npm run fix --- dev/src/order.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/src/order.ts b/dev/src/order.ts index eba6ddf7f..7397c8d62 100644 --- a/dev/src/order.ts +++ b/dev/src/order.ts @@ -281,8 +281,8 @@ export function compareUtf8Strings(left: string, right: string): number { return isSurrogate(leftChar) === isSurrogate(rightChar) ? primitiveComparator(leftChar, rightChar) : isSurrogate(leftChar) - ? 1 - : -1; + ? 1 + : -1; } } From 7db26d7a384be47a2bec48f299d5db49e33031e5 Mon Sep 17 00:00:00 2001 From: Denver Coneybeare Date: Mon, 7 Jul 2025 14:40:13 -0400 Subject: [PATCH 3/3] order.ts: remove `export` keyword from `isSurrogate` function since it's not used outside of the file. --- dev/src/order.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/src/order.ts b/dev/src/order.ts index 7397c8d62..9588ef675 100644 --- a/dev/src/order.ts +++ b/dev/src/order.ts @@ -294,7 +294,7 @@ export function compareUtf8Strings(left: string, right: string): number { const MIN_SURROGATE = 0xd800; const MAX_SURROGATE = 0xdfff; -export function isSurrogate(s: string): boolean { +function isSurrogate(s: string): boolean { const c = s.charCodeAt(0); return c >= MIN_SURROGATE && c <= MAX_SURROGATE; }