Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions core/common/src/-CommonPlatform.kt
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@

package kotlinx.io

internal expect fun String.asUtf8ToByteArray(): ByteArray

/**
* Signals about a general issue occurred during I/O operation.
*/
Expand Down
6 changes: 4 additions & 2 deletions core/common/src/Sinks.kt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@

package kotlinx.io

internal val HEX_DIGIT_BYTES = "0123456789abcdef".asUtf8ToByteArray()
private val HEX_DIGIT_BYTES = ByteArray(16) {
((if (it < 10) '0'.code else ('a'.code - 10)) + it).toByte()
}

/**
* Writes two bytes containing [short], in the little-endian order, to this sink.
Expand Down Expand Up @@ -365,4 +367,4 @@ public fun Sink.writeDoubleLe(double: Double) {
public inline fun Sink.writeToInternalBuffer(lambda: (Buffer) -> Unit) {
lambda(this.buffer)
this.hintEmit()
}
}
91 changes: 0 additions & 91 deletions core/common/src/internal/-Utf8.kt
Original file line number Diff line number Diff line change
Expand Up @@ -38,26 +38,6 @@ internal fun ByteArray.commonToUtf8String(beginIndex: Int = 0, endIndex: Int = s
return chars.concatToString(0, length)
}

internal fun String.commonAsUtf8ToByteArray(): ByteArray {
val bytes = ByteArray(4 * length)

// Assume ASCII until a UTF-8 code point is observed. This is ugly but yields
// about a 2x performance increase for pure ASCII.
for (index in indices) {
val b0 = this[index]
if (b0 >= '\u0080') {
var size = index
processUtf8Bytes(index, length) { c ->
bytes[size++] = c
}
return bytes.copyOf(size)
}
bytes[index] = b0.code.toByte()
}

return bytes.copyOf(length)
}

internal const val REPLACEMENT_BYTE: Byte = '?'.code.toByte()
internal const val REPLACEMENT_CHARACTER: Char = '\ufffd'
internal const val REPLACEMENT_CODE_POINT: Int = REPLACEMENT_CHARACTER.code
Expand All @@ -72,77 +52,6 @@ internal inline fun isUtf8Continuation(byte: Byte): Boolean {
return byte and 0xc0 == 0x80
}

internal inline fun String.processUtf8Bytes(
beginIndex: Int,
endIndex: Int,
yield: (Byte) -> Unit
) {
// Transcode a UTF-16 String to UTF-8 bytes.
var index = beginIndex
while (index < endIndex) {
val c = this[index]

when {
c < '\u0080' -> {
// Emit a 7-bit character with 1 byte.
yield(c.code.toByte()) // 0xxxxxxx
index++

// Assume there is going to be more ASCII
while (index < endIndex && this[index] < '\u0080') {
yield(this[index++].code.toByte())
}
}

c < '\u0800' -> {
// Emit a 11-bit character with 2 bytes.
/* ktlint-disable no-multi-spaces */
yield((c.code shr 6 or 0xc0).toByte()) // 110xxxxx
yield((c.code and 0x3f or 0x80).toByte()) // 10xxxxxx
/* ktlint-enable no-multi-spaces */
index++
}

c !in '\ud800'..'\udfff' -> {
// Emit a 16-bit character with 3 bytes.
/* ktlint-disable no-multi-spaces */
yield((c.code shr 12 or 0xe0).toByte()) // 1110xxxx
yield((c.code shr 6 and 0x3f or 0x80).toByte()) // 10xxxxxx
yield((c.code and 0x3f or 0x80).toByte()) // 10xxxxxx
/* ktlint-enable no-multi-spaces */
index++
}

else -> {
// c is a surrogate. Make sure it is a high surrogate & that its successor is a low
// surrogate. If not, the UTF-16 is invalid, in which case we emit a replacement
// byte.
if (c > '\udbff' ||
endIndex <= index + 1 ||
this[index + 1] !in '\udc00'..'\udfff'
) {
yield(REPLACEMENT_BYTE)
index++
} else {
// UTF-16 high surrogate: 110110xxxxxxxxxx (10 bits)
// UTF-16 low surrogate: 110111yyyyyyyyyy (10 bits)
// Unicode code point: 00010000000000000000 + xxxxxxxxxxyyyyyyyyyy (21 bits)
val codePoint = (((c.code shl 10) + this[index + 1].code) + (0x010000 - (0xd800 shl 10) - 0xdc00))

// Emit a 21-bit character with 4 bytes.
/* ktlint-disable no-multi-spaces */
yield((codePoint shr 18 or 0xf0).toByte()) // 11110xxx
yield((codePoint shr 12 and 0x3f or 0x80).toByte()) // 10xxxxxx
yield((codePoint shr 6 and 0x3f or 0x80).toByte()) // 10xxyyyy
yield((codePoint and 0x3f or 0x80).toByte()) // 10yyyyyy
/* ktlint-enable no-multi-spaces */
index += 2
}
}
}
}
}

internal inline fun ByteArray.processUtf8CodePoints(
beginIndex: Int,
endIndex: Int,
Expand Down
2 changes: 1 addition & 1 deletion core/common/test/AbstractSourceTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -615,7 +615,7 @@ abstract class AbstractBufferedSourceTest internal constructor(
val string = "abcd" + "e".repeat(Segment.SIZE)
sink.writeString(string)
sink.emit()
assertArrayEquals(string.asUtf8ToByteArray(), source.readByteArray())
assertArrayEquals(string.commonAsUtf8ToByteArray(), source.readByteArray())
}

@Test
Expand Down
180 changes: 149 additions & 31 deletions core/common/test/Utf8Test.kt
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@

package kotlinx.io

import kotlinx.io.internal.REPLACEMENT_CHARACTER
import kotlinx.io.internal.REPLACEMENT_CODE_POINT
import kotlinx.io.internal.commonAsUtf8ToByteArray
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, I wonder if we should delegate to JVM string constructor immediately.

Because right now, we process bytes one by one (expecting non-ASCII), then invoke concatToString() which does the same, doing an extra job, and maybe even repackaging these bytes again into ASCII-compressed strings. Bonus points for JVM -- everything is intensified, i.e. StringCoding.hasNegatives and StringUTF16.putChar which is really nice.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Calling String's ctor seems to be slower (by about 10-15% when it comes to utf8 strings) compared to what we do now.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is really unexpected though 🥲

import kotlinx.io.internal.processUtf8CodePoints
import kotlin.test.*
Copy link
Member

@qwwdfsad qwwdfsad Apr 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While reviewing and testing these, I found a subtle difference: in the Java decoding process, when the decoder stumbles across an invalid multi-byte sequence, it replaces every byte of the sequence with \ufffd, while our decoder replaces the whole group with a single code point.

E.g. consider the 4-byte sequences:

  • 0xf0, 0x89, 0x89
  • 0xf0, 0x89, 0x89, 0x89

Their replacement for our decoding is a single codepoint, for Java, it is 3 and 4 replacement cp accordingly.

This behaviour leaks surprisingly for the following sequence: 0xf0 0xf0 0xf0 -- then three characters are produced. The same applies for 3- (0xE0) and probably 2- (haven't checked) byte sequences

Copy link
Member

@qwwdfsad qwwdfsad Apr 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Take on that -- replacement behaviour is undocumented (readString family), and thus it's hard to both justify and figure out this behaviour, as well as take it into account when implementing.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

replacement behaviour is undocumented

Although a UTF-8 conversion process is required to never consume well-formed subse- quences as part of its error handling for ill-formed subsequences, such a process is not oth- erwise constrained in how it deals with any ill-formed subsequence itself. An ill-formed subsequence consisting of more than one code unit could be treated as a single error or as multiple errors.

😢

Copy link
Member

@qwwdfsad qwwdfsad Apr 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean undocumented in kotlinx-io.
I think we can pick a single strategy, document it and stick with it.
Right now, depending on the "ill-formness", we can replace N bytes with [1..4] replacement characters and it's a bit surprising. Always having a single replacement or as many characters as in an ill-formed sequence seems a bit more reasonable

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's undocumented in the standard, either. :)

And, IIRC, we also have the same discrepancy in stdlib, where native and JVM implementations of byte-array-to-string conversion.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Speaking of examples you initially gave, it all seems more or less consistent:

  • 0xf0 0x89 0x89 <EOF> - the sequence has a valid prefix, but terminates abruplty -> we replaced the whole sequence with \ufffd;
  • 0xf0 0x89 0x89 0x89 <EOF> - the sequence is valid, but encoded value lies outside the valid range -> we replaced the whole sequence with \ufffd;
  • 0xf0 0xf0 0xf0 <EOF> - we don't consider 0xf0 0xf0 as a sequence as the second 0xf0 is not a continuation CP, thus the first invalid sequence consists of a single byte, we replaced, and did the same for other two single-byte sequences.

However, looking at UTF-8's definitions of well- and ill-formed cp-sequences (D84-D86) gives an impression that 0xf0 0xf0 0xf0 should be treated as a single ill-formed sequence as none of its bytes overlaps with a minimal well-formed subsequence. 🤔

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But current behavior is a conforming one:

An ill-formed subsequence consisting of more than one code unit could be treated as a single error or as multiple errors.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I'll cut this stream of consciousness off: we should probably follow what others (Java, Python) do and consider only ill-formed subsequences consisting of a single code unit:

>>> b'\xf0\x89\x89\x89'.decode("utf-8", errors='replace')
'����
jshell> new String(new byte[]{(byte)0xf0,(byte)0x89,(byte)0x89,(byte)0x89})
$5 ==> "����"

https://go.dev/play/p/Yc9wxok2aV2

fmt.Println(string([]byte{0xf0, 0x89, 0x89, 0x89}))
...

����

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@qwwdfsad, I filed #301 to track it


Expand Down Expand Up @@ -144,52 +144,109 @@ class Utf8Test {

@Test
fun bufferWriteCodePoints() {
bufferWriteCodePointsCheck(0)
}

@Test
fun bufferWriteCodePointsCrossSegments() {
bufferWriteCodePointsCheck(Segment.SIZE - 1)
}

private fun bufferWriteCodePointsCheck(prefixLength: Int) {
val buffer = Buffer()
buffer.assertCodePointEncoded("40", '@'.code)
buffer.assertCodePointEncoded("7f", '\u007f'.code)
buffer.assertCodePointEncoded("c280", '\u0080'.code)
buffer.assertCodePointEncoded("c2a9", '\u00a9'.code)
buffer.assertCodePointEncoded("c3bf", '\u00ff'.code)
buffer.assertCodePointEncoded("dfbf", '\u07ff'.code)
buffer.assertCodePointEncoded("e0a080", '\u0800'.code)
buffer.assertCodePointEncoded("e1839a", '\u10da'.code)
buffer.assertCodePointEncoded("efbfbf", '\uffff'.code)
buffer.assertCodePointEncoded("f0908080", 0x10000)
buffer.assertCodePointEncoded("f48087bf", 0x1001FF)
buffer.assertCodePointEncoded("40", '@'.code, prefixLength)
buffer.assertCodePointEncoded("7f", '\u007f'.code, prefixLength)
buffer.assertCodePointEncoded("c280", '\u0080'.code, prefixLength)
buffer.assertCodePointEncoded("c2a9", '\u00a9'.code, prefixLength)
buffer.assertCodePointEncoded("c3bf", '\u00ff'.code, prefixLength)
buffer.assertCodePointEncoded("dfbf", '\u07ff'.code, prefixLength)
buffer.assertCodePointEncoded("e0a080", '\u0800'.code, prefixLength)
buffer.assertCodePointEncoded("e1839a", '\u10da'.code, prefixLength)
buffer.assertCodePointEncoded("efbfbf", '\uffff'.code, prefixLength)
buffer.assertCodePointEncoded("f0908080", 0x10000, prefixLength)
buffer.assertCodePointEncoded("f48087bf", 0x1001FF, prefixLength)
}

@Test
fun bufferReadCodePoints() {
bufferReadCodePointsCheck(0)
}

@Test
fun bufferReadCodePointsCrossSegments() {
bufferReadCodePointsCheck(Segment.SIZE - 1)
}

private fun bufferReadCodePointsCheck(prefixLength: Int) {
val buffer = Buffer()
buffer.assertCodePointDecoded('@'.code, "40")
buffer.assertCodePointDecoded('\u007f'.code, "7f")
buffer.assertCodePointDecoded('\u0080'.code, "c280")
buffer.assertCodePointDecoded('\u00a9'.code, "c2a9")
buffer.assertCodePointDecoded('\u00ff'.code, "c3bf")
buffer.assertCodePointDecoded('\u07ff'.code, "dfbf")
buffer.assertCodePointDecoded('\u0800'.code, "e0a080")
buffer.assertCodePointDecoded('\u10da'.code, "e1839a")
buffer.assertCodePointDecoded('\uffff'.code, "efbfbf")
buffer.assertCodePointDecoded(0x10000, "f0908080")
buffer.assertCodePointDecoded(0x1001FF, "f48087bf")
buffer.assertCodePointDecoded('@'.code, "40", prefixLength)
buffer.assertCodePointDecoded('\u007f'.code, "7f", prefixLength)
buffer.assertCodePointDecoded('\u0080'.code, "c280", prefixLength)
buffer.assertCodePointDecoded('\u00a9'.code, "c2a9", prefixLength)
buffer.assertCodePointDecoded('\u00ff'.code, "c3bf", prefixLength)
buffer.assertCodePointDecoded('\u07ff'.code, "dfbf", prefixLength)
buffer.assertCodePointDecoded('\u0800'.code, "e0a080", prefixLength)
buffer.assertCodePointDecoded('\u10da'.code, "e1839a", prefixLength)
buffer.assertCodePointDecoded('\uffff'.code, "efbfbf", prefixLength)
buffer.assertCodePointDecoded(0x10000, "f0908080", prefixLength)
buffer.assertCodePointDecoded(0x1001FF, "f48087bf", prefixLength)
}

@Test
fun bufferWriteUtf8String() {
bufferWriteUtf8StringCheck(0)
}

@Test
fun bufferWriteUtf8StringCrossSegments() {
bufferWriteUtf8StringCheck(Segment.SIZE - 1)
}

private fun bufferWriteUtf8StringCheck(prefixLength: Int) {
val buffer = Buffer()
buffer.assertUtf8StringEncoded("68656c6c6f", "hello")
buffer.assertUtf8StringEncoded("cf87ceb5cf81ceb5cf84ceb9cf83cebccf8ccf82", "χερετισμός")
buffer.assertUtf8StringEncoded("68656c6c6f", "hello", prefixLength)
buffer.assertUtf8StringEncoded("cf87ceb5cf81ceb5cf84ceb9cf83cebccf8ccf82", "χερετισμός",
prefixLength)
buffer.assertUtf8StringEncoded(
"e18392e18390e1839be18390e183a0e183afe1839de18391e18390",
"გამარჯობა"
"გამარჯობა",
prefixLength
)
buffer.assertUtf8StringEncoded(
"f093878bf0938bb4f09380a5",
"\uD80C\uDDCB\uD80C\uDEF4\uD80C\uDC25" /* 𓇋𓋴𓀥, to hail, AN EGYPTIAN HIEROGLYPHIC DICTIONARY, p. 79b */
"\uD80C\uDDCB\uD80C\uDEF4\uD80C\uDC25",/* 𓇋𓋴𓀥, to hail, AN EGYPTIAN HIEROGLYPHIC DICTIONARY, p. 79b */
prefixLength
)

// two consecutive high surrogates, replace with '?'
buffer.assertUtf8StringEncoded("3f3f", "\ud801\uD801")
buffer.assertUtf8StringEncoded("3f3f", "\ud801\uD801", prefixLength)
}

@Test
fun bufferReadUtf8String() {
bufferReadUtf8StringCheck(0)
}

@Test
fun bufferReadUtf8StringCrossSegments() {
bufferReadUtf8StringCheck(Segment.SIZE - 1)
}

private fun bufferReadUtf8StringCheck(prefixLength: Int) {
val buffer = Buffer()
buffer.assertUtf8StringDecoded("hello","68656c6c6f", prefixLength)
buffer.assertUtf8StringDecoded("χερετισμός", "cf87ceb5cf81ceb5cf84ceb9cf83cebccf8ccf82",
prefixLength)
buffer.assertUtf8StringDecoded(
"გამარჯობა",
"e18392e18390e1839be18390e183a0e183afe1839de18391e18390",
prefixLength
)
buffer.assertUtf8StringDecoded(
"\uD80C\uDDCB\uD80C\uDEF4\uD80C\uDC25",/* 𓇋𓋴𓀥, to hail, AN EGYPTIAN HIEROGLYPHIC DICTIONARY, p. 79b */
"f093878bf0938bb4f09380a5",
prefixLength
)
}

@Test
Expand Down Expand Up @@ -258,6 +315,16 @@ class Utf8Test {
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
assertTrue(buffer.exhausted())

buffer.write(ByteArray(Segment.SIZE - 2))
buffer.write("f888808080".decodeHex())
buffer.skip(Segment.SIZE - 2L)
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
assertTrue(buffer.exhausted())
}

@Test
Expand Down Expand Up @@ -307,6 +374,44 @@ class Utf8Test {
}
}

@Test
fun readStringWithUnderflow() {
val buffer = Buffer()
// 3 byte-encoded, last byte missing
buffer.assertUtf8StringDecoded(REPLACEMENT_CHARACTER.toString(), "e183")
// 3 byte-encoded, last two bytes missing
buffer.assertUtf8StringDecoded(REPLACEMENT_CHARACTER.toString(), "e1")
// 2 byte-encoded, last byte missing
buffer.assertUtf8StringDecoded(REPLACEMENT_CHARACTER.toString(), "cf")
// 4 byte encoded, various underflows
buffer.assertUtf8StringDecoded(REPLACEMENT_CHARACTER.toString(), "f09383")
buffer.assertUtf8StringDecoded(REPLACEMENT_CHARACTER.toString(), "f093")
buffer.assertUtf8StringDecoded(REPLACEMENT_CHARACTER.toString(), "f0")
}

@Test
fun readStringWithoutContinuationByte() {
val buffer = Buffer()
// 2 byte-encoded, last byte corrupted
buffer.assertUtf8StringDecoded("${REPLACEMENT_CHARACTER}a", "cf61")
// 3 byte-encoded, last byte corrupted
buffer.assertUtf8StringDecoded("${REPLACEMENT_CHARACTER}a", "e18361")
// 3 byte-encoded, last two bytes corrupted
buffer.assertUtf8StringDecoded("${REPLACEMENT_CHARACTER}aa", "e16161")
// 4 byte-encoded, various bytes corrupterd
buffer.assertUtf8StringDecoded("${REPLACEMENT_CHARACTER}a", "f0938361")
buffer.assertUtf8StringDecoded("${REPLACEMENT_CHARACTER}aa", "f0936161")
buffer.assertUtf8StringDecoded("${REPLACEMENT_CHARACTER}aaa", "f0616161")
}

@OptIn(ExperimentalStdlibApi::class)
@Test
fun encodeUtf16SurrogatePair() {
val buffer = Buffer()
buffer.writeString("\uD852\uDF62")
println(buffer.readByteArray().toHexString())
}

private fun assertEncoded(hex: String, vararg codePoints: Int) {
assertCodePointDecoded(hex, *codePoints)
}
Expand All @@ -321,21 +426,34 @@ class Utf8Test {
assertEquals(i, codePoints.size) // Checked them all
}

private fun Buffer.assertCodePointEncoded(expectedHex: String, codePoint: Int) {
private fun Buffer.assertCodePointEncoded(expectedHex: String, codePoint: Int, prefixLength: Int = 0) {
write(ByteArray(prefixLength))
writeUtf8CodePoint(codePoint)
skip(prefixLength.toLong())
assertArrayEquals(expectedHex.decodeHex(), readByteArray())
}

private fun Buffer.assertCodePointDecoded(expectedCodePoint: Int, hex: String) {
private fun Buffer.assertCodePointDecoded(expectedCodePoint: Int, hex: String, prefixLength: Int = 0) {
write(ByteArray(prefixLength))
write(hex.decodeHex())
skip(prefixLength.toLong())
assertEquals(expectedCodePoint, readUtf8CodePoint())
}

private fun Buffer.assertUtf8StringEncoded(expectedHex: String, string: String) {
private fun Buffer.assertUtf8StringEncoded(expectedHex: String, string: String, prefixLength: Int = 0) {
write(ByteArray(prefixLength))
writeString(string)
skip(prefixLength.toLong())
assertArrayEquals(expectedHex.decodeHex(), readByteArray())
}

private fun Buffer.assertUtf8StringDecoded(expectedString: String, hex: String, prefixLength: Int = 0) {
write(ByteArray(prefixLength))
write(hex.decodeHex())
skip(prefixLength.toLong())
assertEquals(expectedString, readString())
}

private fun assertStringEncoded(hex: String, string: String) {
val expectedUtf8 = hex.decodeHex()

Expand Down
Loading