2121
2222package kotlinx.io
2323
24+ import kotlinx.io.internal.REPLACEMENT_CHARACTER
2425import kotlinx.io.internal.REPLACEMENT_CODE_POINT
25- import kotlinx.io.internal.commonAsUtf8ToByteArray
2626import kotlinx.io.internal.processUtf8CodePoints
2727import kotlin.test.*
2828
@@ -144,52 +144,109 @@ class Utf8Test {
144144
145145 @Test
146146 fun bufferWriteCodePoints () {
147+ bufferWriteCodePointsCheck(0 )
148+ }
149+
150+ @Test
151+ fun bufferWriteCodePointsCrossSegments () {
152+ bufferWriteCodePointsCheck(Segment .SIZE - 1 )
153+ }
154+
155+ private fun bufferWriteCodePointsCheck (prefixLength : Int ) {
147156 val buffer = Buffer ()
148- buffer.assertCodePointEncoded(" 40" , ' @' .code)
149- buffer.assertCodePointEncoded(" 7f" , ' \u007f ' .code)
150- buffer.assertCodePointEncoded(" c280" , ' \u0080 ' .code)
151- buffer.assertCodePointEncoded(" c2a9" , ' \u00a9 ' .code)
152- buffer.assertCodePointEncoded(" c3bf" , ' \u00ff ' .code)
153- buffer.assertCodePointEncoded(" dfbf" , ' \u07ff ' .code)
154- buffer.assertCodePointEncoded(" e0a080" , ' \u0800 ' .code)
155- buffer.assertCodePointEncoded(" e1839a" , ' \u10da ' .code)
156- buffer.assertCodePointEncoded(" efbfbf" , ' \uffff ' .code)
157- buffer.assertCodePointEncoded(" f0908080" , 0x10000 )
158- buffer.assertCodePointEncoded(" f48087bf" , 0x1001FF )
157+ buffer.assertCodePointEncoded(" 40" , ' @' .code, prefixLength )
158+ buffer.assertCodePointEncoded(" 7f" , ' \u007f ' .code, prefixLength )
159+ buffer.assertCodePointEncoded(" c280" , ' \u0080 ' .code, prefixLength )
160+ buffer.assertCodePointEncoded(" c2a9" , ' \u00a9 ' .code, prefixLength )
161+ buffer.assertCodePointEncoded(" c3bf" , ' \u00ff ' .code, prefixLength )
162+ buffer.assertCodePointEncoded(" dfbf" , ' \u07ff ' .code, prefixLength )
163+ buffer.assertCodePointEncoded(" e0a080" , ' \u0800 ' .code, prefixLength )
164+ buffer.assertCodePointEncoded(" e1839a" , ' \u10da ' .code, prefixLength )
165+ buffer.assertCodePointEncoded(" efbfbf" , ' \uffff ' .code, prefixLength )
166+ buffer.assertCodePointEncoded(" f0908080" , 0x10000 , prefixLength )
167+ buffer.assertCodePointEncoded(" f48087bf" , 0x1001FF , prefixLength )
159168 }
160169
161170 @Test
162171 fun bufferReadCodePoints () {
172+ bufferReadCodePointsCheck(0 )
173+ }
174+
175+ @Test
176+ fun bufferReadCodePointsCrossSegments () {
177+ bufferReadCodePointsCheck(Segment .SIZE - 1 )
178+ }
179+
180+ private fun bufferReadCodePointsCheck (prefixLength : Int ) {
163181 val buffer = Buffer ()
164- buffer.assertCodePointDecoded(' @' .code, " 40" )
165- buffer.assertCodePointDecoded(' \u007f ' .code, " 7f" )
166- buffer.assertCodePointDecoded(' \u0080 ' .code, " c280" )
167- buffer.assertCodePointDecoded(' \u00a9 ' .code, " c2a9" )
168- buffer.assertCodePointDecoded(' \u00ff ' .code, " c3bf" )
169- buffer.assertCodePointDecoded(' \u07ff ' .code, " dfbf" )
170- buffer.assertCodePointDecoded(' \u0800 ' .code, " e0a080" )
171- buffer.assertCodePointDecoded(' \u10da ' .code, " e1839a" )
172- buffer.assertCodePointDecoded(' \uffff ' .code, " efbfbf" )
173- buffer.assertCodePointDecoded(0x10000 , " f0908080" )
174- buffer.assertCodePointDecoded(0x1001FF , " f48087bf" )
182+ buffer.assertCodePointDecoded(' @' .code, " 40" , prefixLength )
183+ buffer.assertCodePointDecoded(' \u007f ' .code, " 7f" , prefixLength )
184+ buffer.assertCodePointDecoded(' \u0080 ' .code, " c280" , prefixLength )
185+ buffer.assertCodePointDecoded(' \u00a9 ' .code, " c2a9" , prefixLength )
186+ buffer.assertCodePointDecoded(' \u00ff ' .code, " c3bf" , prefixLength )
187+ buffer.assertCodePointDecoded(' \u07ff ' .code, " dfbf" , prefixLength )
188+ buffer.assertCodePointDecoded(' \u0800 ' .code, " e0a080" , prefixLength )
189+ buffer.assertCodePointDecoded(' \u10da ' .code, " e1839a" , prefixLength )
190+ buffer.assertCodePointDecoded(' \uffff ' .code, " efbfbf" , prefixLength )
191+ buffer.assertCodePointDecoded(0x10000 , " f0908080" , prefixLength )
192+ buffer.assertCodePointDecoded(0x1001FF , " f48087bf" , prefixLength )
175193 }
176194
177195 @Test
178196 fun bufferWriteUtf8String () {
197+ bufferWriteUtf8StringCheck(0 )
198+ }
199+
200+ @Test
201+ fun bufferWriteUtf8StringCrossSegments () {
202+ bufferWriteUtf8StringCheck(Segment .SIZE - 1 )
203+ }
204+
205+ private fun bufferWriteUtf8StringCheck (prefixLength : Int ) {
179206 val buffer = Buffer ()
180- buffer.assertUtf8StringEncoded(" 68656c6c6f" , " hello" )
181- buffer.assertUtf8StringEncoded(" cf87ceb5cf81ceb5cf84ceb9cf83cebccf8ccf82" , " χερετισμός" )
207+ buffer.assertUtf8StringEncoded(" 68656c6c6f" , " hello" , prefixLength)
208+ buffer.assertUtf8StringEncoded(" cf87ceb5cf81ceb5cf84ceb9cf83cebccf8ccf82" , " χερετισμός" ,
209+ prefixLength)
182210 buffer.assertUtf8StringEncoded(
183211 " e18392e18390e1839be18390e183a0e183afe1839de18391e18390" ,
184- " გამარჯობა"
212+ " გამარჯობა" ,
213+ prefixLength
185214 )
186215 buffer.assertUtf8StringEncoded(
187216 " f093878bf0938bb4f09380a5" ,
188- " \uD80C\uDDCB\uD80C\uDEF4\uD80C\uDC25 " /* 𓇋𓋴𓀥, to hail, AN EGYPTIAN HIEROGLYPHIC DICTIONARY, p. 79b */
217+ " \uD80C\uDDCB\uD80C\uDEF4\uD80C\uDC25 " ,/* 𓇋𓋴𓀥, to hail, AN EGYPTIAN HIEROGLYPHIC DICTIONARY, p. 79b */
218+ prefixLength
189219 )
190220
191221 // two consecutive high surrogates, replace with '?'
192- buffer.assertUtf8StringEncoded(" 3f3f" , " \ud801\uD801 " )
222+ buffer.assertUtf8StringEncoded(" 3f3f" , " \ud801\uD801 " , prefixLength)
223+ }
224+
225+ @Test
226+ fun bufferReadUtf8String () {
227+ bufferReadUtf8StringCheck(0 )
228+ }
229+
230+ @Test
231+ fun bufferReadUtf8StringCrossSegments () {
232+ bufferReadUtf8StringCheck(Segment .SIZE - 1 )
233+ }
234+
235+ private fun bufferReadUtf8StringCheck (prefixLength : Int ) {
236+ val buffer = Buffer ()
237+ buffer.assertUtf8StringDecoded(" hello" ," 68656c6c6f" , prefixLength)
238+ buffer.assertUtf8StringDecoded(" χερετισμός" , " cf87ceb5cf81ceb5cf84ceb9cf83cebccf8ccf82" ,
239+ prefixLength)
240+ buffer.assertUtf8StringDecoded(
241+ " გამარჯობა" ,
242+ " e18392e18390e1839be18390e183a0e183afe1839de18391e18390" ,
243+ prefixLength
244+ )
245+ buffer.assertUtf8StringDecoded(
246+ " \uD80C\uDDCB\uD80C\uDEF4\uD80C\uDC25 " ,/* 𓇋𓋴𓀥, to hail, AN EGYPTIAN HIEROGLYPHIC DICTIONARY, p. 79b */
247+ " f093878bf0938bb4f09380a5" ,
248+ prefixLength
249+ )
193250 }
194251
195252 @Test
@@ -258,6 +315,16 @@ class Utf8Test {
258315 assertEquals(REPLACEMENT_CODE_POINT , buffer.readUtf8CodePoint())
259316 assertEquals(REPLACEMENT_CODE_POINT , buffer.readUtf8CodePoint())
260317 assertTrue(buffer.exhausted())
318+
319+ buffer.write(ByteArray (Segment .SIZE - 2 ))
320+ buffer.write(" f888808080" .decodeHex())
321+ buffer.skip(Segment .SIZE - 2L )
322+ assertEquals(REPLACEMENT_CODE_POINT , buffer.readUtf8CodePoint())
323+ assertEquals(REPLACEMENT_CODE_POINT , buffer.readUtf8CodePoint())
324+ assertEquals(REPLACEMENT_CODE_POINT , buffer.readUtf8CodePoint())
325+ assertEquals(REPLACEMENT_CODE_POINT , buffer.readUtf8CodePoint())
326+ assertEquals(REPLACEMENT_CODE_POINT , buffer.readUtf8CodePoint())
327+ assertTrue(buffer.exhausted())
261328 }
262329
263330 @Test
@@ -307,6 +374,44 @@ class Utf8Test {
307374 }
308375 }
309376
377+ @Test
378+ fun readStringWithUnderflow () {
379+ val buffer = Buffer ()
380+ // 3 byte-encoded, last byte missing
381+ buffer.assertUtf8StringDecoded(REPLACEMENT_CHARACTER .toString(), " e183" )
382+ // 3 byte-encoded, last two bytes missing
383+ buffer.assertUtf8StringDecoded(REPLACEMENT_CHARACTER .toString(), " e1" )
384+ // 2 byte-encoded, last byte missing
385+ buffer.assertUtf8StringDecoded(REPLACEMENT_CHARACTER .toString(), " cf" )
386+ // 4 byte encoded, various underflows
387+ buffer.assertUtf8StringDecoded(REPLACEMENT_CHARACTER .toString(), " f09383" )
388+ buffer.assertUtf8StringDecoded(REPLACEMENT_CHARACTER .toString(), " f093" )
389+ buffer.assertUtf8StringDecoded(REPLACEMENT_CHARACTER .toString(), " f0" )
390+ }
391+
392+ @Test
393+ fun readStringWithoutContinuationByte () {
394+ val buffer = Buffer ()
395+ // 2 byte-encoded, last byte corrupted
396+ buffer.assertUtf8StringDecoded(" ${REPLACEMENT_CHARACTER } a" , " cf61" )
397+ // 3 byte-encoded, last byte corrupted
398+ buffer.assertUtf8StringDecoded(" ${REPLACEMENT_CHARACTER } a" , " e18361" )
399+ // 3 byte-encoded, last two bytes corrupted
400+ buffer.assertUtf8StringDecoded(" ${REPLACEMENT_CHARACTER } aa" , " e16161" )
401+ // 4 byte-encoded, various bytes corrupterd
402+ buffer.assertUtf8StringDecoded(" ${REPLACEMENT_CHARACTER } a" , " f0938361" )
403+ buffer.assertUtf8StringDecoded(" ${REPLACEMENT_CHARACTER } aa" , " f0936161" )
404+ buffer.assertUtf8StringDecoded(" ${REPLACEMENT_CHARACTER } aaa" , " f0616161" )
405+ }
406+
407+ @OptIn(ExperimentalStdlibApi ::class )
408+ @Test
409+ fun encodeUtf16SurrogatePair () {
410+ val buffer = Buffer ()
411+ buffer.writeString(" \uD852\uDF62 " )
412+ println (buffer.readByteArray().toHexString())
413+ }
414+
310415 private fun assertEncoded (hex : String , vararg codePoints : Int ) {
311416 assertCodePointDecoded(hex, * codePoints)
312417 }
@@ -321,21 +426,34 @@ class Utf8Test {
321426 assertEquals(i, codePoints.size) // Checked them all
322427 }
323428
324- private fun Buffer.assertCodePointEncoded (expectedHex : String , codePoint : Int ) {
429+ private fun Buffer.assertCodePointEncoded (expectedHex : String , codePoint : Int , prefixLength : Int = 0) {
430+ write(ByteArray (prefixLength))
325431 writeUtf8CodePoint(codePoint)
432+ skip(prefixLength.toLong())
326433 assertArrayEquals(expectedHex.decodeHex(), readByteArray())
327434 }
328435
329- private fun Buffer.assertCodePointDecoded (expectedCodePoint : Int , hex : String ) {
436+ private fun Buffer.assertCodePointDecoded (expectedCodePoint : Int , hex : String , prefixLength : Int = 0) {
437+ write(ByteArray (prefixLength))
330438 write(hex.decodeHex())
439+ skip(prefixLength.toLong())
331440 assertEquals(expectedCodePoint, readUtf8CodePoint())
332441 }
333442
334- private fun Buffer.assertUtf8StringEncoded (expectedHex : String , string : String ) {
443+ private fun Buffer.assertUtf8StringEncoded (expectedHex : String , string : String , prefixLength : Int = 0) {
444+ write(ByteArray (prefixLength))
335445 writeString(string)
446+ skip(prefixLength.toLong())
336447 assertArrayEquals(expectedHex.decodeHex(), readByteArray())
337448 }
338449
450+ private fun Buffer.assertUtf8StringDecoded (expectedString : String , hex : String , prefixLength : Int = 0) {
451+ write(ByteArray (prefixLength))
452+ write(hex.decodeHex())
453+ skip(prefixLength.toLong())
454+ assertEquals(expectedString, readString())
455+ }
456+
339457 private fun assertStringEncoded (hex : String , string : String ) {
340458 val expectedUtf8 = hex.decodeHex()
341459
0 commit comments