@@ -210,3 +210,170 @@ end
210210whos (IOBuffer (), Tmp14173) # warm up
211211@test @allocated (whos (IOBuffer (), Tmp14173)) < 10000
212212
213+ # # test conversion from UTF-8 to UTF-16 (for Windows APIs)
214+ import Base: utf8to16, utf16to8
215+
216+ # empty arrays
217+ @test utf8to16 (UInt8[]) == UInt16[]
218+ @test utf16to8 (UInt16[]) == UInt8[]
219+
220+ # UTF-8-like sequences
221+ V8 = [
222+ # 1-byte (ASCII)
223+ ([0x00 ],[0x0000 ])
224+ ([0x0a ],[0x000a ])
225+ ([0x7f ],[0x007f ])
226+ # 2-byte
227+ ([0xc0 ,0x80 ],[0x0000 ]) # overlong encoding
228+ ([0xc1 ,0xbf ],[0x007f ]) # overlong encoding
229+ ([0xc2 ,0x80 ],[0x0080 ])
230+ ([0xc3 ,0xbf ],[0x00ff ])
231+ ([0xc4 ,0x80 ],[0x0100 ])
232+ ([0xc4 ,0xa3 ],[0x0123 ])
233+ ([0xdf ,0xbf ],[0x07ff ])
234+ # 3-byte
235+ ([0xe0 ,0x80 ,0x80 ],[0x0000 ]) # overlong encoding
236+ ([0xe0 ,0x81 ,0xbf ],[0x007f ]) # overlong encoding
237+ ([0xe0 ,0x82 ,0x80 ],[0x0080 ]) # overlong encoding
238+ ([0xe0 ,0x9f ,0xbf ],[0x07ff ]) # overlong encoding
239+ ([0xe0 ,0xa0 ,0x80 ],[0x0800 ])
240+ ([0xe0 ,0xa2 ,0x9a ],[0x089a ])
241+ ([0xe1 ,0x88 ,0xb4 ],[0x1234 ])
242+ ([0xea ,0xaf ,0x8d ],[0xabcd ])
243+ ([0xed ,0x9f ,0xbf ],[0xd7ff ])
244+ ([0xed ,0xa0 ,0x80 ],[0xd800 ]) # invalid code point – high surrogate
245+ ([0xed ,0xaf ,0xbf ],[0xdbff ]) # invalid code point – high surrogate
246+ ([0xed ,0xb0 ,0x80 ],[0xdc00 ]) # invalid code point – low surrogate
247+ ([0xed ,0xbf ,0xbf ],[0xdfff ]) # invalid code point – low surrogate
248+ ([0xee ,0x80 ,0x80 ],[0xe000 ])
249+ ([0xef ,0xbf ,0xbf ],[0xffff ])
250+ # 4-byte
251+ ([0xf0 ,0x80 ,0x80 ,0x80 ],[0x0000 ]) # overlong encoding
252+ ([0xf0 ,0x80 ,0x81 ,0xbf ],[0x007f ]) # overlong encoding
253+ ([0xf0 ,0x80 ,0x82 ,0x80 ],[0x0080 ]) # overlong encoding
254+ ([0xf0 ,0x80 ,0x9f ,0xbf ],[0x07ff ]) # overlong encoding
255+ ([0xf0 ,0x80 ,0xa0 ,0x80 ],[0x0800 ]) # overlong encoding
256+ ([0xf0 ,0x8f ,0xbf ,0xbf ],[0xffff ]) # overlong encoding
257+ ([0xf0 ,0x90 ,0x80 ,0x80 ],[0xd800 ,0xdc00 ]) # U+10000
258+ ([0xf0 ,0x90 ,0x8d ,0x88 ],[0xd800 ,0xdf48 ]) # U+10348
259+ ([0xf0 ,0x90 ,0x90 ,0xb7 ],[0xd801 ,0xdc37 ]) # U+10437
260+ ([0xf0 ,0xa4 ,0xad ,0xa2 ],[0xd852 ,0xdf62 ]) # U+24b62
261+ ([0xf2 ,0xab ,0xb3 ,0x9e ],[0xda6f ,0xdcde ]) # U+abcde
262+ ([0xf3 ,0xbf ,0xbf ,0xbf ],[0xdbbf ,0xdfff ]) # U+fffff
263+ ([0xf4 ,0x80 ,0x80 ,0x80 ],[0xdbc0 ,0xdc00 ]) # U+100000
264+ ([0xf4 ,0x8a ,0xaf ,0x8d ],[0xdbea ,0xdfcd ]) # U+10abcd
265+ ([0xf4 ,0x8f ,0xbf ,0xbf ],[0xdbff ,0xdfff ]) # U+10ffff
266+ ]
267+
268+ # non UTF-8-like sequences
269+ X8 = Vector{UInt8}[
270+ # invalid 1-byte sequences
271+ [0x80 ], # 1 leading ones
272+ [0xbf ],
273+ [0xc0 ], # 2 leading ones
274+ [0xdf ],
275+ [0xe0 ], # 3 leading ones
276+ [0xef ],
277+ [0xf0 ], # 4 leading ones
278+ [0xf7 ],
279+ [0xf8 ], # 5 leading ones
280+ [0xfb ],
281+ [0xfc ], # 6 leading ones
282+ [0xfd ],
283+ [0xfe ], # 7 leading ones
284+ [0xff ], # 8 leading ones
285+ # other invalid sequences
286+ [0xf4 ,0x90 ,0xbf ,0xbf ],
287+ [0xf4 ,0x91 ,0x80 ,0x80 ],
288+ [0xf7 ,0x80 ,0x80 ,0x80 ],
289+ [0xf7 ,0xbf ,0xbf ,0xbf ],
290+ [0xf8 ,0x80 ,0x80 ,0x80 ],
291+ [0xf8 ,0xbf ,0xbf ,0xbf ],
292+ [0xff ,0x80 ,0x80 ,0x80 ],
293+ [0xff ,0xbf ,0xbf ,0xbf ],
294+ ]
295+
296+ for s in [map (first,V8); X8],
297+ i = 1 : length (s)- 1 ,
298+ j = i+ 1 : length (s)- (i== 1 )
299+ ss = s[i: j]
300+ ss in X8 || push! (X8, ss)
301+ end
302+ sort! (X8, lt= lexless)
303+ sort! (X8, by= length)
304+
305+ I8 = [(s,map (UInt16,s)) for s in X8]
306+
307+ for (X,Y,Z) in ((V8,V8,V8), (I8,V8,I8), (V8,I8,V8), (V8,V8,I8), (I8,V8,V8))
308+ for (a8, a16) in X
309+ @test utf8to16 (a8) == a16
310+ for (b8, b16) in Y
311+ ab8 = [a8; b8]
312+ ab16 = [a16; b16]
313+ @test utf8to16 (ab8) == ab16
314+ for (c8, c16) in Z
315+ abc8 = [ab8; c8]
316+ abc16 = [ab16; c16]
317+ @test utf8to16 (abc8) == abc16
318+ end
319+ end
320+ end
321+ end
322+
323+ # UTF-16-like sequences
324+ V16 = [
325+ # 1-unit UTF-16, 1-byte UTF-8 (ASCII)
326+ ([0x0000 ],[0x00 ])
327+ ([0x000a ],[0x0a ])
328+ ([0x007f ],[0x7f ])
329+ # 1-unit UTF-16, 2-byte UTF-8
330+ ([0x0080 ],[0xc2 ,0x80 ])
331+ ([0x00ff ],[0xc3 ,0xbf ])
332+ ([0x0100 ],[0xc4 ,0x80 ])
333+ ([0x0123 ],[0xc4 ,0xa3 ])
334+ ([0x07ff ],[0xdf ,0xbf ])
335+ # 1-unit UTF-16, 3-byte UTF-8
336+ ([0x0800 ],[0xe0 ,0xa0 ,0x80 ])
337+ ([0x089a ],[0xe0 ,0xa2 ,0x9a ])
338+ ([0x1234 ],[0xe1 ,0x88 ,0xb4 ])
339+ ([0xabcd ],[0xea ,0xaf ,0x8d ])
340+ ([0xd7ff ],[0xed ,0x9f ,0xbf ])
341+ ([0xe000 ],[0xee ,0x80 ,0x80 ])
342+ ([0xffff ],[0xef ,0xbf ,0xbf ])
343+ # 2-unit UTF-16, 4-byte UTF-8
344+ ([0xd800 ,0xdc00 ],[0xf0 ,0x90 ,0x80 ,0x80 ]) # U+10000
345+ ([0xd800 ,0xdf48 ],[0xf0 ,0x90 ,0x8d ,0x88 ]) # U+10348
346+ ([0xd801 ,0xdc37 ],[0xf0 ,0x90 ,0x90 ,0xb7 ]) # U+10437
347+ ([0xd852 ,0xdf62 ],[0xf0 ,0xa4 ,0xad ,0xa2 ]) # U+24b62
348+ ([0xda6f ,0xdcde ],[0xf2 ,0xab ,0xb3 ,0x9e ]) # U+abcde
349+ ([0xdbbf ,0xdfff ],[0xf3 ,0xbf ,0xbf ,0xbf ]) # U+fffff
350+ ([0xdbc0 ,0xdc00 ],[0xf4 ,0x80 ,0x80 ,0x80 ]) # U+100000
351+ ([0xdbea ,0xdfcd ],[0xf4 ,0x8a ,0xaf ,0x8d ]) # U+10abcd
352+ ([0xdbff ,0xdfff ],[0xf4 ,0x8f ,0xbf ,0xbf ]) # U+10ffff
353+ ]
354+
355+ I16 = [
356+ ([0xd800 ],[0xed ,0xa0 ,0x80 ]) # high surrogate
357+ ([0xdbff ],[0xed ,0xaf ,0xbf ]) # high surrogate
358+ ([0xdc00 ],[0xed ,0xb0 ,0x80 ]) # low surrogate
359+ ([0xdfff ],[0xed ,0xbf ,0xbf ]) # low surrogate
360+ ]
361+
362+ for (X,Y,Z) in ((V16,V16,V16), (I16,V16,I16), (V16,I16,V16), (V16,V16,I16), (I16,V16,V16))
363+ for (a16, a8) in X
364+ @test utf16to8 (a16) == a8
365+ @test utf8to16 (a8) == a16
366+ for (b16, b8) in Y
367+ ab16 = [a16; b16]
368+ ab8 = [a8; b8]
369+ @test utf16to8 (ab16) == ab8
370+ @test utf8to16 (ab8) == ab16
371+ for (c16, c8) in Z
372+ abc16 = [ab16; c16]
373+ abc8 = [ab8; c8]
374+ @test utf16to8 (abc16) == abc8
375+ @test utf8to16 (abc8) == abc16
376+ end
377+ end
378+ end
379+ end
0 commit comments