Skip to content

Commit 9395632

Browse files
utf8to16 and utf16to8 helpers for Windows APIs
1 parent 666e325 commit 9395632

File tree

2 files changed

+255
-5
lines changed

2 files changed

+255
-5
lines changed

base/c.jl

Lines changed: 88 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -93,13 +93,96 @@ end
9393
# symbols are guaranteed not to contain embedded NUL
9494
convert(::Type{Cstring}, s::Symbol) = Cstring(unsafe_convert(Ptr{Cchar}, s))
9595

96-
# conversion between UTF-8-like data and UTF-16-like data for Windows APIs
97-
function utf8_to_utf16!(src::Vector{UInt8}, dst::Vector{UInt16})
98-
96+
# conversions between UTF-8 and UTF-16 for Windows APIs
97+
98+
function utf8to16(src::Vector{UInt8})
99+
dst = UInt16[]
100+
i, n = 1, length(src)
101+
n > 0 || return dst
102+
sizehint!(dst, 2n)
103+
a = src[1]
104+
while true
105+
if i < n && -64 <= a % Int8 <= -12 # multi-byte character
106+
b = src[i += 1]
107+
if -64 <= (b % Int8) || a == 0xf4 && 0x8f < b
108+
# invalid UTF-8 (non-continuation or too-high code point)
109+
push!(dst, a)
110+
a = b; continue
111+
elseif a < 0xe0 # 2-byte UTF-8
112+
push!(dst, 0x3080 $ (UInt16(a) << 6) $ b)
113+
elseif i < n # 3/4-byte character
114+
c = src[i += 1]
115+
if -64 <= (c % Int8) # invalid UTF-8 (non-continuation)
116+
push!(dst, a, b)
117+
a = c; continue
118+
elseif a < 0xf0 # 3-byte UTF-8
119+
push!(dst, 0x2080 $ (UInt16(a) << 12) $ (UInt16(b) << 6) $ c)
120+
elseif i < n
121+
d = src[i += 1]
122+
if -64 <= (d % Int8) # invalid UTF-8 (non-continuation)
123+
push!(dst, a, b, c)
124+
a = d; continue
125+
elseif a == 0xf0 && b < 0x90 # overlong encoding
126+
push!(dst, 0x2080 $ (UInt16(b) << 12) $ (UInt16(c) << 6) $ d)
127+
else # 4-byte UTF-8
128+
push!(dst, 0xe5b8 + (UInt16(a) << 8) + (UInt16(b) << 2) + (c >> 4),
129+
0xdc80 $ (UInt16(c & 0xf) << 6) $ d)
130+
end
131+
else # too short
132+
push!(dst, a, b, c)
133+
break
134+
end
135+
else # too short
136+
push!(dst, a, b)
137+
break
138+
end
139+
else # ASCII or invalid UTF-8 (continuation byte or too-high code point)
140+
push!(dst, a)
141+
end
142+
i < n || break
143+
a = src[i += 1]
144+
end
145+
return dst
99146
end
100147

101-
function utf16_to_utf8!(src::Vector{UInt16}, dst::Vector{UInt8})
102-
148+
function utf16to8(src::Vector{UInt16})
149+
dst = UInt8[]
150+
i, n = 1, length(src)
151+
n > 0 || return dst
152+
sizehint!(dst, n)
153+
a = src[1]
154+
while true
155+
if a < 0x80 # ASCII
156+
push!(dst, a % UInt8)
157+
elseif a < 0x800 # 2-byte UTF-8
158+
push!(dst, 0xc0 | ((a >> 6) % UInt8),
159+
0x80 | ((a % UInt8) & 0x3f))
160+
elseif a & 0xfc00 == 0xd800 && i < n
161+
b = src[i += 1]
162+
if (b & 0xfc00) == 0xdc00
163+
# 2-unit UTF-16 sequence => 4-byte UTF-8
164+
a += 0x2840
165+
push!(dst, 0xf0 | ((a >> 8) % UInt8),
166+
0x80 | ((a % UInt8) >> 2),
167+
0xf0 $ ((((a % UInt8) << 4) & 0x3f) $ (b >> 6) % UInt8),
168+
0x80 | ((b % UInt8) & 0x3f))
169+
else
170+
push!(dst, 0xe0 | ((a >> 12) % UInt8),
171+
0x80 | (((a >> 6) % UInt8) & 0x3f),
172+
0x80 | ((a % UInt8) & 0x3f))
173+
a = b; continue
174+
end
175+
else
176+
# 1-unit high UTF-16 or unpaired high surrogate
177+
# either way, encode as 3-byte UTF-8 code point
178+
push!(dst, 0xe0 | ((a >> 12) % UInt8),
179+
0x80 | (((a >> 6) % UInt8) & 0x3f),
180+
0x80 | ((a % UInt8) & 0x3f))
181+
end
182+
i < n || break
183+
a = src[i += 1]
184+
end
185+
return dst
103186
end
104187

105188
# deferring (or un-deferring) ctrl-c handler for external C code that

test/misc.jl

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,3 +210,170 @@ end
210210
whos(IOBuffer(), Tmp14173) # warm up
211211
@test @allocated(whos(IOBuffer(), Tmp14173)) < 10000
212212

213+
## test conversion from UTF-8 to UTF-16 (for Windows APIs)
214+
import Base: utf8to16, utf16to8
215+
216+
# empty arrays
217+
@test utf8to16(UInt8[]) == UInt16[]
218+
@test utf16to8(UInt16[]) == UInt8[]
219+
220+
# UTF-8-like sequences
221+
V8 = [
222+
# 1-byte (ASCII)
223+
([0x00],[0x0000])
224+
([0x0a],[0x000a])
225+
([0x7f],[0x007f])
226+
# 2-byte
227+
([0xc0,0x80],[0x0000]) # overlong encoding
228+
([0xc1,0xbf],[0x007f]) # overlong encoding
229+
([0xc2,0x80],[0x0080])
230+
([0xc3,0xbf],[0x00ff])
231+
([0xc4,0x80],[0x0100])
232+
([0xc4,0xa3],[0x0123])
233+
([0xdf,0xbf],[0x07ff])
234+
# 3-byte
235+
([0xe0,0x80,0x80],[0x0000]) # overlong encoding
236+
([0xe0,0x81,0xbf],[0x007f]) # overlong encoding
237+
([0xe0,0x82,0x80],[0x0080]) # overlong encoding
238+
([0xe0,0x9f,0xbf],[0x07ff]) # overlong encoding
239+
([0xe0,0xa0,0x80],[0x0800])
240+
([0xe0,0xa2,0x9a],[0x089a])
241+
([0xe1,0x88,0xb4],[0x1234])
242+
([0xea,0xaf,0x8d],[0xabcd])
243+
([0xed,0x9f,0xbf],[0xd7ff])
244+
([0xed,0xa0,0x80],[0xd800]) # invalid code point – high surrogate
245+
([0xed,0xaf,0xbf],[0xdbff]) # invalid code point – high surrogate
246+
([0xed,0xb0,0x80],[0xdc00]) # invalid code point – low surrogate
247+
([0xed,0xbf,0xbf],[0xdfff]) # invalid code point – low surrogate
248+
([0xee,0x80,0x80],[0xe000])
249+
([0xef,0xbf,0xbf],[0xffff])
250+
# 4-byte
251+
([0xf0,0x80,0x80,0x80],[0x0000]) # overlong encoding
252+
([0xf0,0x80,0x81,0xbf],[0x007f]) # overlong encoding
253+
([0xf0,0x80,0x82,0x80],[0x0080]) # overlong encoding
254+
([0xf0,0x80,0x9f,0xbf],[0x07ff]) # overlong encoding
255+
([0xf0,0x80,0xa0,0x80],[0x0800]) # overlong encoding
256+
([0xf0,0x8f,0xbf,0xbf],[0xffff]) # overlong encoding
257+
([0xf0,0x90,0x80,0x80],[0xd800,0xdc00]) # U+10000
258+
([0xf0,0x90,0x8d,0x88],[0xd800,0xdf48]) # U+10348
259+
([0xf0,0x90,0x90,0xb7],[0xd801,0xdc37]) # U+10437
260+
([0xf0,0xa4,0xad,0xa2],[0xd852,0xdf62]) # U+24b62
261+
([0xf2,0xab,0xb3,0x9e],[0xda6f,0xdcde]) # U+abcde
262+
([0xf3,0xbf,0xbf,0xbf],[0xdbbf,0xdfff]) # U+fffff
263+
([0xf4,0x80,0x80,0x80],[0xdbc0,0xdc00]) # U+100000
264+
([0xf4,0x8a,0xaf,0x8d],[0xdbea,0xdfcd]) # U+10abcd
265+
([0xf4,0x8f,0xbf,0xbf],[0xdbff,0xdfff]) # U+10ffff
266+
]
267+
268+
# non UTF-8-like sequences
269+
X8 = Vector{UInt8}[
270+
# invalid 1-byte sequences
271+
[0x80], # 1 leading ones
272+
[0xbf],
273+
[0xc0], # 2 leading ones
274+
[0xdf],
275+
[0xe0], # 3 leading ones
276+
[0xef],
277+
[0xf0], # 4 leading ones
278+
[0xf7],
279+
[0xf8], # 5 leading ones
280+
[0xfb],
281+
[0xfc], # 6 leading ones
282+
[0xfd],
283+
[0xfe], # 7 leading ones
284+
[0xff], # 8 leading ones
285+
# other invalid sequences
286+
[0xf4,0x90,0xbf,0xbf],
287+
[0xf4,0x91,0x80,0x80],
288+
[0xf7,0x80,0x80,0x80],
289+
[0xf7,0xbf,0xbf,0xbf],
290+
[0xf8,0x80,0x80,0x80],
291+
[0xf8,0xbf,0xbf,0xbf],
292+
[0xff,0x80,0x80,0x80],
293+
[0xff,0xbf,0xbf,0xbf],
294+
]
295+
296+
for s in [map(first,V8); X8],
297+
i = 1:length(s)-1,
298+
j = i+1:length(s)-(i==1)
299+
ss = s[i:j]
300+
ss in X8 || push!(X8, ss)
301+
end
302+
sort!(X8, lt=lexless)
303+
sort!(X8, by=length)
304+
305+
I8 = [(s,map(UInt16,s)) for s in X8]
306+
307+
for (X,Y,Z) in ((V8,V8,V8), (I8,V8,I8), (V8,I8,V8), (V8,V8,I8), (I8,V8,V8))
308+
for (a8, a16) in X
309+
@test utf8to16(a8) == a16
310+
for (b8, b16) in Y
311+
ab8 = [a8; b8]
312+
ab16 = [a16; b16]
313+
@test utf8to16(ab8) == ab16
314+
for (c8, c16) in Z
315+
abc8 = [ab8; c8]
316+
abc16 = [ab16; c16]
317+
@test utf8to16(abc8) == abc16
318+
end
319+
end
320+
end
321+
end
322+
323+
# UTF-16-like sequences
324+
V16 = [
325+
# 1-unit UTF-16, 1-byte UTF-8 (ASCII)
326+
([0x0000],[0x00])
327+
([0x000a],[0x0a])
328+
([0x007f],[0x7f])
329+
# 1-unit UTF-16, 2-byte UTF-8
330+
([0x0080],[0xc2,0x80])
331+
([0x00ff],[0xc3,0xbf])
332+
([0x0100],[0xc4,0x80])
333+
([0x0123],[0xc4,0xa3])
334+
([0x07ff],[0xdf,0xbf])
335+
# 1-unit UTF-16, 3-byte UTF-8
336+
([0x0800],[0xe0,0xa0,0x80])
337+
([0x089a],[0xe0,0xa2,0x9a])
338+
([0x1234],[0xe1,0x88,0xb4])
339+
([0xabcd],[0xea,0xaf,0x8d])
340+
([0xd7ff],[0xed,0x9f,0xbf])
341+
([0xe000],[0xee,0x80,0x80])
342+
([0xffff],[0xef,0xbf,0xbf])
343+
# 2-unit UTF-16, 4-byte UTF-8
344+
([0xd800,0xdc00],[0xf0,0x90,0x80,0x80]) # U+10000
345+
([0xd800,0xdf48],[0xf0,0x90,0x8d,0x88]) # U+10348
346+
([0xd801,0xdc37],[0xf0,0x90,0x90,0xb7]) # U+10437
347+
([0xd852,0xdf62],[0xf0,0xa4,0xad,0xa2]) # U+24b62
348+
([0xda6f,0xdcde],[0xf2,0xab,0xb3,0x9e]) # U+abcde
349+
([0xdbbf,0xdfff],[0xf3,0xbf,0xbf,0xbf]) # U+fffff
350+
([0xdbc0,0xdc00],[0xf4,0x80,0x80,0x80]) # U+100000
351+
([0xdbea,0xdfcd],[0xf4,0x8a,0xaf,0x8d]) # U+10abcd
352+
([0xdbff,0xdfff],[0xf4,0x8f,0xbf,0xbf]) # U+10ffff
353+
]
354+
355+
I16 = [
356+
([0xd800],[0xed,0xa0,0x80]) # high surrogate
357+
([0xdbff],[0xed,0xaf,0xbf]) # high surrogate
358+
([0xdc00],[0xed,0xb0,0x80]) # low surrogate
359+
([0xdfff],[0xed,0xbf,0xbf]) # low surrogate
360+
]
361+
362+
for (X,Y,Z) in ((V16,V16,V16), (I16,V16,I16), (V16,I16,V16), (V16,V16,I16), (I16,V16,V16))
363+
for (a16, a8) in X
364+
@test utf16to8(a16) == a8
365+
@test utf8to16(a8) == a16
366+
for (b16, b8) in Y
367+
ab16 = [a16; b16]
368+
ab8 = [a8; b8]
369+
@test utf16to8(ab16) == ab8
370+
@test utf8to16(ab8) == ab16
371+
for (c16, c8) in Z
372+
abc16 = [ab16; c16]
373+
abc8 = [ab8; c8]
374+
@test utf16to8(abc16) == abc8
375+
@test utf8to16(abc8) == abc16
376+
end
377+
end
378+
end
379+
end

0 commit comments

Comments
 (0)