utf8to16 and utf16to8 helpers for Windows APIs

StefanKarpinski · StefanKarpinski · commit 93956325b959 · 2016-02-08T19:33:51.000-05:00
diff --git a/base/c.jl b/base/c.jl
@@ -93,13 +93,96 @@ end
 # symbols are guaranteed not to contain embedded NUL
 convert(::Type{Cstring}, s::Symbol) = Cstring(unsafe_convert(Ptr{Cchar}, s))
 
-# conversion between UTF-8-like data and UTF-16-like data for Windows APIs
-function utf8_to_utf16!(src::Vector{UInt8}, dst::Vector{UInt16})
-
+# conversions between UTF-8 and UTF-16 for Windows APIs
+
+function utf8to16(src::Vector{UInt8})
+    dst = UInt16[]
+    i, n = 1, length(src)
+    n > 0 || return dst
+    sizehint!(dst, 2n)
+    a = src[1]
+    while true
+        if i < n && -64 <= a % Int8 <= -12 # multi-byte character
+            b = src[i += 1]
+            if -64 <= (b % Int8) || a == 0xf4 && 0x8f < b
+                # invalid UTF-8 (non-continuation or too-high code point)
+                push!(dst, a)
+                a = b; continue
+            elseif a < 0xe0 # 2-byte UTF-8
+                push!(dst, 0x3080 $ (UInt16(a) << 6) $ b)
+            elseif i < n # 3/4-byte character
+                c = src[i += 1]
+                if -64 <= (c % Int8) # invalid UTF-8 (non-continuation)
+                    push!(dst, a, b)
+                    a = c; continue
+                elseif a < 0xf0 # 3-byte UTF-8
+                    push!(dst, 0x2080 $ (UInt16(a) << 12) $ (UInt16(b) << 6) $ c)
+                elseif i < n
+                    d = src[i += 1]
+                    if -64 <= (d % Int8) # invalid UTF-8 (non-continuation)
+                        push!(dst, a, b, c)
+                        a = d; continue
+                    elseif a == 0xf0 && b < 0x90 # overlong encoding
+                        push!(dst, 0x2080 $ (UInt16(b) << 12) $ (UInt16(c) << 6) $ d)
+                    else # 4-byte UTF-8
+                        push!(dst, 0xe5b8 + (UInt16(a) << 8) + (UInt16(b) << 2) + (c >> 4),
+                                   0xdc80 $ (UInt16(c & 0xf) << 6) $ d)
+                    end
+                else # too short
+                    push!(dst, a, b, c)
+                    break
+                end
+            else # too short
+                push!(dst, a, b)
+                break
+            end
+        else # ASCII or invalid UTF-8 (continuation byte or too-high code point)
+            push!(dst, a)
+        end
+        i < n || break
+        a = src[i += 1]
+    end
+    return dst
 end
 
-function utf16_to_utf8!(src::Vector{UInt16}, dst::Vector{UInt8})
-
+function utf16to8(src::Vector{UInt16})
+    dst = UInt8[]
+    i, n = 1, length(src)
+    n > 0 || return dst
+    sizehint!(dst, n)
+    a = src[1]
+    while true
+        if a < 0x80 # ASCII
+            push!(dst, a % UInt8)
+        elseif a < 0x800 # 2-byte UTF-8
+            push!(dst, 0xc0 | ((a >> 6) % UInt8),
+                       0x80 | ((a % UInt8) & 0x3f))
+        elseif a & 0xfc00 == 0xd800 && i < n
+            b = src[i += 1]
+            if (b & 0xfc00) == 0xdc00
+                # 2-unit UTF-16 sequence => 4-byte UTF-8
+                a += 0x2840
+                push!(dst, 0xf0 | ((a >> 8) % UInt8),
+                           0x80 | ((a % UInt8) >> 2),
+                           0xf0 $ ((((a % UInt8) << 4) & 0x3f) $ (b >> 6) % UInt8),
+                           0x80 | ((b % UInt8) & 0x3f))
+            else
+                push!(dst, 0xe0 | ((a >> 12) % UInt8),
+                           0x80 | (((a >> 6) % UInt8) & 0x3f),
+                           0x80 | ((a % UInt8) & 0x3f))
+                a = b; continue
+            end
+        else
+            # 1-unit high UTF-16 or unpaired high surrogate
+            # either way, encode as 3-byte UTF-8 code point
+            push!(dst, 0xe0 | ((a >> 12) % UInt8),
+                       0x80 | (((a >> 6) % UInt8) & 0x3f),
+                       0x80 | ((a % UInt8) & 0x3f))
+        end
+        i < n || break
+        a = src[i += 1]
+    end
+    return dst
 end
 
 # deferring (or un-deferring) ctrl-c handler for external C code that
diff --git a/test/misc.jl b/test/misc.jl
@@ -210,3 +210,170 @@ end
 whos(IOBuffer(), Tmp14173) # warm up
 @test @allocated(whos(IOBuffer(), Tmp14173)) < 10000
 
+## test conversion from UTF-8 to UTF-16 (for Windows APIs)
+import Base: utf8to16, utf16to8
+
+# empty arrays
+@test utf8to16(UInt8[]) == UInt16[]
+@test utf16to8(UInt16[]) == UInt8[]
+
+# UTF-8-like sequences
+V8 = [
+    # 1-byte (ASCII)
+    ([0x00],[0x0000])
+    ([0x0a],[0x000a])
+    ([0x7f],[0x007f])
+    # 2-byte
+    ([0xc0,0x80],[0x0000]) # overlong encoding
+    ([0xc1,0xbf],[0x007f]) # overlong encoding
+    ([0xc2,0x80],[0x0080])
+    ([0xc3,0xbf],[0x00ff])
+    ([0xc4,0x80],[0x0100])
+    ([0xc4,0xa3],[0x0123])
+    ([0xdf,0xbf],[0x07ff])
+    # 3-byte
+    ([0xe0,0x80,0x80],[0x0000]) # overlong encoding
+    ([0xe0,0x81,0xbf],[0x007f]) # overlong encoding
+    ([0xe0,0x82,0x80],[0x0080]) # overlong encoding
+    ([0xe0,0x9f,0xbf],[0x07ff]) # overlong encoding
+    ([0xe0,0xa0,0x80],[0x0800])
+    ([0xe0,0xa2,0x9a],[0x089a])
+    ([0xe1,0x88,0xb4],[0x1234])
+    ([0xea,0xaf,0x8d],[0xabcd])
+    ([0xed,0x9f,0xbf],[0xd7ff])
+    ([0xed,0xa0,0x80],[0xd800]) # invalid code point – high surrogate
+    ([0xed,0xaf,0xbf],[0xdbff]) # invalid code point – high surrogate
+    ([0xed,0xb0,0x80],[0xdc00]) # invalid code point – low surrogate
+    ([0xed,0xbf,0xbf],[0xdfff]) # invalid code point – low surrogate
+    ([0xee,0x80,0x80],[0xe000])
+    ([0xef,0xbf,0xbf],[0xffff])
+    # 4-byte
+    ([0xf0,0x80,0x80,0x80],[0x0000]) # overlong encoding
+    ([0xf0,0x80,0x81,0xbf],[0x007f]) # overlong encoding
+    ([0xf0,0x80,0x82,0x80],[0x0080]) # overlong encoding
+    ([0xf0,0x80,0x9f,0xbf],[0x07ff]) # overlong encoding
+    ([0xf0,0x80,0xa0,0x80],[0x0800]) # overlong encoding
+    ([0xf0,0x8f,0xbf,0xbf],[0xffff]) # overlong encoding
+    ([0xf0,0x90,0x80,0x80],[0xd800,0xdc00]) # U+10000
+    ([0xf0,0x90,0x8d,0x88],[0xd800,0xdf48]) # U+10348
+    ([0xf0,0x90,0x90,0xb7],[0xd801,0xdc37]) # U+10437
+    ([0xf0,0xa4,0xad,0xa2],[0xd852,0xdf62]) # U+24b62
+    ([0xf2,0xab,0xb3,0x9e],[0xda6f,0xdcde]) # U+abcde
+    ([0xf3,0xbf,0xbf,0xbf],[0xdbbf,0xdfff]) # U+fffff
+    ([0xf4,0x80,0x80,0x80],[0xdbc0,0xdc00]) # U+100000
+    ([0xf4,0x8a,0xaf,0x8d],[0xdbea,0xdfcd]) # U+10abcd
+    ([0xf4,0x8f,0xbf,0xbf],[0xdbff,0xdfff]) # U+10ffff
+]
+
+# non UTF-8-like sequences
+X8 = Vector{UInt8}[
+    # invalid 1-byte sequences
+    [0x80], # 1 leading ones
+    [0xbf],
+    [0xc0], # 2 leading ones
+    [0xdf],
+    [0xe0], # 3 leading ones
+    [0xef],
+    [0xf0], # 4 leading ones
+    [0xf7],
+    [0xf8], # 5 leading ones
+    [0xfb],
+    [0xfc], # 6 leading ones
+    [0xfd],
+    [0xfe], # 7 leading ones
+    [0xff], # 8 leading ones
+    # other invalid sequences
+    [0xf4,0x90,0xbf,0xbf],
+    [0xf4,0x91,0x80,0x80],
+    [0xf7,0x80,0x80,0x80],
+    [0xf7,0xbf,0xbf,0xbf],
+    [0xf8,0x80,0x80,0x80],
+    [0xf8,0xbf,0xbf,0xbf],
+    [0xff,0x80,0x80,0x80],
+    [0xff,0xbf,0xbf,0xbf],
+]
+
+for s in [map(first,V8); X8],
+    i = 1:length(s)-1,
+    j = i+1:length(s)-(i==1)
+    ss = s[i:j]
+    ss in X8 || push!(X8, ss)
+end
+sort!(X8, lt=lexless)
+sort!(X8, by=length)
+
+I8 = [(s,map(UInt16,s)) for s in X8]
+
+for (X,Y,Z) in ((V8,V8,V8), (I8,V8,I8), (V8,I8,V8), (V8,V8,I8), (I8,V8,V8))
+    for (a8, a16) in X
+        @test utf8to16(a8) == a16
+        for (b8, b16) in Y
+            ab8 = [a8; b8]
+            ab16 = [a16; b16]
+            @test utf8to16(ab8) == ab16
+            for (c8, c16) in Z
+                abc8 = [ab8; c8]
+                abc16 = [ab16; c16]
+                @test utf8to16(abc8) == abc16
+            end
+        end
+    end
+end
+
+# UTF-16-like sequences
+V16 = [
+    # 1-unit UTF-16, 1-byte UTF-8 (ASCII)
+    ([0x0000],[0x00])
+    ([0x000a],[0x0a])
+    ([0x007f],[0x7f])
+    # 1-unit UTF-16, 2-byte UTF-8
+    ([0x0080],[0xc2,0x80])
+    ([0x00ff],[0xc3,0xbf])
+    ([0x0100],[0xc4,0x80])
+    ([0x0123],[0xc4,0xa3])
+    ([0x07ff],[0xdf,0xbf])
+    # 1-unit UTF-16, 3-byte UTF-8
+    ([0x0800],[0xe0,0xa0,0x80])
+    ([0x089a],[0xe0,0xa2,0x9a])
+    ([0x1234],[0xe1,0x88,0xb4])
+    ([0xabcd],[0xea,0xaf,0x8d])
+    ([0xd7ff],[0xed,0x9f,0xbf])
+    ([0xe000],[0xee,0x80,0x80])
+    ([0xffff],[0xef,0xbf,0xbf])
+    # 2-unit UTF-16, 4-byte UTF-8
+    ([0xd800,0xdc00],[0xf0,0x90,0x80,0x80]) # U+10000
+    ([0xd800,0xdf48],[0xf0,0x90,0x8d,0x88]) # U+10348
+    ([0xd801,0xdc37],[0xf0,0x90,0x90,0xb7]) # U+10437
+    ([0xd852,0xdf62],[0xf0,0xa4,0xad,0xa2]) # U+24b62
+    ([0xda6f,0xdcde],[0xf2,0xab,0xb3,0x9e]) # U+abcde
+    ([0xdbbf,0xdfff],[0xf3,0xbf,0xbf,0xbf]) # U+fffff
+    ([0xdbc0,0xdc00],[0xf4,0x80,0x80,0x80]) # U+100000
+    ([0xdbea,0xdfcd],[0xf4,0x8a,0xaf,0x8d]) # U+10abcd
+    ([0xdbff,0xdfff],[0xf4,0x8f,0xbf,0xbf]) # U+10ffff
+]
+
+I16 = [
+    ([0xd800],[0xed,0xa0,0x80]) # high surrogate
+    ([0xdbff],[0xed,0xaf,0xbf]) # high surrogate
+    ([0xdc00],[0xed,0xb0,0x80]) # low surrogate
+    ([0xdfff],[0xed,0xbf,0xbf]) # low surrogate
+]
+
+for (X,Y,Z) in ((V16,V16,V16), (I16,V16,I16), (V16,I16,V16), (V16,V16,I16), (I16,V16,V16))
+    for (a16, a8) in X
+        @test utf16to8(a16) == a8
+        @test utf8to16(a8) == a16
+        for (b16, b8) in Y
+            ab16 = [a16; b16]
+            ab8 = [a8; b8]
+            @test utf16to8(ab16) == ab8
+            @test utf8to16(ab8) == ab16
+            for (c16, c8) in Z
+                abc16 = [ab16; c16]
+                abc8 = [ab8; c8]
+                @test utf16to8(abc16) == abc8
+                @test utf8to16(abc8) == abc16
+            end
+        end
+    end
+end