11# This file is a part of Julia. License is MIT: http://julialang.org/license
22
3- utf16_is_lead (c:: UInt16 ) = (c & 0xfc00 ) == 0xd800
4- utf16_is_trail (c:: UInt16 ) = (c & 0xfc00 ) == 0xdc00
5- utf16_is_surrogate (c:: UInt16 ) = (c & 0xf800 ) == 0xd800
6- utf16_get_supplementary (lead:: UInt16 , trail:: UInt16 ) = Char (UInt32 (lead- 0xd7f7 )<< 10 + trail)
3+ # Quickly copy and set trailing \0
4+ @inline function fast_utf_copy {S <: Union{UTF16String, UTF32String}, T <: Union{UInt16, Char}} (
5+ :: Type{S} , :: Type{T} , len, dat, flag:: Bool = false )
6+ S (setindex! (copy! (Vector {T} (len+ 1 ), 1 , dat, 1 , flag ? len : len+ 1 ), 0 , len+ 1 ))
7+ end
8+
9+ # Get rest of character ch from 3-byte UTF-8 sequence in dat
10+ @inline function get_utf8_3byte (dat, pos, ch)
11+ @inbounds return ((ch & 0xf ) << 12 ) | (UInt32 (dat[pos- 1 ] & 0x3f ) << 6 ) | (dat[pos] & 0x3f )
12+ end
13+ # Get rest of character ch from 4-byte UTF-8 sequence in dat
14+ @inline function get_utf8_4byte (dat, pos, ch)
15+ @inbounds return (((ch & 0x7 ) << 18 )
16+ | (UInt32 (dat[pos- 2 ] & 0x3f ) << 12 )
17+ | (UInt32 (dat[pos- 1 ] & 0x3f ) << 6 )
18+ | (dat[pos] & 0x3f ))
19+ end
20+
21+ # Output a character as a 4-byte UTF-8 sequence
22+ @inline function output_utf8_4byte! (buf, out, ch)
23+ @inbounds begin
24+ buf[out + 1 ] = 0xf0 | (ch >>> 18 )
25+ buf[out + 2 ] = 0x80 | ((ch >>> 12 ) & 0x3f )
26+ buf[out + 3 ] = 0x80 | ((ch >>> 6 ) & 0x3f )
27+ buf[out + 4 ] = 0x80 | (ch & 0x3f )
28+ end
29+ end
30+
31+ const empty_utf16 = UTF16String (UInt16[0 ])
732
833function length (s:: UTF16String )
934 d = s. data
1035 len = length (d) - 1
1136 len == 0 && return 0
1237 cnum = 0
1338 for i = 1 : len
14- @inbounds cnum += ! utf16_is_trail (d[i])
39+ @inbounds cnum += ! is_surrogate_trail (d[i])
1540 end
1641 cnum
1742end
@@ -20,100 +45,240 @@ function endof(s::UTF16String)
2045 d = s. data
2146 i = length (d) - 1
2247 i == 0 && return i
23- utf16_is_surrogate (d[i]) ? i- 1 : i
48+ return is_surrogate_codeunit (d[i]) ? i- 1 : i
2449end
2550
51+ get_supplementary (lead:: Unsigned , trail:: Unsigned ) = (UInt32 (lead- 0xd7f7 )<< 10 + trail)
52+
2653function next (s:: UTF16String , i:: Int )
27- if ! utf16_is_surrogate (s. data[i])
28- return Char (s. data[i]), i+ 1
29- elseif length (s. data)- 1 > i && utf16_is_lead (s. data[i]) && utf16_is_trail (s. data[i+ 1 ])
30- return utf16_get_supplementary (s. data[i], s. data[i+ 1 ]), i+ 2
31- end
32- throw (UnicodeError (UTF_ERR_INVALID_INDEX,0 ,0 ))
54+ ch = s. data[i]
55+ ! is_surrogate_codeunit (ch) && return (Char (ch), i+ 1 )
56+ # check length, account for terminating \0
57+ i >= (length (s. data)- 1 ) && throw (UnicodeError (UTF_ERR_MISSING_SURROGATE, i, UInt32 (ch)))
58+ ! is_surrogate_lead (ch) && throw (UnicodeError (UTF_ERR_NOT_LEAD, i, ch))
59+ ct = s. data[i+ 1 ]
60+ ! is_surrogate_trail (ct) && throw ((UTF_ERR_NOT_TRAIL, i, ch))
61+ Char (get_supplementary (ch, ct)), i+ 2
3362end
3463
3564function reverseind (s:: UTF16String , i:: Integer )
3665 j = length (s. data) - i
37- return Base . utf16_is_trail (s. data[j]) ? j- 1 : j
66+ return is_surrogate_trail (s. data[j]) ? j- 1 : j
3867end
3968
4069lastidx (s:: UTF16String ) = length (s. data) - 1 # s.data includes NULL terminator
4170
4271function reverse (s:: UTF16String )
43- d = s. data
72+ d = s. data
4473 out = similar (d)
4574 out[end ] = 0 # NULL termination
4675 n = length (d)
47- for i = 1 : n- 1
48- out[i] = d[n- i]
49- if Base. utf16_is_lead (out[i])
50- out[i],out[i- 1 ] = out[i- 1 ],out[i]
76+ @inbounds for i = 1 : n- 1
77+ ch = d[n- i]
78+ if is_surrogate_lead (ch)
79+ out[i],out[i- 1 ] = out[i- 1 ],ch
80+ else
81+ out[i] = ch
82+ end
83+ end
84+ UTF16String (out)
85+ end
86+
87+ sizeof (s:: UTF16String ) = sizeof (s. data) - sizeof (UInt16)
88+
89+ function isvalid (:: Type{UTF16String} , data:: AbstractArray{UInt16} )
90+ i = 1
91+ n = length (data) # this may include NULL termination; that's okay
92+ @inbounds while i < n # check for unpaired surrogates
93+ if is_surrogate_lead (data[i]) && is_surrogate_trail (data[i+ 1 ])
94+ i += 2
95+ elseif is_surrogate_codeunit (data[i])
96+ return false
97+ else
98+ i += 1
5199 end
52100 end
53- return UTF16String (out )
101+ return i > n || ! is_surrogate_codeunit (data[i] )
54102end
55103
56- # TODO : optimize this
57- function encode16 (s:: AbstractString )
58- buf = UInt16[]
59- for ch in s
60- c = reinterpret (UInt32, ch)
104+ "
105+ Converts an `AbstractString` to a `UTF16String`
106+
107+ ### Returns:
108+ * `UTF16String`
109+
110+ ### Throws:
111+ * `UnicodeError`
112+ "
113+ function convert (:: Type{UTF16String} , str:: AbstractString )
114+ len, flags, num4byte = unsafe_checkstring (str)
115+ buf = Vector {UInt16} (len+ num4byte+ 1 )
116+ out = 0
117+ @inbounds for ch in str
118+ c = UInt32 (ch)
61119 if c < 0x10000
62- push! (buf, UInt16 (c))
63- elseif c <= 0x10ffff
64- push! (buf, UInt16 (0xd7c0 + (c>> 10 )))
65- push! (buf, UInt16 (0xdc00 + (c & 0x3ff )))
120+ buf[out += 1 ] = UInt16 (c)
66121 else
67- throw (UnicodeError (UTF_ERR_INVALID_CHAR, 0 , ch))
122+ # output surrogate pair
123+ buf[out += 1 ] = UInt16 (0xd7c0 + (ch >>> 10 ))
124+ buf[out += 1 ] = UInt16 (0xdc00 + (ch & 0x3ff ))
68125 end
69126 end
70- push! ( buf, 0 ) # NULL termination
127+ @inbounds buf[out + 1 ] = 0 # NULL termination
71128 UTF16String (buf)
72129end
73130
74- utf16 (x) = convert (UTF16String, x)
75- convert (:: Type{UTF16String} , s:: UTF16String ) = s
76- convert (:: Type{UTF16String} , s:: AbstractString ) = encode16 (s)
77- convert (:: Type{Array{UInt16,1}} , s:: UTF16String ) = s. data
78- convert (:: Type{Array{UInt16}} , s:: UTF16String ) = s. data
131+ "
132+ Converts a `UTF8String` to a `UTF16String`
79133
80- # TODO : optimize this
81- convert (:: Type{UTF8String} , s:: UTF16String ) =
82- sprint (length (s. data)- 1 , io-> for c in s; write (io,c:: Char ); end )
134+ ### Returns:
135+ * `UTF16String`
83136
84- sizeof (s:: UTF16String ) = sizeof (s. data) - sizeof (UInt16)
85- unsafe_convert {T<:Union{Int16,UInt16}} (:: Type{Ptr{T}} , s:: UTF16String ) =
86- convert (Ptr{T}, pointer (s))
137+ ### Throws:
138+ * `UnicodeError`
139+ "
140+ function convert (:: Type{UTF16String} , str:: UTF8String )
141+ dat = str. data
142+ # handle zero length string quickly
143+ sizeof (dat) == 0 && return empty_utf16
144+ # Check that is correct UTF-8 encoding and get number of words needed
145+ len, flags, num4byte = unsafe_checkstring (dat)
146+ len += num4byte
147+ buf = Vector {UInt16} (len+ 1 )
148+ @inbounds buf[len+ 1 ] = 0
149+ # Optimize case where no characters > 0x7f
150+ flags == 0 && @inbounds return UTF16String (copy! (buf, dat))
151+ out = 0
152+ pos = 0
153+ @inbounds while out < len
154+ ch:: UInt32 = dat[pos += 1 ]
155+ # Handle ASCII characters
156+ if ch <= 0x7f
157+ buf[out += 1 ] = ch
158+ # Handle range 0x80-0x7ff
159+ elseif ch < 0xe0
160+ buf[out += 1 ] = ((ch & 0x1f ) << 6 ) | (dat[pos += 1 ] & 0x3f )
161+ # Handle range 0x800-0xffff
162+ elseif ch < 0xf0
163+ pos += 2
164+ buf[out += 1 ] = get_utf8_3byte (dat, pos, ch)
165+ # Handle range 0x10000-0x10ffff
166+ else
167+ pos += 3
168+ ch = get_utf8_4byte (dat, pos, ch)
169+ # output surrogate pair
170+ buf[out += 1 ] = UInt16 (0xd7c0 + (ch >>> 10 ))
171+ buf[out += 1 ] = UInt16 (0xdc00 + (ch & 0x3ff ))
172+ end
173+ end
174+ UTF16String (buf)
175+ end
87176
88- function isvalid (:: Type{UTF16String} , data:: AbstractArray{UInt16} )
89- i = 1
90- n = length (data) # this may include NULL termination; that's okay
91- while i < n # check for unpaired surrogates
92- if utf16_is_lead (data[i]) && utf16_is_trail (data[i+ 1 ])
93- i += 2
94- elseif utf16_is_surrogate (data[i])
95- return false
177+ "
178+ Converts a UTF-16 encoded vector of `UInt16` to a `UTF8String`
179+
180+ ### Returns:
181+ * `UTF8String`
182+
183+ ### Throws:
184+ * `UnicodeError`
185+ "
186+ function convert (:: Type{UTF8String} , dat:: Vector{UInt16} )
187+ len = sizeof (dat)
188+ # handle zero length string quickly
189+ len == 0 && return emtpy_utf8
190+ # get number of bytes to allocate
191+ len, flags, num4byte, num3byte, num2byte = unsafe_checkstring (dat, 1 , len>>> 1 )
192+ flags == 0 && @inbounds return UTF8String (copy! (Vector {UInt8} (len), dat))
193+ return encode_to_utf8 (UInt16, dat, len + num2byte + num3byte* 2 + num4byte* 3 )
194+ end
195+
196+ "
197+ Converts a `UTF16String` to a `UTF8String`
198+
199+ ### Returns:
200+ * `UTF8String`
201+
202+ ### Throws:
203+ * `UnicodeError`
204+ "
205+ function convert (:: Type{UTF8String} , str:: UTF16String )
206+ dat = str. data
207+ len = sizeof (dat) >>> 1
208+ # handle zero length string quickly
209+ len <= 1 && return empty_utf8
210+ # get number of bytes to allocate
211+ len, flags, num4byte, num3byte, num2byte = unsafe_checkstring (dat, 1 , len- 1 )
212+ flags == 0 && @inbounds return UTF8String (copy! (Vector {UInt8} (len), 1 , dat, 1 , len))
213+ return encode_to_utf8 (UInt16, dat, len + num2byte + num3byte* 2 + num4byte* 3 )
214+ end
215+
216+ "
217+ Converts an already validated vector of `UInt16` or `UInt32` to a `UTF8String`
218+
219+ ### Input Arguments:
220+ * `dat` Vector of code units (`UInt16` or `UInt32`), explicit `\0 ` is not converted
221+ * `len` length of output in bytes
222+
223+ ### Returns:
224+ * `UTF8String`
225+ "
226+ function encode_to_utf8 {T<:Union{UInt16, UInt32}} (:: Type{T} , dat, len)
227+ buf = Vector {UInt8} (len)
228+ out = 0
229+ pos = 0
230+ @inbounds while out < len
231+ ch:: UInt32 = dat[pos += 1 ]
232+ # Handle ASCII characters
233+ if ch <= 0x7f
234+ buf[out += 1 ] = ch
235+ # Handle 0x80-0x7ff
236+ elseif ch < 0x800
237+ buf[out += 1 ] = 0xc0 | (ch >>> 6 )
238+ buf[out += 1 ] = 0x80 | (ch & 0x3f )
239+ # Handle 0x10000-0x10ffff (if input is UInt32)
240+ elseif ch > 0xffff # this is only for T == UInt32, should not be generated for UInt16
241+ output_utf8_4byte! (buf, out, ch)
242+ out += 4
243+ # Handle surrogate pairs
244+ elseif is_surrogate_codeunit (ch)
245+ output_utf8_4byte! (buf, out, get_supplementary (ch, dat[pos += 1 ]))
246+ out += 4
247+ # Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters
96248 else
97- i += 1
249+ buf[out += 1 ] = 0xe0 | ((ch >>> 12 ) & 0x3f )
250+ buf[out += 1 ] = 0x80 | ((ch >>> 6 ) & 0x3f )
251+ buf[out += 1 ] = 0x80 | (ch & 0x3f )
98252 end
99253 end
100- return i > n || ! utf16_is_surrogate (data[i] )
254+ UTF8String (buf )
101255end
102256
103- function convert (:: Type{UTF16String} , data:: AbstractVector{UInt16} )
104- ! isvalid (UTF16String, data) && throw (UnicodeError (UTF_ERR_INVALID_16,0 ,0 ))
105- len = length (data)
106- d = Array (UInt16, len + 1 )
107- d[end ] = 0 # NULL terminate
108- UTF16String (copy! (d,1 , data,1 , len))
257+ function convert (:: Type{UTF16String} , str:: ASCIIString )
258+ dat = str. data
259+ @inbounds return fast_utf_copy (UTF16String, UInt16, length (dat), dat, true )
109260end
110261
262+ convert (:: Type{Vector{UInt16}} , str:: UTF16String ) = str. data
263+ convert (:: Type{Array{UInt16}} , str:: UTF16String ) = str. data
264+
265+ convert (:: Type{UTF16String} , str:: UTF16String ) = str
266+
267+ unsafe_convert {T<:Union{Int16,UInt16}} (:: Type{Ptr{T}} , s:: UTF16String ) =
268+ convert (Ptr{T}, pointer (s))
269+
111270convert (T:: Type{UTF16String} , data:: AbstractArray{UInt16} ) =
112271 convert (T, reshape (data, length (data)))
113272
114273convert (T:: Type{UTF16String} , data:: AbstractArray{Int16} ) =
115274 convert (T, reinterpret (UInt16, data))
116275
276+ function convert (:: Type{UTF16String} , data:: AbstractVector{UInt16} )
277+ ! isvalid (UTF16String, data) && throw (UnicodeError (UTF_ERR_INVALID_16,0 ,0 ))
278+ len = length (data)
279+ @inbounds return UTF16String (setindex! (copy! (Vector {UInt16} (len+ 1 ),1 ,data,1 ,len),0 ,len+ 1 ))
280+ end
281+
117282function convert (T:: Type{UTF16String} , bytes:: AbstractArray{UInt8} )
118283 isempty (bytes) && return UTF16String (UInt16[0 ])
119284 isodd (length (bytes)) && throw (UnicodeError (UTF_ERR_ODD_BYTES_16, length (bytes), 0 ))
@@ -136,6 +301,9 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
136301 UTF16String (d)
137302end
138303
304+ convert (:: Type{UTF16String} , str:: UTF16String ) = str
305+
306+ utf16 (x) = convert (UTF16String, x)
139307utf16 (p:: Ptr{UInt16} , len:: Integer ) = utf16 (pointer_to_array (p, len))
140308utf16 (p:: Ptr{Int16} , len:: Integer ) = utf16 (convert (Ptr{UInt16}, p), len)
141309function utf16 (p:: Union{Ptr{UInt16}, Ptr{Int16}} )
0 commit comments