@@ -5,25 +5,277 @@ next(s::UTF32String, i::Int) = (s.data[i], i+1)
55endof (s:: UTF32String ) = length (s. data) - 1
66length (s:: UTF32String ) = length (s. data) - 1
77
8+ reverse (s:: UTF32String ) = UTF32String (reverse! (copy (s. data), 1 , length (s)))
9+
10+ sizeof (s:: UTF32String ) = sizeof (s. data) - sizeof (Char)
11+
12+ const empty_utf32 = UTF32String (UInt32[0 ])
13+
814utf32 (x) = convert (UTF32String, x)
915convert (:: Type{UTF32String} , c:: Char ) = UTF32String (Char[c, Char (0 )])
1016convert (:: Type{UTF32String} , s:: UTF32String ) = s
1117
12- function convert (:: Type{UTF32String} , s:: AbstractString )
13- a = Array (Char, length (s) + 1 )
14- i = 0
15- for c in s
16- a[i += 1 ] = c
18+ "
19+ Converts an `AbstractString` to a `UTF16String`
20+
21+ ### Input Arguments:
22+ * `::Type{UTF32String}`
23+ * `str::AbstractString`
24+
25+ ### Returns:
26+ * `::UTF32String`
27+
28+ ### Throws:
29+ * `UnicodeError`
30+ "
31+ function convert (:: Type{UTF32String} , str:: AbstractString )
32+ len, flags = check_string (str)
33+ buf = Vector {Char} (len+ 1 )
34+ out = 0
35+ @inbounds for ch in str ; buf[out += 1 ] = ch ; end
36+ @inbounds buf[out + 1 ] = 0 # NULL termination
37+ UTF32String (buf)
38+ end
39+
40+ "
41+ Converts a UTF-32 encoded vector of `UInt32` to a `UTF8String`
42+
43+ ### Input Arguments:
44+ * `::Type{UTF8String}`
45+ * `dat::Vector{UInt32}`
46+
47+ ### Returns:
48+ * `::UTF8String`
49+
50+ ### Throws:
51+ * `UnicodeError`
52+ "
53+ function convert (:: Type{UTF8String} , dat:: Vector{UInt32} )
54+ len = sizeof (dat)
55+ # handle zero length string quickly
56+ len == 0 && return empty_utf8
57+ # get number of bytes to allocate
58+ len, flags, num4byte, num3byte, num2byte = check_string (dat, len>>> 2 )
59+ flags == 0 && @inbounds return UTF8String (copy! (Vector {UInt8} (len), 1 , dat, 1 , len))
60+ return encode_to_utf8 (UInt32, dat, len + num2byte + num3byte* 2 + num4byte* 3 )
61+ end
62+
63+ "
64+ Converts a `UTF32String` to a `UTF8String`
65+
66+ ### Input Arguments:
67+ * `::Type{UTF8String}`
68+ * `str::UTF32String`
69+
70+ ### Returns:
71+ * `::UTF8String`
72+
73+ ### Throws:
74+ * `UnicodeError`
75+ "
76+ function convert (:: Type{UTF8String} , str:: UTF32String )
77+ dat = reinterpret (UInt32, str. data)
78+ len = sizeof (dat) >>> 2
79+ # handle zero length string quickly
80+ len <= 1 && return empty_utf8
81+ # get number of bytes to allocate
82+ len, flags, num4byte, num3byte, num2byte = check_string (dat, len- 1 )
83+ flags == 0 && @inbounds return UTF8String (copy! (Vector {UInt8} (len), 1 , dat, 1 , len))
84+ return encode_to_utf8 (UInt32, dat, len + num2byte + num3byte* 2 + num4byte* 3 )
85+ end
86+
87+ "
88+ Converts a `UTF8String` to a `UTF32String`
89+
90+ ### Input Arguments:
91+ * `::Type{UTF32String}`
92+ * `str::UTF8String`
93+
94+ ### Returns:
95+ * `::UTF32String`
96+
97+ ### Throws:
98+ * `UnicodeError`
99+ "
100+ function convert (:: Type{UTF32String} , str:: UTF8String )
101+ dat = str. data
102+ # handle zero length string quickly
103+ sizeof (dat) == 0 && return empty_utf32
104+ # Validate UTF-8 encoding, and get number of words to create
105+ len, flags = check_string (dat)
106+ # Optimize case where no characters > 0x7f
107+ flags == 0 && @inbounds return fast_utf_copy (UTF32String, Char, len, dat, true )
108+ # has multi-byte UTF-8 sequences
109+ buf = Vector {Char} (len+ 1 )
110+ @inbounds buf[len+ 1 ] = 0 # NULL termination
111+ local ch:: UInt32 , surr:: UInt32
112+ out = 0
113+ pos = 0
114+ @inbounds while out < len
115+ ch = dat[pos += 1 ]
116+ # Handle ASCII characters
117+ if ch <= 0x7f
118+ buf[out += 1 ] = ch
119+ # Handle range 0x80-0x7ff
120+ elseif ch < 0xe0
121+ buf[out += 1 ] = ((ch & 0x1f ) << 6 ) | (dat[pos += 1 ] & 0x3f )
122+ # Handle range 0x800-0xffff
123+ elseif ch < 0xf0
124+ pos += 2
125+ ch = get_utf8_3byte (dat, pos, ch)
126+ # Handle surrogate pairs (should have been encoded in 4 bytes)
127+ if is_surrogate_lead (ch)
128+ # Build up 32-bit character from ch and trailing surrogate in next 3 bytes
129+ pos += 3
130+ surr = ((UInt32 (dat[pos- 2 ] & 0xf ) << 12 )
131+ | (UInt32 (dat[pos- 1 ] & 0x3f ) << 6 )
132+ | (dat[pos] & 0x3f ))
133+ ch = get_supplementary (ch, surr)
134+ end
135+ buf[out += 1 ] = ch
136+ # Handle range 0x10000-0x10ffff
137+ else
138+ pos += 3
139+ buf[out += 1 ] = get_utf8_4byte (dat, pos, ch)
140+ end
141+ end
142+ UTF32String (buf)
143+ end
144+
145+ "
146+ Converts a `UTF16String` to `UTF32String`
147+
148+ ### Input Arguments:
149+ * `::Type{UTF32String}`
150+ * `str::UTF16String`
151+
152+ ### Returns:
153+ * `::UTF32String`
154+
155+ ### Throws:
156+ * `UnicodeError`
157+ "
158+ function convert (:: Type{UTF32String} , str:: UTF16String )
159+ dat = str. data
160+ len = sizeof (dat)
161+ # handle zero length string quickly (account for trailing \0)
162+ len <= 2 && return empty_utf32
163+ # get number of words to create
164+ len, flags, num4byte = check_string (dat, len>>> 1 )
165+ # No surrogate pairs, do optimized copy
166+ (flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String (copy! (Vector {Char} (len), dat))
167+ local ch:: UInt32
168+ buf = Vector {Char} (len)
169+ out = 0
170+ pos = 0
171+ @inbounds while out < len
172+ ch = dat[pos += 1 ]
173+ # check for surrogate pair
174+ if is_surrogate_lead (ch) ; ch = get_supplementary (ch, dat[pos += 1 ]) ; end
175+ buf[out += 1 ] = ch
17176 end
18- a[end ] = Char (0 ) # NULL terminate
19- UTF32String (a)
177+ UTF32String (buf)
178+ end
179+
180+ "
181+ Converts a UTF-32 encoded vector of `UInt32` to a `UTF16String`
182+
183+ ### Input Arguments:
184+ * `::Type{UTF16String}`
185+ * `dat::Vector{UInt32}`
186+
187+ ### Returns:
188+ * `::UTF16String`
189+
190+ ### Throws:
191+ * `UnicodeError`
192+ "
193+ function convert (:: Type{UTF16String} , dat:: Vector{UInt32} )
194+ len = sizeof (dat)
195+ # handle zero length string quickly
196+ len <= 4 && return empty_utf16
197+ # get number of words to allocate
198+ len, flags, num4byte = check_string (dat, len>>> 2 )
199+ len += num4byte + 1
200+ # optimized path, no surrogates
201+ num4byte == 0 && @inbounds return fast_utf_copy (UTF16String, UInt16, len, dat)
202+ return encode_to_utf16 (dat, len)
203+ end
204+
205+ "
206+ Converts a `UTF32String` to `UTF16String`
207+
208+ ### Input Arguments:
209+ * `::Type{UTF16String}`
210+ * `str::UTF32String`
211+
212+ ### Returns:
213+ * `::UTF16String`
214+
215+ ### Throws:
216+ * `UnicodeError`
217+ "
218+ function convert (:: Type{UTF16String} , str:: UTF32String )
219+ dat = reinterpret (UInt32, str. data)
220+ len = sizeof (dat)
221+ # handle zero length string quickly
222+ len <= 4 && return empty_utf16
223+ # get number of words to allocate
224+ len, flags, num4byte = check_string (dat, len>>> 2 )
225+ # optimized path, no surrogates
226+ num4byte == 0 && @inbounds return UTF16String (copy! (Vector {UInt16} (len), dat))
227+ return encode_to_utf16 (dat, len + num4byte)
228+ end
229+
230+ "
231+ Converts an already validated UTF-32 encoded vector of `UInt32` to a `UTF16String`
232+
233+ ### Input Arguments:
234+ * `dat::Vector{UInt32}` UTF-32 encoded data
235+ * `len` length of output in 16-bit words
236+
237+ ### Returns:
238+ * `::UTF16String`
239+ "
240+ function encode_to_utf16 (dat, len)
241+ buf = Vector {UInt16} (len)
242+ @inbounds buf[len] = 0 # NULL termination
243+ out = 0
244+ pos = 0
245+ @inbounds while out < len
246+ ch = UInt32 (dat[pos += 1 ])
247+ if ch > 0xffff
248+ # Output surrogate pair for 0x10000-0x10ffff
249+ buf[out += 1 ] = 0xd7c0 + (ch >>> 10 )
250+ ch = 0xdc00 + (ch & 0x3ff )
251+ end
252+ buf[out += 1 ] = ch
253+ end
254+ UTF16String (buf)
255+ end
256+
257+ convert (:: Type{UTF8String} , dat:: Vector{Char} ) = convert (UTF8String, reinterpret (UInt32, dat))
258+
259+ convert (:: Type{UTF16String} , dat:: Vector{Char} ) = convert (UTF16String, reinterpret (UInt32, dat))
260+ convert (:: Type{UTF32String} , c:: Char ) = UTF32String (Char[c, Char (0 )])
261+
262+ function convert (:: Type{UTF32String} , str:: ASCIIString )
263+ dat = str. data
264+ @inbounds return fast_utf_copy (UTF32String, Char, length (dat), dat, true )
265+ end
266+
267+ function convert (:: Type{UTF32String} , dat:: AbstractVector{Char} )
268+ @inbounds return fast_utf_copy (UTF32String, Char, length (dat), dat, true )
20269end
21270
22271function convert (:: Type{UTF32String} , data:: AbstractVector{Char} )
23272 len = length (data)
24- d = Array (Char, len + 1 )
25- d[end ] = Char (0 ) # NULL terminate
26- UTF32String (copy! (d,1 , data,1 , len))
273+ @inbounds return UTF32String (setindex! (copy! (Vector {Char} (len+ 1 ),1 ,data,1 ,len),0 ,len+ 1 ))
274+ end
275+
276+ function convert (:: Type{UTF32String} , data:: AbstractVector{Char} )
277+ len = length (data)
278+ @inbounds return UTF32String (setindex! (copy! (Vector {Char} (len+ 1 ),1 ,data,1 ,len),0 ,len+ 1 ))
27279end
28280
29281convert {T<:Union{Int32,UInt32}} (:: Type{UTF32String} , data:: AbstractVector{T} ) =
@@ -46,12 +298,11 @@ convert(::Type{Array{Char}}, str::UTF32String) = str.data
46298
47299reverse (s:: UTF32String ) = UTF32String (reverse! (copy (s. data), 1 , length (s)))
48300
49- sizeof (s:: UTF32String ) = sizeof (s. data) - sizeof (Char)
50301unsafe_convert {T<:Union{Int32,UInt32,Char}} (:: Type{Ptr{T}} , s:: UTF32String ) =
51302 convert (Ptr{T}, pointer (s))
52303
53304function convert (T:: Type{UTF32String} , bytes:: AbstractArray{UInt8} )
54- isempty (bytes) && return UTF32String (Char[ 0 ])
305+ isempty (bytes) && return empty_utf32
55306 length (bytes) & 3 != 0 && throw (UnicodeError (UTF_ERR_ODD_BYTES_32,0 ,0 ))
56307 data = reinterpret (Char, bytes)
57308 # check for byte-order mark (BOM):
@@ -79,6 +330,8 @@ function isvalid(::Type{UTF32String}, str::Union{Vector{Char}, Vector{UInt32}})
79330end
80331isvalid (str:: Vector{Char} ) = isvalid (UTF32String, str)
81332
333+ utf32 (x) = convert (UTF32String, x)
334+
82335utf32 (p:: Ptr{Char} , len:: Integer ) = utf32 (pointer_to_array (p, len))
83336utf32 (p:: Union{Ptr{UInt32}, Ptr{Int32}} , len:: Integer ) = utf32 (convert (Ptr{Char}, p), len)
84337function utf32 (p:: Union{Ptr{Char}, Ptr{UInt32}, Ptr{Int32}} )
0 commit comments