@@ -5,25 +5,169 @@ next(s::UTF32String, i::Int) = (s.data[i], i+1)
55endof (s:: UTF32String ) = length (s. data) - 1
66length (s:: UTF32String ) = length (s. data) - 1
77
8+ reverse (s:: UTF32String ) = UTF32String (reverse! (copy (s. data), 1 , length (s)))
9+
10+ sizeof (s:: UTF32String ) = sizeof (s. data) - sizeof (Char)
11+
12+ const empty_utf32 = UTF32String (UInt32[0 ])
13+
814utf32 (x) = convert (UTF32String, x)
915convert (:: Type{UTF32String} , c:: Char ) = UTF32String (Char[c, Char (0 )])
1016convert (:: Type{UTF32String} , s:: UTF32String ) = s
1117
12- function convert (:: Type{UTF32String} , s:: AbstractString )
13- a = Array (Char, length (s) + 1 )
14- i = 0
15- for c in s
16- a[i += 1 ] = c
18+ "
19+ Converts an `AbstractString` to a `UTF32String`
20+
21+ ### Returns:
22+ * `UTF32String`
23+
24+ ### Throws:
25+ * `UnicodeError`
26+ "
27+ function convert (:: Type{UTF32String} , str:: AbstractString )
28+ len, flags = unsafe_checkstring (str)
29+ buf = Vector {Char} (len+ 1 )
30+ out = 0
31+ @inbounds for ch in str ; buf[out += 1 ] = ch ; end
32+ @inbounds buf[out + 1 ] = 0 # NULL termination
33+ UTF32String (buf)
34+ end
35+
36+ "
37+ Converts a `UTF32String` to a `UTF8String`
38+
39+ ### Returns:
40+ * `UTF8String`
41+
42+ ### Throws:
43+ * `UnicodeError`
44+ "
45+ function convert (:: Type{UTF8String} , str:: UTF32String )
46+ dat = reinterpret (UInt32, str. data)
47+ len = sizeof (dat) >>> 2
48+ # handle zero length string quickly
49+ len <= 1 && return empty_utf8
50+ # get number of bytes to allocate
51+ len, flags, num4byte, num3byte, num2byte = unsafe_checkstring (dat, 1 , len- 1 )
52+ flags == 0 && @inbounds return UTF8String (copy! (Vector {UInt8} (len), 1 , dat, 1 , len))
53+ return encode_to_utf8 (UInt32, dat, len + num2byte + num3byte* 2 + num4byte* 3 )
54+ end
55+
56+ "
57+ Converts a `UTF8String` to a `UTF32String`
58+
59+ ### Returns:
60+ * `::UTF32String`
61+
62+ ### Throws:
63+ * `UnicodeError`
64+ "
65+ function convert (:: Type{UTF32String} , str:: UTF8String )
66+ dat = str. data
67+ # handle zero length string quickly
68+ sizeof (dat) == 0 && return empty_utf32
69+ # Validate UTF-8 encoding, and get number of words to create
70+ len, flags = unsafe_checkstring (dat)
71+ # Optimize case where no characters > 0x7f
72+ flags == 0 && @inbounds return fast_utf_copy (UTF32String, Char, len, dat, true )
73+ # has multi-byte UTF-8 sequences
74+ buf = Vector {Char} (len+ 1 )
75+ @inbounds buf[len+ 1 ] = 0 # NULL termination
76+ local ch:: UInt32 , surr:: UInt32
77+ out = 0
78+ pos = 0
79+ @inbounds while out < len
80+ ch = dat[pos += 1 ]
81+ # Handle ASCII characters
82+ if ch <= 0x7f
83+ buf[out += 1 ] = ch
84+ # Handle range 0x80-0x7ff
85+ elseif ch < 0xe0
86+ buf[out += 1 ] = ((ch & 0x1f ) << 6 ) | (dat[pos += 1 ] & 0x3f )
87+ # Handle range 0x800-0xffff
88+ elseif ch < 0xf0
89+ pos += 2
90+ ch = get_utf8_3byte (dat, pos, ch)
91+ # Handle surrogate pairs (should have been encoded in 4 bytes)
92+ if is_surrogate_lead (ch)
93+ # Build up 32-bit character from ch and trailing surrogate in next 3 bytes
94+ pos += 3
95+ surr = ((UInt32 (dat[pos- 2 ] & 0xf ) << 12 )
96+ | (UInt32 (dat[pos- 1 ] & 0x3f ) << 6 )
97+ | (dat[pos] & 0x3f ))
98+ ch = get_supplementary (ch, surr)
99+ end
100+ buf[out += 1 ] = ch
101+ # Handle range 0x10000-0x10ffff
102+ else
103+ pos += 3
104+ buf[out += 1 ] = get_utf8_4byte (dat, pos, ch)
105+ end
17106 end
18- a[end ] = Char (0 ) # NULL terminate
19- UTF32String (a)
107+ UTF32String (buf)
20108end
21109
22- function convert (:: Type{UTF32String} , data:: AbstractVector{Char} )
23- len = length (data)
24- d = Array (Char, len + 1 )
25- d[end ] = Char (0 ) # NULL terminate
26- UTF32String (copy! (d,1 , data,1 , len))
110+ "
111+ Converts a `UTF16String` to `UTF32String`
112+
113+ ### Returns:
114+ * `::UTF32String`
115+
116+ ### Throws:
117+ * `UnicodeError`
118+ "
119+ function convert (:: Type{UTF32String} , str:: UTF16String )
120+ dat = str. data
121+ len = sizeof (dat)
122+ # handle zero length string quickly (account for trailing \0)
123+ len <= 2 && return empty_utf32
124+ # get number of words to create
125+ len, flags, num4byte = unsafe_checkstring (dat, 1 , len>>> 1 )
126+ # No surrogate pairs, do optimized copy
127+ (flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String (copy! (Vector {Char} (len), dat))
128+ local ch:: UInt32
129+ buf = Vector {Char} (len)
130+ out = 0
131+ pos = 0
132+ @inbounds while out < len
133+ ch = dat[pos += 1 ]
134+ # check for surrogate pair
135+ if is_surrogate_lead (ch) ; ch = get_supplementary (ch, dat[pos += 1 ]) ; end
136+ buf[out += 1 ] = ch
137+ end
138+ UTF32String (buf)
139+ end
140+
141+ "
142+ Converts a `UTF32String` to `UTF16String`
143+
144+ ### Returns:
145+ * `::UTF16String`
146+
147+ ### Throws:
148+ * `UnicodeError`
149+ "
150+ function convert (:: Type{UTF16String} , str:: UTF32String )
151+ dat = reinterpret (UInt32, str. data)
152+ len = sizeof (dat)
153+ # handle zero length string quickly
154+ len <= 4 && return empty_utf16
155+ # get number of words to allocate
156+ len, flags, num4byte = unsafe_checkstring (dat, 1 , len>>> 2 )
157+ # optimized path, no surrogates
158+ num4byte == 0 && @inbounds return UTF16String (copy! (Vector {UInt16} (len), dat))
159+ return encode_to_utf16 (dat, len + num4byte)
160+ end
161+
162+ convert (:: Type{UTF32String} , c:: Char ) = UTF32String (Char[c, Char (0 )])
163+
164+ function convert (:: Type{UTF32String} , str:: ASCIIString )
165+ dat = str. data
166+ @inbounds return fast_utf_copy (UTF32String, Char, length (dat), dat, true )
167+ end
168+
169+ function convert (:: Type{UTF32String} , dat:: AbstractVector{Char} )
170+ @inbounds return fast_utf_copy (UTF32String, Char, length (dat), dat, true )
27171end
28172
29173convert {T<:Union{Int32,UInt32}} (:: Type{UTF32String} , data:: AbstractVector{T} ) =
@@ -46,12 +190,11 @@ convert(::Type{Array{Char}}, str::UTF32String) = str.data
46190
47191reverse (s:: UTF32String ) = UTF32String (reverse! (copy (s. data), 1 , length (s)))
48192
49- sizeof (s:: UTF32String ) = sizeof (s. data) - sizeof (Char)
50193unsafe_convert {T<:Union{Int32,UInt32,Char}} (:: Type{Ptr{T}} , s:: UTF32String ) =
51194 convert (Ptr{T}, pointer (s))
52195
53196function convert (T:: Type{UTF32String} , bytes:: AbstractArray{UInt8} )
54- isempty (bytes) && return UTF32String (Char[ 0 ])
197+ isempty (bytes) && return empty_utf32
55198 length (bytes) & 3 != 0 && throw (UnicodeError (UTF_ERR_ODD_BYTES_32,0 ,0 ))
56199 data = reinterpret (Char, bytes)
57200 # check for byte-order mark (BOM):
@@ -79,6 +222,8 @@ function isvalid(::Type{UTF32String}, str::Union{Vector{Char}, Vector{UInt32}})
79222end
80223isvalid (str:: Vector{Char} ) = isvalid (UTF32String, str)
81224
225+ utf32 (x) = convert (UTF32String, x)
226+
82227utf32 (p:: Ptr{Char} , len:: Integer ) = utf32 (pointer_to_array (p, len))
83228utf32 (p:: Union{Ptr{UInt32}, Ptr{Int32}} , len:: Integer ) = utf32 (convert (Ptr{Char}, p), len)
84229function utf32 (p:: Union{Ptr{Char}, Ptr{UInt32}, Ptr{Int32}} )
0 commit comments