11# This file is a part of Julia. License is MIT: http://julialang.org/license
22
3- # UTF-32 basic functions
4- next (s:: UTF32String , i:: Int ) = (Char (s. data[i]), i+ 1 )
5- endof (s:: UTF32String ) = length (s. data) - 1
6- length (s:: UTF32String ) = length (s. data) - 1
7-
8- reverse (s:: UTF32String ) = UTF32String (reverse! (copy (s. data), 1 , length (s)))
9-
10- sizeof (s:: UTF32String ) = sizeof (s. data) - sizeof (UInt32)
11-
12- const empty_utf32 = UTF32String (UInt32[0 ])
13-
14- convert (:: Type{UTF32String} , c:: Char ) = UTF32String (UInt32[c, 0 ])
15- convert (:: Type{UTF32String} , s:: UTF32String ) = s
16-
17- function convert (:: Type{UTF32String} , str:: AbstractString )
18- len, flags = unsafe_checkstring (str)
19- buf = Vector {UInt32} (len+ 1 )
20- out = 0
21- @inbounds for ch in str ; buf[out += 1 ] = ch ; end
22- @inbounds buf[out + 1 ] = 0 # NULL termination
23- UTF32String (buf)
24- end
25-
26- function convert (:: Type{String} , str:: UTF32String )
27- dat = str. data
28- len = sizeof (dat) >>> 2
29- # handle zero length string quickly
30- len <= 1 && return empty_utf8
31- # get number of bytes to allocate
32- len, flags, num4byte, num3byte, num2byte = unsafe_checkstring (dat, 1 , len- 1 )
33- flags == 0 && @inbounds return String (copy! (Vector {UInt8} (len), 1 , dat, 1 , len))
34- return encode_to_utf8 (UInt32, dat, len + num2byte + num3byte* 2 + num4byte* 3 )
35- end
36-
37- function convert (:: Type{UTF32String} , str:: String )
38- dat = str. data
39- # handle zero length string quickly
40- sizeof (dat) == 0 && return empty_utf32
41- # Validate UTF-8 encoding, and get number of words to create
42- len, flags = unsafe_checkstring (dat)
43- # Optimize case where no characters > 0x7f
44- flags == 0 && @inbounds return fast_utf_copy (UTF32String, UInt32, len, dat, true )
45- # has multi-byte UTF-8 sequences
46- buf = Vector {UInt32} (len+ 1 )
47- @inbounds buf[len+ 1 ] = 0 # NULL termination
48- local ch:: UInt32 , surr:: UInt32
49- out = 0
50- pos = 0
51- @inbounds while out < len
52- ch = dat[pos += 1 ]
53- # Handle ASCII characters
54- if ch <= 0x7f
55- buf[out += 1 ] = ch
56- # Handle range 0x80-0x7ff
57- elseif ch < 0xe0
58- buf[out += 1 ] = ((ch & 0x1f ) << 6 ) | (dat[pos += 1 ] & 0x3f )
59- # Handle range 0x800-0xffff
60- elseif ch < 0xf0
61- pos += 2
62- ch = get_utf8_3byte (dat, pos, ch)
63- # Handle surrogate pairs (should have been encoded in 4 bytes)
64- if is_surrogate_lead (ch)
65- # Build up 32-bit character from ch and trailing surrogate in next 3 bytes
66- pos += 3
67- surr = ((UInt32 (dat[pos- 2 ] & 0xf ) << 12 )
68- | (UInt32 (dat[pos- 1 ] & 0x3f ) << 6 )
69- | (dat[pos] & 0x3f ))
70- ch = get_supplementary (ch, surr)
71- end
72- buf[out += 1 ] = ch
73- # Handle range 0x10000-0x10ffff
74- else
75- pos += 3
76- buf[out += 1 ] = get_utf8_4byte (dat, pos, ch)
77- end
78- end
79- UTF32String (buf)
80- end
81-
82- function convert (:: Type{UTF32String} , str:: UTF16String )
83- dat = str. data
84- len = sizeof (dat)
85- # handle zero length string quickly (account for trailing \0)
86- len <= 2 && return empty_utf32
87- # get number of words to create
88- len, flags, num4byte = unsafe_checkstring (dat, 1 , len>>> 1 )
89- # No surrogate pairs, do optimized copy
90- (flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String (copy! (Vector {Char} (len), dat))
91- local ch:: UInt32
92- buf = Vector {UInt32} (len)
93- out = 0
94- pos = 0
95- @inbounds while out < len
96- ch = dat[pos += 1 ]
97- # check for surrogate pair
98- if is_surrogate_lead (ch) ; ch = get_supplementary (ch, dat[pos += 1 ]) ; end
99- buf[out += 1 ] = ch
100- end
101- UTF32String (buf)
102- end
103-
104- function convert (:: Type{UTF16String} , str:: UTF32String )
105- dat = str. data
106- len = sizeof (dat)
107- # handle zero length string quickly
108- len <= 4 && return empty_utf16
109- # get number of words to allocate
110- len, flags, num4byte = unsafe_checkstring (dat, 1 , len>>> 2 )
111- # optimized path, no surrogates
112- num4byte == 0 && @inbounds return UTF16String (copy! (Vector {UInt16} (len), dat))
113- return encode_to_utf16 (dat, len + num4byte)
114- end
115-
116- function convert (:: Type{UTF32String} , dat:: AbstractVector{UInt32} )
117- @inbounds return fast_utf_copy (UTF32String, UInt32, length (dat), dat, true )
118- end
119-
120- convert (:: Type{UTF32String} , data:: AbstractVector{Int32} ) =
121- convert (UTF32String, reinterpret (UInt32, convert (Vector{T}, data)))
122-
123- convert (:: Type{UTF32String} , data:: AbstractVector{Char} ) =
124- convert (UTF32String, map (UInt32, data))
125-
126- convert {T<:AbstractString, S<:Union{UInt32,Char,Int32}} (:: Type{T} , v:: AbstractVector{S} ) =
127- convert (T, utf32 (v))
128-
1293# specialize for performance reasons:
1304function convert {T<:Union{UInt32,Char,Int32}} (:: Type{String} , data:: AbstractVector{T} )
1315 s = IOBuffer (Array (UInt8,length (data)), true , true )
@@ -136,98 +10,20 @@ function convert{T<:Union{UInt32,Char,Int32}}(::Type{String}, data::AbstractVect
13610 convert (String, takebuf_string (s))
13711end
13812
139- convert (:: Type{Vector{UInt32}} , str:: UTF32String ) = str. data
140- convert (:: Type{Array{UInt32}} , str:: UTF32String ) = str. data
141-
142- unsafe_convert {T<:Union{UInt32,Int32,Char}} (:: Type{Ptr{T}} , s:: UTF32String ) =
143- convert (Ptr{T}, pointer (s))
144-
145- function convert (T:: Type{UTF32String} , bytes:: AbstractArray{UInt8} )
146- isempty (bytes) && return empty_utf32
147- length (bytes) & 3 != 0 && throw (UnicodeError (UTF_ERR_ODD_BYTES_32,0 ,0 ))
148- data = reinterpret (UInt32, bytes)
149- # check for byte-order mark (BOM):
150- if data[1 ] == 0x0000feff # native byte order
151- d = Array (UInt32, length (data))
152- copy! (d,1 , data, 2 , length (data)- 1 )
153- elseif data[1 ] == 0xfffe0000 # byte-swapped
154- d = Array (UInt32, length (data))
155- for i = 2 : length (data)
156- @inbounds d[i- 1 ] = bswap (data[i])
157- end
158- else
159- d = Array (UInt32, length (data) + 1 )
160- copy! (d, 1 , data, 1 , length (data)) # assume native byte order
161- end
162- d[end ] = 0 # NULL terminate
163- UTF32String (d)
164- end
165-
166- function isvalid (:: Type{UTF32String} , str:: Union{Vector{UInt32}, Vector{Char}} )
167- for i= 1 : length (str)
168- @inbounds if ! isvalid (Char, UInt32 (str[i])) ; return false ; end
169- end
170- return true
171- end
172- isvalid (str:: Vector{Char} ) = isvalid (UTF32String, str)
173-
174- utf32 (x) = convert (UTF32String, x)
175-
176- utf32 (p:: Ptr{UInt32} , len:: Integer ) = utf32 (pointer_to_array (p, len))
177- utf32 (p:: Union{Ptr{Char}, Ptr{Int32}} , len:: Integer ) = utf32 (convert (Ptr{UInt32}, p), len)
178- function utf32 (p:: Union{Ptr{UInt32}, Ptr{Char}, Ptr{Int32}} )
179- len = 0
180- while unsafe_load (p, len+ 1 ) != 0 ; len += 1 ; end
181- utf32 (p, len)
182- end
183-
184- function map (f, s:: UTF32String )
185- d = s. data
186- out = similar (d)
187- out[end ] = 0
188-
189- @inbounds for i = 1 : (length (d)- 1 )
190- c2 = f (Char (d[i]))
191- if ! isa (c2, Char)
192- throw (UnicodeError (UTF_ERR_MAP_CHAR, 0 , 0 ))
193- end
194- out[i] = (c2:: Char )
195- end
196- UTF32String (out)
197- end
198-
19913# Definitions for C compatible strings, that don't allow embedded
20014# '\0', and which are terminated by a '\0'
20115
20216containsnul (s:: AbstractString ) = ' \0 ' in s
20317containsnul (s:: String ) = containsnul (unsafe_convert (Ptr{Cchar}, s), sizeof (s))
204- containsnul (s:: Union{UTF16String,UTF32String} ) = findfirst (s. data, 0 ) != length (s. data)
205-
206- if sizeof (Cwchar_t) == 2
207- const WString = UTF16String
208- const wstring = utf16
209- elseif sizeof (Cwchar_t) == 4
210- const WString = UTF32String
211- const wstring = utf32
212- end
213- wstring (s:: Cwstring ) = wstring (convert (Ptr{Cwchar_t}, s))
214-
215- # Cwstring is defined in c.jl, but conversion needs to be defined here
216- # to have WString
217- function unsafe_convert (:: Type{Cwstring} , s:: WString )
218- if containsnul (s)
219- throw (ArgumentError (" embedded NUL chars are not allowed in C strings: $(repr (s)) " ))
220- end
221- return Cwstring (unsafe_convert (Ptr{Cwchar_t}, s))
222- end
18+ containsnul (s:: Union{UTF16String} ) = findfirst (s. data, 0 ) != length (s. data)
22319
22420# pointer conversions of ASCII/UTF8/UTF16/UTF32 strings:
225- pointer (x:: Union{String,UTF16String,UTF32String } ) = pointer (x. data)
21+ pointer (x:: Union{String,UTF16String} ) = pointer (x. data)
22622pointer (x:: String , i:: Integer ) = pointer (x. data)+ (i- 1 )
227- pointer (x:: Union{UTF16String,UTF32String } , i:: Integer ) = pointer (x)+ (i- 1 )* sizeof (eltype (x. data))
23+ pointer (x:: Union{UTF16String} , i:: Integer ) = pointer (x)+ (i- 1 )* sizeof (eltype (x. data))
22824
22925# pointer conversions of SubString of ASCII/UTF8/UTF16/UTF32:
23026pointer (x:: SubString{String} ) = pointer (x. string. data) + x. offset
23127pointer (x:: SubString{String} , i:: Integer ) = pointer (x. string. data) + x. offset + (i- 1 )
232- pointer {T<:Union{UTF16String,UTF32String}} (x:: SubString{T } ) = pointer (x. string. data) + x. offset* sizeof (eltype (x. string. data))
233- pointer {T<:Union{UTF16String,UTF32String}} (x:: SubString{T } , i:: Integer ) = pointer (x. string. data) + (x. offset + (i- 1 ))* sizeof (eltype (x. string. data))
28+ pointer (x:: SubString{UTF16String } ) = pointer (x. string. data) + x. offset* sizeof (eltype (x. string. data))
29+ pointer (x:: SubString{UTF16String } , i:: Integer ) = pointer (x. string. data) + (x. offset + (i- 1 ))* sizeof (eltype (x. string. data))
0 commit comments