Skip to content

Commit 9c1d0c8

Browse files
highlander: excise UTF-32
1 parent 7d1d72d commit 9c1d0c8

File tree

8 files changed

+9
-276
lines changed

8 files changed

+9
-276
lines changed

base/c.jl

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,6 @@ pointer_to_string(p::Cstring, own::Bool=false) = pointer_to_string(convert(Ptr{U
8080

8181
# convert strings to String etc. to pass as pointers
8282
cconvert(::Type{Cstring}, s::AbstractString) = bytestring(s)
83-
cconvert(::Type{Cwstring}, s::AbstractString) = wstring(s)
8483

8584
containsnul(p::Ptr, len) = C_NULL != ccall(:memchr, Ptr{Cchar}, (Ptr{Cchar}, Cint, Csize_t), p, 0, len)
8685
function unsafe_convert(::Type{Cstring}, s::String)
@@ -94,8 +93,6 @@ end
9493
# symbols are guaranteed not to contain embedded NUL
9594
convert(::Type{Cstring}, s::Symbol) = Cstring(unsafe_convert(Ptr{Cchar}, s))
9695

97-
# in string.jl: unsafe_convert(::Type{Cwstring}, s::WString)
98-
9996
# deferring (or un-deferring) ctrl-c handler for external C code that
10097
# is not interrupt safe (see also issue #2622). The sigatomic_begin/end
10198
# functions should always be called in matched pairs, ideally via:

base/deprecated.jl

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -501,14 +501,9 @@ export float32_isvalid, float64_isvalid
501501
@deprecate is_valid_char(ch::Char) isvalid(ch)
502502
@deprecate is_valid_utf8(str::String) isvalid(str)
503503
@deprecate is_valid_utf16(str::UTF16String) isvalid(str)
504-
@deprecate is_valid_utf32(str::UTF32String) isvalid(str)
505504
@deprecate is_valid_char(ch) isvalid(Char, ch)
506505
@deprecate is_valid_utf8(str) isvalid(String, str)
507506
@deprecate is_valid_utf16(str) isvalid(UTF16String, str)
508-
@deprecate is_valid_utf32(str) isvalid(UTF32String, str)
509-
510-
# 11379
511-
@deprecate utf32(c::Integer...) UTF32String(UInt32[c...,0])
512507

513508
# 12087
514509
@deprecate call(P::Base.DFT.Plan, A) P * A

base/docs/helpdb/Base.jl

Lines changed: 0 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -94,32 +94,6 @@ Get the step size of a [`Range`](:obj:`Range`) object.
9494
"""
9595
step
9696

97-
"""
98-
utf32(s)
99-
100-
Create a UTF-32 string from a byte array, array of `Char` or `UInt32`, or any other string
101-
type. (Conversions of byte arrays check for a byte-order marker in the first four bytes, and
102-
do not include it in the resulting string.)
103-
104-
Note that the resulting `UTF32String` data is terminated by the NUL codepoint (32-bit zero),
105-
which is not treated as a character in the string (so that it is mostly invisible in Julia);
106-
this allows the string to be passed directly to external functions requiring NUL-terminated
107-
data. This NUL is appended automatically by the `utf32(s)` conversion function. If you have
108-
a `Char` or `UInt32` array `A` that is already NUL-terminated UTF-32 data, then you can
109-
instead use `UTF32String(A)` to construct the string without making a copy of the data and
110-
treating the NUL as a terminator rather than as part of the string.
111-
"""
112-
utf32(s)
113-
114-
"""
115-
utf32(::Union{Ptr{Char},Ptr{UInt32},Ptr{Int32}} [, length])
116-
117-
Create a string from the address of a NUL-terminated UTF-32 string. A copy is made; the
118-
pointer can be safely freed. If `length` is specified, the string does not have to be
119-
NUL-terminated.
120-
"""
121-
utf32(::Union{Ptr{Char},Ptr{UInt32},Ptr{Int32}}, length=?)
122-
12397
"""
12498
takebuf_array(b::IOBuffer)
12599
@@ -2962,15 +2936,6 @@ Equivalent to `writedlm` with `delim` set to comma.
29622936
"""
29632937
writecsv
29642938

2965-
"""
2966-
wstring(s)
2967-
2968-
This is a synonym for either `utf32(s)` or `utf16(s)`, depending on whether `Cwchar_t` is 32
2969-
or 16 bits, respectively. The synonym `WString` for `UTF32String` or `UTF16String` is also
2970-
provided.
2971-
"""
2972-
wstring
2973-
29742939
"""
29752940
withenv(f::Function, kv::Pair...)
29762941

base/exports.jl

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -118,14 +118,12 @@ export
118118
UnitRange,
119119
UpperTriangular,
120120
UTF16String,
121-
UTF32String,
122121
Val,
123122
VecOrMat,
124123
Vector,
125124
VersionNumber,
126125
WeakKeyDict,
127126
WorkerConfig,
128-
WString,
129127
Zip,
130128

131129
# Ccall types
@@ -894,9 +892,7 @@ export
894892
uppercase,
895893
utf8,
896894
utf16,
897-
utf32,
898895
warn,
899-
wstring,
900896
xdump,
901897

902898
# random numbers

base/serialize.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ const TAGS = Any[
2121
LineNumberNode, SymbolNode, LabelNode, GotoNode,
2222
QuoteNode, TopNode, TypeVar, Box, LambdaStaticData,
2323
Module, #=UndefRefTag=#Symbol, Task, String,
24-
UTF16String, UTF32String, Float16,
24+
UTF16String, Float16,
2525
SimpleVector, #=BackrefTag=#Symbol, :reserved11, :reserved12,
2626

2727
(), Bool, Any, :Any, Bottom, :reserved21, :reserved22, Type,

base/unicode/types.jl

Lines changed: 2 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -14,21 +14,5 @@ immutable UTF16String <: AbstractString
1414
end
1515
end
1616

17-
##\brief Base UTF32String type, has 32-bit NULL termination word after data, native byte order
18-
#
19-
# \throws UnicodeError
20-
21-
immutable UTF32String <: DirectIndexString
22-
data::Vector{UInt32} # includes 32-bit NULL termination after string chars
23-
24-
function UTF32String(data::Vector{UInt32})
25-
if length(data) < 1 || data[end] != 0
26-
throw(UnicodeError(UTF_ERR_NULL_32_TERMINATE, 0, 0))
27-
end
28-
new(data)
29-
end
30-
end
31-
UTF32String(data::Vector{Char}) = UTF32String(reinterpret(UInt32, data))
32-
33-
isvalid{T<:Union{String,UTF16String,UTF32String}}(str::T) = isvalid(T, str.data)
34-
isvalid{T<:Union{String,UTF16String,UTF32String}}(::Type{T}, str::T) = isvalid(T, str.data)
17+
isvalid{T<:Union{String,UTF16String}}(str::T) = isvalid(T, str.data)
18+
isvalid{T<:Union{String,UTF16String}}(::Type{T}, str::T) = isvalid(T, str.data)

base/unicode/utf16.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# This file is a part of Julia. License is MIT: http://julialang.org/license
22

33
# Quickly copy and set trailing \0
4-
@inline function fast_utf_copy{S <: Union{UTF16String, UTF32String}, T <: Union{UInt16, UInt32}}(
4+
@inline function fast_utf_copy{S <: UTF16String, T <: Union{UInt16, UInt32}}(
55
::Type{S}, ::Type{T}, len, dat, flag::Bool=false)
66
S(setindex!(copy!(Vector{T}(len+1), 1, dat, 1, flag ? len : len+1), 0, len+1))
77
end

base/unicode/utf32.jl

Lines changed: 5 additions & 209 deletions
Original file line numberDiff line numberDiff line change
@@ -1,131 +1,5 @@
11
# This file is a part of Julia. License is MIT: http://julialang.org/license
22

3-
# UTF-32 basic functions
4-
next(s::UTF32String, i::Int) = (Char(s.data[i]), i+1)
5-
endof(s::UTF32String) = length(s.data) - 1
6-
length(s::UTF32String) = length(s.data) - 1
7-
8-
reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s)))
9-
10-
sizeof(s::UTF32String) = sizeof(s.data) - sizeof(UInt32)
11-
12-
const empty_utf32 = UTF32String(UInt32[0])
13-
14-
convert(::Type{UTF32String}, c::Char) = UTF32String(UInt32[c, 0])
15-
convert(::Type{UTF32String}, s::UTF32String) = s
16-
17-
function convert(::Type{UTF32String}, str::AbstractString)
18-
len, flags = unsafe_checkstring(str)
19-
buf = Vector{UInt32}(len+1)
20-
out = 0
21-
@inbounds for ch in str ; buf[out += 1] = ch ; end
22-
@inbounds buf[out + 1] = 0 # NULL termination
23-
UTF32String(buf)
24-
end
25-
26-
function convert(::Type{String}, str::UTF32String)
27-
dat = str.data
28-
len = sizeof(dat) >>> 2
29-
# handle zero length string quickly
30-
len <= 1 && return empty_utf8
31-
# get number of bytes to allocate
32-
len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat, 1, len-1)
33-
flags == 0 && @inbounds return String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
34-
return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3)
35-
end
36-
37-
function convert(::Type{UTF32String}, str::String)
38-
dat = str.data
39-
# handle zero length string quickly
40-
sizeof(dat) == 0 && return empty_utf32
41-
# Validate UTF-8 encoding, and get number of words to create
42-
len, flags = unsafe_checkstring(dat)
43-
# Optimize case where no characters > 0x7f
44-
flags == 0 && @inbounds return fast_utf_copy(UTF32String, UInt32, len, dat, true)
45-
# has multi-byte UTF-8 sequences
46-
buf = Vector{UInt32}(len+1)
47-
@inbounds buf[len+1] = 0 # NULL termination
48-
local ch::UInt32, surr::UInt32
49-
out = 0
50-
pos = 0
51-
@inbounds while out < len
52-
ch = dat[pos += 1]
53-
# Handle ASCII characters
54-
if ch <= 0x7f
55-
buf[out += 1] = ch
56-
# Handle range 0x80-0x7ff
57-
elseif ch < 0xe0
58-
buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f)
59-
# Handle range 0x800-0xffff
60-
elseif ch < 0xf0
61-
pos += 2
62-
ch = get_utf8_3byte(dat, pos, ch)
63-
# Handle surrogate pairs (should have been encoded in 4 bytes)
64-
if is_surrogate_lead(ch)
65-
# Build up 32-bit character from ch and trailing surrogate in next 3 bytes
66-
pos += 3
67-
surr = ((UInt32(dat[pos-2] & 0xf) << 12)
68-
| (UInt32(dat[pos-1] & 0x3f) << 6)
69-
| (dat[pos] & 0x3f))
70-
ch = get_supplementary(ch, surr)
71-
end
72-
buf[out += 1] = ch
73-
# Handle range 0x10000-0x10ffff
74-
else
75-
pos += 3
76-
buf[out += 1] = get_utf8_4byte(dat, pos, ch)
77-
end
78-
end
79-
UTF32String(buf)
80-
end
81-
82-
function convert(::Type{UTF32String}, str::UTF16String)
83-
dat = str.data
84-
len = sizeof(dat)
85-
# handle zero length string quickly (account for trailing \0)
86-
len <= 2 && return empty_utf32
87-
# get number of words to create
88-
len, flags, num4byte = unsafe_checkstring(dat, 1, len>>>1)
89-
# No surrogate pairs, do optimized copy
90-
(flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String(copy!(Vector{Char}(len), dat))
91-
local ch::UInt32
92-
buf = Vector{UInt32}(len)
93-
out = 0
94-
pos = 0
95-
@inbounds while out < len
96-
ch = dat[pos += 1]
97-
# check for surrogate pair
98-
if is_surrogate_lead(ch) ; ch = get_supplementary(ch, dat[pos += 1]) ; end
99-
buf[out += 1] = ch
100-
end
101-
UTF32String(buf)
102-
end
103-
104-
function convert(::Type{UTF16String}, str::UTF32String)
105-
dat = str.data
106-
len = sizeof(dat)
107-
# handle zero length string quickly
108-
len <= 4 && return empty_utf16
109-
# get number of words to allocate
110-
len, flags, num4byte = unsafe_checkstring(dat, 1, len>>>2)
111-
# optimized path, no surrogates
112-
num4byte == 0 && @inbounds return UTF16String(copy!(Vector{UInt16}(len), dat))
113-
return encode_to_utf16(dat, len + num4byte)
114-
end
115-
116-
function convert(::Type{UTF32String}, dat::AbstractVector{UInt32})
117-
@inbounds return fast_utf_copy(UTF32String, UInt32, length(dat), dat, true)
118-
end
119-
120-
convert(::Type{UTF32String}, data::AbstractVector{Int32}) =
121-
convert(UTF32String, reinterpret(UInt32, convert(Vector{T}, data)))
122-
123-
convert(::Type{UTF32String}, data::AbstractVector{Char}) =
124-
convert(UTF32String, map(UInt32, data))
125-
126-
convert{T<:AbstractString, S<:Union{UInt32,Char,Int32}}(::Type{T}, v::AbstractVector{S}) =
127-
convert(T, utf32(v))
128-
1293
# specialize for performance reasons:
1304
function convert{T<:Union{UInt32,Char,Int32}}(::Type{String}, data::AbstractVector{T})
1315
s = IOBuffer(Array(UInt8,length(data)), true, true)
@@ -136,98 +10,20 @@ function convert{T<:Union{UInt32,Char,Int32}}(::Type{String}, data::AbstractVect
13610
convert(String, takebuf_string(s))
13711
end
13812

139-
convert(::Type{Vector{UInt32}}, str::UTF32String) = str.data
140-
convert(::Type{Array{UInt32}}, str::UTF32String) = str.data
141-
142-
unsafe_convert{T<:Union{UInt32,Int32,Char}}(::Type{Ptr{T}}, s::UTF32String) =
143-
convert(Ptr{T}, pointer(s))
144-
145-
function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8})
146-
isempty(bytes) && return empty_utf32
147-
length(bytes) & 3 != 0 && throw(UnicodeError(UTF_ERR_ODD_BYTES_32,0,0))
148-
data = reinterpret(UInt32, bytes)
149-
# check for byte-order mark (BOM):
150-
if data[1] == 0x0000feff # native byte order
151-
d = Array(UInt32, length(data))
152-
copy!(d,1, data, 2, length(data)-1)
153-
elseif data[1] == 0xfffe0000 # byte-swapped
154-
d = Array(UInt32, length(data))
155-
for i = 2:length(data)
156-
@inbounds d[i-1] = bswap(data[i])
157-
end
158-
else
159-
d = Array(UInt32, length(data) + 1)
160-
copy!(d, 1, data, 1, length(data)) # assume native byte order
161-
end
162-
d[end] = 0 # NULL terminate
163-
UTF32String(d)
164-
end
165-
166-
function isvalid(::Type{UTF32String}, str::Union{Vector{UInt32}, Vector{Char}})
167-
for i=1:length(str)
168-
@inbounds if !isvalid(Char, UInt32(str[i])) ; return false ; end
169-
end
170-
return true
171-
end
172-
isvalid(str::Vector{Char}) = isvalid(UTF32String, str)
173-
174-
utf32(x) = convert(UTF32String, x)
175-
176-
utf32(p::Ptr{UInt32}, len::Integer) = utf32(pointer_to_array(p, len))
177-
utf32(p::Union{Ptr{Char}, Ptr{Int32}}, len::Integer) = utf32(convert(Ptr{UInt32}, p), len)
178-
function utf32(p::Union{Ptr{UInt32}, Ptr{Char}, Ptr{Int32}})
179-
len = 0
180-
while unsafe_load(p, len+1) != 0; len += 1; end
181-
utf32(p, len)
182-
end
183-
184-
function map(f, s::UTF32String)
185-
d = s.data
186-
out = similar(d)
187-
out[end] = 0
188-
189-
@inbounds for i = 1:(length(d)-1)
190-
c2 = f(Char(d[i]))
191-
if !isa(c2, Char)
192-
throw(UnicodeError(UTF_ERR_MAP_CHAR, 0, 0))
193-
end
194-
out[i] = (c2::Char)
195-
end
196-
UTF32String(out)
197-
end
198-
19913
# Definitions for C compatible strings, that don't allow embedded
20014
# '\0', and which are terminated by a '\0'
20115

20216
containsnul(s::AbstractString) = '\0' in s
20317
containsnul(s::String) = containsnul(unsafe_convert(Ptr{Cchar}, s), sizeof(s))
204-
containsnul(s::Union{UTF16String,UTF32String}) = findfirst(s.data, 0) != length(s.data)
205-
206-
if sizeof(Cwchar_t) == 2
207-
const WString = UTF16String
208-
const wstring = utf16
209-
elseif sizeof(Cwchar_t) == 4
210-
const WString = UTF32String
211-
const wstring = utf32
212-
end
213-
wstring(s::Cwstring) = wstring(convert(Ptr{Cwchar_t}, s))
214-
215-
# Cwstring is defined in c.jl, but conversion needs to be defined here
216-
# to have WString
217-
function unsafe_convert(::Type{Cwstring}, s::WString)
218-
if containsnul(s)
219-
throw(ArgumentError("embedded NUL chars are not allowed in C strings: $(repr(s))"))
220-
end
221-
return Cwstring(unsafe_convert(Ptr{Cwchar_t}, s))
222-
end
18+
containsnul(s::Union{UTF16String}) = findfirst(s.data, 0) != length(s.data)
22319

22420
# pointer conversions of ASCII/UTF8/UTF16/UTF32 strings:
225-
pointer(x::Union{String,UTF16String,UTF32String}) = pointer(x.data)
21+
pointer(x::Union{String,UTF16String}) = pointer(x.data)
22622
pointer(x::String, i::Integer) = pointer(x.data)+(i-1)
227-
pointer(x::Union{UTF16String,UTF32String}, i::Integer) = pointer(x)+(i-1)*sizeof(eltype(x.data))
23+
pointer(x::Union{UTF16String}, i::Integer) = pointer(x)+(i-1)*sizeof(eltype(x.data))
22824

22925
# pointer conversions of SubString of ASCII/UTF8/UTF16/UTF32:
23026
pointer(x::SubString{String}) = pointer(x.string.data) + x.offset
23127
pointer(x::SubString{String}, i::Integer) = pointer(x.string.data) + x.offset + (i-1)
232-
pointer{T<:Union{UTF16String,UTF32String}}(x::SubString{T}) = pointer(x.string.data) + x.offset*sizeof(eltype(x.string.data))
233-
pointer{T<:Union{UTF16String,UTF32String}}(x::SubString{T}, i::Integer) = pointer(x.string.data) + (x.offset + (i-1))*sizeof(eltype(x.string.data))
28+
pointer(x::SubString{UTF16String}) = pointer(x.string.data) + x.offset*sizeof(eltype(x.string.data))
29+
pointer(x::SubString{UTF16String}, i::Integer) = pointer(x.string.data) + (x.offset + (i-1))*sizeof(eltype(x.string.data))

0 commit comments

Comments
 (0)