Skip to content

Commit ff0c385

Browse files
new Char: up to 4 zero-padded UTF-8 bytes; basic change passes tests
1 parent 1a33c93 commit ff0c385

File tree

6 files changed

+52
-46
lines changed

6 files changed

+52
-46
lines changed

base/char.jl

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,25 @@
11
# This file is a part of Julia. License is MIT: http://julialang.org/license
22

3-
convert(::Type{Char}, x::UInt32) = reinterpret(Char, x)
3+
function convert(::Type{UInt32}, c::Char)
4+
u = reinterpret(UInt32, c)
5+
u $= ifelse(
6+
u <= 0x0000ffff,
7+
ifelse(u <= 0x000000ff, 0x00000000, 0x0000c080),
8+
ifelse(u <= 0x00ffffff, 0x00e08080, 0xf0808080),
9+
)
10+
((u & 0x000000ff) >> 0) $ ((u & 0x0000ff00) >> 2) $
11+
((u & 0x00ff0000) >> 4) $ ((u & 0xff000000) >> 6)
12+
end
13+
14+
function convert(::Type{Char}, u::UInt32)
15+
u < 0x00200000 || throw(InexactError())
16+
c = (u & 0x3f) | ((u << 2) & 0x3f00) | ((u << 4) & 0x3f0000) | ((u << 6) & 0x3f000000)
17+
reinterpret(Char, ifelse(u <= 0x7f, u,
18+
c | ifelse(u <= 0x000007ff, 0x0000c080,
19+
ifelse(u <= 0x0000ffff, 0x00e08080, 0xf0808080))))
20+
end
21+
422
convert(::Type{Char}, x::Number) = Char(UInt32(x))
5-
convert(::Type{UInt32}, x::Char) = reinterpret(UInt32, x)
623
convert{T<:Number}(::Type{T}, x::Char) = convert(T, UInt32(x))
724

825
rem{T<:Number}(x::Char, ::Type{T}) = rem(UInt32(x), T)
@@ -29,11 +46,11 @@ done(c::Char, state) = state
2946
isempty(c::Char) = false
3047
in(x::Char, y::Char) = x == y
3148

32-
==(x::Char, y::Char) = UInt32(x) == UInt32(y)
49+
==(x::Char, y::Char) = eq_int(unbox(Char,x), unbox(Char,y))
3350
==(x::Char, y::Integer) = UInt32(x) == y
3451
==(x::Integer, y::Char) = x == UInt32(y)
3552

36-
isless(x::Char, y::Char) = isless(UInt32(x), UInt32(y))
53+
isless(x::Char, y::Char) = ult_int(unbox(Char,x), unbox(Char,y))
3754
isless(x::Char, y::Integer) = isless(UInt32(x), y)
3855
isless(x::Integer, y::Char) = isless(x, UInt32(y))
3956

@@ -42,8 +59,6 @@ isless(x::Integer, y::Char) = isless(x, UInt32(y))
4259
+(x::Char, y::Integer) = Char(Int32(x) + Int32(y))
4360
+(x::Integer, y::Char) = y + x
4461

45-
bswap(x::Char) = Char(bswap(UInt32(x)))
46-
4762
print(io::IO, c::Char) = (write(io, c); nothing)
4863

4964
const hex_chars = UInt8['0':'9';'a':'z']

base/io.jl

Lines changed: 23 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -70,25 +70,15 @@ function write(s::IO, a::AbstractArray)
7070
return nb
7171
end
7272

73-
function write(s::IO, ch::Char)
74-
c = reinterpret(UInt32, ch)
75-
if c < 0x80
76-
return write(s, c%UInt8)
77-
elseif c < 0x800
78-
return (write(s, (( c >> 6 ) | 0xC0)%UInt8)) +
79-
(write(s, (( c & 0x3F ) | 0x80)%UInt8))
80-
elseif c < 0x10000
81-
return (write(s, (( c >> 12 ) | 0xE0)%UInt8)) +
82-
(write(s, (((c >> 6) & 0x3F ) | 0x80)%UInt8)) +
83-
(write(s, (( c & 0x3F ) | 0x80)%UInt8))
84-
elseif c < 0x110000
85-
return (write(s, (( c >> 18 ) | 0xF0)%UInt8)) +
86-
(write(s, (((c >> 12) & 0x3F ) | 0x80)%UInt8)) +
87-
(write(s, (((c >> 6) & 0x3F ) | 0x80)%UInt8)) +
88-
(write(s, (( c & 0x3F ) | 0x80)%UInt8))
89-
else
90-
return write(s, '\ufffd')
73+
function write(io::IO, c::Char)
74+
u = bswap(reinterpret(UInt32, c))
75+
n = 24 & trailing_zeros(u)
76+
u >>= n
77+
while true
78+
write(io, u % UInt8)
79+
0 < (u >>= 8) || break
9180
end
81+
4 - (n >> 3)
9282
end
9383

9484
function write(s::IO, p::Ptr, n::Integer)
@@ -144,22 +134,22 @@ function read!{T}(s::IO, a::Array{T})
144134
end
145135

146136
function read(s::IO, ::Type{Char})
147-
ch = read(s, UInt8)
148-
if ch < 0x80
149-
return Char(ch)
150-
end
151-
152-
# mimic utf8.next function
153-
trailing = Base.utf8_trailing[ch+1]
154-
c::UInt32 = 0
155-
for j = 1:trailing
156-
c += ch
157-
c <<= 6
158-
ch = read(s, UInt8)
137+
b0 = read(s, UInt8)
138+
n = leading_ones(b0)
139+
c = UInt32(b0)
140+
2 <= n <= 4 || return reinterpret(Char, c)
141+
mark(s)
142+
while n > 1
143+
b = read(s, UInt8)
144+
if b & 0xc0 != 0x80
145+
reset(s)
146+
return reinterpret(Char, UInt32(b0))
147+
end
148+
c <<= 8
149+
c |= b
150+
n -= 1
159151
end
160-
c += ch
161-
c -= Base.utf8_offset[trailing+1]
162-
Char(c)
152+
return reinterpret(Char, c)
163153
end
164154

165155
function readuntil(s::IO, delim::Char)

base/unicode/types.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ immutable UTF32String <: DirectIndexString
2828
new(data)
2929
end
3030
end
31-
UTF32String(data::Vector{Char}) = UTF32String(reinterpret(UInt32, data))
31+
UTF32String(data::Vector{Char}) = UTF32String(map(UInt32, data))
3232

3333
isvalid{T<:Union{ASCIIString,UTF8String,UTF16String,UTF32String}}(str::T) = isvalid(T, str.data)
3434
isvalid{T<:Union{ASCIIString,UTF8String,UTF16String,UTF32String}}(::Type{T}, str::T) = isvalid(T, str.data)

src/ast.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -392,8 +392,12 @@ static jl_value_t *scm_to_julia_(value_t e, int eo)
392392
}
393393
}
394394
if (iscprim(e) && cp_class((cprim_t*)ptr(e))==wchartype) {
395-
jl_value_t *wc =
396-
jl_box32(jl_char_type, *(int32_t*)cp_data((cprim_t*)ptr(e)));
395+
uint32_t u = *(uint32_t*)cp_data((cprim_t*)ptr(e));
396+
if (u > 0x7f) {
397+
u = (u <= 0x000007ff ? 0x0000c080 : u <= 0x0000ffff ? 0x00e08080 : 0xf0808080) |
398+
(u & 0x3f) | ((u << 2) & 0x3f00) | ((u << 4) & 0x3f0000) | ((u << 6) & 0x3f000000);
399+
}
400+
jl_value_t *wc = jl_box32(jl_char_type, u);
397401
return wc;
398402
}
399403
if (iscvalue(e) && cv_class((cvalue_t*)ptr(e)) == jvtype) {

test/char.jl

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,6 @@
99
# This is current behavior, but it seems incorrect
1010
@test getindex('a',1,1,1) == 'a'
1111
@test_throws BoundsError getindex('a',1,1,2)
12-
# bswap of a Char should be removed, only the underlying codeunit (UInt32)
13-
# should be swapped
14-
@test bswap('\U10200') == '\U20100'
1512

1613
let
1714

test/strings/basic.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ c = Char['0','\0']
268268
d = UTF32String(c)
269269
@test d == "0"
270270
c[1] = 'A'
271-
@test d == "A"
271+
@test d == "0"
272272

273273
# iteration
274274
@test [c for c in "ḟøøƀäṙ"] == ['', 'ø', 'ø', 'ƀ', 'ä', '']

0 commit comments

Comments
 (0)