Skip to content

Commit 3e53553

Browse files
committed
throw UnicodeError when accessing an invalid UTF-8 indice, rather than attempting to adjust it (in getindex, next, search). fix #7811
1 parent 39f720a commit 3e53553

File tree

5 files changed

+43
-25
lines changed

5 files changed

+43
-25
lines changed

base/ascii.jl

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,11 @@ sizeof(s::ASCIIString) = sizeof(s.data)
1919
getindex(s::ASCIIString, r::Vector) = ASCIIString(getindex(s.data,r))
2020
getindex(s::ASCIIString, r::UnitRange{Int}) = ASCIIString(getindex(s.data,r))
2121
getindex(s::ASCIIString, indx::AbstractVector{Int}) = ASCIIString(s.data[indx])
22-
search(s::ASCIIString, c::Char, i::Integer) = c < Char(0x80) ? search(s.data,c%UInt8,i) : 0
22+
function search(s::ASCIIString, c::Char, i::Integer)
23+
i == sizeof(s) + 1 && return 0
24+
(i < 1 || i > sizeof(s)) && throw(BoundsError(s, i))
25+
return c < Char(0x80) ? search(s.data,c%UInt8,i) : 0
26+
end
2327
rsearch(s::ASCIIString, c::Char, i::Integer) = c < Char(0x80) ? rsearch(s.data,c%UInt8,i) : 0
2428

2529
function string(c::ASCIIString...)

base/dates/io.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,12 +102,12 @@ slotparse(slot::Slot{Millisecond},x) = !ismatch(r"[^0-9\s]",x) ? slot.period(Bas
102102
slotparse(slot::Slot{DayOfWeekSlot},x) = nothing
103103

104104
function getslot(x,slot::DelimitedSlot,df,cursor)
105-
endind = first(search(x,df.trans[slot.i],cursor+1))
105+
endind = first(search(x,df.trans[slot.i],nextind(x,cursor)))
106106
if endind == 0 # we didn't find the next delimiter
107107
s = x[cursor:end]
108108
return (endof(x)+1, isdigit(s) ? slotparse(slot,s) : default(slot.period))
109109
end
110-
return endind+1, slotparse(slot,x[cursor:(endind-1)])
110+
return nextind(x,endind), slotparse(slot,x[cursor:(endind-1)])
111111
end
112112
getslot(x,slot,df,cursor) = (cursor+slot.width, slotparse(slot,x[cursor:(cursor+slot.width-1)]))
113113

base/string.jl

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -188,10 +188,9 @@ function search(s::AbstractString, c::Chars, i::Integer)
188188
return 1 <= i <= nextind(s,endof(s)) ? i :
189189
throw(BoundsError(s, i))
190190
end
191-
if i < 1
191+
if i < 1 || i > nextind(s,endof(s))
192192
throw(BoundsError(s, i))
193193
end
194-
i = nextind(s,i-1)
195194
while !done(s,i)
196195
d, j = next(s,i)
197196
if d in c
@@ -655,7 +654,7 @@ prevind(s::SubString, i::Integer) = prevind(s.string, i+s.offset)-s.offset
655654

656655
convert{T<:AbstractString}(::Type{SubString{T}}, s::T) = SubString(s, 1, endof(s))
657656

658-
bytestring{T <: ByteString}(p::SubString{T}) = bytestring(pointer(p.string.data)+p.offset, nextind(p, p.endof)-1)
657+
bytestring{T <: ByteString}(p::SubString{T}) = bytestring(p.string.data[1+p.offset:p.offset+nextind(p, p.endof)-1])
659658

660659
function getindex(s::AbstractString, r::UnitRange{Int})
661660
if first(r) < 1 || endof(s) < last(r)

base/utf8.jl

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -66,16 +66,7 @@ function next(s::UTF8String, i::Int)
6666
d = s.data
6767
b = d[i]
6868
if !is_utf8_start(b)
69-
j = i-1
70-
while 0 < j && !is_utf8_start(d[j])
71-
j -= 1
72-
end
73-
if 0 < j && i <= j+utf8_trailing[d[j]+1] <= length(d)
74-
# b is a continuation byte of a valid UTF-8 character
75-
throw(UnicodeError(UTF_ERR_CONT, i, d[j]))
76-
end
77-
# move past 1 byte in case the data is actually Latin-1
78-
return '\ufffd', i+1
69+
throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i]))
7970
end
8071
trailing = utf8_trailing[b+1]
8172
if length(d) < i + trailing
@@ -123,8 +114,11 @@ function getindex(s::UTF8String, r::UnitRange{Int})
123114
isempty(r) && return empty_utf8
124115
i, j = first(r), last(r)
125116
d = s.data
117+
if i < 1 || i > length(s.data)
118+
throw(BoundsError(s, i))
119+
end
126120
if !is_utf8_start(d[i])
127-
i = nextind(s,i)
121+
throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i]))
128122
end
129123
if j > length(d)
130124
throw(BoundsError())
@@ -134,9 +128,17 @@ function getindex(s::UTF8String, r::UnitRange{Int})
134128
end
135129

136130
function search(s::UTF8String, c::Char, i::Integer)
137-
c < Char(0x80) && return search(s.data, c%UInt8, i)
131+
if i < 1 || i > sizeof(s)
132+
i == sizeof(s) + 1 && return 0
133+
throw(BoundsError(s, i))
134+
end
135+
d = s.data
136+
if !is_utf8_start(d[i])
137+
throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i]))
138+
end
139+
c < Char(0x80) && return search(d, c%UInt8, i)
138140
while true
139-
i = search(s.data, first_utf8_byte(c), i)
141+
i = search(d, first_utf8_byte(c), i)
140142
(i==0 || s[i] == c) && return i
141143
i = next(s,i)[2]
142144
end

test/strings.jl

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,8 @@ u8str = "∀ ε > 0, ∃ δ > 0: |x-y| < δ ⇒ |f(x)-f(y)| < ε"
262262

263263
# ascii search
264264
for str in [astr, Base.GenericString(astr)]
265+
@test_throws BoundsError search(str, 'z', 0)
266+
@test_throws BoundsError search(str, '', 0)
265267
@test search(str, 'x') == 0
266268
@test search(str, '\0') == 0
267269
@test search(str, '\u80') == 0
@@ -275,6 +277,8 @@ for str in [astr, Base.GenericString(astr)]
275277
@test search(str, ',', 7) == 0
276278
@test search(str, '\n') == 14
277279
@test search(str, '\n', 15) == 0
280+
@test_throws BoundsError search(str, 'ε', nextind(str,endof(str))+1)
281+
@test_throws BoundsError search(str, 'a', nextind(str,endof(str))+1)
278282
end
279283

280284
# ascii rsearch
@@ -297,23 +301,32 @@ end
297301

298302
# utf-8 search
299303
for str in (u8str, Base.GenericString(u8str))
304+
@test_throws BoundsError search(str, 'z', 0)
305+
@test_throws BoundsError search(str, '', 0)
300306
@test search(str, 'z') == 0
301307
@test search(str, '\0') == 0
302308
@test search(str, '\u80') == 0
303309
@test search(str, '') == 0
304310
@test search(str, '') == 1
305-
@test search(str, '', 2) == 0
311+
@test_throws UnicodeError search(str, '', 2)
312+
@test search(str, '', 4) == 0
306313
@test search(str, '') == 13
307-
@test search(str, '', 14) == 0
314+
@test_throws UnicodeError search(str, '', 15)
315+
@test search(str, '', 16) == 0
308316
@test search(str, 'x') == 26
309317
@test search(str, 'x', 27) == 43
310318
@test search(str, 'x', 44) == 0
311319
@test search(str, 'δ') == 17
312-
@test search(str, 'δ', 18) == 33
313-
@test search(str, 'δ', 34) == 0
320+
@test_throws UnicodeError search(str, 'δ', 18)
321+
@test search(str, 'δ', nextind(str,17)) == 33
322+
@test search(str, 'δ', nextind(str,33)) == 0
314323
@test search(str, 'ε') == 5
315-
@test search(str, 'ε', 6) == 54
316-
@test search(str, 'ε', 55) == 0
324+
@test search(str, 'ε', nextind(str,5)) == 54
325+
@test search(str, 'ε', nextind(str,54)) == 0
326+
@test search(str, 'ε', nextind(str,endof(str))) == 0
327+
@test search(str, 'a', nextind(str,endof(str))) == 0
328+
@test_throws BoundsError search(str, 'ε', nextind(str,endof(str))+1)
329+
@test_throws BoundsError search(str, 'a', nextind(str,endof(str))+1)
317330
end
318331

319332
# utf-8 rsearch

0 commit comments

Comments
 (0)