Skip to content

Commit c457bfb

Browse files
committed
Add Unicode validation function
Adds `check_string` function, which checks a vector of bytes, 16-bit or 32-bit words, or an AbstractString for validity, either for UTF-8, UTF-16, or UTF-32 encoding. By default, `Modified UTF-8 (long \0 encoding)` and `CESU-8 (surrogate pairs encoded as 2 UTF-8 3-byte sequences)` are allowed, but other over long encoded sequences are not allowed, but this can be changed by the keyword options argument. Add unit tests of all the errors found by `check_string` Updated documentation to not use doxygen tags. Move documentation strings from line after to line before Add testing of valid strings Improve/consolidate documentation Add bounds checking Change name to unsafe_checkstring, warn that doesn't check bounds Add checkstring, which does check bounds Add tests of bounds checking Change order of start/end positions Update bounds checking tests Change 1 to start(dat)
1 parent 99391b3 commit c457bfb

File tree

3 files changed

+389
-0
lines changed

3 files changed

+389
-0
lines changed

base/sysimg.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ include("osutils.jl")
8787
# strings & printing
8888
include("utferror.jl")
8989
include("utftypes.jl")
90+
include("utfcheck.jl")
9091
include("char.jl")
9192
include("ascii.jl")
9293
include("utf8.jl")

base/utfcheck.jl

Lines changed: 228 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,228 @@
1+
# This file is a part of Julia. License is MIT: http://julialang.org/license
2+
3+
## Functions to check validity of UTF-8, UTF-16, and UTF-32 encoded strings,
4+
# and also to return information necessary to convert to other encodings
5+
6+
is_surrogate_lead(c::Unsigned) = ((c & ~0x003ff) == 0xd800)
7+
is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00)
8+
is_surrogate_codeunit(c::Unsigned) = ((c & ~0x007ff) == 0xd800)
9+
is_valid_continuation(c) = ((c & 0xc0) == 0x80)
10+
11+
## Return flags for check_string function
12+
13+
const UTF_LONG = 1 ##< Long encodings are present
14+
const UTF_LATIN1 = 2 ##< characters in range 0x80-0xFF present
15+
const UTF_UNICODE2 = 4 ##< characters in range 0x100-0x7ff present
16+
const UTF_UNICODE3 = 8 ##< characters in range 0x800-0xd7ff, 0xe000-0xffff
17+
const UTF_UNICODE4 = 16 ##< non-BMP characters present
18+
const UTF_SURROGATE = 32 ##< surrogate pairs present
19+
20+
## Get a UTF-8 continuation byte, give error if invalid, return updated character value
21+
@inline function get_continuation(ch::UInt32, byt::UInt8, pos)
22+
!is_valid_continuation(byt) && throw(UnicodeError(UTF_ERR_CONT, pos, byt))
23+
(ch << 6) | (byt & 0x3f)
24+
end
25+
26+
"
27+
Validates and calculates number of characters in a UTF-8,UTF-16 or UTF-32 encoded vector/string
28+
29+
Warning: this function does not check the bounds of the start or end positions
30+
Use `checkstring` to make sure the bounds are checked
31+
32+
### Input Arguments:
33+
* `dat` UTF-8 (`Vector{UInt8}`), UTF-16 (`Vector{UInt16}`) or UTF-32 (`Vector{UInt32}`, `AbstractString`) encoded string
34+
35+
### Optional Input Arguments:
36+
* `pos` start position (defaults to `start(dat)`)
37+
* `endpos` end position (defaults to `endof(dat)`)
38+
39+
### Keyword Arguments:
40+
* `accept_long_null` = `true` # Modified UTF-8 (`\\0` represented as `b\"\\xc0\\x80\"`)
41+
* `accept_surrogates` = `true` # `CESU-8`
42+
* `accept_long_char` = `false` # Accept arbitrary long encodings
43+
44+
### Returns:
45+
* (total characters, flags, 4-byte, 3-byte, 2-byte)
46+
47+
### Throws:
48+
* `UnicodeError`
49+
"
50+
function unsafe_checkstring end
51+
52+
function unsafe_checkstring(dat::Vector{UInt8},
53+
pos = start(dat),
54+
endpos = endof(dat)
55+
;
56+
accept_long_null = true,
57+
accept_surrogates = true,
58+
accept_long_char = false)
59+
local byt::UInt8, ch::UInt32, surr::UInt32
60+
flags::UInt = 0
61+
totalchar = num2byte = num3byte = num4byte = 0
62+
@inbounds while pos <= endpos
63+
ch, pos = next(dat, pos)
64+
totalchar += 1
65+
if ch > 0x7f
66+
# Check UTF-8 encoding
67+
if ch < 0xe0
68+
# 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff)
69+
(pos > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
70+
byt, pos = next(dat, pos)
71+
ch = get_continuation(ch & 0x3f, byt, pos)
72+
if ch > 0x7f
73+
num2byte += 1
74+
flags |= (ch > 0xff) ? UTF_UNICODE2 : UTF_LATIN1
75+
elseif accept_long_char
76+
flags |= UTF_LONG
77+
elseif (ch == 0) && accept_long_null
78+
flags |= UTF_LONG
79+
else
80+
throw(UnicodeError(UTF_ERR_LONG, pos, ch))
81+
end
82+
elseif ch < 0xf0
83+
# 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff)
84+
(pos + 1 > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
85+
byt, pos = next(dat, pos)
86+
ch = get_continuation(ch & 0x0f, byt, pos)
87+
byt, pos = next(dat, pos)
88+
ch = get_continuation(ch, byt, pos)
89+
# check for surrogate pairs, make sure correct
90+
if is_surrogate_codeunit(ch)
91+
!is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, pos-2, ch))
92+
# next character *must* be a trailing surrogate character
93+
(pos + 2 > endpos) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos-2, ch))
94+
byt, pos = next(dat, pos)
95+
(byt != 0xed) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, byt))
96+
byt, pos = next(dat, pos)
97+
surr = get_continuation(0x0000d, byt, pos)
98+
byt, pos = next(dat, pos)
99+
surr = get_continuation(surr, byt, pos)
100+
!is_surrogate_trail(surr) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos-2, surr))
101+
!accept_surrogates && throw(UnicodeError(UTF_ERR_SURROGATE, pos-2, surr))
102+
flags |= UTF_SURROGATE
103+
num4byte += 1
104+
elseif ch > 0x07ff
105+
num3byte += 1
106+
elseif accept_long_char
107+
flags |= UTF_LONG
108+
num2byte += 1
109+
else
110+
throw(UnicodeError(UTF_ERR_LONG, pos-2, ch))
111+
end
112+
elseif ch < 0xf5
113+
# 4-byte UTF-8 sequence (i.e. characters > 0xffff)
114+
(pos + 2 > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
115+
byt, pos = next(dat, pos)
116+
ch = get_continuation(ch & 0x07, byt, pos)
117+
byt, pos = next(dat, pos)
118+
ch = get_continuation(ch, byt, pos)
119+
byt, pos = next(dat, pos)
120+
ch = get_continuation(ch, byt, pos)
121+
if ch > 0x10ffff
122+
throw(UnicodeError(UTF_ERR_INVALID, pos-3, ch))
123+
elseif ch > 0xffff
124+
num4byte += 1
125+
elseif is_surrogate_codeunit(ch)
126+
throw(UnicodeError(UTF_ERR_SURROGATE, pos-3, ch))
127+
elseif accept_long_char
128+
# This is an overly long encoded character
129+
flags |= UTF_LONG
130+
if ch > 0x7ff
131+
num3byte += 1
132+
elseif ch > 0x7f
133+
num2byte += 1
134+
end
135+
else
136+
throw(UnicodeError(UTF_ERR_LONG, pos-2, ch))
137+
end
138+
else
139+
throw(UnicodeError(UTF_ERR_INVALID, pos, ch))
140+
end
141+
end
142+
end
143+
num3byte != 0 && (flags |= UTF_UNICODE3)
144+
num4byte != 0 && (flags |= UTF_UNICODE4)
145+
return totalchar, flags, num4byte, num3byte, num2byte
146+
end
147+
148+
function unsafe_checkstring{T <: Union(Vector{UInt16}, Vector{UInt32}, AbstractString)}(
149+
dat::T,
150+
pos = start(dat),
151+
endpos = endof(dat)
152+
;
153+
accept_long_null = true,
154+
accept_surrogates = true,
155+
accept_long_char = false)
156+
local ch::UInt32
157+
flags::UInt = 0
158+
totalchar = num2byte = num3byte = num4byte = 0
159+
@inbounds while pos <= endpos
160+
ch, pos = next(dat, pos)
161+
totalchar += 1
162+
if ch > 0x7f
163+
if ch < 0x100
164+
num2byte += 1
165+
flags |= UTF_LATIN1
166+
elseif ch < 0x800
167+
num2byte += 1
168+
flags |= UTF_UNICODE2
169+
elseif ch > 0x0ffff
170+
(ch > 0x10ffff) && throw(UnicodeError(UTF_ERR_INVALID, pos, ch))
171+
num4byte += 1
172+
elseif !is_surrogate_codeunit(ch)
173+
num3byte += 1
174+
elseif is_surrogate_lead(ch)
175+
pos > endpos && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos, ch))
176+
# next character *must* be a trailing surrogate character
177+
ch, pos = next(dat, pos)
178+
!is_surrogate_trail(ch) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, ch))
179+
num4byte += 1
180+
if T != Vector{UInt16}
181+
!accept_surrogates && throw(UnicodeError(UTF_ERR_SURROGATE, pos, ch))
182+
flags |= UTF_SURROGATE
183+
end
184+
else
185+
throw(UnicodeError(UTF_ERR_NOT_LEAD, pos, ch))
186+
end
187+
end
188+
end
189+
num3byte != 0 && (flags |= UTF_UNICODE3)
190+
num4byte != 0 && (flags |= UTF_UNICODE4)
191+
return totalchar, flags, num4byte, num3byte, num2byte
192+
end
193+
194+
"
195+
Validates and calculates number of characters in a UTF-8,UTF-16 or UTF-32 encoded vector/string
196+
197+
This function checks the bounds of the start or end positions
198+
Use `unsafe_checkstring` to avoid that overhead if the bounds have already been checked
199+
200+
### Input Arguments:
201+
* `dat` UTF-8 (`Vector{UInt8}`), UTF-16 (`Vector{UInt16}`) or UTF-32 (`Vector{UInt32}`, `AbstractString`) encoded string
202+
203+
### Optional Input Arguments:
204+
* `startpos` start position (defaults to `start(dat)`)
205+
* `endpos` end position (defaults to `endof(dat)`)
206+
207+
### Keyword Arguments:
208+
* `accept_long_null` = `true` # Modified UTF-8 (`\\0` represented as `b\"\\xc0\\x80\"`)
209+
* `accept_surrogates` = `true` # `CESU-8`
210+
* `accept_long_char` = `false` # Accept arbitrary long encodings
211+
212+
### Returns:
213+
* (total characters, flags, 4-byte, 3-byte, 2-byte)
214+
215+
### Throws:
216+
* `UnicodeError`
217+
"
218+
function checkstring end
219+
220+
# No need to check bounds if using defaults
221+
checkstring(dat; kwargs...) = unsafe_checkstring(dat, start(dat), endof(dat); kwargs...)
222+
223+
# Make sure that beginning and end positions are bounds checked
224+
function checkstring(dat, startpos = start(dat), endpos = endof(dat); kwargs...)
225+
startpos < start(dat) && throw(BoundsError(dat, startpos))
226+
(startpos <= endpos <= endof(dat)) || throw(BoundsError(dat, endpos))
227+
unsafe_checkstring(dat, startpos, endpos; kwargs...)
228+
end

0 commit comments

Comments
 (0)