Skip to content

Commit 3c83c17

Browse files
committed
Updates for keywords
1 parent 27a0187 commit 3c83c17

File tree

1 file changed

+21
-25
lines changed

1 file changed

+21
-25
lines changed

src/CheckStrings.jl

Lines changed: 21 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,7 @@ is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00)
1919
is_surrogate_codeunit(c::Unsigned) = ((c & ~0x007ff) == 0xd800)
2020
is_valid_continuation(c) = ((c & 0xc0) == 0x80)
2121

22-
## Options for check_string_* functions
23-
24-
const UTF_NO_LONG_NULL = 1 ##< don't accept 0xc0 0x80 for '\0'
25-
const UTF_NO_SURROGATES = 2 ##< don't accept surrogate pairs in UTF-8/UTF-32
26-
const UTF_ACCEPT_LONG = 4 ##< accept long encodings (other than long null in UTF-8)
22+
## Return flags for check_string function
2723

2824
const UTF_LONG = 1 ##< Long encodings are present
2925
const UTF_LATIN1 = 2 ##< characters in range 0x80-0xFF present
@@ -39,10 +35,10 @@ const UTF_SURROGATE = 32 ##< surrogate pairs present
3935
end
4036

4137
CodeUnitType = Union(UInt8, UInt16, UInt32, Char)
42-
CodeUnitC = Union(Vector{Char}, AbstractVector{Char}, AbstractArray{Char})
43-
CodeUnit8 = Union(Vector{UInt8}, AbstractVector{UInt8}, AbstractArray{UInt8})
44-
CodeUnit16 = Union(Vector{UInt16}, AbstractVector{UInt16}, AbstractArray{UInt16})
45-
CodeUnit32 = Union(Vector{UInt32}, AbstractVector{UInt32}, AbstractArray{UInt32})
38+
CodeUnitC = AbstractArray{Char}
39+
CodeUnit8 = AbstractArray(UInt8}
40+
CodeUnit16 = AbstractArry{UInt16}
41+
CodeUnit32 = AbstractArray{UInt32}
4642
CodeUnit = Union(CodeUnit8, CodeUnit16, CodeUnit32, CodeUnitC, AbstractString)
4743

4844
"
@@ -57,9 +53,9 @@ Validates and calculates number of characters in a UTF-8 encoded vector of `UInt
5753
* `pos` start position (defaults to `start(dat)`)
5854
5955
### Keyword Arguments:
60-
* `accept_long_null` = `true` # Modified UTF-8
61-
* `accept_surrogates` = `true` # CESU-8
62-
* `accept_long` = `false`
56+
* `accept_long_null` = `true` # Modified UTF-8 (`\\0` represented as `b\"\\xc0\\x80\"`)
57+
* `accept_surrogates` = `true` # `CESU-8`
58+
* `accept_long_char` = `false` # Accept arbitrary long encodings
6359
6460
### Returns:
6561
* (total characters, flags, 4-byte, 3-byte, 2-byte)
@@ -71,11 +67,11 @@ function check_string{T <: CodeUnit, E <: Union(UTF8, UTF16, UTF32)} (
7167
::Type{E},
7268
dat::T,
7369
endpos = endof(dat),
74-
pos = (T <: AbstractString) ? start(dat) : 1
70+
pos = start(dat)
7571
;
76-
accept_long_null = true,
72+
accept_long_null = true,
7773
accept_surrogates = true,
78-
accept_long = false)
74+
accept_long_char = false)
7975
local byt::UInt8, ch::UInt32, surr::UInt32
8076
flags::UInt = 0
8177
totalchar = num2byte = num3byte = num4byte = 0
@@ -87,22 +83,22 @@ function check_string{T <: CodeUnit, E <: Union(UTF8, UTF16, UTF32)} (
8783
# Check UTF-8 encoding
8884
if ch < 0xe0
8985
# 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff)
90-
(pos >= endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
86+
(pos > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
9187
byt, pos = next(dat, pos)
9288
ch = get_continuation(ch & 0x3f, byt, pos)
9389
if ch > 0x7f
9490
num2byte += 1
9591
flags |= (ch > 0xff) ? UTF_UNICODE2 : UTF_LATIN1
96-
elseif AcceptAnyLong
92+
elseif accept_long_char
9793
flags |= UTF_LONG
98-
elseif (ch == 0) && AcceptLongNull
94+
elseif (ch == 0) && accept_long_null
9995
flags |= UTF_LONG
10096
else
10197
throw(UnicodeError(UTF_ERR_LONG, pos, ch))
10298
end
10399
elseif ch < 0xf0
104100
# 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff)
105-
(pos + 2 >= endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
101+
(pos + 1 > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
106102
byt, pos = next(dat, pos)
107103
ch = get_continuation(ch & 0x0f, byt, pos)
108104
byt, pos = next(dat, pos)
@@ -111,28 +107,28 @@ function check_string{T <: CodeUnit, E <: Union(UTF8, UTF16, UTF32)} (
111107
if is_surrogate_codeunit(ch)
112108
!is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, pos-2, ch))
113109
# next character *must* be a trailing surrogate character
114-
(pos + 3 >= endpos) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos-2, ch))
110+
(pos + 2 > endpos) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos-2, ch))
115111
byt, pos = next(dat, pos)
116112
(byt != 0xed) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, byt))
117113
byt, pos = next(dat, pos)
118114
surr = get_continuation(0x0000d, byt, pos)
119115
byt, pos = next(dat, pos)
120116
surr = get_continuation(surr, byt, pos)
121117
!is_surrogate_trail(surr) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos-2, surr))
122-
!AcceptSurrogates && throw(UnicodeError(UTF_ERR_SURROGATE, pos-2, surr))
118+
!accept_surrogates && throw(UnicodeError(UTF_ERR_SURROGATE, pos-2, surr))
123119
flags |= UTF_SURROGATE
124120
num4byte += 1
125121
elseif ch > 0x07ff
126122
num3byte += 1
127-
elseif AcceptAnyLong
123+
elseif accept_long_char
128124
flags |= UTF_LONG
129125
num2byte += 1
130126
else
131127
throw(UnicodeError(UTF_ERR_LONG, pos-2, ch))
132128
end
133129
elseif ch < 0xf5
134130
# 4-byte UTF-8 sequence (i.e. characters > 0xffff)
135-
(pos + 3 >= endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
131+
(pos + 2 > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
136132
byt, pos = next(dat, pos)
137133
ch = get_continuation(ch & 0x07, byt, pos)
138134
byt, pos = next(dat, pos)
@@ -145,7 +141,7 @@ function check_string{T <: CodeUnit, E <: Union(UTF8, UTF16, UTF32)} (
145141
num4byte += 1
146142
elseif is_surrogate_codeunit(ch)
147143
throw(UnicodeError(UTF_ERR_SURROGATE, pos-3, ch))
148-
elseif AcceptAnyLong
144+
elseif accept_long_char
149145
# This is an overly long encoded character
150146
flags |= UTF_LONG
151147
if ch > 0x7ff
@@ -178,7 +174,7 @@ function check_string{T <: CodeUnit, E <: Union(UTF8, UTF16, UTF32)} (
178174
!is_surrogate_trail(ch) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, ch))
179175
num4byte += 1
180176
if E !<: UTF16
181-
!AcceptSurrogates && throw(UnicodeError(UTF_ERR_SURROGATE, pos, ch))
177+
!accept_surrogates && throw(UnicodeError(UTF_ERR_SURROGATE, pos, ch))
182178
flags |= UTF_SURROGATE
183179
end
184180
else

0 commit comments

Comments
 (0)