@@ -19,11 +19,7 @@ is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00)
1919is_surrogate_codeunit (c:: Unsigned ) = ((c & ~ 0x007ff ) == 0xd800 )
2020is_valid_continuation (c) = ((c & 0xc0 ) == 0x80 )
2121
22- # # Options for check_string_* functions
23-
24- const UTF_NO_LONG_NULL = 1 # #< don't accept 0xc0 0x80 for '\0'
25- const UTF_NO_SURROGATES = 2 # #< don't accept surrogate pairs in UTF-8/UTF-32
26- const UTF_ACCEPT_LONG = 4 # #< accept long encodings (other than long null in UTF-8)
22+ # # Return flags for check_string function
2723
2824const UTF_LONG = 1 # #< Long encodings are present
2925const UTF_LATIN1 = 2 # #< characters in range 0x80-0xFF present
@@ -39,10 +35,10 @@ const UTF_SURROGATE = 32 ##< surrogate pairs present
3935end
4036
4137CodeUnitType = Union (UInt8, UInt16, UInt32, Char)
42- CodeUnitC = Union (Vector{Char}, AbstractVector{Char}, AbstractArray{Char})
43- CodeUnit8 = Union (Vector{ UInt8}, AbstractVector{UInt8}, AbstractArray{UInt8})
44- CodeUnit16 = Union (Vector {UInt16}, AbstractVector{UInt16}, AbstractArray{UInt16})
45- CodeUnit32 = Union (Vector{UInt32}, AbstractVector{UInt32}, AbstractArray{UInt32})
38+ CodeUnitC = AbstractArray{Char}
39+ CodeUnit8 = AbstractArray ( UInt8}
40+ CodeUnit16 = AbstractArry {UInt16}
41+ CodeUnit32 = AbstractArray{UInt32}
4642CodeUnit = Union (CodeUnit8, CodeUnit16, CodeUnit32, CodeUnitC, AbstractString)
4743
4844"
@@ -57,9 +53,9 @@ Validates and calculates number of characters in a UTF-8 encoded vector of `UInt
5753* `pos` start position (defaults to `start(dat)`)
5854
5955### Keyword Arguments:
60- * `accept_long_null` = `true` # Modified UTF-8
61- * `accept_surrogates` = `true` # CESU-8
62- * `accept_long` = `false`
56+ * `accept_long_null` = `true` # Modified UTF-8 (` \\ 0` represented as `b \"\\ xc0 \\ x80 \" `)
57+ * `accept_surrogates` = `true` # ` CESU-8`
58+ * `accept_long_char` = `false` # Accept arbitrary long encodings
6359
6460### Returns:
6561* (total characters, flags, 4-byte, 3-byte, 2-byte)
@@ -71,11 +67,11 @@ function check_string{T <: CodeUnit, E <: Union(UTF8, UTF16, UTF32)} (
7167 :: Type{E} ,
7268 dat:: T ,
7369 endpos = endof (dat),
74- pos = (T <: AbstractString ) ? start (dat) : 1
70+ pos = start (dat)
7571 ;
76- accept_long_null = true ,
72+ accept_long_null = true ,
7773 accept_surrogates = true ,
78- accept_long = false )
74+ accept_long_char = false )
7975 local byt:: UInt8 , ch:: UInt32 , surr:: UInt32
8076 flags:: UInt = 0
8177 totalchar = num2byte = num3byte = num4byte = 0
@@ -87,22 +83,22 @@ function check_string{T <: CodeUnit, E <: Union(UTF8, UTF16, UTF32)} (
8783 # Check UTF-8 encoding
8884 if ch < 0xe0
8985 # 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff)
90- (pos >= endpos) && throw (UnicodeError (UTF_ERR_SHORT, pos, ch))
86+ (pos > endpos) && throw (UnicodeError (UTF_ERR_SHORT, pos, ch))
9187 byt, pos = next (dat, pos)
9288 ch = get_continuation (ch & 0x3f , byt, pos)
9389 if ch > 0x7f
9490 num2byte += 1
9591 flags |= (ch > 0xff ) ? UTF_UNICODE2 : UTF_LATIN1
96- elseif AcceptAnyLong
92+ elseif accept_long_char
9793 flags |= UTF_LONG
98- elseif (ch == 0 ) && AcceptLongNull
94+ elseif (ch == 0 ) && accept_long_null
9995 flags |= UTF_LONG
10096 else
10197 throw (UnicodeError (UTF_ERR_LONG, pos, ch))
10298 end
10399 elseif ch < 0xf0
104100 # 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff)
105- (pos + 2 >= endpos) && throw (UnicodeError (UTF_ERR_SHORT, pos, ch))
101+ (pos + 1 > endpos) && throw (UnicodeError (UTF_ERR_SHORT, pos, ch))
106102 byt, pos = next (dat, pos)
107103 ch = get_continuation (ch & 0x0f , byt, pos)
108104 byt, pos = next (dat, pos)
@@ -111,28 +107,28 @@ function check_string{T <: CodeUnit, E <: Union(UTF8, UTF16, UTF32)} (
111107 if is_surrogate_codeunit (ch)
112108 ! is_surrogate_lead (ch) && throw (UnicodeError (UTF_ERR_NOT_LEAD, pos- 2 , ch))
113109 # next character *must* be a trailing surrogate character
114- (pos + 3 >= endpos) && throw (UnicodeError (UTF_ERR_MISSING_SURROGATE, pos- 2 , ch))
110+ (pos + 2 > endpos) && throw (UnicodeError (UTF_ERR_MISSING_SURROGATE, pos- 2 , ch))
115111 byt, pos = next (dat, pos)
116112 (byt != 0xed ) && throw (UnicodeError (UTF_ERR_NOT_TRAIL, pos, byt))
117113 byt, pos = next (dat, pos)
118114 surr = get_continuation (0x0000d , byt, pos)
119115 byt, pos = next (dat, pos)
120116 surr = get_continuation (surr, byt, pos)
121117 ! is_surrogate_trail (surr) && throw (UnicodeError (UTF_ERR_NOT_TRAIL, pos- 2 , surr))
122- ! AcceptSurrogates && throw (UnicodeError (UTF_ERR_SURROGATE, pos- 2 , surr))
118+ ! accept_surrogates && throw (UnicodeError (UTF_ERR_SURROGATE, pos- 2 , surr))
123119 flags |= UTF_SURROGATE
124120 num4byte += 1
125121 elseif ch > 0x07ff
126122 num3byte += 1
127- elseif AcceptAnyLong
123+ elseif accept_long_char
128124 flags |= UTF_LONG
129125 num2byte += 1
130126 else
131127 throw (UnicodeError (UTF_ERR_LONG, pos- 2 , ch))
132128 end
133129 elseif ch < 0xf5
134130 # 4-byte UTF-8 sequence (i.e. characters > 0xffff)
135- (pos + 3 >= endpos) && throw (UnicodeError (UTF_ERR_SHORT, pos, ch))
131+ (pos + 2 > endpos) && throw (UnicodeError (UTF_ERR_SHORT, pos, ch))
136132 byt, pos = next (dat, pos)
137133 ch = get_continuation (ch & 0x07 , byt, pos)
138134 byt, pos = next (dat, pos)
@@ -145,7 +141,7 @@ function check_string{T <: CodeUnit, E <: Union(UTF8, UTF16, UTF32)} (
145141 num4byte += 1
146142 elseif is_surrogate_codeunit (ch)
147143 throw (UnicodeError (UTF_ERR_SURROGATE, pos- 3 , ch))
148- elseif AcceptAnyLong
144+ elseif accept_long_char
149145 # This is an overly long encoded character
150146 flags |= UTF_LONG
151147 if ch > 0x7ff
@@ -178,7 +174,7 @@ function check_string{T <: CodeUnit, E <: Union(UTF8, UTF16, UTF32)} (
178174 ! is_surrogate_trail (ch) && throw (UnicodeError (UTF_ERR_NOT_TRAIL, pos, ch))
179175 num4byte += 1
180176 if E ! <: UTF16
181- ! AcceptSurrogates && throw (UnicodeError (UTF_ERR_SURROGATE, pos, ch))
177+ ! accept_surrogates && throw (UnicodeError (UTF_ERR_SURROGATE, pos, ch))
182178 flags |= UTF_SURROGATE
183179 end
184180 else
0 commit comments