Updates for keywords

ScottPJones · ScottPJones · commit 3c83c177245e · 2015-06-09T04:29:19.000+02:00
diff --git a/src/CheckStrings.jl b/src/CheckStrings.jl
@@ -19,11 +19,7 @@ is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00)
 is_surrogate_codeunit(c::Unsigned) = ((c & ~0x007ff) == 0xd800)
 is_valid_continuation(c) = ((c & 0xc0) == 0x80)
 
-## Options for check_string_* functions
-
-const UTF_NO_LONG_NULL = 1      ##< don't accept 0xc0 0x80 for '\0'
-const UTF_NO_SURROGATES = 2     ##< don't accept surrogate pairs in UTF-8/UTF-32
-const UTF_ACCEPT_LONG = 4       ##< accept long encodings (other than long null in UTF-8)
+## Return flags for check_string function
 
 const UTF_LONG = 1              ##< Long encodings are present
 const UTF_LATIN1 = 2            ##< characters in range 0x80-0xFF present
@@ -39,10 +35,10 @@ const UTF_SURROGATE = 32        ##< surrogate pairs present
 end
 
 CodeUnitType = Union(UInt8, UInt16, UInt32, Char)
-CodeUnitC = Union(Vector{Char}, AbstractVector{Char}, AbstractArray{Char})
-CodeUnit8 = Union(Vector{UInt8}, AbstractVector{UInt8}, AbstractArray{UInt8})
-CodeUnit16 = Union(Vector{UInt16}, AbstractVector{UInt16}, AbstractArray{UInt16})
-CodeUnit32 = Union(Vector{UInt32}, AbstractVector{UInt32}, AbstractArray{UInt32})
+CodeUnitC = AbstractArray{Char}
+CodeUnit8 = AbstractArray(UInt8}
+CodeUnit16 = AbstractArry{UInt16}
+CodeUnit32 = AbstractArray{UInt32}
 CodeUnit = Union(CodeUnit8, CodeUnit16, CodeUnit32, CodeUnitC, AbstractString)
 
 "
@@ -57,9 +53,9 @@ Validates and calculates number of characters in a UTF-8 encoded vector of `UInt
 * `pos`    start position (defaults to `start(dat)`)
 
 ### Keyword Arguments:
-* `accept_long_null` = `true` # Modified UTF-8
-* `accept_surrogates` = `true` # CESU-8
-* `accept_long` = `false`
+* `accept_long_null`  = `true`  # Modified UTF-8 (`\\0` represented as `b\"\\xc0\\x80\"`)
+* `accept_surrogates` = `true`  # `CESU-8`
+* `accept_long_char`  = `false` # Accept arbitrary long encodings
 
 ### Returns:
 *   (total characters, flags, 4-byte, 3-byte, 2-byte)
@@ -71,11 +67,11 @@ function check_string{T <: CodeUnit, E <: Union(UTF8, UTF16, UTF32)} (
                       ::Type{E},
                       dat::T,
                       endpos = endof(dat),
-                      pos = (T <: AbstractString) ? start(dat) : 1
+                      pos = start(dat)
                       ;
-                      accept_long_null = true,
+                      accept_long_null  = true,
                       accept_surrogates = true,
-                      accept_long = false)
+                      accept_long_char  = false)
     local byt::UInt8, ch::UInt32, surr::UInt32
     flags::UInt = 0
     totalchar = num2byte = num3byte = num4byte = 0
@@ -87,22 +83,22 @@ function check_string{T <: CodeUnit, E <: Union(UTF8, UTF16, UTF32)} (
                 # Check UTF-8 encoding
                 if ch < 0xe0
                     # 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff)
-                    (pos >= endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
+                    (pos > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
                     byt, pos = next(dat, pos)
                     ch = get_continuation(ch & 0x3f, byt, pos)
                     if ch > 0x7f
                         num2byte += 1
                         flags |= (ch > 0xff) ? UTF_UNICODE2 : UTF_LATIN1
-                    elseif AcceptAnyLong
+                    elseif accept_long_char
                         flags |= UTF_LONG
-                    elseif (ch == 0) && AcceptLongNull
+                    elseif (ch == 0) && accept_long_null
                         flags |= UTF_LONG
                     else
                         throw(UnicodeError(UTF_ERR_LONG, pos, ch))
                     end
                 elseif ch < 0xf0
                     # 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff)
-                    (pos + 2 >= endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
+                    (pos + 1 > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
                     byt, pos = next(dat, pos)
                     ch = get_continuation(ch & 0x0f, byt, pos)
                     byt, pos = next(dat, pos)
@@ -111,28 +107,28 @@ function check_string{T <: CodeUnit, E <: Union(UTF8, UTF16, UTF32)} (
                     if is_surrogate_codeunit(ch)
                         !is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, pos-2, ch))
                         # next character *must* be a trailing surrogate character
-                        (pos + 3 >= endpos) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos-2, ch))
+                        (pos + 2 > endpos) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos-2, ch))
                         byt, pos = next(dat, pos)
                         (byt != 0xed) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, byt))
                         byt, pos = next(dat, pos)
                         surr = get_continuation(0x0000d, byt, pos)
                         byt, pos = next(dat, pos)
                         surr = get_continuation(surr, byt, pos)
                         !is_surrogate_trail(surr) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos-2, surr))
-                        !AcceptSurrogates && throw(UnicodeError(UTF_ERR_SURROGATE, pos-2, surr))
+                        !accept_surrogates && throw(UnicodeError(UTF_ERR_SURROGATE, pos-2, surr))
                         flags |= UTF_SURROGATE
                         num4byte += 1
                     elseif ch > 0x07ff
                         num3byte += 1
-                    elseif AcceptAnyLong
+                    elseif accept_long_char
                         flags |= UTF_LONG
                         num2byte += 1
                     else
                         throw(UnicodeError(UTF_ERR_LONG, pos-2, ch))
                     end
                 elseif ch < 0xf5
                     # 4-byte UTF-8 sequence (i.e. characters > 0xffff)
-                    (pos + 3 >= endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
+                    (pos + 2 > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
                     byt, pos = next(dat, pos)
                     ch = get_continuation(ch & 0x07, byt, pos)
                     byt, pos = next(dat, pos)
@@ -145,7 +141,7 @@ function check_string{T <: CodeUnit, E <: Union(UTF8, UTF16, UTF32)} (
                         num4byte += 1
                     elseif is_surrogate_codeunit(ch)
                         throw(UnicodeError(UTF_ERR_SURROGATE, pos-3, ch))
-                    elseif AcceptAnyLong
+                    elseif accept_long_char
                         # This is an overly long encoded character
                         flags |= UTF_LONG
                         if ch > 0x7ff
@@ -178,7 +174,7 @@ function check_string{T <: CodeUnit, E <: Union(UTF8, UTF16, UTF32)} (
                 !is_surrogate_trail(ch) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, ch))
                 num4byte += 1
                 if E !<: UTF16
-                    !AcceptSurrogates && throw(UnicodeError(UTF_ERR_SURROGATE, pos, ch))
+                    !accept_surrogates && throw(UnicodeError(UTF_ERR_SURROGATE, pos, ch))
                     flags |= UTF_SURROGATE
                 end
             else