1212 * GNU Lesser General Public License for more details.
1313 *)
1414exception UCS_value_out_of_range
15+
1516exception UCS_value_prohibited_in_UTF8
17+
1618exception UCS_value_prohibited_in_XML
19+
1720exception UTF8_character_incomplete
21+
1822exception UTF8_header_byte_invalid
23+
1924exception UTF8_continuation_byte_invalid
25+
2026exception UTF8_encoding_not_canonical
27+
2128exception String_incomplete
2229
2330(* === Utility Functions === *)
2431
2532let ( +++ ) = Int32. add
33+
2634let ( --- ) = Int32. sub
35+
2736let ( &&& ) = Int32. logand
37+
2838let ( ||| ) = Int32. logor
39+
2940let ( <<< ) = Int32. shift_left
41+
3042let ( >>> ) = Int32. shift_right_logical
3143
3244(* === Unicode Functions === *)
3345
3446module UCS = struct
35-
3647 let min_value = 0x000000l
37- let max_value = 0x1fffffl
3848
39- let is_non_character value = false
40- || (0xfdd0l < = value && value < = 0xfdefl ) (* case 1 *)
41- || (Int32. logand 0xfffel value = 0xfffel ) (* case 2 *)
49+ let max_value = 0x1fffffl
4250
43- let is_out_of_range value =
44- value < min_value || value > max_value
51+ let is_non_character value =
52+ false
53+ || (0xfdd0l < = value && value < = 0xfdefl ) (* case 1 *)
54+ || Int32. logand 0xfffel value = 0xfffel
55+ (* case 2 *)
4556
46- let is_surrogate value =
47- (0xd800l < = value && value < = 0xdfffl )
57+ let is_out_of_range value = value < min_value || value > max_value
4858
59+ let is_surrogate value = 0xd800l < = value && value < = 0xdfffl
4960end
5061
5162module XML = struct
52-
53- let is_forbidden_control_character value = value < 0x20l
54- && value <> 0x09l
55- && value <> 0x0al
56- && value <> 0x0dl
57-
63+ let is_forbidden_control_character value =
64+ value < 0x20l && value <> 0x09l && value <> 0x0al && value <> 0x0dl
5865end
5966
6067(* === UCS Validators === *)
6168
6269module type UCS_VALIDATOR = sig
63-
6470 val validate : int32 -> unit
65-
6671end
6772
6873module UTF8_UCS_validator : UCS_VALIDATOR = struct
69-
7074 let validate value =
71- if UCS. is_out_of_range value then raise UCS_value_out_of_range ;
72- if UCS. is_non_character value then raise UCS_value_prohibited_in_UTF8 ;
73- if UCS. is_surrogate value then raise UCS_value_prohibited_in_UTF8
74-
75+ if UCS. is_out_of_range value then raise UCS_value_out_of_range ;
76+ if UCS. is_non_character value then raise UCS_value_prohibited_in_UTF8 ;
77+ if UCS. is_surrogate value then raise UCS_value_prohibited_in_UTF8
7578end
7679
7780module XML_UTF8_UCS_validator : UCS_VALIDATOR = struct
78-
7981 let validate value =
80- UTF8_UCS_validator. validate value;
81- if XML. is_forbidden_control_character value
82- then raise UCS_value_prohibited_in_XML
83-
82+ UTF8_UCS_validator. validate value ;
83+ if XML. is_forbidden_control_character value then
84+ raise UCS_value_prohibited_in_XML
8485end
8586
8687(* ==== Character Codecs ==== *)
@@ -95,107 +96,129 @@ end
9596
9697module UTF8_CODEC (UCS_validator : UCS_VALIDATOR ) = struct
9798 let width_required_for_ucs_value value =
98- if value < 0x000080l (* 1 lsl 7 *) then 1 else
99- if value < 0x000800l (* 1 lsl 11 *) then 2 else
100- if value < 0x010000l (* 1 lsl 16 *) then 3 else 4
99+ if value < 0x000080l (* 1 lsl 7 *) then
100+ 1
101+ else if value < 0x000800l (* 1 lsl 11 *) then
102+ 2
103+ else if value < 0x010000l (* 1 lsl 16 *) then
104+ 3
105+ else
106+ 4
101107
102108 (* === Decoding === *)
103109
104110 let decode_header_byte byte =
105- if byte land 0b10000000 = 0b00000000 then (byte , 1 ) else
106- if byte land 0b11100000 = 0b11000000 then (byte land 0b0011111 , 2 ) else
107- if byte land 0b11110000 = 0b11100000 then (byte land 0b0001111 , 3 ) else
108- if byte land 0b11111000 = 0b11110000 then (byte land 0b0000111 , 4 ) else
111+ if byte land 0b10000000 = 0b00000000 then
112+ (byte, 1 )
113+ else if byte land 0b11100000 = 0b11000000 then
114+ (byte land 0b0011111 , 2 )
115+ else if byte land 0b11110000 = 0b11100000 then
116+ (byte land 0b0001111 , 3 )
117+ else if byte land 0b11111000 = 0b11110000 then
118+ (byte land 0b0000111 , 4 )
119+ else
109120 raise UTF8_header_byte_invalid
110121
111122 let decode_continuation_byte byte =
112- if byte land 0b11000000 = 0b10000000 then byte land 0b00111111 else
123+ if byte land 0b11000000 = 0b10000000 then
124+ byte land 0b00111111
125+ else
113126 raise UTF8_continuation_byte_invalid
114127
115128 let decode_character string index =
116129 let value, width = decode_header_byte (Char. code string .[index]) in
117- let value = if width = 1 then (Int32. of_int value)
118- else begin
130+ let value =
131+ if width = 1 then
132+ Int32. of_int value
133+ else
119134 let value = ref (Int32. of_int value) in
120135 for index = index + 1 to index + width - 1 do
121136 let chunk = decode_continuation_byte (Char. code string .[index]) in
122- value := ( ! value <<< 6 ) ||| ( Int32. of_int chunk)
123- done ;
124- if width > ( width_required_for_ucs_value ! value)
125- then raise UTF8_encoding_not_canonical ;
137+ value := ! value <<< 6 ||| Int32. of_int chunk
138+ done ;
139+ if width > width_required_for_ucs_value ! value then
140+ raise UTF8_encoding_not_canonical ;
126141 ! value
127- end in
128- UCS_validator. validate value;
142+ in
143+ UCS_validator. validate value ;
129144 (value, width)
130145
131146 (* === Encoding === *)
132147
133148 let encode_header_byte width value =
134149 match width with
135- | 1 -> value
136- | 2 -> value ||| 0b11000000l
137- | 3 -> value ||| 0b11100000l
138- | 4 -> value ||| 0b11110000l
139- | _ -> raise UCS_value_out_of_range
150+ | 1 ->
151+ value
152+ | 2 ->
153+ value ||| 0b11000000l
154+ | 3 ->
155+ value ||| 0b11100000l
156+ | 4 ->
157+ value ||| 0b11110000l
158+ | _ ->
159+ raise UCS_value_out_of_range
140160
141161 let encode_continuation_byte value =
142- (( value &&& 0b00111111l ) ||| 0b10000000l , value >>> 6 )
162+ (value &&& 0b00111111l ||| 0b10000000l , value >>> 6 )
143163
144164 let encode_character value =
145- UCS_validator. validate value;
165+ UCS_validator. validate value ;
146166 let width = width_required_for_ucs_value value in
147167 let b = Bytes. make width ' ' in
148168 (* Start by encoding the continuation bytes in reverse order. *)
149169 let rec encode_continuation_bytes remainder index =
150- if index = 0 then remainder else
170+ if index = 0 then
171+ remainder
172+ else
151173 let byte, remainder = encode_continuation_byte remainder in
152- Bytes. set b index @@ Char. chr (Int32. to_int byte);
153- encode_continuation_bytes remainder (index - 1 ) in
174+ Bytes. set b index @@ Char. chr (Int32. to_int byte) ;
175+ encode_continuation_bytes remainder (index - 1 )
176+ in
154177 let remainder = encode_continuation_bytes value (width - 1 ) in
155178 (* Finish by encoding the header byte. *)
156179 let byte = encode_header_byte width remainder in
157- Bytes. set b 0 @@ Char. chr (Int32. to_int byte);
180+ Bytes. set b 0 @@ Char. chr (Int32. to_int byte) ;
158181 Bytes. unsafe_to_string b
159-
160182end
161183
162- module UTF8_codec = UTF8_CODEC ( UTF8_UCS_validator )
184+ module UTF8_codec = UTF8_CODEC (UTF8_UCS_validator )
163185module XML_UTF8_codec = UTF8_CODEC (XML_UTF8_UCS_validator )
164186
165187(* === String Validators === *)
166188
167189module type STRING_VALIDATOR = sig
168-
169190 val is_valid : string -> bool
191+
170192 val validate : string -> unit
171- val longest_valid_prefix : string -> string
172193
194+ val longest_valid_prefix : string -> string
173195end
174196
175197exception Validation_error of int * exn
176198
177- module String_validator (Decoder : CHARACTER_DECODER ) : STRING_VALIDATOR = struct
178-
199+ module String_validator (Decoder : CHARACTER_DECODER ) : STRING_VALIDATOR =
200+ struct
179201 let validate string =
180202 let index = ref 0 and length = String. length string in
181- begin try
203+ ( try
182204 while ! index < length do
183205 let _, width = Decoder. decode_character string ! index in
184206 index := ! index + width
185- done ;
207+ done
186208 with
187- | Invalid_argument _ -> raise String_incomplete
188- | error -> raise (Validation_error (! index, error))
189- end ; assert (! index = length)
209+ | Invalid_argument _ ->
210+ raise String_incomplete
211+ | error ->
212+ raise (Validation_error (! index, error))
213+ ) ;
214+ assert (! index = length)
190215
191- let is_valid string =
192- try validate string ; true with _ -> false
216+ let is_valid string = try validate string ; true with _ -> false
193217
194218 let longest_valid_prefix string =
195- try validate string ; string
219+ try validate string ; string
196220 with Validation_error (index , _ ) -> String. sub string 0 index
197-
198221end
199222
200- module UTF8 = String_validator ( UTF8_codec )
223+ module UTF8 = String_validator (UTF8_codec )
201224module UTF8_XML = String_validator (XML_UTF8_codec )
0 commit comments