11use encoding_rs:: { Decoder , Encoder , Encoding } ;
22use std:: fmt;
3- use std:: io:: { self , BufRead , BufReader } ;
3+ use std:: io;
44
55use crate :: util:: ConsumeBuf ;
66
77const MAX_OUTPUT : usize = 4096 ;
88
99/// Charset transcoder
1010pub ( crate ) struct CharCodec < R > {
11- reader : BufReader < R > ,
11+ reader : R ,
12+ input_buf : ConsumeBuf ,
1213 dec : Option < Decoder > ,
1314 enc : Option < Encoder > ,
14- buf : ConsumeBuf ,
15+ output_buf : ConsumeBuf ,
1516 reached_end : bool ,
1617}
1718
@@ -21,58 +22,51 @@ where
2122{
2223 pub fn new ( reader : R , from : & ' static Encoding , to : & ' static Encoding ) -> Self {
2324 CharCodec {
24- reader : BufReader :: new ( reader) ,
25+ reader,
26+ input_buf : ConsumeBuf :: new ( 8192 ) ,
2527 dec : Some ( from. new_decoder ( ) ) ,
2628 enc : if to == encoding_rs:: UTF_8 {
2729 None
2830 } else {
2931 Some ( to. new_encoder ( ) )
3032 } ,
31- buf : ConsumeBuf :: new ( MAX_OUTPUT ) ,
33+ output_buf : ConsumeBuf :: new ( MAX_OUTPUT ) ,
3234 reached_end : false ,
3335 }
3436 }
3537}
3638
3739impl < R : io:: Read > io:: Read for CharCodec < R > {
3840 fn read ( & mut self , buf : & mut [ u8 ] ) -> io:: Result < usize > {
39- if self . reached_end && self . buf . unconsumed ( ) . is_empty ( ) {
41+ if self . reached_end && self . output_buf . unconsumed ( ) . is_empty ( ) {
4042 return Ok ( 0 ) ;
4143 }
4244
43- let input = ' read: {
44- if self . buf . unconsumed ( ) . len ( ) > MAX_OUTPUT / 4 {
45- // Do not keep filling if we have unused output.
46- break ' read self . reader . buffer ( ) ;
47- }
48-
49- let tmp = self . reader . fill_buf ( ) ?;
50- let tmp_len = tmp. len ( ) ;
51- if tmp_len >= 4 {
52- // We need some minimum input to make progress.
53- break ' read tmp;
54- }
55-
56- let tmp2 = self . reader . fill_buf ( ) ?;
57- if tmp2. len ( ) == tmp_len {
58- // Made no progress. That means we reached the end.
45+ // Ensure we have at least 4 bytes of input to decode, or we've reached EOF
46+ while self . input_buf . unconsumed ( ) . len ( ) < 4 && !self . reached_end {
47+ let free = self . input_buf . free_mut ( ) ;
48+ let n = self . reader . read ( free) ?;
49+ if n == 0 {
50+ // Reached EOF
5951 self . reached_end = true ;
52+ break ;
6053 }
54+ self . input_buf . add_filled ( n) ;
55+ }
6156
62- tmp2
63- } ;
57+ let input = self . input_buf . unconsumed ( ) ;
6458
65- if self . buf . free_mut ( ) . len ( ) < 4 {
66- self . buf . add_space ( 1024 ) ;
59+ if self . output_buf . free_mut ( ) . len ( ) < 4 {
60+ self . output_buf . add_space ( 1024 ) ;
6761 }
68- let output = self . buf . free_mut ( ) ;
62+ let output = self . output_buf . free_mut ( ) ;
6963
7064 if let Some ( dec) = & mut self . dec {
7165 let ( _, input_used, output_used, _had_errors) =
7266 dec. decode_to_utf8 ( input, output, self . reached_end ) ;
7367
74- self . reader . consume ( input_used) ;
75- self . buf . add_filled ( output_used) ;
68+ self . input_buf . consume ( input_used) ;
69+ self . output_buf . add_filled ( output_used) ;
7670
7771 if self . reached_end {
7872 // Can't be used again
@@ -81,13 +75,13 @@ impl<R: io::Read> io::Read for CharCodec<R> {
8175 }
8276
8377 // guaranteed to be on a char boundary by encoding_rs
84- let bytes = self . buf . unconsumed ( ) ;
78+ let bytes = self . output_buf . unconsumed ( ) ;
8579
8680 let amount = if let Some ( enc) = & mut self . enc {
8781 // unwrap is ok because it is on a char boundary, and non-utf8 chars have been replaced
8882 let utf8 = std:: str:: from_utf8 ( bytes) . unwrap ( ) ;
8983 let ( _, input_used, output_used, _) = enc. encode_from_utf8 ( utf8, buf, self . reached_end ) ;
90- self . buf . consume ( input_used) ;
84+ self . output_buf . consume ( input_used) ;
9185
9286 if self . reached_end {
9387 // Can't be used again
@@ -99,7 +93,7 @@ impl<R: io::Read> io::Read for CharCodec<R> {
9993 // No encoder, we want utf8
10094 let max = bytes. len ( ) . min ( buf. len ( ) ) ;
10195 buf[ ..max] . copy_from_slice ( & bytes[ ..max] ) ;
102- self . buf . consume ( max) ;
96+ self . output_buf . consume ( max) ;
10397 max
10498 } ;
10599
@@ -150,4 +144,51 @@ mod test {
150144 . unwrap ( ) ;
151145 assert_eq ! ( res. status( ) , 302 ) ;
152146 }
147+
148+ #[ test]
149+ fn multibyte_chars ( ) {
150+ const CHAR_COUNT : usize = 8193 ;
151+
152+ let cases: & [ ( & [ u8 ] , _ , _ ) ] = & [
153+ // ž
154+ // in utf-8: 0xC5 0xBE
155+ // CharCodec stops at 16384 bytes
156+ (
157+ & [ 0x01 , 0x7E ] ,
158+ encoding_rs:: UTF_16BE ,
159+ "2B utf-16be chars -> 2B utf-8" ,
160+ ) ,
161+ (
162+ & [ 0xB8 ] ,
163+ encoding_rs:: ISO_8859_15 ,
164+ "1B iso-8859-15 chars -> 2B utf-8" ,
165+ ) ,
166+ // ‽
167+ // in utf-8: 0xE2 0x80 0xBD
168+ // CharCodec stops at 24576 bytes
169+ (
170+ & [ 0x20 , 0x3D ] ,
171+ encoding_rs:: UTF_16BE ,
172+ "2B utf-16be chars -> 3B utf-8" ,
173+ ) ,
174+ ] ;
175+
176+ for ( char_bytes, from_encoding, case_name) in cases {
177+ let source_bytes = char_bytes. repeat ( CHAR_COUNT ) ;
178+
179+ let encoding_rs_result = from_encoding. decode ( & source_bytes) . 0 ;
180+ let char_codec_result = io:: read_to_string ( CharCodec :: new (
181+ source_bytes. as_slice ( ) ,
182+ from_encoding,
183+ encoding_rs:: UTF_8 ,
184+ ) )
185+ . unwrap ( ) ;
186+
187+ assert_eq ! (
188+ char_codec_result. len( ) ,
189+ encoding_rs_result. len( ) ,
190+ "{CHAR_COUNT} * {case_name}" ,
191+ ) ;
192+ }
193+ }
153194}
0 commit comments