Skip to content

Commit 0a4b032

Browse files
committed
Fix CharCodec problem reading short
1 parent 561e814 commit 0a4b032

File tree

1 file changed

+73
-32
lines changed

1 file changed

+73
-32
lines changed

src/body/charset.rs

Lines changed: 73 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,18 @@
11
use encoding_rs::{Decoder, Encoder, Encoding};
22
use std::fmt;
3-
use std::io::{self, BufRead, BufReader};
3+
use std::io;
44

55
use crate::util::ConsumeBuf;
66

77
const MAX_OUTPUT: usize = 4096;
88

99
/// Charset transcoder
1010
pub(crate) struct CharCodec<R> {
11-
reader: BufReader<R>,
11+
reader: R,
12+
input_buf: ConsumeBuf,
1213
dec: Option<Decoder>,
1314
enc: Option<Encoder>,
14-
buf: ConsumeBuf,
15+
output_buf: ConsumeBuf,
1516
reached_end: bool,
1617
}
1718

@@ -21,58 +22,51 @@ where
2122
{
2223
pub fn new(reader: R, from: &'static Encoding, to: &'static Encoding) -> Self {
2324
CharCodec {
24-
reader: BufReader::new(reader),
25+
reader,
26+
input_buf: ConsumeBuf::new(8192),
2527
dec: Some(from.new_decoder()),
2628
enc: if to == encoding_rs::UTF_8 {
2729
None
2830
} else {
2931
Some(to.new_encoder())
3032
},
31-
buf: ConsumeBuf::new(MAX_OUTPUT),
33+
output_buf: ConsumeBuf::new(MAX_OUTPUT),
3234
reached_end: false,
3335
}
3436
}
3537
}
3638

3739
impl<R: io::Read> io::Read for CharCodec<R> {
3840
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
39-
if self.reached_end && self.buf.unconsumed().is_empty() {
41+
if self.reached_end && self.output_buf.unconsumed().is_empty() {
4042
return Ok(0);
4143
}
4244

43-
let input = 'read: {
44-
if self.buf.unconsumed().len() > MAX_OUTPUT / 4 {
45-
// Do not keep filling if we have unused output.
46-
break 'read self.reader.buffer();
47-
}
48-
49-
let tmp = self.reader.fill_buf()?;
50-
let tmp_len = tmp.len();
51-
if tmp_len >= 4 {
52-
// We need some minimum input to make progress.
53-
break 'read tmp;
54-
}
55-
56-
let tmp2 = self.reader.fill_buf()?;
57-
if tmp2.len() == tmp_len {
58-
// Made no progress. That means we reached the end.
45+
// Ensure we have at least 4 bytes of input to decode, or we've reached EOF
46+
while self.input_buf.unconsumed().len() < 4 && !self.reached_end {
47+
let free = self.input_buf.free_mut();
48+
let n = self.reader.read(free)?;
49+
if n == 0 {
50+
// Reached EOF
5951
self.reached_end = true;
52+
break;
6053
}
54+
self.input_buf.add_filled(n);
55+
}
6156

62-
tmp2
63-
};
57+
let input = self.input_buf.unconsumed();
6458

65-
if self.buf.free_mut().len() < 4 {
66-
self.buf.add_space(1024);
59+
if self.output_buf.free_mut().len() < 4 {
60+
self.output_buf.add_space(1024);
6761
}
68-
let output = self.buf.free_mut();
62+
let output = self.output_buf.free_mut();
6963

7064
if let Some(dec) = &mut self.dec {
7165
let (_, input_used, output_used, _had_errors) =
7266
dec.decode_to_utf8(input, output, self.reached_end);
7367

74-
self.reader.consume(input_used);
75-
self.buf.add_filled(output_used);
68+
self.input_buf.consume(input_used);
69+
self.output_buf.add_filled(output_used);
7670

7771
if self.reached_end {
7872
// Can't be used again
@@ -81,13 +75,13 @@ impl<R: io::Read> io::Read for CharCodec<R> {
8175
}
8276

8377
// guaranteed to be on a char boundary by encoding_rs
84-
let bytes = self.buf.unconsumed();
78+
let bytes = self.output_buf.unconsumed();
8579

8680
let amount = if let Some(enc) = &mut self.enc {
8781
// unwrap is ok because it is on a char boundary, and non-utf8 chars have been replaced
8882
let utf8 = std::str::from_utf8(bytes).unwrap();
8983
let (_, input_used, output_used, _) = enc.encode_from_utf8(utf8, buf, self.reached_end);
90-
self.buf.consume(input_used);
84+
self.output_buf.consume(input_used);
9185

9286
if self.reached_end {
9387
// Can't be used again
@@ -99,7 +93,7 @@ impl<R: io::Read> io::Read for CharCodec<R> {
9993
// No encoder, we want utf8
10094
let max = bytes.len().min(buf.len());
10195
buf[..max].copy_from_slice(&bytes[..max]);
102-
self.buf.consume(max);
96+
self.output_buf.consume(max);
10397
max
10498
};
10599

@@ -150,4 +144,51 @@ mod test {
150144
.unwrap();
151145
assert_eq!(res.status(), 302);
152146
}
147+
148+
#[test]
149+
fn multibyte_chars() {
150+
const CHAR_COUNT: usize = 8193;
151+
152+
let cases: &[(&[u8], _, _)] = &[
153+
// ž
154+
// in utf-8: 0xC5 0xBE
155+
// CharCodec stops at 16384 bytes
156+
(
157+
&[0x01, 0x7E],
158+
encoding_rs::UTF_16BE,
159+
"2B utf-16be chars -> 2B utf-8",
160+
),
161+
(
162+
&[0xB8],
163+
encoding_rs::ISO_8859_15,
164+
"1B iso-8859-15 chars -> 2B utf-8",
165+
),
166+
// ‽
167+
// in utf-8: 0xE2 0x80 0xBD
168+
// CharCodec stops at 24576 bytes
169+
(
170+
&[0x20, 0x3D],
171+
encoding_rs::UTF_16BE,
172+
"2B utf-16be chars -> 3B utf-8",
173+
),
174+
];
175+
176+
for (char_bytes, from_encoding, case_name) in cases {
177+
let source_bytes = char_bytes.repeat(CHAR_COUNT);
178+
179+
let encoding_rs_result = from_encoding.decode(&source_bytes).0;
180+
let char_codec_result = io::read_to_string(CharCodec::new(
181+
source_bytes.as_slice(),
182+
from_encoding,
183+
encoding_rs::UTF_8,
184+
))
185+
.unwrap();
186+
187+
assert_eq!(
188+
char_codec_result.len(),
189+
encoding_rs_result.len(),
190+
"{CHAR_COUNT} * {case_name}",
191+
);
192+
}
193+
}
153194
}

0 commit comments

Comments
 (0)