Skip to content

Commit 1820442

Browse files
committed
Add Utf8Error::resume_from, to help incremental and/or lossy decoding.
Without this, code outside of the standard library needs to reimplement most of the logic `from_utf8` to interpret the bytes after `valid_up_to()`.
1 parent fd182c4 commit 1820442

File tree

3 files changed

+87
-22
lines changed

3 files changed

+87
-22
lines changed

src/libcollectionstest/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#![feature(test)]
2929
#![feature(unboxed_closures)]
3030
#![feature(unicode)]
31+
#![feature(utf8_error_resume_from)]
3132

3233
extern crate collections;
3334
extern crate test;

src/libcollectionstest/str.rs

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -540,6 +540,36 @@ fn from_utf8_mostly_ascii() {
540540
}
541541
}
542542

543+
#[test]
544+
fn from_utf8_error() {
545+
macro_rules! test {
546+
($input: expr, $expected_valid_up_to: expr, $expected_resume_from: expr) => {
547+
let error = from_utf8($input).unwrap_err();
548+
assert_eq!(error.valid_up_to(), $expected_valid_up_to);
549+
assert_eq!(error.resume_from(), $expected_resume_from);
550+
}
551+
}
552+
test!(b"A\xC3\xA9 \xFF ", 4, Some(5));
553+
test!(b"A\xC3\xA9 \x80 ", 4, Some(5));
554+
test!(b"A\xC3\xA9 \xC1 ", 4, Some(5));
555+
test!(b"A\xC3\xA9 \xC1", 4, Some(5));
556+
test!(b"A\xC3\xA9 \xC2", 4, None);
557+
test!(b"A\xC3\xA9 \xC2 ", 4, Some(5));
558+
test!(b"A\xC3\xA9 \xC2\xC0", 4, Some(5));
559+
test!(b"A\xC3\xA9 \xE0", 4, None);
560+
test!(b"A\xC3\xA9 \xE0\x9F", 4, Some(5));
561+
test!(b"A\xC3\xA9 \xE0\xA0", 4, None);
562+
test!(b"A\xC3\xA9 \xE0\xA0\xC0", 4, Some(6));
563+
test!(b"A\xC3\xA9 \xE0\xA0 ", 4, Some(6));
564+
test!(b"A\xC3\xA9 \xED\xA0\x80 ", 4, Some(5));
565+
test!(b"A\xC3\xA9 \xF1", 4, None);
566+
test!(b"A\xC3\xA9 \xF1\x80", 4, None);
567+
test!(b"A\xC3\xA9 \xF1\x80\x80", 4, None);
568+
test!(b"A\xC3\xA9 \xF1 ", 4, Some(5));
569+
test!(b"A\xC3\xA9 \xF1\x80 ", 4, Some(6));
570+
test!(b"A\xC3\xA9 \xF1\x80\x80 ", 4, Some(7));
571+
}
572+
543573
#[test]
544574
fn test_as_bytes() {
545575
// no null

src/libcore/str/mod.rs

Lines changed: 56 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -125,13 +125,14 @@ Section: Creating a string
125125
#[stable(feature = "rust1", since = "1.0.0")]
126126
pub struct Utf8Error {
127127
valid_up_to: usize,
128+
invalid_length: Option<u8>,
128129
}
129130

130131
impl Utf8Error {
131132
/// Returns the index in the given string up to which valid UTF-8 was
132133
/// verified.
133134
///
134-
/// It is the maximum index such that `from_utf8(input[..index])`
135+
/// It is the maximum index such that `from_utf8(&input[..index])`
135136
/// would return `Ok(_)`.
136137
///
137138
/// # Examples
@@ -152,6 +153,21 @@ impl Utf8Error {
152153
/// ```
153154
#[stable(feature = "utf8_error", since = "1.5.0")]
154155
pub fn valid_up_to(&self) -> usize { self.valid_up_to }
156+
157+
/// Provide more information about the failure:
158+
///
159+
/// * `None`: the end of the input was reached unexpectedly.
160+
/// `self.valid_up_to()` is 1 to 3 bytes from the end of the input.
161+
/// If a byte stream (such as a file or a network socket) is being decoded incrementally,
162+
/// this could be a valid `char` whose UTF-8 byte sequence is spanning multiple chunks.
163+
///
164+
/// * `Some(index)`: an unexpected byte was encountered.
165+
/// The index provided is where decoding should resume
166+
/// (after inserting a U+FFFD REPLACEMENT CHARACTER) in case of lossy decoding.
167+
#[unstable(feature = "utf8_error_resume_from", reason ="new", issue = "0")]
168+
pub fn resume_from(&self) -> Option<usize> {
169+
self.invalid_length.map(|l| self.valid_up_to + l as usize)
170+
}
155171
}
156172

157173
/// Converts a slice of bytes to a string slice.
@@ -300,7 +316,12 @@ pub unsafe fn from_utf8_unchecked(v: &[u8]) -> &str {
300316
#[stable(feature = "rust1", since = "1.0.0")]
301317
impl fmt::Display for Utf8Error {
302318
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
303-
write!(f, "invalid utf-8: invalid byte near index {}", self.valid_up_to)
319+
if let Some(invalid_length) = self.invalid_length {
320+
write!(f, "invalid utf-8 sequence of {} bytes from index {}",
321+
invalid_length, self.valid_up_to)
322+
} else {
323+
write!(f, "incomplete utf-8 byte sequence from index {}", self.valid_up_to)
324+
}
304325
}
305326
}
306327

@@ -1241,25 +1262,27 @@ fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
12411262

12421263
while index < len {
12431264
let old_offset = index;
1244-
macro_rules! err { () => {{
1245-
return Err(Utf8Error {
1246-
valid_up_to: old_offset
1247-
})
1248-
}}}
1265+
macro_rules! err {
1266+
($invalid_length: expr) => {
1267+
return Err(Utf8Error {
1268+
valid_up_to: old_offset,
1269+
invalid_length: $invalid_length,
1270+
})
1271+
}
1272+
}
12491273

12501274
macro_rules! next { () => {{
12511275
index += 1;
12521276
// we needed data, but there was none: error!
12531277
if index >= len {
1254-
err!()
1278+
err!(None)
12551279
}
12561280
v[index]
12571281
}}}
12581282

12591283
let first = v[index];
12601284
if first >= 128 {
12611285
let w = UTF8_CHAR_WIDTH[first as usize];
1262-
let second = next!();
12631286
// 2-byte encoding is for codepoints \u{0080} to \u{07ff}
12641287
// first C2 80 last DF BF
12651288
// 3-byte encoding is for codepoints \u{0800} to \u{ffff}
@@ -1279,25 +1302,36 @@ fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
12791302
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
12801303
// %xF4 %x80-8F 2( UTF8-tail )
12811304
match w {
1282-
2 => if second & !CONT_MASK != TAG_CONT_U8 {err!()},
1305+
2 => if next!() & !CONT_MASK != TAG_CONT_U8 {
1306+
err!(Some(1))
1307+
},
12831308
3 => {
1284-
match (first, second, next!() & !CONT_MASK) {
1285-
(0xE0 , 0xA0 ... 0xBF, TAG_CONT_U8) |
1286-
(0xE1 ... 0xEC, 0x80 ... 0xBF, TAG_CONT_U8) |
1287-
(0xED , 0x80 ... 0x9F, TAG_CONT_U8) |
1288-
(0xEE ... 0xEF, 0x80 ... 0xBF, TAG_CONT_U8) => {}
1289-
_ => err!()
1309+
match (first, next!()) {
1310+
(0xE0 , 0xA0 ... 0xBF) |
1311+
(0xE1 ... 0xEC, 0x80 ... 0xBF) |
1312+
(0xED , 0x80 ... 0x9F) |
1313+
(0xEE ... 0xEF, 0x80 ... 0xBF) => {}
1314+
_ => err!(Some(1))
1315+
}
1316+
if next!() & !CONT_MASK != TAG_CONT_U8 {
1317+
err!(Some(2))
12901318
}
12911319
}
12921320
4 => {
1293-
match (first, second, next!() & !CONT_MASK, next!() & !CONT_MASK) {
1294-
(0xF0 , 0x90 ... 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
1295-
(0xF1 ... 0xF3, 0x80 ... 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
1296-
(0xF4 , 0x80 ... 0x8F, TAG_CONT_U8, TAG_CONT_U8) => {}
1297-
_ => err!()
1321+
match (first, next!()) {
1322+
(0xF0 , 0x90 ... 0xBF) |
1323+
(0xF1 ... 0xF3, 0x80 ... 0xBF) |
1324+
(0xF4 , 0x80 ... 0x8F) => {}
1325+
_ => err!(Some(1))
1326+
}
1327+
if next!() & !CONT_MASK != TAG_CONT_U8 {
1328+
err!(Some(2))
1329+
}
1330+
if next!() & !CONT_MASK != TAG_CONT_U8 {
1331+
err!(Some(3))
12981332
}
12991333
}
1300-
_ => err!()
1334+
_ => err!(Some(1))
13011335
}
13021336
index += 1;
13031337
} else {

0 commit comments

Comments
 (0)