Skip to content

Commit ea149b8

Browse files
committed
Utf8Lossy type with chunks iterator and impl Display and Debug
1 parent 258ae6d commit ea149b8

File tree

6 files changed

+333
-99
lines changed

6 files changed

+333
-99
lines changed

src/liballoc/string.rs

+22-99
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,8 @@ use core::hash;
6161
use core::iter::{FromIterator, FusedIterator};
6262
use core::ops::{self, Add, AddAssign, Index, IndexMut};
6363
use core::ptr;
64-
use core::str as core_str;
6564
use core::str::pattern::Pattern;
65+
use std_unicode::lossy;
6666
use std_unicode::char::{decode_utf16, REPLACEMENT_CHARACTER};
6767

6868
use borrow::{Cow, ToOwned};
@@ -533,111 +533,34 @@ impl String {
533533
/// ```
534534
#[stable(feature = "rust1", since = "1.0.0")]
535535
pub fn from_utf8_lossy<'a>(v: &'a [u8]) -> Cow<'a, str> {
536-
let mut i;
537-
match str::from_utf8(v) {
538-
Ok(s) => return Cow::Borrowed(s),
539-
Err(e) => i = e.valid_up_to(),
540-
}
536+
let mut iter = lossy::Utf8Lossy::from_bytes(v).chunks();
541537

542-
const TAG_CONT_U8: u8 = 128;
543-
const REPLACEMENT: &'static [u8] = b"\xEF\xBF\xBD"; // U+FFFD in UTF-8
544-
let total = v.len();
545-
fn unsafe_get(xs: &[u8], i: usize) -> u8 {
546-
unsafe { *xs.get_unchecked(i) }
547-
}
548-
fn safe_get(xs: &[u8], i: usize, total: usize) -> u8 {
549-
if i >= total { 0 } else { unsafe_get(xs, i) }
550-
}
538+
let (first_valid, first_broken) = if let Some(chunk) = iter.next() {
539+
let lossy::Utf8LossyChunk { valid, broken } = chunk;
540+
if valid.len() == v.len() {
541+
debug_assert!(broken.is_empty());
542+
return Cow::Borrowed(valid);
543+
}
544+
(valid, broken)
545+
} else {
546+
return Cow::Borrowed("");
547+
};
551548

552-
let mut res = String::with_capacity(total);
549+
const REPLACEMENT: &'static str = "\u{FFFD}";
553550

554-
if i > 0 {
555-
unsafe { res.as_mut_vec().extend_from_slice(&v[..i]) };
551+
let mut res = String::with_capacity(v.len());
552+
res.push_str(first_valid);
553+
if !first_broken.is_empty() {
554+
res.push_str(REPLACEMENT);
556555
}
557556

558-
// subseqidx is the index of the first byte of the subsequence we're
559-
// looking at. It's used to copy a bunch of contiguous good codepoints
560-
// at once instead of copying them one by one.
561-
let mut subseqidx = i;
562-
563-
while i < total {
564-
let i_ = i;
565-
let byte = unsafe_get(v, i);
566-
i += 1;
567-
568-
macro_rules! error { () => ({
569-
unsafe {
570-
if subseqidx != i_ {
571-
res.as_mut_vec().extend_from_slice(&v[subseqidx..i_]);
572-
}
573-
subseqidx = i;
574-
res.as_mut_vec().extend_from_slice(REPLACEMENT);
575-
}
576-
})}
577-
578-
if byte < 128 {
579-
// subseqidx handles this
580-
} else {
581-
let w = core_str::utf8_char_width(byte);
582-
583-
match w {
584-
2 => {
585-
if safe_get(v, i, total) & 192 != TAG_CONT_U8 {
586-
error!();
587-
continue;
588-
}
589-
i += 1;
590-
}
591-
3 => {
592-
match (byte, safe_get(v, i, total)) {
593-
(0xE0, 0xA0...0xBF) => (),
594-
(0xE1...0xEC, 0x80...0xBF) => (),
595-
(0xED, 0x80...0x9F) => (),
596-
(0xEE...0xEF, 0x80...0xBF) => (),
597-
_ => {
598-
error!();
599-
continue;
600-
}
601-
}
602-
i += 1;
603-
if safe_get(v, i, total) & 192 != TAG_CONT_U8 {
604-
error!();
605-
continue;
606-
}
607-
i += 1;
608-
}
609-
4 => {
610-
match (byte, safe_get(v, i, total)) {
611-
(0xF0, 0x90...0xBF) => (),
612-
(0xF1...0xF3, 0x80...0xBF) => (),
613-
(0xF4, 0x80...0x8F) => (),
614-
_ => {
615-
error!();
616-
continue;
617-
}
618-
}
619-
i += 1;
620-
if safe_get(v, i, total) & 192 != TAG_CONT_U8 {
621-
error!();
622-
continue;
623-
}
624-
i += 1;
625-
if safe_get(v, i, total) & 192 != TAG_CONT_U8 {
626-
error!();
627-
continue;
628-
}
629-
i += 1;
630-
}
631-
_ => {
632-
error!();
633-
continue;
634-
}
635-
}
557+
for lossy::Utf8LossyChunk { valid, broken } in iter {
558+
res.push_str(valid);
559+
if !broken.is_empty() {
560+
res.push_str(REPLACEMENT);
636561
}
637562
}
638-
if subseqidx < total {
639-
unsafe { res.as_mut_vec().extend_from_slice(&v[subseqidx..total]) };
640-
}
563+
641564
Cow::Owned(res)
642565
}
643566

src/libstd_unicode/Cargo.toml

+4
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,9 @@ path = "lib.rs"
99
test = false
1010
bench = false
1111

12+
[[test]]
13+
name = "std_unicode_tests"
14+
path = "tests/lib.rs"
15+
1216
[dependencies]
1317
core = { path = "../libcore" }

src/libstd_unicode/lib.rs

+3
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@
3434

3535
#![feature(char_escape_debug)]
3636
#![feature(core_char_ext)]
37+
#![feature(str_internals)]
38+
#![feature(core_intrinsics)]
3739
#![feature(decode_utf8)]
3840
#![feature(fused)]
3941
#![feature(fn_traits)]
@@ -45,6 +47,7 @@
4547
mod tables;
4648
mod u_str;
4749
pub mod char;
50+
pub mod lossy;
4851

4952
#[allow(deprecated)]
5053
pub mod str {

src/libstd_unicode/lossy.rs

+198
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
// Copyright 2012-2017 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
use core::str as core_str;
12+
use core::fmt;
13+
use core::fmt::Write;
14+
use char;
15+
use core::intrinsics;
16+
17+
18+
/// Lossy UTF-8 string.
19+
#[unstable(feature = "str_internals", issue = "0")]
20+
pub struct Utf8Lossy {
21+
bytes: [u8]
22+
}
23+
24+
impl Utf8Lossy {
25+
pub fn from_str(s: &str) -> &Utf8Lossy {
26+
Utf8Lossy::from_bytes(s.as_bytes())
27+
}
28+
29+
pub fn from_bytes(bytes: &[u8]) -> &Utf8Lossy {
30+
unsafe { intrinsics::transmute(bytes) }
31+
}
32+
33+
pub fn chunks(&self) -> Utf8LossyChunksIter {
34+
Utf8LossyChunksIter { source: &self.bytes }
35+
}
36+
}
37+
38+
39+
/// Iterator over lossy UTF-8 string
40+
#[unstable(feature = "str_internals", issue = "0")]
41+
pub struct Utf8LossyChunksIter<'a> {
42+
source: &'a [u8],
43+
}
44+
45+
#[unstable(feature = "str_internals", issue = "0")]
46+
#[derive(PartialEq, Eq, Debug)]
47+
pub struct Utf8LossyChunk<'a> {
48+
/// Sequence of valid chars.
49+
/// Can be empty between broken UTF-8 chars.
50+
pub valid: &'a str,
51+
/// Single broken char, empty if none.
52+
/// Empty iff iterator item is last.
53+
pub broken: &'a [u8],
54+
}
55+
56+
impl<'a> Iterator for Utf8LossyChunksIter<'a> {
57+
type Item = Utf8LossyChunk<'a>;
58+
59+
fn next(&mut self) -> Option<Utf8LossyChunk<'a>> {
60+
if self.source.len() == 0 {
61+
return None;
62+
}
63+
64+
const TAG_CONT_U8: u8 = 128;
65+
fn unsafe_get(xs: &[u8], i: usize) -> u8 {
66+
unsafe { *xs.get_unchecked(i) }
67+
}
68+
fn safe_get(xs: &[u8], i: usize) -> u8 {
69+
if i >= xs.len() { 0 } else { unsafe_get(xs, i) }
70+
}
71+
72+
let mut i = 0;
73+
while i < self.source.len() {
74+
let i_ = i;
75+
76+
let byte = unsafe_get(self.source, i);
77+
i += 1;
78+
79+
if byte < 128 {
80+
81+
} else {
82+
let w = core_str::utf8_char_width(byte);
83+
84+
macro_rules! error { () => ({
85+
unsafe {
86+
let r = Utf8LossyChunk {
87+
valid: core_str::from_utf8_unchecked(&self.source[0..i_]),
88+
broken: &self.source[i_..i],
89+
};
90+
self.source = &self.source[i..];
91+
return Some(r);
92+
}
93+
})}
94+
95+
match w {
96+
2 => {
97+
if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
98+
error!();
99+
}
100+
i += 1;
101+
}
102+
3 => {
103+
match (byte, safe_get(self.source, i)) {
104+
(0xE0, 0xA0 ... 0xBF) => (),
105+
(0xE1 ... 0xEC, 0x80 ... 0xBF) => (),
106+
(0xED, 0x80 ... 0x9F) => (),
107+
(0xEE ... 0xEF, 0x80 ... 0xBF) => (),
108+
_ => {
109+
error!();
110+
}
111+
}
112+
i += 1;
113+
if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
114+
error!();
115+
}
116+
i += 1;
117+
}
118+
4 => {
119+
match (byte, safe_get(self.source, i)) {
120+
(0xF0, 0x90 ... 0xBF) => (),
121+
(0xF1 ... 0xF3, 0x80 ... 0xBF) => (),
122+
(0xF4, 0x80 ... 0x8F) => (),
123+
_ => {
124+
error!();
125+
}
126+
}
127+
i += 1;
128+
if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
129+
error!();
130+
}
131+
i += 1;
132+
if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
133+
error!();
134+
}
135+
i += 1;
136+
}
137+
_ => {
138+
error!();
139+
}
140+
}
141+
}
142+
}
143+
144+
let r = Utf8LossyChunk {
145+
valid: unsafe { core_str::from_utf8_unchecked(self.source) },
146+
broken: &[],
147+
};
148+
self.source = &[];
149+
return Some(r);
150+
}
151+
}
152+
153+
154+
impl fmt::Display for Utf8Lossy {
155+
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
156+
for Utf8LossyChunk { valid, broken } in self.chunks() {
157+
f.write_str(valid)?;
158+
if !broken.is_empty() {
159+
f.write_char(char::REPLACEMENT_CHARACTER)?;
160+
}
161+
}
162+
Ok(())
163+
}
164+
}
165+
166+
impl fmt::Debug for Utf8Lossy {
167+
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
168+
f.write_char('"')?;
169+
170+
for Utf8LossyChunk { valid, broken } in self.chunks() {
171+
172+
// Valid part.
173+
// Here we partially parse UTF-8 again which is suboptimal.
174+
{
175+
let mut from = 0;
176+
for (i, c) in valid.char_indices() {
177+
let esc = c.escape_debug();
178+
// If char needs escaping, flush backlog so far and write, else skip
179+
if esc.len() != 1 {
180+
f.write_str(&valid[from..i])?;
181+
for c in esc {
182+
f.write_char(c)?;
183+
}
184+
from = i + c.len_utf8();
185+
}
186+
}
187+
f.write_str(&valid[from..])?;
188+
}
189+
190+
// Broken parts of string as hex escape.
191+
for &b in broken {
192+
write!(f, "\\x{:02x}", b)?;
193+
}
194+
}
195+
196+
f.write_char('"')
197+
}
198+
}

src/libstd_unicode/tests/lib.rs

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
// Copyright 2012-2017 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
#![feature(str_internals, unicode)]
12+
13+
extern crate std_unicode;
14+
15+
mod lossy;

0 commit comments

Comments
 (0)