Skip to content

Commit f679352

Browse files
committed
auto merge of #7696 : glinscott/rust/utf8_perf, r=cmr
Moves multibyte code to it's own function to make char_range_at easier to inline, and faster for single and multibyte chars. Benchmarked reading example.json 100 times, 1.18s before, 1.08s after. Also, optimize str::is_utf8 for the single and multibyte case Before: is_utf8_ascii: 272.355162 ms is_utf8_multibyte: 167.337334 ms After: is_utf8_ascii: 218.088049 ms is_utf8_multibyte: 134.836722 ms
2 parents d56c976 + 8926b31 commit f679352

File tree

3 files changed

+111
-36
lines changed

3 files changed

+111
-36
lines changed

src/libstd/str.rs

+76-36
Original file line numberDiff line numberDiff line change
@@ -596,17 +596,25 @@ pub fn is_utf8(v: &[u8]) -> bool {
596596
let mut i = 0u;
597597
let total = v.len();
598598
while i < total {
599-
let mut chsize = utf8_char_width(v[i]);
600-
if chsize == 0u { return false; }
601-
if i + chsize > total { return false; }
602-
i += 1u;
603-
while chsize > 1u {
604-
if v[i] & 192u8 != TAG_CONT_U8 { return false; }
599+
if v[i] < 128u8 {
605600
i += 1u;
606-
chsize -= 1u;
601+
} else {
602+
let w = utf8_char_width(v[i]);
603+
if w == 0u { return false; }
604+
605+
let nexti = i + w;
606+
if nexti > total { return false; }
607+
608+
if v[i + 1] & 192u8 != TAG_CONT_U8 { return false; }
609+
if w > 2 {
610+
if v[i + 2] & 192u8 != TAG_CONT_U8 { return false; }
611+
if w > 3 && (v[i + 3] & 192u8 != TAG_CONT_U8) { return false; }
612+
}
613+
614+
i = nexti;
607615
}
608616
}
609-
return true;
617+
true
610618
}
611619

612620
/// Determines if a vector of `u16` contains valid UTF-16
@@ -722,17 +730,29 @@ pub fn count_bytes<'b>(s: &'b str, start: uint, n: uint) -> uint {
722730
end - start
723731
}
724732

733+
// https://tools.ietf.org/html/rfc3629
734+
static UTF8_CHAR_WIDTH: [u8, ..256] = [
735+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
736+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
737+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
738+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
739+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
740+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
741+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
742+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
743+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
744+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
745+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
746+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
747+
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
748+
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
749+
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
750+
4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
751+
];
752+
725753
/// Given a first byte, determine how many bytes are in this UTF-8 character
726754
pub fn utf8_char_width(b: u8) -> uint {
727-
let byte: uint = b as uint;
728-
if byte < 128u { return 1u; }
729-
// Not a valid start byte
730-
if byte < 192u { return 0u; }
731-
if byte < 224u { return 2u; }
732-
if byte < 240u { return 3u; }
733-
if byte < 248u { return 4u; }
734-
if byte < 252u { return 5u; }
735-
return 6u;
755+
return UTF8_CHAR_WIDTH[b] as uint;
736756
}
737757

738758
#[allow(missing_doc)]
@@ -1714,26 +1734,29 @@ impl<'self> StrSlice<'self> for &'self str {
17141734
* If `i` is greater than or equal to the length of the string.
17151735
* If `i` is not the index of the beginning of a valid UTF-8 character.
17161736
*/
1737+
#[inline]
17171738
fn char_range_at(&self, i: uint) -> CharRange {
1718-
let b0 = self[i];
1719-
let w = utf8_char_width(b0);
1720-
assert!((w != 0u));
1721-
if w == 1u { return CharRange {ch: b0 as char, next: i + 1u}; }
1722-
let mut val = 0u;
1723-
let end = i + w;
1724-
let mut i = i + 1u;
1725-
while i < end {
1726-
let byte = self[i];
1727-
assert_eq!(byte & 192u8, TAG_CONT_U8);
1728-
val <<= 6u;
1729-
val += (byte & 63u8) as uint;
1730-
i += 1u;
1739+
if (self[i] < 128u8) {
1740+
return CharRange {ch: self[i] as char, next: i + 1 };
17311741
}
1732-
// Clunky way to get the right bits from the first byte. Uses two shifts,
1733-
// the first to clip off the marker bits at the left of the byte, and then
1734-
// a second (as uint) to get it to the right position.
1735-
val += ((b0 << ((w + 1u) as u8)) as uint) << ((w - 1u) * 6u - w - 1u);
1736-
return CharRange {ch: val as char, next: i};
1742+
1743+
// Multibyte case is a fn to allow char_range_at to inline cleanly
1744+
fn multibyte_char_range_at(s: &str, i: uint) -> CharRange {
1745+
let mut val = s[i] as uint;
1746+
let w = UTF8_CHAR_WIDTH[val] as uint;
1747+
assert!((w != 0));
1748+
1749+
// First byte is special, only want bottom 5 bits for width 2, 4 bits
1750+
// for width 3, and 3 bits for width 4
1751+
val &= 0x7Fu >> w;
1752+
val = (val << 6) | (s[i + 1] & 63u8) as uint;
1753+
if w > 2 { val = (val << 6) | (s[i + 2] & 63u8) as uint; }
1754+
if w > 3 { val = (val << 6) | (s[i + 3] & 63u8) as uint; }
1755+
1756+
return CharRange {ch: val as char, next: i + w};
1757+
}
1758+
1759+
return multibyte_char_range_at(*self, i);
17371760
}
17381761
17391762
/// Plucks the character starting at the `i`th byte of a string
@@ -2430,7 +2453,11 @@ mod tests {
24302453
fn test_push_char() {
24312454
let mut data = ~"ประเทศไทย中";
24322455
data.push_char('华');
2433-
assert_eq!(~"ประเทศไทย中华", data);
2456+
data.push_char('b'); // 1 byte
2457+
data.push_char('¢'); // 2 byte
2458+
data.push_char('€'); // 3 byte
2459+
data.push_char('𤭢'); // 4 byte
2460+
assert_eq!(~"ประเทศไทย中华b¢€𤭢", data);
24342461
}
24352462
24362463
#[test]
@@ -3240,6 +3267,19 @@ mod tests {
32403267
"22".cmp(& &"1234") == Greater;
32413268
}
32423269
3270+
#[test]
3271+
fn test_char_range_at() {
3272+
let data = ~"b¢€𤭢𤭢€¢b";
3273+
assert_eq!('b', data.char_range_at(0).ch);
3274+
assert_eq!('¢', data.char_range_at(1).ch);
3275+
assert_eq!('€', data.char_range_at(3).ch);
3276+
assert_eq!('𤭢', data.char_range_at(6).ch);
3277+
assert_eq!('𤭢', data.char_range_at(10).ch);
3278+
assert_eq!('€', data.char_range_at(14).ch);
3279+
assert_eq!('¢', data.char_range_at(17).ch);
3280+
assert_eq!('b', data.char_range_at(19).ch);
3281+
}
3282+
32433283
#[test]
32443284
fn test_char_range_at_reverse_underflow() {
32453285
assert_eq!("abc".char_range_at_reverse(0).next, 0);

src/test/bench/core-std.rs

+24
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ use std::os;
1919
use std::rand::RngUtil;
2020
use std::rand;
2121
use std::result;
22+
use std::str;
2223
use std::uint;
2324
use std::util;
2425
use std::vec;
@@ -36,6 +37,8 @@ fn main() {
3637
bench!(vec_plus);
3738
bench!(vec_append);
3839
bench!(vec_push_all);
40+
bench!(is_utf8_ascii);
41+
bench!(is_utf8_multibyte);
3942
}
4043

4144
fn maybe_run_test(argv: &[~str], name: ~str, test: &fn()) {
@@ -127,3 +130,24 @@ fn vec_push_all() {
127130
}
128131
}
129132
}
133+
134+
fn is_utf8_ascii() {
135+
let mut v : ~[u8] = ~[];
136+
for uint::range(0, 20000) |_| {
137+
v.push('b' as u8);
138+
if !str::is_utf8(v) {
139+
fail!("is_utf8 failed");
140+
}
141+
}
142+
}
143+
144+
fn is_utf8_multibyte() {
145+
let s = "b¢€𤭢";
146+
let mut v : ~[u8]= ~[];
147+
for uint::range(0, 5000) |_| {
148+
v.push_all(s.as_bytes());
149+
if !str::is_utf8(v) {
150+
fail!("is_utf8 failed");
151+
}
152+
}
153+
}

src/test/run-pass/utf8_chars.rs

+11
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,20 @@ pub fn main() {
2727
assert!(s.char_at(1u) == 'é');
2828

2929
assert!((str::is_utf8(s.as_bytes())));
30+
// invalid prefix
3031
assert!((!str::is_utf8(~[0x80_u8])));
32+
// invalid 2 byte prefix
3133
assert!((!str::is_utf8(~[0xc0_u8])));
3234
assert!((!str::is_utf8(~[0xc0_u8, 0x10_u8])));
35+
// invalid 3 byte prefix
36+
assert!((!str::is_utf8(~[0xe0_u8])));
37+
assert!((!str::is_utf8(~[0xe0_u8, 0x10_u8])));
38+
assert!((!str::is_utf8(~[0xe0_u8, 0xff_u8, 0x10_u8])));
39+
// invalid 4 byte prefix
40+
assert!((!str::is_utf8(~[0xf0_u8])));
41+
assert!((!str::is_utf8(~[0xf0_u8, 0x10_u8])));
42+
assert!((!str::is_utf8(~[0xf0_u8, 0xff_u8, 0x10_u8])));
43+
assert!((!str::is_utf8(~[0xf0_u8, 0xff_u8, 0xff_u8, 0x10_u8])));
3344

3445
let mut stack = ~"a×c€";
3546
assert_eq!(stack.pop_char(), '€');

0 commit comments

Comments
 (0)