@@ -596,17 +596,25 @@ pub fn is_utf8(v: &[u8]) -> bool {
596
596
let mut i = 0 u;
597
597
let total = v. len ( ) ;
598
598
while i < total {
599
- let mut chsize = utf8_char_width ( v[ i] ) ;
600
- if chsize == 0 u { return false ; }
601
- if i + chsize > total { return false ; }
602
- i += 1 u;
603
- while chsize > 1 u {
604
- if v[ i] & 192u8 != TAG_CONT_U8 { return false ; }
599
+ if v[ i] < 128u8 {
605
600
i += 1 u;
606
- chsize -= 1 u;
601
+ } else {
602
+ let w = utf8_char_width ( v[ i] ) ;
603
+ if w == 0 u { return false ; }
604
+
605
+ let nexti = i + w;
606
+ if nexti > total { return false ; }
607
+
608
+ if v[ i + 1 ] & 192u8 != TAG_CONT_U8 { return false ; }
609
+ if w > 2 {
610
+ if v[ i + 2 ] & 192u8 != TAG_CONT_U8 { return false ; }
611
+ if w > 3 && ( v[ i + 3 ] & 192u8 != TAG_CONT_U8 ) { return false ; }
612
+ }
613
+
614
+ i = nexti;
607
615
}
608
616
}
609
- return true ;
617
+ true
610
618
}
611
619
612
620
/// Determines if a vector of `u16` contains valid UTF-16
@@ -722,17 +730,29 @@ pub fn count_bytes<'b>(s: &'b str, start: uint, n: uint) -> uint {
722
730
end - start
723
731
}
724
732
733
+ // https://tools.ietf.org/html/rfc3629
734
+ static UTF8_CHAR_WIDTH : [ u8 , ..256 ] = [
735
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
736
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , // 0x1F
737
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
738
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , // 0x3F
739
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
740
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , // 0x5F
741
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
742
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , // 0x7F
743
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
744
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 0x9F
745
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
746
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 0xBF
747
+ 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 ,
748
+ 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , // 0xDF
749
+ 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , // 0xEF
750
+ 4 , 4 , 4 , 4 , 4 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 0xFF
751
+ ] ;
752
+
725
753
/// Given a first byte, determine how many bytes are in this UTF-8 character
726
754
pub fn utf8_char_width ( b : u8 ) -> uint {
727
- let byte: uint = b as uint ;
728
- if byte < 128 u { return 1 u; }
729
- // Not a valid start byte
730
- if byte < 192 u { return 0 u; }
731
- if byte < 224 u { return 2 u; }
732
- if byte < 240 u { return 3 u; }
733
- if byte < 248 u { return 4 u; }
734
- if byte < 252 u { return 5 u; }
735
- return 6 u;
755
+ return UTF8_CHAR_WIDTH [ b] as uint ;
736
756
}
737
757
738
758
#[ allow( missing_doc) ]
@@ -1714,26 +1734,29 @@ impl<'self> StrSlice<'self> for &'self str {
1714
1734
* If `i` is greater than or equal to the length of the string.
1715
1735
* If `i` is not the index of the beginning of a valid UTF-8 character.
1716
1736
*/
1737
+ #[inline]
1717
1738
fn char_range_at(&self, i: uint) -> CharRange {
1718
- let b0 = self[i];
1719
- let w = utf8_char_width(b0);
1720
- assert!((w != 0u));
1721
- if w == 1u { return CharRange {ch: b0 as char, next: i + 1u}; }
1722
- let mut val = 0u;
1723
- let end = i + w;
1724
- let mut i = i + 1u;
1725
- while i < end {
1726
- let byte = self[i];
1727
- assert_eq!(byte & 192u8, TAG_CONT_U8);
1728
- val <<= 6u;
1729
- val += (byte & 63u8) as uint;
1730
- i += 1u;
1739
+ if (self[i] < 128u8) {
1740
+ return CharRange {ch: self[i] as char, next: i + 1 };
1731
1741
}
1732
- // Clunky way to get the right bits from the first byte. Uses two shifts,
1733
- // the first to clip off the marker bits at the left of the byte, and then
1734
- // a second (as uint) to get it to the right position.
1735
- val += ((b0 << ((w + 1u) as u8)) as uint) << ((w - 1u) * 6u - w - 1u);
1736
- return CharRange {ch: val as char, next: i};
1742
+
1743
+ // Multibyte case is a fn to allow char_range_at to inline cleanly
1744
+ fn multibyte_char_range_at(s: &str, i: uint) -> CharRange {
1745
+ let mut val = s[i] as uint;
1746
+ let w = UTF8_CHAR_WIDTH[val] as uint;
1747
+ assert!((w != 0));
1748
+
1749
+ // First byte is special, only want bottom 5 bits for width 2, 4 bits
1750
+ // for width 3, and 3 bits for width 4
1751
+ val &= 0x7Fu >> w;
1752
+ val = (val << 6) | (s[i + 1] & 63u8) as uint;
1753
+ if w > 2 { val = (val << 6) | (s[i + 2] & 63u8) as uint; }
1754
+ if w > 3 { val = (val << 6) | (s[i + 3] & 63u8) as uint; }
1755
+
1756
+ return CharRange {ch: val as char, next: i + w};
1757
+ }
1758
+
1759
+ return multibyte_char_range_at(*self, i);
1737
1760
}
1738
1761
1739
1762
/// Plucks the character starting at the `i`th byte of a string
@@ -2430,7 +2453,11 @@ mod tests {
2430
2453
fn test_push_char() {
2431
2454
let mut data = ~" ประเทศไทย中";
2432
2455
data.push_char('华');
2433
- assert_eq!(~" ประเทศไทย中华", data);
2456
+ data.push_char('b'); // 1 byte
2457
+ data.push_char('¢'); // 2 byte
2458
+ data.push_char('€'); // 3 byte
2459
+ data.push_char('𤭢'); // 4 byte
2460
+ assert_eq!(~" ประเทศไทย中华b¢€𤭢", data);
2434
2461
}
2435
2462
2436
2463
#[test]
@@ -3240,6 +3267,19 @@ mod tests {
3240
3267
" 22 ".cmp(& &" 1234 ") == Greater;
3241
3268
}
3242
3269
3270
+ #[test]
3271
+ fn test_char_range_at() {
3272
+ let data = ~" b¢€𤭢𤭢€¢b";
3273
+ assert_eq!('b', data.char_range_at(0).ch);
3274
+ assert_eq!('¢', data.char_range_at(1).ch);
3275
+ assert_eq!('€', data.char_range_at(3).ch);
3276
+ assert_eq!('𤭢', data.char_range_at(6).ch);
3277
+ assert_eq!('𤭢', data.char_range_at(10).ch);
3278
+ assert_eq!('€', data.char_range_at(14).ch);
3279
+ assert_eq!('¢', data.char_range_at(17).ch);
3280
+ assert_eq!('b', data.char_range_at(19).ch);
3281
+ }
3282
+
3243
3283
#[test]
3244
3284
fn test_char_range_at_reverse_underflow() {
3245
3285
assert_eq!(" abc".char_range_at_reverse(0).next, 0);
0 commit comments