@@ -66,7 +66,7 @@ extern crate quickcheck;
66
66
pub use grapheme:: { Graphemes , GraphemeIndices } ;
67
67
pub use grapheme:: { GraphemeCursor , GraphemeIncomplete } ;
68
68
pub use tables:: UNICODE_VERSION ;
69
- pub use word:: { UWordBounds , UWordBoundIndices , UnicodeWords } ;
69
+ pub use word:: { UWordBounds , UWordBoundIndices , UnicodeWords , UnicodeWordIndices } ;
70
70
pub use sentence:: { USentenceBounds , USentenceBoundIndices , UnicodeSentences } ;
71
71
72
72
mod grapheme;
@@ -146,6 +146,30 @@ pub trait UnicodeSegmentation {
146
146
/// ```
147
147
fn unicode_words < ' a > ( & ' a self ) -> UnicodeWords < ' a > ;
148
148
149
+ /// Returns an iterator over the words of `self`, separated on
150
+ /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their
151
+ /// offsets.
152
+ ///
153
+ /// Here, "words" are just those substrings which, after splitting on
154
+ /// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
155
+ /// substring must contain at least one character with the
156
+ /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
157
+ /// property, or with
158
+ /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
159
+ ///
160
+ /// # Example
161
+ ///
162
+ /// ```
163
+ /// # use self::unicode_segmentation::UnicodeSegmentation;
164
+ /// let uwis = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
165
+ /// let uwi1 = uwis.unicode_word_indices().collect::<Vec<(usize, &str)>>();
166
+ /// let b: &[_] = &[(0, "The"), (4, "quick"), (12, "brown"), (20, "fox"), (24, "can't"),
167
+ /// (30, "jump"), (35, "32.3"), (40, "feet"), (46, "right")];
168
+ ///
169
+ /// assert_eq!(&uwi1[..], b);
170
+ /// ```
171
+ fn unicode_word_indices < ' a > ( & ' a self ) -> UnicodeWordIndices < ' a > ;
172
+
149
173
/// Returns an iterator over substrings of `self` separated on
150
174
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
151
175
///
@@ -249,6 +273,11 @@ impl UnicodeSegmentation for str {
249
273
word:: new_unicode_words ( self )
250
274
}
251
275
276
+ #[ inline]
277
+ fn unicode_word_indices ( & self ) -> UnicodeWordIndices {
278
+ word:: new_unicode_word_indices ( self )
279
+ }
280
+
252
281
#[ inline]
253
282
fn split_word_bounds ( & self ) -> UWordBounds {
254
283
word:: new_word_bounds ( self )
0 commit comments