Skip to content

Commit 8bd6e3a

Browse files
committed
Add unicode_word_indices
The iterator UnicodeWordIndices is similar to UnicodeWord but also provides byte offsets for each word
1 parent 3b75ee1 commit 8bd6e3a

File tree

2 files changed

+72
-5
lines changed

2 files changed

+72
-5
lines changed

src/lib.rs

+30-1
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ extern crate quickcheck;
6666
pub use grapheme::{Graphemes, GraphemeIndices};
6767
pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
6868
pub use tables::UNICODE_VERSION;
69-
pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords};
69+
pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords, UnicodeWordIndices};
7070
pub use sentence::{USentenceBounds, USentenceBoundIndices, UnicodeSentences};
7171

7272
mod grapheme;
@@ -146,6 +146,30 @@ pub trait UnicodeSegmentation {
146146
/// ```
147147
fn unicode_words<'a>(&'a self) -> UnicodeWords<'a>;
148148

149+
/// Returns an iterator over the words of `self`, separated on
150+
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their
151+
/// offsets.
152+
///
153+
/// Here, "words" are just those substrings which, after splitting on
154+
/// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
155+
/// substring must contain at least one character with the
156+
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
157+
/// property, or with
158+
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
159+
///
160+
/// # Example
161+
///
162+
/// ```
163+
/// # use self::unicode_segmentation::UnicodeSegmentation;
164+
/// let uwis = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
165+
/// let uwi1 = uwis.unicode_word_indices().collect::<Vec<(usize, &str)>>();
166+
/// let b: &[_] = &[(0, "The"), (4, "quick"), (12, "brown"), (20, "fox"), (24, "can't"),
167+
/// (30, "jump"), (35, "32.3"), (40, "feet"), (46, "right")];
168+
///
169+
/// assert_eq!(&uwi1[..], b);
170+
/// ```
171+
fn unicode_word_indices<'a>(&'a self) -> UnicodeWordIndices<'a>;
172+
149173
/// Returns an iterator over substrings of `self` separated on
150174
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
151175
///
@@ -249,6 +273,11 @@ impl UnicodeSegmentation for str {
249273
word::new_unicode_words(self)
250274
}
251275

276+
#[inline]
277+
fn unicode_word_indices(&self) -> UnicodeWordIndices {
278+
word::new_unicode_word_indices(self)
279+
}
280+
252281
#[inline]
253282
fn split_word_bounds(&self) -> UWordBounds {
254283
word::new_word_bounds(self)

src/word.rs

+42-4
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,34 @@ impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
4040
fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() }
4141
}
4242

43+
/// An iterator over the substrings of a string which, after splitting the string on
44+
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
45+
/// contain any characters with the
46+
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
47+
/// property, or with
48+
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
49+
/// This iterator also provides the byte offsets for each substring.
50+
///
51+
/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
52+
/// its documentation for more.
53+
///
54+
/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
55+
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
56+
pub struct UnicodeWordIndices<'a> {
57+
inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>,
58+
}
59+
60+
impl<'a> Iterator for UnicodeWordIndices<'a> {
61+
type Item = (usize, &'a str);
62+
63+
#[inline]
64+
fn next(&mut self) -> Option<(usize, &'a str)> { self.inner.next() }
65+
}
66+
impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> {
67+
#[inline]
68+
fn next_back(&mut self) -> Option<(usize, &'a str)> { self.inner.next_back() }
69+
}
70+
4371
/// External iterator for a string's
4472
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
4573
///
@@ -671,12 +699,22 @@ pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> {
671699
}
672700

673701
#[inline]
674-
pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
675-
use super::UnicodeSegmentation;
702+
fn has_alphanumeric(s: &&str) -> bool {
676703
use tables::util::is_alphanumeric;
677704

678-
fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) }
679-
let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
705+
s.chars().any(|c| is_alphanumeric(c))
706+
}
707+
708+
#[inline]
709+
pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
710+
use super::UnicodeSegmentation;
680711

681712
UnicodeWords { inner: s.split_word_bounds().filter(has_alphanumeric) }
682713
}
714+
715+
#[inline]
716+
pub fn new_unicode_word_indices<'b>(s: &'b str) -> UnicodeWordIndices<'b> {
717+
use super::UnicodeSegmentation;
718+
719+
UnicodeWordIndices { inner: s.split_word_bound_indices().filter(|(_, c)| has_alphanumeric(c)) }
720+
}

0 commit comments

Comments
 (0)