Add unicode_word_indices

basile-henry · basile-henry · commit 8bd6e3a2d66e · 2021-03-07T19:18:37.000+01:00
The iterator UnicodeWordIndices is similar to UnicodeWord but also provides byte offsets for each word
diff --git a/src/lib.rs b/src/lib.rs
@@ -66,7 +66,7 @@ extern crate quickcheck;
 pub use grapheme::{Graphemes, GraphemeIndices};
 pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
 pub use tables::UNICODE_VERSION;
-pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords};
+pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords, UnicodeWordIndices};
 pub use sentence::{USentenceBounds, USentenceBoundIndices, UnicodeSentences};
 
 mod grapheme;
@@ -146,6 +146,30 @@ pub trait UnicodeSegmentation {
     /// ```
     fn unicode_words<'a>(&'a self) -> UnicodeWords<'a>;
 
+    /// Returns an iterator over the words of `self`, separated on
+    /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their
+    /// offsets.
+    ///
+    /// Here, "words" are just those substrings which, after splitting on
+    /// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
+    /// substring must contain at least one character with the
+    /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
+    /// property, or with
+    /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # use self::unicode_segmentation::UnicodeSegmentation;
+    /// let uwis = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
+    /// let uwi1 = uwis.unicode_word_indices().collect::<Vec<(usize, &str)>>();
+    /// let b: &[_] = &[(0, "The"), (4, "quick"), (12, "brown"), (20, "fox"), (24, "can't"),
+    ///                 (30, "jump"), (35, "32.3"), (40, "feet"), (46, "right")];
+    ///
+    /// assert_eq!(&uwi1[..], b);
+    /// ```
+    fn unicode_word_indices<'a>(&'a self) -> UnicodeWordIndices<'a>;
+
     /// Returns an iterator over substrings of `self` separated on
     /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
     ///
@@ -249,6 +273,11 @@ impl UnicodeSegmentation for str {
         word::new_unicode_words(self)
     }
 
+    #[inline]
+    fn unicode_word_indices(&self) -> UnicodeWordIndices {
+        word::new_unicode_word_indices(self)
+    }
+
     #[inline]
     fn split_word_bounds(&self) -> UWordBounds {
         word::new_word_bounds(self)
diff --git a/src/word.rs b/src/word.rs
@@ -40,6 +40,34 @@ impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
     fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() }
 }
 
+/// An iterator over the substrings of a string which, after splitting the string on
+/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
+/// contain any characters with the
+/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
+/// property, or with
+/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
+/// This iterator also provides the byte offsets for each substring.
+///
+/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
+/// its documentation for more.
+///
+/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
+/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
+pub struct UnicodeWordIndices<'a> {
+    inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>,
+}
+
+impl<'a> Iterator for UnicodeWordIndices<'a> {
+    type Item = (usize, &'a str);
+
+    #[inline]
+    fn next(&mut self) -> Option<(usize, &'a str)> { self.inner.next() }
+}
+impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> {
+    #[inline]
+    fn next_back(&mut self) -> Option<(usize, &'a str)> { self.inner.next_back() }
+}
+
 /// External iterator for a string's
 /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
 ///
@@ -671,12 +699,22 @@ pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> {
 }
 
 #[inline]
-pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
-    use super::UnicodeSegmentation;
+fn has_alphanumeric(s: &&str) -> bool {
     use tables::util::is_alphanumeric;
 
-    fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) }
-    let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
+    s.chars().any(|c| is_alphanumeric(c))
+}
+
+#[inline]
+pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
+    use super::UnicodeSegmentation;
 
     UnicodeWords { inner: s.split_word_bounds().filter(has_alphanumeric) }
 }
+
+#[inline]
+pub fn new_unicode_word_indices<'b>(s: &'b str) -> UnicodeWordIndices<'b> {
+    use super::UnicodeSegmentation;
+
+    UnicodeWordIndices { inner: s.split_word_bound_indices().filter(|(_, c)| has_alphanumeric(c)) }
+}