From acc82dc50ec135496d643d31750629f76f791480 Mon Sep 17 00:00:00 2001 From: Konrad Borowski Date: Sat, 14 Oct 2017 09:47:17 +0200 Subject: [PATCH] Split graphemes and grapheme_indices into two methods s.extended_graphemes() is more readable than s.graphemes(true), as you don't have to think what does true mean here. Those methods were implemented as default methods in order to preserve backward compatibility if somebody implemented UnicodeSegmentation for their own types. --- src/lib.rs | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/test.rs | 8 +++++++ 2 files changed, 71 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 6f903c0..bc3204a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -107,6 +107,44 @@ pub trait UnicodeSegmentation { /// ``` fn graphemes<'a>(&'a self, is_extended: bool) -> Graphemes<'a>; + /// Returns an iterator over the [legacy grapheme clusters][graphemes] of `self`. + /// + /// [graphemes]: http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries + /// + /// [UAX#29](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries) + /// recommends extended grapheme cluster boundaries for general processing. + #[inline] + fn legacy_graphemes<'a>(&'a self) -> Graphemes<'a> { + self.graphemes(false) + } + + /// Returns an iterator over the [extended grapheme clusters][graphemes] of `self`. + /// + /// [graphemes]: http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries + /// + /// [UAX#29](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries) + /// recommends extended grapheme cluster boundaries for general processing. + /// + /// # Examples + /// + /// ``` + /// # use self::unicode_segmentation::UnicodeSegmentation; + /// let gr1 = UnicodeSegmentation::extended_graphemes("a\u{310}e\u{301}o\u{308}\u{332}") + /// .collect::>(); + /// let b: &[_] = &["a\u{310}", "e\u{301}", "o\u{308}\u{332}"]; + /// + /// assert_eq!(&gr1[..], b); + /// + /// let gr2 = UnicodeSegmentation::extended_graphemes("a\r\nb🇷🇺🇸🇹").collect::>(); + /// let b: &[_] = &["a", "\r\n", "b", "🇷🇺", "🇸🇹"]; + /// + /// assert_eq!(&gr2[..], b); + /// ``` + #[inline] + fn extended_graphemes<'a>(&'a self) -> Graphemes<'a> { + self.graphemes(true) + } + /// Returns an iterator over the grapheme clusters of `self` and their /// byte offsets. See `graphemes()` for more information. /// @@ -122,6 +160,31 @@ pub trait UnicodeSegmentation { /// ``` fn grapheme_indices<'a>(&'a self, is_extended: bool) -> GraphemeIndices<'a>; + /// Returns an iterator over the legacy grapheme clusters of `self` and their + /// byte offsets. See `legacy_graphemes()` for more information. + #[inline] + fn legacy_grapheme_indices<'a>(&'a self) -> GraphemeIndices<'a> { + self.grapheme_indices(false) + } + + /// Returns an iterator over the grapheme clusters of `self` and their + /// byte offsets. See `graphemes()` for more information. + /// + /// # Examples + /// + /// ``` + /// # use self::unicode_segmentation::UnicodeSegmentation; + /// let gr_inds = UnicodeSegmentation::extended_grapheme_indices("a̐éö̲\r\n") + /// .collect::>(); + /// let b: &[_] = &[(0, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")]; + /// + /// assert_eq!(&gr_inds[..], b); + /// ``` + #[inline] + fn extended_grapheme_indices<'a>(&'a self) -> GraphemeIndices<'a> { + self.grapheme_indices(true) + } + /// Returns an iterator over the words of `self`, separated on /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries). /// diff --git a/src/test.rs b/src/test.rs index 54493fe..4deb69f 100644 --- a/src/test.rs +++ b/src/test.rs @@ -44,20 +44,28 @@ fn test_graphemes() { // test forward iterator assert!(UnicodeSegmentation::graphemes(s, true).eq(g.iter().cloned())); assert!(UnicodeSegmentation::graphemes(s, false).eq(g.iter().cloned())); + assert!(UnicodeSegmentation::extended_graphemes(s).eq(g.iter().cloned())); + assert!(UnicodeSegmentation::legacy_graphemes(s).eq(g.iter().cloned())); // test reverse iterator assert!(UnicodeSegmentation::graphemes(s, true).rev().eq(g.iter().rev().cloned())); assert!(UnicodeSegmentation::graphemes(s, false).rev().eq(g.iter().rev().cloned())); + assert!(UnicodeSegmentation::extended_graphemes(s).rev().eq(g.iter().rev().cloned())); + assert!(UnicodeSegmentation::legacy_graphemes(s).rev().eq(g.iter().rev().cloned())); } for &(s, gt, gf) in TEST_DIFF.iter().chain(EXTRA_DIFF) { // test forward iterator assert!(UnicodeSegmentation::graphemes(s, true).eq(gt.iter().cloned())); assert!(UnicodeSegmentation::graphemes(s, false).eq(gf.iter().cloned())); + assert!(UnicodeSegmentation::extended_graphemes(s).eq(gt.iter().cloned())); + assert!(UnicodeSegmentation::legacy_graphemes(s).eq(gf.iter().cloned())); // test reverse iterator assert!(UnicodeSegmentation::graphemes(s, true).rev().eq(gt.iter().rev().cloned())); assert!(UnicodeSegmentation::graphemes(s, false).rev().eq(gf.iter().rev().cloned())); + assert!(UnicodeSegmentation::extended_graphemes(s).rev().eq(gt.iter().rev().cloned())); + assert!(UnicodeSegmentation::legacy_graphemes(s).rev().eq(gf.iter().rev().cloned())); } // test the indices iterators