@@ -102,6 +102,7 @@ enum UWordBoundsState {
102102 FormatExtend ( FormatExtendType ) ,
103103 Zwj ,
104104 Emoji ,
105+ WSegSpace ,
105106}
106107
107108// subtypes for FormatExtend state in UWordBoundsState
@@ -156,6 +157,8 @@ impl<'a> Iterator for UWordBounds<'a> {
156157 // Whether or not the previous category was ZWJ
157158 // ZWJs get collapsed, so this handles precedence of WB3c over WB4
158159 let mut prev_zwj;
160+ // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
161+ let mut skipped_format_extend = false ;
159162 for ( curr, ch) in self . string . char_indices ( ) {
160163 idx = curr;
161164 prev_zwj = cat == wd:: WC_ZWJ ;
@@ -177,6 +180,7 @@ impl<'a> Iterator for UWordBounds<'a> {
177180 if state != Start {
178181 match cat {
179182 wd:: WC_Extend | wd:: WC_Format | wd:: WC_ZWJ => {
183+ skipped_format_extend = true ;
180184 continue
181185 }
182186 _ => { }
@@ -219,6 +223,7 @@ impl<'a> Iterator for UWordBounds<'a> {
219223 wd:: WC_Regional_Indicator => Regional ( RegionalState :: Half ) , // rule WB13c
220224 wd:: WC_LF | wd:: WC_Newline => break , // rule WB3a
221225 wd:: WC_ZWJ => Zwj , // rule WB3c
226+ wd:: WC_WSegSpace => WSegSpace , // rule WB3d
222227 _ => {
223228 if let Some ( ncat) = self . get_next_cat ( idx) { // rule WB4
224229 if ncat == wd:: WC_Format || ncat == wd:: WC_Extend || ncat == wd:: WC_ZWJ {
@@ -230,6 +235,13 @@ impl<'a> Iterator for UWordBounds<'a> {
230235 break ; // rule WB999
231236 }
232237 } ,
238+ WSegSpace => match cat {
239+ wd:: WC_WSegSpace if !skipped_format_extend => WSegSpace ,
240+ _ => {
241+ take_curr = false ;
242+ break ;
243+ }
244+ } ,
233245 Zwj => {
234246 // We already handle WB3c above.
235247 take_curr = false ;
@@ -371,6 +383,8 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
371383 let mut savestate = Start ;
372384 let mut cat = wd:: WC_Any ;
373385
386+ let mut skipped_format_extend = false ;
387+
374388 for ( curr, ch) in self . string . char_indices ( ) . rev ( ) {
375389 previdx = idx;
376390 idx = curr;
@@ -409,6 +423,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
409423 state = savestate;
410424 previdx = saveidx;
411425 take_cat = false ;
426+ skipped_format_extend = true ;
412427 }
413428
414429 // Don't use `continue` in this match without updating `catb`
@@ -427,6 +442,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
427442 saveidx = idx;
428443 FormatExtend ( AcceptQLetter ) // rule WB7a
429444 } ,
445+ wd:: WC_WSegSpace => WSegSpace ,
430446 wd:: WC_CR | wd:: WC_LF | wd:: WC_Newline => {
431447 if state == Start {
432448 if cat == wd:: WC_LF {
@@ -451,6 +467,15 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
451467 break ;
452468 }
453469 } ,
470+ WSegSpace => match cat { // rule WB3d
471+ wd:: WC_WSegSpace if !skipped_format_extend => {
472+ WSegSpace
473+ }
474+ _ => {
475+ take_curr = false ;
476+ break ;
477+ }
478+ } ,
454479 Letter | HLetter => match cat {
455480 wd:: WC_ALetter => Letter , // rule WB5
456481 wd:: WC_Hebrew_Letter => HLetter , // rule WB5
0 commit comments