15
15
use mem;
16
16
use char;
17
17
use clone:: Clone ;
18
+ use cmp;
18
19
use cmp:: { Eq , TotalEq } ;
19
20
use container:: Container ;
20
21
use default:: Default ;
21
22
use iter:: { Filter , Map , Iterator } ;
22
23
use iter:: { Rev , DoubleEndedIterator , ExactSize } ;
24
+ use iter:: range;
23
25
use num:: Saturating ;
24
26
use option:: { None , Option , Some } ;
25
27
use raw:: Repr ;
26
28
use slice:: { ImmutableVector , Vector } ;
27
29
use slice;
30
+ use uint;
28
31
29
32
/*
30
33
Section: Creating a string
@@ -316,13 +319,207 @@ impl<'a, Sep: CharEq> Iterator<&'a str> for CharSplitsN<'a, Sep> {
316
319
}
317
320
}
318
321
322
+ /// The internal state of an iterator that searches for matches of a substring
323
+ /// within a larger string using naive search
324
+ #[ deriving( Clone ) ]
325
+ struct NaiveSearcher {
326
+ position : uint
327
+ }
328
+
329
+ impl NaiveSearcher {
330
+ fn new ( ) -> NaiveSearcher {
331
+ NaiveSearcher { position : 0 }
332
+ }
333
+
334
+ fn next ( & mut self , haystack : & [ u8 ] , needle : & [ u8 ] ) -> Option < ( uint , uint ) > {
335
+ while self . position + needle. len ( ) <= haystack. len ( ) {
336
+ if haystack. slice ( self . position , self . position + needle. len ( ) ) == needle {
337
+ let matchPos = self . position ;
338
+ self . position += needle. len ( ) ; // add 1 for all matches
339
+ return Some ( ( matchPos, matchPos + needle. len ( ) ) ) ;
340
+ } else {
341
+ self . position += 1 ;
342
+ }
343
+ }
344
+ None
345
+ }
346
+ }
347
+
348
+ /// The internal state of an iterator that searches for matches of a substring
349
+ /// within a larger string using two-way search
350
+ #[ deriving( Clone ) ]
351
+ struct TwoWaySearcher {
352
+ // constants
353
+ critPos : uint ,
354
+ period : uint ,
355
+ byteset : u64 ,
356
+
357
+ // variables
358
+ position : uint ,
359
+ memory : uint
360
+ }
361
+
362
+ impl TwoWaySearcher {
363
+ fn new ( needle : & [ u8 ] ) -> TwoWaySearcher {
364
+ let ( critPos1, period1) = TwoWaySearcher :: maximal_suffix ( needle, false ) ;
365
+ let ( critPos2, period2) = TwoWaySearcher :: maximal_suffix ( needle, true ) ;
366
+
367
+ let critPos;
368
+ let period;
369
+ if critPos1 > critPos2 {
370
+ critPos = critPos1;
371
+ period = period1;
372
+ } else {
373
+ critPos = critPos2;
374
+ period = period2;
375
+ }
376
+
377
+ let byteset = needle. iter ( ) . fold ( 0 , |a, & b| ( 1 << ( b & 0x3f ) ) | a) ;
378
+
379
+ if needle. slice_to ( critPos) == needle. slice_from ( needle. len ( ) - critPos) {
380
+ TwoWaySearcher {
381
+ critPos : critPos,
382
+ period : period,
383
+ byteset : byteset,
384
+
385
+ position : 0 ,
386
+ memory : 0
387
+ }
388
+ } else {
389
+ TwoWaySearcher {
390
+ critPos : critPos,
391
+ period : cmp:: max ( critPos, needle. len ( ) - critPos) + 1 ,
392
+ byteset : byteset,
393
+
394
+ position : 0 ,
395
+ memory : uint:: MAX // Dummy value to signify that the period is long
396
+ }
397
+ }
398
+ }
399
+
400
+ #[ inline]
401
+ fn next ( & mut self , haystack : & [ u8 ] , needle : & [ u8 ] , longPeriod : bool ) -> Option < ( uint , uint ) > {
402
+ ' search: loop {
403
+ // Check that we have room to search in
404
+ if self . position + needle. len ( ) > haystack. len ( ) {
405
+ return None ;
406
+ }
407
+
408
+ // Quickly skip by large portions unrelated to our substring
409
+ if ( self . byteset >> ( haystack[ self . position + needle. len ( ) - 1 ] & 0x3f ) ) & 1 == 0 {
410
+ self . position += needle. len ( ) ;
411
+ continue ' search;
412
+ }
413
+
414
+ // See if the right part of the needle matches
415
+ let start = if longPeriod { self . critPos } else { cmp:: max ( self . critPos , self . memory ) } ;
416
+ for i in range ( start, needle. len ( ) ) {
417
+ if needle[ i] != haystack[ self . position + i] {
418
+ self . position += i - self . critPos + 1 ;
419
+ if !longPeriod {
420
+ self . memory = 0 ;
421
+ }
422
+ continue ' search;
423
+ }
424
+ }
425
+
426
+ // See if the left part of the needle matches
427
+ let start = if longPeriod { 0 } else { self . memory } ;
428
+ for i in range ( start, self . critPos ) . rev ( ) {
429
+ if needle[ i] != haystack[ self . position + i] {
430
+ self . position += self . period ;
431
+ if !longPeriod {
432
+ self . memory = needle. len ( ) - self . period ;
433
+ }
434
+ continue ' search;
435
+ }
436
+ }
437
+
438
+ // We have found a match!
439
+ let matchPos = self . position ;
440
+ self . position += needle. len ( ) ; // add self.period for all matches
441
+ if !longPeriod {
442
+ self . memory = 0 ; // set to needle.len() - self.period for all matches
443
+ }
444
+ return Some ( ( matchPos, matchPos + needle. len ( ) ) ) ;
445
+ }
446
+ }
447
+
448
+ #[ inline]
449
+ fn maximal_suffix ( arr : & [ u8 ] , reversed : bool ) -> ( uint , uint ) {
450
+ let mut left = -1 ; // Corresponds to i in the paper
451
+ let mut right = 0 ; // Corresponds to j in the paper
452
+ let mut offset = 1 ; // Corresponds to k in the paper
453
+ let mut period = 1 ; // Corresponds to p in the paper
454
+
455
+ while right + offset < arr. len ( ) {
456
+ let a;
457
+ let b;
458
+ if reversed {
459
+ a = arr[ left + offset] ;
460
+ b = arr[ right + offset] ;
461
+ } else {
462
+ a = arr[ right + offset] ;
463
+ b = arr[ left + offset] ;
464
+ }
465
+ if a < b {
466
+ // Suffix is smaller, period is entire prefix so far.
467
+ right += offset;
468
+ offset = 1 ;
469
+ period = right - left;
470
+ } else if a == b {
471
+ // Advance through repetition of the current period.
472
+ if offset == period {
473
+ right += offset;
474
+ offset = 1 ;
475
+ } else {
476
+ offset += 1 ;
477
+ }
478
+ } else {
479
+ // Suffix is larger, start over from current location.
480
+ left = right;
481
+ right += 1 ;
482
+ offset = 1 ;
483
+ period = 1 ;
484
+ }
485
+ }
486
+ ( left + 1 , period)
487
+ }
488
+ }
489
+
490
+ /// The internal state of an iterator that searches for matches of a substring
491
+ /// within a larger string using a dynamically chosed search algorithm
492
+ #[ deriving( Clone ) ]
493
+ enum Searcher {
494
+ Naive ( NaiveSearcher ) ,
495
+ TwoWay ( TwoWaySearcher ) ,
496
+ TwoWayLong ( TwoWaySearcher )
497
+ }
498
+
499
+ impl Searcher {
500
+ fn new ( haystack : & [ u8 ] , needle : & [ u8 ] ) -> Searcher {
501
+ // FIXME: Tune this.
502
+ if needle. len ( ) > haystack. len ( ) - 20 {
503
+ Naive ( NaiveSearcher :: new ( ) )
504
+ } else {
505
+ let searcher = TwoWaySearcher :: new ( needle) ;
506
+ if searcher. memory == uint:: MAX { // If the period is long
507
+ TwoWayLong ( searcher)
508
+ } else {
509
+ TwoWay ( searcher)
510
+ }
511
+ }
512
+ }
513
+ }
514
+
319
515
/// An iterator over the start and end indices of the matches of a
320
516
/// substring within a larger string
321
517
#[ deriving( Clone ) ]
322
518
pub struct MatchIndices < ' a > {
519
+ // constants
323
520
haystack : & ' a str ,
324
521
needle : & ' a str ,
325
- position : uint ,
522
+ searcher : Searcher
326
523
}
327
524
328
525
/// An iterator over the substrings of a string separated by a given
@@ -337,31 +534,14 @@ pub struct StrSplits<'a> {
337
534
impl < ' a > Iterator < ( uint , uint ) > for MatchIndices < ' a > {
338
535
#[ inline]
339
536
fn next ( & mut self ) -> Option < ( uint , uint ) > {
340
- // See Issue #1932 for why this is a naive search
341
- let ( h_len, n_len) = ( self . haystack . len ( ) , self . needle . len ( ) ) ;
342
- let mut match_start = 0 ;
343
- let mut match_i = 0 ;
344
-
345
- while self . position < h_len {
346
- if self . haystack [ self . position ] == self . needle [ match_i] {
347
- if match_i == 0 { match_start = self . position ; }
348
- match_i += 1 ;
349
- self . position += 1 ;
350
-
351
- if match_i == n_len {
352
- // found a match!
353
- return Some ( ( match_start, self . position ) ) ;
354
- }
355
- } else {
356
- // failed match, backtrack
357
- if match_i > 0 {
358
- match_i = 0 ;
359
- self . position = match_start;
360
- }
361
- self . position += 1 ;
362
- }
537
+ match self . searcher {
538
+ Naive ( ref mut searcher)
539
+ => searcher. next ( self . haystack . as_bytes ( ) , self . needle . as_bytes ( ) ) ,
540
+ TwoWay ( ref mut searcher)
541
+ => searcher. next ( self . haystack . as_bytes ( ) , self . needle . as_bytes ( ) , false ) ,
542
+ TwoWayLong ( ref mut searcher)
543
+ => searcher. next ( self . haystack . as_bytes ( ) , self . needle . as_bytes ( ) , true )
363
544
}
364
- None
365
545
}
366
546
}
367
547
@@ -1581,7 +1761,7 @@ impl<'a> StrSlice<'a> for &'a str {
1581
1761
MatchIndices {
1582
1762
haystack : * self ,
1583
1763
needle : sep,
1584
- position : 0
1764
+ searcher : Searcher :: new ( self . as_bytes ( ) , sep . as_bytes ( ) )
1585
1765
}
1586
1766
}
1587
1767
0 commit comments