Skip to content

Commit cea4803

Browse files
committed
auto merge of #14135 : gereeter/rust/two-way-search, r=brson
This changes the previously naive string searching algorithm to a two-way search like glibc, which should be faster on average while still maintaining worst case linear time complexity. This fixes #14107. Note that I don't think this should be merged yet, as this is the only approach to speeding up search I've tried - it's worth considering options like Boyer-Moore or adding a bad character shift table to this. However, the benchmarks look quite good so far: test str::bench::bench_contains_bad_naive ... bench: 290 ns/iter (+/- 12) from 1309 ns/iter (+/- 36) test str::bench::bench_contains_equal ... bench: 479 ns/iter (+/- 10) from 137 ns/iter (+/- 2) test str::bench::bench_contains_short_long ... bench: 2844 ns/iter (+/- 105) from 5473 ns/iter (+/- 14) test str::bench::bench_contains_short_short ... bench: 55 ns/iter (+/- 4) from 57 ns/iter (+/- 6) Except for the case specifically designed to be optimal for the naive case (`bench_contains_equal`), this gets as good or better performance as the previous code.
2 parents 5e10686 + 39cb5b1 commit cea4803

File tree

2 files changed

+280
-26
lines changed

2 files changed

+280
-26
lines changed

src/libcore/str.rs

+206-26
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,19 @@
1515
use mem;
1616
use char;
1717
use clone::Clone;
18+
use cmp;
1819
use cmp::{Eq, TotalEq};
1920
use container::Container;
2021
use default::Default;
2122
use iter::{Filter, Map, Iterator};
2223
use iter::{Rev, DoubleEndedIterator, ExactSize};
24+
use iter::range;
2325
use num::Saturating;
2426
use option::{None, Option, Some};
2527
use raw::Repr;
2628
use slice::{ImmutableVector, Vector};
2729
use slice;
30+
use uint;
2831

2932
/*
3033
Section: Creating a string
@@ -316,13 +319,207 @@ impl<'a, Sep: CharEq> Iterator<&'a str> for CharSplitsN<'a, Sep> {
316319
}
317320
}
318321

322+
/// The internal state of an iterator that searches for matches of a substring
323+
/// within a larger string using naive search
324+
#[deriving(Clone)]
325+
struct NaiveSearcher {
326+
position: uint
327+
}
328+
329+
impl NaiveSearcher {
330+
fn new() -> NaiveSearcher {
331+
NaiveSearcher { position: 0 }
332+
}
333+
334+
fn next(&mut self, haystack: &[u8], needle: &[u8]) -> Option<(uint, uint)> {
335+
while self.position + needle.len() <= haystack.len() {
336+
if haystack.slice(self.position, self.position + needle.len()) == needle {
337+
let matchPos = self.position;
338+
self.position += needle.len(); // add 1 for all matches
339+
return Some((matchPos, matchPos + needle.len()));
340+
} else {
341+
self.position += 1;
342+
}
343+
}
344+
None
345+
}
346+
}
347+
348+
/// The internal state of an iterator that searches for matches of a substring
349+
/// within a larger string using two-way search
350+
#[deriving(Clone)]
351+
struct TwoWaySearcher {
352+
// constants
353+
critPos: uint,
354+
period: uint,
355+
byteset: u64,
356+
357+
// variables
358+
position: uint,
359+
memory: uint
360+
}
361+
362+
impl TwoWaySearcher {
363+
fn new(needle: &[u8]) -> TwoWaySearcher {
364+
let (critPos1, period1) = TwoWaySearcher::maximal_suffix(needle, false);
365+
let (critPos2, period2) = TwoWaySearcher::maximal_suffix(needle, true);
366+
367+
let critPos;
368+
let period;
369+
if critPos1 > critPos2 {
370+
critPos = critPos1;
371+
period = period1;
372+
} else {
373+
critPos = critPos2;
374+
period = period2;
375+
}
376+
377+
let byteset = needle.iter().fold(0, |a, &b| (1 << (b & 0x3f)) | a);
378+
379+
if needle.slice_to(critPos) == needle.slice_from(needle.len() - critPos) {
380+
TwoWaySearcher {
381+
critPos: critPos,
382+
period: period,
383+
byteset: byteset,
384+
385+
position: 0,
386+
memory: 0
387+
}
388+
} else {
389+
TwoWaySearcher {
390+
critPos: critPos,
391+
period: cmp::max(critPos, needle.len() - critPos) + 1,
392+
byteset: byteset,
393+
394+
position: 0,
395+
memory: uint::MAX // Dummy value to signify that the period is long
396+
}
397+
}
398+
}
399+
400+
#[inline]
401+
fn next(&mut self, haystack: &[u8], needle: &[u8], longPeriod: bool) -> Option<(uint, uint)> {
402+
'search: loop {
403+
// Check that we have room to search in
404+
if self.position + needle.len() > haystack.len() {
405+
return None;
406+
}
407+
408+
// Quickly skip by large portions unrelated to our substring
409+
if (self.byteset >> (haystack[self.position + needle.len() - 1] & 0x3f)) & 1 == 0 {
410+
self.position += needle.len();
411+
continue 'search;
412+
}
413+
414+
// See if the right part of the needle matches
415+
let start = if longPeriod { self.critPos } else { cmp::max(self.critPos, self.memory) };
416+
for i in range(start, needle.len()) {
417+
if needle[i] != haystack[self.position + i] {
418+
self.position += i - self.critPos + 1;
419+
if !longPeriod {
420+
self.memory = 0;
421+
}
422+
continue 'search;
423+
}
424+
}
425+
426+
// See if the left part of the needle matches
427+
let start = if longPeriod { 0 } else { self.memory };
428+
for i in range(start, self.critPos).rev() {
429+
if needle[i] != haystack[self.position + i] {
430+
self.position += self.period;
431+
if !longPeriod {
432+
self.memory = needle.len() - self.period;
433+
}
434+
continue 'search;
435+
}
436+
}
437+
438+
// We have found a match!
439+
let matchPos = self.position;
440+
self.position += needle.len(); // add self.period for all matches
441+
if !longPeriod {
442+
self.memory = 0; // set to needle.len() - self.period for all matches
443+
}
444+
return Some((matchPos, matchPos + needle.len()));
445+
}
446+
}
447+
448+
#[inline]
449+
fn maximal_suffix(arr: &[u8], reversed: bool) -> (uint, uint) {
450+
let mut left = -1; // Corresponds to i in the paper
451+
let mut right = 0; // Corresponds to j in the paper
452+
let mut offset = 1; // Corresponds to k in the paper
453+
let mut period = 1; // Corresponds to p in the paper
454+
455+
while right + offset < arr.len() {
456+
let a;
457+
let b;
458+
if reversed {
459+
a = arr[left + offset];
460+
b = arr[right + offset];
461+
} else {
462+
a = arr[right + offset];
463+
b = arr[left + offset];
464+
}
465+
if a < b {
466+
// Suffix is smaller, period is entire prefix so far.
467+
right += offset;
468+
offset = 1;
469+
period = right - left;
470+
} else if a == b {
471+
// Advance through repetition of the current period.
472+
if offset == period {
473+
right += offset;
474+
offset = 1;
475+
} else {
476+
offset += 1;
477+
}
478+
} else {
479+
// Suffix is larger, start over from current location.
480+
left = right;
481+
right += 1;
482+
offset = 1;
483+
period = 1;
484+
}
485+
}
486+
(left + 1, period)
487+
}
488+
}
489+
490+
/// The internal state of an iterator that searches for matches of a substring
491+
/// within a larger string using a dynamically chosed search algorithm
492+
#[deriving(Clone)]
493+
enum Searcher {
494+
Naive(NaiveSearcher),
495+
TwoWay(TwoWaySearcher),
496+
TwoWayLong(TwoWaySearcher)
497+
}
498+
499+
impl Searcher {
500+
fn new(haystack: &[u8], needle: &[u8]) -> Searcher {
501+
// FIXME: Tune this.
502+
if needle.len() > haystack.len() - 20 {
503+
Naive(NaiveSearcher::new())
504+
} else {
505+
let searcher = TwoWaySearcher::new(needle);
506+
if searcher.memory == uint::MAX { // If the period is long
507+
TwoWayLong(searcher)
508+
} else {
509+
TwoWay(searcher)
510+
}
511+
}
512+
}
513+
}
514+
319515
/// An iterator over the start and end indices of the matches of a
320516
/// substring within a larger string
321517
#[deriving(Clone)]
322518
pub struct MatchIndices<'a> {
519+
// constants
323520
haystack: &'a str,
324521
needle: &'a str,
325-
position: uint,
522+
searcher: Searcher
326523
}
327524

328525
/// An iterator over the substrings of a string separated by a given
@@ -337,31 +534,14 @@ pub struct StrSplits<'a> {
337534
impl<'a> Iterator<(uint, uint)> for MatchIndices<'a> {
338535
#[inline]
339536
fn next(&mut self) -> Option<(uint, uint)> {
340-
// See Issue #1932 for why this is a naive search
341-
let (h_len, n_len) = (self.haystack.len(), self.needle.len());
342-
let mut match_start = 0;
343-
let mut match_i = 0;
344-
345-
while self.position < h_len {
346-
if self.haystack[self.position] == self.needle[match_i] {
347-
if match_i == 0 { match_start = self.position; }
348-
match_i += 1;
349-
self.position += 1;
350-
351-
if match_i == n_len {
352-
// found a match!
353-
return Some((match_start, self.position));
354-
}
355-
} else {
356-
// failed match, backtrack
357-
if match_i > 0 {
358-
match_i = 0;
359-
self.position = match_start;
360-
}
361-
self.position += 1;
362-
}
537+
match self.searcher {
538+
Naive(ref mut searcher)
539+
=> searcher.next(self.haystack.as_bytes(), self.needle.as_bytes()),
540+
TwoWay(ref mut searcher)
541+
=> searcher.next(self.haystack.as_bytes(), self.needle.as_bytes(), false),
542+
TwoWayLong(ref mut searcher)
543+
=> searcher.next(self.haystack.as_bytes(), self.needle.as_bytes(), true)
363544
}
364-
None
365545
}
366546
}
367547

@@ -1581,7 +1761,7 @@ impl<'a> StrSlice<'a> for &'a str {
15811761
MatchIndices {
15821762
haystack: *self,
15831763
needle: sep,
1584-
position: 0
1764+
searcher: Searcher::new(self.as_bytes(), sep.as_bytes())
15851765
}
15861766
}
15871767

src/libstd/str.rs

+74
Original file line numberDiff line numberDiff line change
@@ -2421,4 +2421,78 @@ mod bench {
24212421
assert_eq!(v.connect(sep).len(), s.len() * 10 + sep.len() * 9);
24222422
})
24232423
}
2424+
2425+
#[bench]
2426+
fn bench_contains_short_short(b: &mut Bencher) {
2427+
let haystack = "Lorem ipsum dolor sit amet, consectetur adipiscing elit.";
2428+
let needle = "sit";
2429+
2430+
b.iter(|| {
2431+
assert!(haystack.contains(needle));
2432+
})
2433+
}
2434+
2435+
#[bench]
2436+
fn bench_contains_short_long(b: &mut Bencher) {
2437+
let haystack = "\
2438+
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Suspendisse quis lorem sit amet dolor \
2439+
ultricies condimentum. Praesent iaculis purus elit, ac malesuada quam malesuada in. Duis sed orci \
2440+
eros. Suspendisse sit amet magna mollis, mollis nunc luctus, imperdiet mi. Integer fringilla non \
2441+
sem ut lacinia. Fusce varius tortor a risus porttitor hendrerit. Morbi mauris dui, ultricies nec \
2442+
tempus vel, gravida nec quam.
2443+
2444+
In est dui, tincidunt sed tempus interdum, adipiscing laoreet ante. Etiam tempor, tellus quis \
2445+
sagittis interdum, nulla purus mattis sem, quis auctor erat odio ac tellus. In nec nunc sit amet \
2446+
diam volutpat molestie at sed ipsum. Vestibulum laoreet consequat vulputate. Integer accumsan \
2447+
lorem ac dignissim placerat. Suspendisse convallis faucibus lorem. Aliquam erat volutpat. In vel \
2448+
eleifend felis. Sed suscipit nulla lorem, sed mollis est sollicitudin et. Nam fermentum egestas \
2449+
interdum. Curabitur ut nisi justo.
2450+
2451+
Sed sollicitudin ipsum tellus, ut condimentum leo eleifend nec. Cras ut velit ante. Phasellus nec \
2452+
mollis odio. Mauris molestie erat in arcu mattis, at aliquet dolor vehicula. Quisque malesuada \
2453+
lectus sit amet nisi pretium, a condimentum ipsum porta. Morbi at dapibus diam. Praesent egestas \
2454+
est sed risus elementum, eu rutrum metus ultrices. Etiam fermentum consectetur magna, id rutrum \
2455+
felis accumsan a. Aliquam ut pellentesque libero. Sed mi nulla, lobortis eu tortor id, suscipit \
2456+
ultricies neque. Morbi iaculis sit amet risus at iaculis. Praesent eget ligula quis turpis \
2457+
feugiat suscipit vel non arcu. Interdum et malesuada fames ac ante ipsum primis in faucibus. \
2458+
Aliquam sit amet placerat lorem.
2459+
2460+
Cras a lacus vel ante posuere elementum. Nunc est leo, bibendum ut facilisis vel, bibendum at \
2461+
mauris. Nullam adipiscing diam vel odio ornare, luctus adipiscing mi luctus. Nulla facilisi. \
2462+
Mauris adipiscing bibendum neque, quis adipiscing lectus tempus et. Sed feugiat erat et nisl \
2463+
lobortis pharetra. Donec vitae erat enim. Nullam sit amet felis et quam lacinia tincidunt. Aliquam \
2464+
suscipit dapibus urna. Sed volutpat urna in magna pulvinar volutpat. Phasellus nec tellus ac diam \
2465+
cursus accumsan.
2466+
2467+
Nam lectus enim, dapibus non nisi tempor, consectetur convallis massa. Maecenas eleifend dictum \
2468+
feugiat. Etiam quis mauris vel risus luctus mattis a a nunc. Nullam orci quam, imperdiet id \
2469+
vehicula in, porttitor ut nibh. Duis sagittis adipiscing nisl vitae congue. Donec mollis risus eu \
2470+
leo suscipit, varius porttitor nulla porta. Pellentesque ut sem nec nisi euismod vehicula. Nulla \
2471+
malesuada sollicitudin quam eu fermentum.";
2472+
let needle = "english";
2473+
2474+
b.iter(|| {
2475+
assert!(!haystack.contains(needle));
2476+
})
2477+
}
2478+
2479+
#[bench]
2480+
fn bench_contains_bad_naive(b: &mut Bencher) {
2481+
let haystack = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
2482+
let needle = "aaaaaaaab";
2483+
2484+
b.iter(|| {
2485+
assert!(!haystack.contains(needle));
2486+
})
2487+
}
2488+
2489+
#[bench]
2490+
fn bench_contains_equal(b: &mut Bencher) {
2491+
let haystack = "Lorem ipsum dolor sit amet, consectetur adipiscing elit.";
2492+
let needle = "Lorem ipsum dolor sit amet, consectetur adipiscing elit.";
2493+
2494+
b.iter(|| {
2495+
assert!(haystack.contains(needle));
2496+
})
2497+
}
24242498
}

0 commit comments

Comments
 (0)