|
39 | 39 | )]
|
40 | 40 |
|
41 | 41 | use crate::cmp;
|
42 |
| -use crate::cmp::Ordering; |
43 | 42 | use crate::fmt;
|
44 | 43 | use crate::slice::memchr;
|
45 | 44 |
|
@@ -947,32 +946,6 @@ impl<'a, 'b> Pattern<'a> for &'b str {
|
947 | 946 | haystack.as_bytes().starts_with(self.as_bytes())
|
948 | 947 | }
|
949 | 948 |
|
950 |
| - /// Checks whether the pattern matches anywhere in the haystack |
951 |
| - #[inline] |
952 |
| - fn is_contained_in(self, haystack: &'a str) -> bool { |
953 |
| - if self.len() == 0 { |
954 |
| - return true; |
955 |
| - } |
956 |
| - |
957 |
| - match self.len().cmp(&haystack.len()) { |
958 |
| - Ordering::Less => { |
959 |
| - if self.len() == 1 { |
960 |
| - return haystack.as_bytes().contains(&self.as_bytes()[0]); |
961 |
| - } |
962 |
| - |
963 |
| - #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] |
964 |
| - if self.len() <= 32 { |
965 |
| - if let Some(result) = simd_contains(self, haystack) { |
966 |
| - return result; |
967 |
| - } |
968 |
| - } |
969 |
| - |
970 |
| - self.into_searcher(haystack).next_match().is_some() |
971 |
| - } |
972 |
| - _ => self == haystack, |
973 |
| - } |
974 |
| - } |
975 |
| - |
976 | 949 | /// Removes the pattern from the front of haystack, if it matches.
|
977 | 950 | #[inline]
|
978 | 951 | fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> {
|
@@ -1711,208 +1684,3 @@ impl TwoWayStrategy for RejectAndMatch {
|
1711 | 1684 | SearchStep::Match(a, b)
|
1712 | 1685 | }
|
1713 | 1686 | }
|
1714 |
| - |
1715 |
| -/// SIMD search for short needles based on |
1716 |
| -/// Wojciech Muła's "SIMD-friendly algorithms for substring searching"[0] |
1717 |
| -/// |
1718 |
| -/// It skips ahead by the vector width on each iteration (rather than the needle length as two-way |
1719 |
| -/// does) by probing the first and last byte of the needle for the whole vector width |
1720 |
| -/// and only doing full needle comparisons when the vectorized probe indicated potential matches. |
1721 |
| -/// |
1722 |
| -/// Since the x86_64 baseline only offers SSE2 we only use u8x16 here. |
1723 |
| -/// If we ever ship std with for x86-64-v3 or adapt this for other platforms then wider vectors |
1724 |
| -/// should be evaluated. |
1725 |
| -/// |
1726 |
| -/// For haystacks smaller than vector-size + needle length it falls back to |
1727 |
| -/// a naive O(n*m) search so this implementation should not be called on larger needles. |
1728 |
| -/// |
1729 |
| -/// [0]: http://0x80.pl/articles/simd-strfind.html#sse-avx2 |
1730 |
| -#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] |
1731 |
| -#[inline] |
1732 |
| -fn simd_contains(needle: &str, haystack: &str) -> Option<bool> { |
1733 |
| - let needle = needle.as_bytes(); |
1734 |
| - let haystack = haystack.as_bytes(); |
1735 |
| - |
1736 |
| - debug_assert!(needle.len() > 1); |
1737 |
| - |
1738 |
| - use crate::ops::BitAnd; |
1739 |
| - use crate::simd::mask8x16 as Mask; |
1740 |
| - use crate::simd::u8x16 as Block; |
1741 |
| - use crate::simd::{SimdPartialEq, ToBitMask}; |
1742 |
| - |
1743 |
| - let first_probe = needle[0]; |
1744 |
| - |
1745 |
| - // the offset used for the 2nd vector |
1746 |
| - let second_probe_offset = if needle.len() == 2 { |
1747 |
| - // never bail out on len=2 needles because the probes will fully cover them and have |
1748 |
| - // no degenerate cases. |
1749 |
| - 1 |
1750 |
| - } else { |
1751 |
| - // try a few bytes in case first and last byte of the needle are the same |
1752 |
| - let Some(second_probe_offset) = (needle.len().saturating_sub(4)..needle.len()).rfind(|&idx| needle[idx] != first_probe) else { |
1753 |
| - // fall back to other search methods if we can't find any different bytes |
1754 |
| - // since we could otherwise hit some degenerate cases |
1755 |
| - return None; |
1756 |
| - }; |
1757 |
| - second_probe_offset |
1758 |
| - }; |
1759 |
| - |
1760 |
| - // do a naive search if the haystack is too small to fit |
1761 |
| - if haystack.len() < Block::LANES + second_probe_offset { |
1762 |
| - return Some(haystack.windows(needle.len()).any(|c| c == needle)); |
1763 |
| - } |
1764 |
| - |
1765 |
| - let first_probe: Block = Block::splat(first_probe); |
1766 |
| - let second_probe: Block = Block::splat(needle[second_probe_offset]); |
1767 |
| - // first byte are already checked by the outer loop. to verify a match only the |
1768 |
| - // remainder has to be compared. |
1769 |
| - let trimmed_needle = &needle[1..]; |
1770 |
| - |
1771 |
| - // this #[cold] is load-bearing, benchmark before removing it... |
1772 |
| - let check_mask = #[cold] |
1773 |
| - |idx, mask: u16, skip: bool| -> bool { |
1774 |
| - if skip { |
1775 |
| - return false; |
1776 |
| - } |
1777 |
| - |
1778 |
| - // and so is this. optimizations are weird. |
1779 |
| - let mut mask = mask; |
1780 |
| - |
1781 |
| - while mask != 0 { |
1782 |
| - let trailing = mask.trailing_zeros(); |
1783 |
| - let offset = idx + trailing as usize + 1; |
1784 |
| - // SAFETY: mask is between 0 and 15 trailing zeroes, we skip one additional byte that was already compared |
1785 |
| - // and then take trimmed_needle.len() bytes. This is within the bounds defined by the outer loop |
1786 |
| - unsafe { |
1787 |
| - let sub = haystack.get_unchecked(offset..).get_unchecked(..trimmed_needle.len()); |
1788 |
| - if small_slice_eq(sub, trimmed_needle) { |
1789 |
| - return true; |
1790 |
| - } |
1791 |
| - } |
1792 |
| - mask &= !(1 << trailing); |
1793 |
| - } |
1794 |
| - return false; |
1795 |
| - }; |
1796 |
| - |
1797 |
| - let test_chunk = |idx| -> u16 { |
1798 |
| - // SAFETY: this requires at least LANES bytes being readable at idx |
1799 |
| - // that is ensured by the loop ranges (see comments below) |
1800 |
| - let a: Block = unsafe { haystack.as_ptr().add(idx).cast::<Block>().read_unaligned() }; |
1801 |
| - // SAFETY: this requires LANES + block_offset bytes being readable at idx |
1802 |
| - let b: Block = unsafe { |
1803 |
| - haystack.as_ptr().add(idx).add(second_probe_offset).cast::<Block>().read_unaligned() |
1804 |
| - }; |
1805 |
| - let eq_first: Mask = a.simd_eq(first_probe); |
1806 |
| - let eq_last: Mask = b.simd_eq(second_probe); |
1807 |
| - let both = eq_first.bitand(eq_last); |
1808 |
| - let mask = both.to_bitmask(); |
1809 |
| - |
1810 |
| - return mask; |
1811 |
| - }; |
1812 |
| - |
1813 |
| - let mut i = 0; |
1814 |
| - let mut result = false; |
1815 |
| - // The loop condition must ensure that there's enough headroom to read LANE bytes, |
1816 |
| - // and not only at the current index but also at the index shifted by block_offset |
1817 |
| - const UNROLL: usize = 4; |
1818 |
| - while i + second_probe_offset + UNROLL * Block::LANES < haystack.len() && !result { |
1819 |
| - let mut masks = [0u16; UNROLL]; |
1820 |
| - for j in 0..UNROLL { |
1821 |
| - masks[j] = test_chunk(i + j * Block::LANES); |
1822 |
| - } |
1823 |
| - for j in 0..UNROLL { |
1824 |
| - let mask = masks[j]; |
1825 |
| - if mask != 0 { |
1826 |
| - result |= check_mask(i + j * Block::LANES, mask, result); |
1827 |
| - } |
1828 |
| - } |
1829 |
| - i += UNROLL * Block::LANES; |
1830 |
| - } |
1831 |
| - while i + second_probe_offset + Block::LANES < haystack.len() && !result { |
1832 |
| - let mask = test_chunk(i); |
1833 |
| - if mask != 0 { |
1834 |
| - result |= check_mask(i, mask, result); |
1835 |
| - } |
1836 |
| - i += Block::LANES; |
1837 |
| - } |
1838 |
| - |
1839 |
| - // Process the tail that didn't fit into LANES-sized steps. |
1840 |
| - // This simply repeats the same procedure but as right-aligned chunk instead |
1841 |
| - // of a left-aligned one. The last byte must be exactly flush with the string end so |
1842 |
| - // we don't miss a single byte or read out of bounds. |
1843 |
| - let i = haystack.len() - second_probe_offset - Block::LANES; |
1844 |
| - let mask = test_chunk(i); |
1845 |
| - if mask != 0 { |
1846 |
| - result |= check_mask(i, mask, result); |
1847 |
| - } |
1848 |
| - |
1849 |
| - Some(result) |
1850 |
| -} |
1851 |
| - |
1852 |
| -/// Compares short slices for equality. |
1853 |
| -/// |
1854 |
| -/// It avoids a call to libc's memcmp which is faster on long slices |
1855 |
| -/// due to SIMD optimizations but it incurs a function call overhead. |
1856 |
| -/// |
1857 |
| -/// # Safety |
1858 |
| -/// |
1859 |
| -/// Both slices must have the same length. |
1860 |
| -#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] // only called on x86 |
1861 |
| -#[inline] |
1862 |
| -unsafe fn small_slice_eq(x: &[u8], y: &[u8]) -> bool { |
1863 |
| - // This function is adapted from |
1864 |
| - // https://github.com/BurntSushi/memchr/blob/8037d11b4357b0f07be2bb66dc2659d9cf28ad32/src/memmem/util.rs#L32 |
1865 |
| - |
1866 |
| - // If we don't have enough bytes to do 4-byte at a time loads, then |
1867 |
| - // fall back to the naive slow version. |
1868 |
| - // |
1869 |
| - // Potential alternative: We could do a copy_nonoverlapping combined with a mask instead |
1870 |
| - // of a loop. Benchmark it. |
1871 |
| - if x.len() < 4 { |
1872 |
| - for (&b1, &b2) in x.iter().zip(y) { |
1873 |
| - if b1 != b2 { |
1874 |
| - return false; |
1875 |
| - } |
1876 |
| - } |
1877 |
| - return true; |
1878 |
| - } |
1879 |
| - // When we have 4 or more bytes to compare, then proceed in chunks of 4 at |
1880 |
| - // a time using unaligned loads. |
1881 |
| - // |
1882 |
| - // Also, why do 4 byte loads instead of, say, 8 byte loads? The reason is |
1883 |
| - // that this particular version of memcmp is likely to be called with tiny |
1884 |
| - // needles. That means that if we do 8 byte loads, then a higher proportion |
1885 |
| - // of memcmp calls will use the slower variant above. With that said, this |
1886 |
| - // is a hypothesis and is only loosely supported by benchmarks. There's |
1887 |
| - // likely some improvement that could be made here. The main thing here |
1888 |
| - // though is to optimize for latency, not throughput. |
1889 |
| - |
1890 |
| - // SAFETY: Via the conditional above, we know that both `px` and `py` |
1891 |
| - // have the same length, so `px < pxend` implies that `py < pyend`. |
1892 |
| - // Thus, derefencing both `px` and `py` in the loop below is safe. |
1893 |
| - // |
1894 |
| - // Moreover, we set `pxend` and `pyend` to be 4 bytes before the actual |
1895 |
| - // end of of `px` and `py`. Thus, the final dereference outside of the |
1896 |
| - // loop is guaranteed to be valid. (The final comparison will overlap with |
1897 |
| - // the last comparison done in the loop for lengths that aren't multiples |
1898 |
| - // of four.) |
1899 |
| - // |
1900 |
| - // Finally, we needn't worry about alignment here, since we do unaligned |
1901 |
| - // loads. |
1902 |
| - unsafe { |
1903 |
| - let (mut px, mut py) = (x.as_ptr(), y.as_ptr()); |
1904 |
| - let (pxend, pyend) = (px.add(x.len() - 4), py.add(y.len() - 4)); |
1905 |
| - while px < pxend { |
1906 |
| - let vx = (px as *const u32).read_unaligned(); |
1907 |
| - let vy = (py as *const u32).read_unaligned(); |
1908 |
| - if vx != vy { |
1909 |
| - return false; |
1910 |
| - } |
1911 |
| - px = px.add(4); |
1912 |
| - py = py.add(4); |
1913 |
| - } |
1914 |
| - let vx = (pxend as *const u32).read_unaligned(); |
1915 |
| - let vy = (pyend as *const u32).read_unaligned(); |
1916 |
| - vx == vy |
1917 |
| - } |
1918 |
| -} |
0 commit comments