1
1
-- | Parallel versions of 'filter' and 'simpleFilter'
2
+
2
3
module Text.Fuzzy.Parallel
3
4
( filter ,
4
5
simpleFilter,
5
- Scored (.. ),
6
- -- reexports
7
- Fuzzy ,
6
+ match,
7
+ Scored (.. )
8
8
) where
9
9
10
- import Control.Monad.ST (runST )
11
- import Control.Parallel.Strategies (Eval , Strategy , evalTraversable ,
12
- parTraversable , rseq , using )
13
- import Data.Monoid.Textual (TextualMonoid )
14
- import Data.Vector (Vector , (!) )
15
- import qualified Data.Vector as V
16
- -- need to use a stable sort
17
- import Data.Bifunctor (second )
18
- import Data.Char (toLower )
19
- import Data.Maybe (fromMaybe )
20
- import qualified Data.Monoid.Textual as T
10
+ import Control.Parallel.Strategies (rseq , using , parList , evalList )
11
+ import Data.Bits ((.|.) )
12
+ import Data.Maybe (fromMaybe , mapMaybe )
13
+ import qualified Data.Text as T
14
+ import qualified Data.Text.Internal as T
15
+ import qualified Data.Text.Array as TA
21
16
import Prelude hiding (filter )
22
- import Text.Fuzzy (Fuzzy (.. ))
23
17
24
- data Scored a = Scored { score_ :: ! Int , original :: ! a }
25
- deriving (Functor ,Show )
18
+ data Scored a = Scored { score :: ! Int , original :: ! a }
19
+ deriving (Functor , Show )
26
20
27
21
-- | Returns the rendered output and the
28
22
-- matching score for a pattern and a text.
29
23
-- Two examples are given below:
30
24
--
31
- -- >>> match "fnt" "infinite" "" "" id True
32
- -- Just ("infinite",3)
25
+ -- >>> match "fnt" "infinite"
26
+ -- Just 3
33
27
--
34
- -- >>> match "hsk" ( "Haskell",1995) "<" ">" fst False
35
- -- Just ("<h>a<s><k>ell",5)
28
+ -- >>> match "hsk" "Haskell"
29
+ -- Just 5
36
30
--
37
31
{-# INLINABLE match #-}
38
32
39
- match :: (T. TextualMonoid s )
40
- => s -- ^ Pattern in lowercase except for first character
41
- -> t -- ^ The value containing the text to search in.
42
- -> s -- ^ The text to add before each match.
43
- -> s -- ^ The text to add after each match.
44
- -> (t -> s ) -- ^ The function to extract the text from the container.
45
- -> Maybe (Fuzzy t s ) -- ^ The original value, rendered string and score.
46
- match pattern t pre post extract =
47
- if null pat then Just (Fuzzy t result totalScore) else Nothing
33
+ match :: T. Text -- ^ Pattern in lowercase except for first character
34
+ -> T. Text -- ^ The text to search in.
35
+ -> Maybe Int -- ^ The score
36
+ match (T. Text pArr pOff pLen) (T. Text sArr sOff sLen) = go 0 1 pOff sOff
48
37
where
49
- null :: (T. TextualMonoid s ) => s -> Bool
50
- null = not . T. any (const True )
51
-
52
- s = extract t
53
- (totalScore, _currScore, result, pat, _) =
54
- T. foldl'
55
- undefined
56
- (\ (tot, cur, res, pat, isFirst) c ->
57
- case T. splitCharacterPrefix pat of
58
- Nothing -> (tot, 0 , res <> T. singleton c, pat, isFirst)
59
- Just (x, xs) ->
60
- -- the case of the first character has to match
61
- -- otherwise use lower case since the pattern is assumed lower
62
- let ! c' = if isFirst then c else toLower c in
63
- if x == c' then
64
- let cur' = cur * 2 + 1 in
65
- (tot + cur', cur', res <> pre <> T. singleton c <> post, xs, False )
66
- else (tot, 0 , res <> T. singleton c, pat, isFirst)
67
- ) ( 0
68
- , 1 -- matching at the start gives a bonus (cur = 1)
69
- , mempty , pattern , True ) s
38
+ pTotal = pOff + pLen
39
+ sDelta = sOff + sLen - pTotal
40
+
41
+ go ! totalScore ! currScore ! currPOff ! currSOff
42
+ -- If pattern has been matched in full
43
+ | currPOff >= pTotal
44
+ = Just totalScore
45
+ -- If there is not enough left to match the rest of the pattern, equivalent to
46
+ -- (sOff + sLen - currSOff) < (pOff + pLen - currPOff)
47
+ | currSOff > currPOff + sDelta
48
+ = Nothing
49
+ -- This is slightly broken for non-ASCII:
50
+ -- 1. If code units, consisting a single pattern code point, are found as parts
51
+ -- of different code points, it counts as a match. Unless you use a ton of emojis
52
+ -- as identifiers, such false positives should not be be a big deal,
53
+ -- and anyways HLS does not currently support such use cases, because it uses
54
+ -- code point and UTF-16 code unit positions interchangeably.
55
+ -- 2. Case conversions is not applied to non-ASCII code points, because one has
56
+ -- to call T.toLower (not T.map toLower), reallocating the string in full, which
57
+ -- is too much of performance penalty for fuzzy search. Again, anyway HLS does not
58
+ -- attempt to do justice to Unicode: proper Unicode text matching requires
59
+ -- `unicode-transforms` and friends.
60
+ -- Altogether we sacrifice correctness for the sake of performance, which
61
+ -- is a right trade-off for fuzzy search.
62
+ | pByte <- TA. unsafeIndex pArr currPOff
63
+ , sByte <- TA. unsafeIndex sArr currSOff
64
+ -- First byte (currPOff == pOff) should match exactly, otherwise - up to case.
65
+ , pByte == sByte || (currPOff /= pOff && pByte == toLowerAscii sByte)
66
+ = let curr = currScore * 2 + 1 in
67
+ go (totalScore + curr) curr (currPOff + 1 ) (currSOff + 1 )
68
+ | otherwise
69
+ = go totalScore 0 currPOff (currSOff + 1 )
70
+
71
+ toLowerAscii w = if (w - 65 ) < 26 then w .|. 0x20 else w
70
72
71
73
-- | The function to filter a list of values by fuzzy search on the text extracted from them.
72
- filter :: (TextualMonoid s )
73
- => Int -- ^ Chunk size. 1000 works well.
74
- -> Int -- ^ Max. number of results wanted
75
- -> s -- ^ Pattern.
76
- -> [t ] -- ^ The list of values containing the text to search in.
77
- -> s -- ^ The text to add before each match.
78
- -> s -- ^ The text to add after each match.
79
- -> (t -> s ) -- ^ The function to extract the text from the container.
80
- -> [Scored t ] -- ^ The list of results, sorted, highest score first.
81
- filter chunkSize maxRes pattern ts pre post extract = runST $ do
82
- let v = V. mapMaybe id
83
- (V. map (\ t -> match pattern' t pre post extract) (V. fromList ts)
84
- `using`
85
- parVectorChunk chunkSize (evalTraversable forceScore))
86
- perfectScore = score $ fromMaybe (error $ T. toString undefined pattern ) $
87
- match pattern' pattern' " " " " id
88
- return $ partialSortByAscScore maxRes perfectScore v
74
+ filter :: Int -- ^ Chunk size. 1000 works well.
75
+ -> Int -- ^ Max. number of results wanted
76
+ -> T. Text -- ^ Pattern.
77
+ -> [t ] -- ^ The list of values containing the text to search in.
78
+ -> (t -> T. Text ) -- ^ The function to extract the text from the container.
79
+ -> [Scored t ] -- ^ The list of results, sorted, highest score first.
80
+ filter chunkSize maxRes pattern ts extract = partialSortByAscScore maxRes perfectScore (concat vss)
89
81
where
90
82
-- Preserve case for the first character, make all others lowercase
91
- pattern' = case T. splitCharacterPrefix pattern of
92
- Just (c, rest) -> T. singleton c <> T. map toLower rest
93
- _ -> pattern
83
+ pattern' = case T. uncons pattern of
84
+ Just (c, rest) -> T. cons c (T. toLower rest)
85
+ _ -> pattern
86
+ vss = map (mapMaybe (\ t -> flip Scored t <$> match pattern' (extract t))) (chunkList chunkSize ts)
87
+ `using` parList (evalList rseq)
88
+ perfectScore = fromMaybe (error $ T. unpack pattern ) $ match pattern' pattern'
94
89
95
90
-- | Return all elements of the list that have a fuzzy
96
91
-- match against the pattern. Runs with default settings where
@@ -99,84 +94,44 @@ filter chunkSize maxRes pattern ts pre post extract = runST $ do
99
94
-- >>> simpleFilter "vm" ["vim", "emacs", "virtual machine"]
100
95
-- ["vim","virtual machine"]
101
96
{-# INLINABLE simpleFilter #-}
102
- simpleFilter :: (TextualMonoid s )
103
- => Int -- ^ Chunk size. 1000 works well.
104
- -> Int -- ^ Max. number of results wanted
105
- -> s -- ^ Pattern to look for.
106
- -> [s ] -- ^ List of texts to check.
107
- -> [Scored s ] -- ^ The ones that match.
97
+ simpleFilter :: Int -- ^ Chunk size. 1000 works well.
98
+ -> Int -- ^ Max. number of results wanted
99
+ -> T. Text -- ^ Pattern to look for.
100
+ -> [T. Text ] -- ^ List of texts to check.
101
+ -> [Scored T. Text ] -- ^ The ones that match.
108
102
simpleFilter chunk maxRes pattern xs =
109
- filter chunk maxRes pattern xs mempty mempty id
110
-
111
- --------------------------------------------------------------------------------
112
-
113
- -- | Evaluation that forces the 'score' field
114
- forceScore :: TextualMonoid s => Fuzzy t s -> Eval (Fuzzy t s )
115
- forceScore it@ Fuzzy {score} = do
116
- score' <- rseq score
117
- return it{score = score'}
103
+ filter chunk maxRes pattern xs id
118
104
119
105
--------------------------------------------------------------------------------
120
106
121
- -- | Divides a vector in chunks, applies the strategy in parallel to each chunk.
122
- parVectorChunk :: Int -> Strategy a -> Vector a -> Eval (Vector a )
123
- parVectorChunk chunkSize st v =
124
- V. concat <$> parTraversable (evalTraversable st) (chunkVector chunkSize v)
125
-
126
- -- >>> chunkVector 3 (V.fromList [0..10])
127
- -- >>> chunkVector 3 (V.fromList [0..11])
128
- -- >>> chunkVector 3 (V.fromList [0..12])
129
- -- [[0,1,2],[3,4,5],[6,7,8],[9,10]]
130
- -- [[0,1,2],[3,4,5],[6,7,8],[9,10,11]]
131
- -- [[0,1,2],[3,4,5],[6,7,8],[9,10,11],[12]]
132
- chunkVector :: Int -> Vector a -> [Vector a ]
133
- chunkVector chunkSize v = do
134
- let indices = chunkIndices chunkSize (0 ,V. length v)
135
- [V. slice l (h- l+ 1 ) v | (l,h) <- indices]
136
-
137
- -- >>> chunkIndices 3 (0,9)
138
- -- >>> chunkIndices 3 (0,10)
139
- -- >>> chunkIndices 3 (0,11)
140
- -- [(0,2),(3,5),(6,8)]
141
- -- [(0,2),(3,5),(6,8),(9,9)]
142
- -- [(0,2),(3,5),(6,8),(9,10)]
143
- chunkIndices :: Int -> (Int ,Int ) -> [(Int ,Int )]
144
- chunkIndices chunkSize (from,to) =
145
- map (second pred ) $
146
- pairwise $
147
- [from, from+ chunkSize .. to- 1 ] ++ [to]
148
-
149
- pairwise :: [a ] -> [(a ,a )]
150
- pairwise [] = []
151
- pairwise [_] = []
152
- pairwise (x: y: xs) = (x,y) : pairwise (y: xs)
107
+ chunkList :: Int -> [a ] -> [[a ]]
108
+ chunkList chunkSize = go
109
+ where
110
+ go [] = []
111
+ go xs = ys : go zs
112
+ where
113
+ (ys, zs) = splitAt chunkSize xs
153
114
154
115
-- | A stable partial sort ascending by score. O(N) best case, O(wanted*N) worst case
155
- partialSortByAscScore :: TextualMonoid s
156
- => Int -- ^ Number of items needed
116
+ partialSortByAscScore
117
+ :: Int -- ^ Number of items needed
157
118
-> Int -- ^ Value of a perfect score
158
- -> Vector (Fuzzy t s )
159
119
-> [Scored t ]
160
- partialSortByAscScore wantedCount perfectScore v = loop 0 ( SortState minBound perfectScore 0 ) [] where
161
- l = V. length v
162
- loop index st@ SortState {.. } acc
120
+ -> [ Scored t ]
121
+ partialSortByAscScore wantedCount perfectScore orig = loop orig ( SortState minBound perfectScore 0 ) [] where
122
+ loop [] st@ SortState {.. } acc
163
123
| foundCount == wantedCount = reverse acc
164
- | index == l
165
- -- ProgressCancelledException
166
- = if bestScoreSeen < scoreWanted
167
- then loop 0 st{scoreWanted = bestScoreSeen, bestScoreSeen = minBound } acc
124
+ | otherwise = if bestScoreSeen < scoreWanted
125
+ then loop orig st{scoreWanted = bestScoreSeen, bestScoreSeen = minBound } acc
168
126
else reverse acc
169
- | otherwise =
170
- case v! index of
171
- x | score x == scoreWanted
172
- -> loop (index+ 1 ) st{foundCount = foundCount+ 1 } (toScored x: acc)
173
- | score x < scoreWanted && score x > bestScoreSeen
174
- -> loop (index+ 1 ) st{bestScoreSeen = score x} acc
175
- | otherwise
176
- -> loop (index+ 1 ) st acc
177
-
178
- toScored :: TextualMonoid s => Fuzzy t s -> Scored t
179
- toScored Fuzzy {.. } = Scored score original
127
+ loop (x : xs) st@ SortState {.. } acc
128
+ | foundCount == wantedCount = reverse acc
129
+ | score x == scoreWanted
130
+ = loop xs st{foundCount = foundCount+ 1 } (x: acc)
131
+ | score x < scoreWanted && score x > bestScoreSeen
132
+ = loop xs st{bestScoreSeen = score x} acc
133
+ | otherwise
134
+ = loop xs st acc
180
135
181
136
data SortState a = SortState
182
137
{ bestScoreSeen :: ! Int
0 commit comments