Skip to content

Commit 847ad94

Browse files
Bodigrimpepeiborra
andauthored
Speed up fuzzy search (#2639)
* Speed up fuzzy search * test suite * Undo accidental file delete Co-authored-by: Pepe Iborra <[email protected]>
1 parent 38acb86 commit 847ad94

File tree

5 files changed

+240
-147
lines changed

5 files changed

+240
-147
lines changed

ghcide/ghcide.cabal

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@ library
5050
dlist,
5151
exceptions,
5252
extra >= 1.7.4,
53-
fuzzy,
5453
filepath,
5554
fingertree,
5655
focus,
@@ -205,6 +204,7 @@ library
205204
Development.IDE.Plugin.HLS.GhcIde
206205
Development.IDE.Plugin.Test
207206
Development.IDE.Plugin.TypeLenses
207+
Text.Fuzzy.Parallel
208208

209209
other-modules:
210210
Development.IDE.Core.FileExists
@@ -216,7 +216,6 @@ library
216216
Development.IDE.Plugin.Completions.Logic
217217
Development.IDE.Session.VersionCheck
218218
Development.IDE.Types.Action
219-
Text.Fuzzy.Parallel
220219

221220
ghc-options:
222221
-Wall
@@ -371,6 +370,7 @@ test-suite ghcide-tests
371370
directory,
372371
extra,
373372
filepath,
373+
fuzzy,
374374
--------------------------------------------------------------
375375
-- The MIN_VERSION_ghc macro relies on MIN_VERSION pragmas
376376
-- which require depending on ghc. So the tests need to depend
@@ -385,11 +385,13 @@ test-suite ghcide-tests
385385
lsp,
386386
lsp-types,
387387
hls-plugin-api,
388-
network-uri,
389388
lens,
390389
list-t,
391390
lsp-test ^>= 0.14,
391+
monoid-subclasses,
392+
network-uri,
392393
optparse-applicative,
394+
parallel,
393395
process,
394396
QuickCheck,
395397
quickcheck-instances,
@@ -410,6 +412,7 @@ test-suite ghcide-tests
410412
tasty-rerun,
411413
text,
412414
unordered-containers,
415+
vector,
413416
if (impl(ghc >= 8.6) && impl(ghc < 9.2))
414417
build-depends:
415418
record-dot-preprocessor,
@@ -423,6 +426,7 @@ test-suite ghcide-tests
423426
Development.IDE.Test.Runfiles
424427
Experiments
425428
Experiments.Types
429+
FuzzySearch
426430
Progress
427431
HieDbRetry
428432
default-extensions:

ghcide/src/Development/IDE/Plugin/Completions/Logic.hs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ import Ide.Types (CommandId (..),
6363
import Language.LSP.Types
6464
import Language.LSP.Types.Capabilities
6565
import qualified Language.LSP.VFS as VFS
66-
import Text.Fuzzy.Parallel (Scored (score_),
66+
import Text.Fuzzy.Parallel (Scored (score),
6767
original)
6868

6969
-- Chunk size used for parallelizing fuzzy matching
@@ -590,7 +590,7 @@ getCompletions plId ideOpts CC {allModNamesAsNS, anyQualCompls, unqualCompls, qu
590590
$ (if T.null enteredQual then id else mapMaybe (T.stripPrefix enteredQual))
591591
allModNamesAsNS
592592

593-
filtCompls = Fuzzy.filter chunkSize maxC prefixText ctxCompls "" "" (label . snd)
593+
filtCompls = Fuzzy.filter chunkSize maxC prefixText ctxCompls (label . snd)
594594
where
595595

596596
mcc = case maybe_parsed of
@@ -668,7 +668,7 @@ getCompletions plId ideOpts CC {allModNamesAsNS, anyQualCompls, unqualCompls, qu
668668
return $
669669
(fmap.fmap) snd $
670670
sortBy (compare `on` lexicographicOrdering) $
671-
mergeListsBy (flip compare `on` score_)
671+
mergeListsBy (flip compare `on` score)
672672
[ (fmap.fmap) (notQual,) filtModNameCompls
673673
, (fmap.fmap) (notQual,) filtKeywordCompls
674674
, (fmap.fmap.fmap) (toggleSnippets caps config) compls
@@ -681,11 +681,11 @@ getCompletions plId ideOpts CC {allModNamesAsNS, anyQualCompls, unqualCompls, qu
681681
-- 3. In-scope completions rank next
682682
-- 4. label alphabetical ordering next
683683
-- 4. detail alphabetical ordering (proxy for module)
684-
lexicographicOrdering Fuzzy.Scored{score_, original} =
684+
lexicographicOrdering Fuzzy.Scored{score, original} =
685685
case original of
686686
(isQual, CompletionItem{_label,_detail}) -> do
687687
let isLocal = maybe False (":" `T.isPrefixOf`) _detail
688-
(Down isQual, Down score_, Down isLocal, _label, _detail)
688+
(Down isQual, Down score, Down isLocal, _label, _detail)
689689

690690

691691

ghcide/src/Text/Fuzzy/Parallel.hs

Lines changed: 94 additions & 139 deletions
Original file line numberDiff line numberDiff line change
@@ -1,96 +1,91 @@
11
-- | Parallel versions of 'filter' and 'simpleFilter'
2+
23
module Text.Fuzzy.Parallel
34
( filter,
45
simpleFilter,
5-
Scored(..),
6-
-- reexports
7-
Fuzzy,
6+
match,
7+
Scored(..)
88
) where
99

10-
import Control.Monad.ST (runST)
11-
import Control.Parallel.Strategies (Eval, Strategy, evalTraversable,
12-
parTraversable, rseq, using)
13-
import Data.Monoid.Textual (TextualMonoid)
14-
import Data.Vector (Vector, (!))
15-
import qualified Data.Vector as V
16-
-- need to use a stable sort
17-
import Data.Bifunctor (second)
18-
import Data.Char (toLower)
19-
import Data.Maybe (fromMaybe)
20-
import qualified Data.Monoid.Textual as T
10+
import Control.Parallel.Strategies (rseq, using, parList, evalList)
11+
import Data.Bits ((.|.))
12+
import Data.Maybe (fromMaybe, mapMaybe)
13+
import qualified Data.Text as T
14+
import qualified Data.Text.Internal as T
15+
import qualified Data.Text.Array as TA
2116
import Prelude hiding (filter)
22-
import Text.Fuzzy (Fuzzy (..))
2317

24-
data Scored a = Scored {score_ :: !Int, original:: !a}
25-
deriving (Functor,Show)
18+
data Scored a = Scored {score :: !Int, original:: !a}
19+
deriving (Functor, Show)
2620

2721
-- | Returns the rendered output and the
2822
-- matching score for a pattern and a text.
2923
-- Two examples are given below:
3024
--
31-
-- >>> match "fnt" "infinite" "" "" id True
32-
-- Just ("infinite",3)
25+
-- >>> match "fnt" "infinite"
26+
-- Just 3
3327
--
34-
-- >>> match "hsk" ("Haskell",1995) "<" ">" fst False
35-
-- Just ("<h>a<s><k>ell",5)
28+
-- >>> match "hsk" "Haskell"
29+
-- Just 5
3630
--
3731
{-# INLINABLE match #-}
3832

39-
match :: (T.TextualMonoid s)
40-
=> s -- ^ Pattern in lowercase except for first character
41-
-> t -- ^ The value containing the text to search in.
42-
-> s -- ^ The text to add before each match.
43-
-> s -- ^ The text to add after each match.
44-
-> (t -> s) -- ^ The function to extract the text from the container.
45-
-> Maybe (Fuzzy t s) -- ^ The original value, rendered string and score.
46-
match pattern t pre post extract =
47-
if null pat then Just (Fuzzy t result totalScore) else Nothing
33+
match :: T.Text -- ^ Pattern in lowercase except for first character
34+
-> T.Text -- ^ The text to search in.
35+
-> Maybe Int -- ^ The score
36+
match (T.Text pArr pOff pLen) (T.Text sArr sOff sLen) = go 0 1 pOff sOff
4837
where
49-
null :: (T.TextualMonoid s) => s -> Bool
50-
null = not . T.any (const True)
51-
52-
s = extract t
53-
(totalScore, _currScore, result, pat, _) =
54-
T.foldl'
55-
undefined
56-
(\(tot, cur, res, pat, isFirst) c ->
57-
case T.splitCharacterPrefix pat of
58-
Nothing -> (tot, 0, res <> T.singleton c, pat, isFirst)
59-
Just (x, xs) ->
60-
-- the case of the first character has to match
61-
-- otherwise use lower case since the pattern is assumed lower
62-
let !c' = if isFirst then c else toLower c in
63-
if x == c' then
64-
let cur' = cur * 2 + 1 in
65-
(tot + cur', cur', res <> pre <> T.singleton c <> post, xs, False)
66-
else (tot, 0, res <> T.singleton c, pat, isFirst)
67-
) ( 0
68-
, 1 -- matching at the start gives a bonus (cur = 1)
69-
, mempty, pattern, True) s
38+
pTotal = pOff + pLen
39+
sDelta = sOff + sLen - pTotal
40+
41+
go !totalScore !currScore !currPOff !currSOff
42+
-- If pattern has been matched in full
43+
| currPOff >= pTotal
44+
= Just totalScore
45+
-- If there is not enough left to match the rest of the pattern, equivalent to
46+
-- (sOff + sLen - currSOff) < (pOff + pLen - currPOff)
47+
| currSOff > currPOff + sDelta
48+
= Nothing
49+
-- This is slightly broken for non-ASCII:
50+
-- 1. If code units, consisting a single pattern code point, are found as parts
51+
-- of different code points, it counts as a match. Unless you use a ton of emojis
52+
-- as identifiers, such false positives should not be be a big deal,
53+
-- and anyways HLS does not currently support such use cases, because it uses
54+
-- code point and UTF-16 code unit positions interchangeably.
55+
-- 2. Case conversions is not applied to non-ASCII code points, because one has
56+
-- to call T.toLower (not T.map toLower), reallocating the string in full, which
57+
-- is too much of performance penalty for fuzzy search. Again, anyway HLS does not
58+
-- attempt to do justice to Unicode: proper Unicode text matching requires
59+
-- `unicode-transforms` and friends.
60+
-- Altogether we sacrifice correctness for the sake of performance, which
61+
-- is a right trade-off for fuzzy search.
62+
| pByte <- TA.unsafeIndex pArr currPOff
63+
, sByte <- TA.unsafeIndex sArr currSOff
64+
-- First byte (currPOff == pOff) should match exactly, otherwise - up to case.
65+
, pByte == sByte || (currPOff /= pOff && pByte == toLowerAscii sByte)
66+
= let curr = currScore * 2 + 1 in
67+
go (totalScore + curr) curr (currPOff + 1) (currSOff + 1)
68+
| otherwise
69+
= go totalScore 0 currPOff (currSOff + 1)
70+
71+
toLowerAscii w = if (w - 65) < 26 then w .|. 0x20 else w
7072

7173
-- | The function to filter a list of values by fuzzy search on the text extracted from them.
72-
filter :: (TextualMonoid s)
73-
=> Int -- ^ Chunk size. 1000 works well.
74-
-> Int -- ^ Max. number of results wanted
75-
-> s -- ^ Pattern.
76-
-> [t] -- ^ The list of values containing the text to search in.
77-
-> s -- ^ The text to add before each match.
78-
-> s -- ^ The text to add after each match.
79-
-> (t -> s) -- ^ The function to extract the text from the container.
80-
-> [Scored t] -- ^ The list of results, sorted, highest score first.
81-
filter chunkSize maxRes pattern ts pre post extract = runST $ do
82-
let v = V.mapMaybe id
83-
(V.map (\t -> match pattern' t pre post extract) (V.fromList ts)
84-
`using`
85-
parVectorChunk chunkSize (evalTraversable forceScore))
86-
perfectScore = score $ fromMaybe (error $ T.toString undefined pattern) $
87-
match pattern' pattern' "" "" id
88-
return $ partialSortByAscScore maxRes perfectScore v
74+
filter :: Int -- ^ Chunk size. 1000 works well.
75+
-> Int -- ^ Max. number of results wanted
76+
-> T.Text -- ^ Pattern.
77+
-> [t] -- ^ The list of values containing the text to search in.
78+
-> (t -> T.Text) -- ^ The function to extract the text from the container.
79+
-> [Scored t] -- ^ The list of results, sorted, highest score first.
80+
filter chunkSize maxRes pattern ts extract = partialSortByAscScore maxRes perfectScore (concat vss)
8981
where
9082
-- Preserve case for the first character, make all others lowercase
91-
pattern' = case T.splitCharacterPrefix pattern of
92-
Just (c, rest) -> T.singleton c <> T.map toLower rest
93-
_ -> pattern
83+
pattern' = case T.uncons pattern of
84+
Just (c, rest) -> T.cons c (T.toLower rest)
85+
_ -> pattern
86+
vss = map (mapMaybe (\t -> flip Scored t <$> match pattern' (extract t))) (chunkList chunkSize ts)
87+
`using` parList (evalList rseq)
88+
perfectScore = fromMaybe (error $ T.unpack pattern) $ match pattern' pattern'
9489

9590
-- | Return all elements of the list that have a fuzzy
9691
-- match against the pattern. Runs with default settings where
@@ -99,84 +94,44 @@ filter chunkSize maxRes pattern ts pre post extract = runST $ do
9994
-- >>> simpleFilter "vm" ["vim", "emacs", "virtual machine"]
10095
-- ["vim","virtual machine"]
10196
{-# INLINABLE simpleFilter #-}
102-
simpleFilter :: (TextualMonoid s)
103-
=> Int -- ^ Chunk size. 1000 works well.
104-
-> Int -- ^ Max. number of results wanted
105-
-> s -- ^ Pattern to look for.
106-
-> [s] -- ^ List of texts to check.
107-
-> [Scored s] -- ^ The ones that match.
97+
simpleFilter :: Int -- ^ Chunk size. 1000 works well.
98+
-> Int -- ^ Max. number of results wanted
99+
-> T.Text -- ^ Pattern to look for.
100+
-> [T.Text] -- ^ List of texts to check.
101+
-> [Scored T.Text] -- ^ The ones that match.
108102
simpleFilter chunk maxRes pattern xs =
109-
filter chunk maxRes pattern xs mempty mempty id
110-
111-
--------------------------------------------------------------------------------
112-
113-
-- | Evaluation that forces the 'score' field
114-
forceScore :: TextualMonoid s => Fuzzy t s -> Eval(Fuzzy t s)
115-
forceScore it@Fuzzy{score} = do
116-
score' <- rseq score
117-
return it{score = score'}
103+
filter chunk maxRes pattern xs id
118104

119105
--------------------------------------------------------------------------------
120106

121-
-- | Divides a vector in chunks, applies the strategy in parallel to each chunk.
122-
parVectorChunk :: Int -> Strategy a -> Vector a -> Eval (Vector a)
123-
parVectorChunk chunkSize st v =
124-
V.concat <$> parTraversable (evalTraversable st) (chunkVector chunkSize v)
125-
126-
-- >>> chunkVector 3 (V.fromList [0..10])
127-
-- >>> chunkVector 3 (V.fromList [0..11])
128-
-- >>> chunkVector 3 (V.fromList [0..12])
129-
-- [[0,1,2],[3,4,5],[6,7,8],[9,10]]
130-
-- [[0,1,2],[3,4,5],[6,7,8],[9,10,11]]
131-
-- [[0,1,2],[3,4,5],[6,7,8],[9,10,11],[12]]
132-
chunkVector :: Int -> Vector a -> [Vector a]
133-
chunkVector chunkSize v = do
134-
let indices = chunkIndices chunkSize (0,V.length v)
135-
[V.slice l (h-l+1) v | (l,h) <- indices]
136-
137-
-- >>> chunkIndices 3 (0,9)
138-
-- >>> chunkIndices 3 (0,10)
139-
-- >>> chunkIndices 3 (0,11)
140-
-- [(0,2),(3,5),(6,8)]
141-
-- [(0,2),(3,5),(6,8),(9,9)]
142-
-- [(0,2),(3,5),(6,8),(9,10)]
143-
chunkIndices :: Int -> (Int,Int) -> [(Int,Int)]
144-
chunkIndices chunkSize (from,to) =
145-
map (second pred) $
146-
pairwise $
147-
[from, from+chunkSize .. to-1] ++ [to]
148-
149-
pairwise :: [a] -> [(a,a)]
150-
pairwise [] = []
151-
pairwise [_] = []
152-
pairwise (x:y:xs) = (x,y) : pairwise (y:xs)
107+
chunkList :: Int -> [a] -> [[a]]
108+
chunkList chunkSize = go
109+
where
110+
go [] = []
111+
go xs = ys : go zs
112+
where
113+
(ys, zs) = splitAt chunkSize xs
153114

154115
-- | A stable partial sort ascending by score. O(N) best case, O(wanted*N) worst case
155-
partialSortByAscScore :: TextualMonoid s
156-
=> Int -- ^ Number of items needed
116+
partialSortByAscScore
117+
:: Int -- ^ Number of items needed
157118
-> Int -- ^ Value of a perfect score
158-
-> Vector (Fuzzy t s)
159119
-> [Scored t]
160-
partialSortByAscScore wantedCount perfectScore v = loop 0 (SortState minBound perfectScore 0) [] where
161-
l = V.length v
162-
loop index st@SortState{..} acc
120+
-> [Scored t]
121+
partialSortByAscScore wantedCount perfectScore orig = loop orig (SortState minBound perfectScore 0) [] where
122+
loop [] st@SortState{..} acc
163123
| foundCount == wantedCount = reverse acc
164-
| index == l
165-
-- ProgressCancelledException
166-
= if bestScoreSeen < scoreWanted
167-
then loop 0 st{scoreWanted = bestScoreSeen, bestScoreSeen = minBound} acc
124+
| otherwise = if bestScoreSeen < scoreWanted
125+
then loop orig st{scoreWanted = bestScoreSeen, bestScoreSeen = minBound} acc
168126
else reverse acc
169-
| otherwise =
170-
case v!index of
171-
x | score x == scoreWanted
172-
-> loop (index+1) st{foundCount = foundCount+1} (toScored x:acc)
173-
| score x < scoreWanted && score x > bestScoreSeen
174-
-> loop (index+1) st{bestScoreSeen = score x} acc
175-
| otherwise
176-
-> loop (index+1) st acc
177-
178-
toScored :: TextualMonoid s => Fuzzy t s -> Scored t
179-
toScored Fuzzy{..} = Scored score original
127+
loop (x : xs) st@SortState{..} acc
128+
| foundCount == wantedCount = reverse acc
129+
| score x == scoreWanted
130+
= loop xs st{foundCount = foundCount+1} (x:acc)
131+
| score x < scoreWanted && score x > bestScoreSeen
132+
= loop xs st{bestScoreSeen = score x} acc
133+
| otherwise
134+
= loop xs st acc
180135

181136
data SortState a = SortState
182137
{ bestScoreSeen :: !Int

0 commit comments

Comments
 (0)