Skip to content

Commit

Permalink
Update Aho-Corasick (#66)
Browse files Browse the repository at this point in the history
- Build the trie strictly. This has an up front cost, but saves time
  later when matching.
- Force the result early in the alter function in insertTAC. Improves
  construction times a bit.
- Replace the ACLink type with Maybe (ACNode a). This lets us use the
  value from IM.lookup directly, and simplifies the code. Doesn't seem
  to have an observable effect on benchmarks.
- Update the NFData instances to not rnf outs. It was there since random
  benchmarks are unlikely to hit O(n^2), but we can do without it.
- Use ByteString instead of ByteString.Char8. Char doesn't give use
  anything here.
- Update complexities to be more accurate.
- Other small implementation changes.
  • Loading branch information
meooow25 authored Feb 22, 2024
1 parent b9577e7 commit d121780
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 47 deletions.
8 changes: 4 additions & 4 deletions bench-out/bench-out.csv
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
Name,Mean,MeanLB,MeanUB,Stddev,StddevLB,StddevUB
AhoCorasick/build from few long/100,8.513422680432697e-6,8.488458633778344e-6,8.546705080177561e-6,9.969128415851875e-8,6.874945026166872e-8,1.2551484185948497e-7
AhoCorasick/build from few long/10000,1.7882787405552921e-3,1.7827405684590716e-3,1.800476608986399e-3,2.6489400757498618e-5,1.527119101355923e-5,4.8771893418822536e-5
AhoCorasick/build from few long/1000000,0.23589482944379495,0.23200599944383513,0.24093268611088406,5.659296396731304e-3,2.7851112029909936e-3,8.843550322711289e-3
AhoCorasick/build from few long/500000,0.2870396793284454,0.2765157213341444,0.29758385999011805,1.3635218294303319e-2,1.0899630511983244e-2,1.5095991361441054e-2
AhoCorasick/match few long/100,2.2989241446890707e-6,2.2842848889416615e-6,2.3288720778934305e-6,6.554756497943676e-8,4.269301973156141e-8,1.0536142485745498e-7
AhoCorasick/match few long/10000,3.507593566002027e-4,3.503681577904525e-4,3.5142937242386426e-4,1.6338270733078287e-6,1.092881336848686e-6,2.3304923525208636e-6
AhoCorasick/match few long/1000000,3.656548186761784e-2,3.652561409766825e-2,3.6622430453914626e-2,9.672715275543123e-5,7.193501134881048e-5,1.2520862590684369e-4
AhoCorasick/match few long/500000,1.6762486934380014e-2,1.67489177556701e-2,1.6781974996993375e-2,3.836560959812748e-5,2.6144824792401538e-5,5.4936047377959083e-5
AhoCorasick/build from many short/100,7.620756091721171e-6,7.600920298876813e-6,7.653999891156354e-6,8.583816941679067e-8,6.086177964688925e-8,1.1094562635143705e-7
AhoCorasick/build from many short/10000,2.3220817337715476e-3,2.3162026496528912e-3,2.331857585301445e-3,2.41756256291307e-5,1.6287139995232412e-5,4.143502420584936e-5
AhoCorasick/build from many short/1000000,0.5978469729166894,0.5650704708333858,0.630623474999993,4.111337847925013e-2,2.18953305723231e-2,5.379875863123568e-2
AhoCorasick/build from many short/500000,0.28803835700508595,0.27732268700230633,0.3016158820060082,1.5092218078034075e-2,9.070650038611277e-3,1.903580993386635e-2
AhoCorasick/match many short/100,1.4163937802165651e-6,1.4148630918882928e-6,1.4183771713672137e-6,5.718962793971818e-9,4.258549520434209e-9,7.600752211313617e-9
AhoCorasick/match many short/10000,5.07770151244229e-4,5.070275720290223e-4,5.087876294835994e-4,3.054487854355818e-6,1.992049199816893e-6,5.195600275955076e-6
AhoCorasick/match many short/1000000,0.1395871239583336,0.1380405541666697,0.14196939375000284,2.908507943265563e-3,4.963682171872912e-4,3.706065436758448e-3
AhoCorasick/match many short/500000,5.7279700557087754e-2,5.6926181796010715e-2,5.7917360827559605e-2,8.894385491499692e-4,5.265095924127337e-4,1.3826368162908529e-3
Array/Arr/listArray @UArr @X/100,9.069885103472647e-7,9.035551382906603e-7,9.112235943143424e-7,1.2415186423055452e-8,1.1004677274345127e-8,1.4144533541032416e-8
Array/Arr/listArray @UArr @X/10000,8.69223926122827e-5,8.683319070226885e-5,8.703410027438146e-5,3.2818784665200795e-7,2.7162401018303904e-7,3.940282006642692e-7
Array/Arr/listArray @UArr @X/1000000,1.192827563639611e-2,1.1582270679330178e-2,1.244559316429011e-2,1.1138309892690973e-3,6.627342910107843e-4,1.7758030775786929e-3
Expand Down
10 changes: 5 additions & 5 deletions bench-out/bench-out.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,15 @@ For details about any benchmark see its source file.

AhoCorasick
┌───────────────────────┬──────────┬──────────┬──────────┐
│ Name │ 100 │ 10000 │ 1000000
│ Name │ 100 │ 10000 │ 500000
╞═══════════════════════╪══════════╪══════════╪══════════╡
│ build from few long │ 8.513 μs │ 1.788 ms │ 235.9 ms │
│ build from few long │ 8.513 μs │ 1.788 ms │ 287.0 ms │
├───────────────────────┼──────────┼──────────┼──────────┤
│ match few long │ 2.299 μs │ 350.8 μs │ 36.57 ms │
│ match few long │ 2.299 μs │ 350.8 μs │ 16.76 ms │
├───────────────────────┼──────────┼──────────┼──────────┤
│ build from many short │ 7.621 μs │ 2.322 ms │ 597.8 ms │
│ build from many short │ 7.621 μs │ 2.322 ms │ 288.0 ms │
├───────────────────────┼──────────┼──────────┼──────────┤
│ match many short │ 1.416 μs │ 507.8 μs │ 139.6 ms │
│ match many short │ 1.416 μs │ 507.8 μs │ 57.28 ms │
└───────────────────────┴──────────┴──────────┴──────────┘

Array
Expand Down
4 changes: 2 additions & 2 deletions bench/AhoCorasickBench.hs
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ benchmark = bgroup "AhoCorasick"

-- Build the Aho-Corasick automaton from n/20 a-z strings of length 20.
, bgroup "build from many short" $ map (benchBuild genMany) sizes

-- Match an Aho-Corasick automaton built from n/20 a-z strings of length 20 on a string of
-- length n.
, bgroup "match many short" $ map (benchMatch genMany) sizes
]

sizes :: [Int]
sizes = [100, 10000, 1000000]
sizes = [100, 10000, 500000]

benchBuild :: (Int -> RandStd [(C.ByteString, Int)]) -> Int -> Benchmark
benchBuild genps n = sizedBench n gen $ nf (fromTrieAC . fromListTAC) where
Expand Down
78 changes: 42 additions & 36 deletions src/AhoCorasick.hs
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
{-# LANGUAGE BangPatterns #-}

{-|
Aho-Corasick algorithm
The Aho-Corasick algorithm builds an automaton from a set of pattern strings, and then uses it to
find positions in a search string where each of the pattern strings occur.
This implementation only works on ByteStrings, to keep things fast. If required it can be adapted
to work on Strings, or even more generally (Ord a, Foldable f) => f a.
to work on other sequence types.
A TrieAC a can be constructed from pattern strings with associated values a, which can be then be
turned into an ACRoot a. An ACRoot a can then be run on a search string to find matches.
Expand All @@ -18,18 +20,22 @@ Sources:
* Stanford CS166 Aho-Corasick lecture slides
https://web.stanford.edu/class/archive/cs/cs166/cs166.1166/lectures/04/Slides04.pdf
Let k be the alphabet size. Let the complexity of IntMap operations be f(n), where n is the size of
the map. f(n) is O(min(n, word size)), see IntMap documentation for details.
Implementation notes:
* We have to be lazy in the (Maybe (ACNode a)) and the [a] in fromTrieAC because we build the tree
depth-first and strictly (due to IntMap.Strict). If we could build it breadth-first, then we
could be strict in these, but I don't see an easy way to do that.
For complexities below, k is the alphabet range (max 256).
emptyTAC
An empty trie.
insertTAC
Inserts a string with an associated value into a trie. O(n * f(k)) where n is the length of the
Inserts a string with an associated value into a trie. O(n log k) where n is the length of the
string.
fromListTAC
Builds a trie from a list of strings and associated values. O(n * f(k)) where n is total length of
Builds a trie from a list of strings and associated values. O(n log k) where n is total length of
the strings.
fromTrieAC
Expand All @@ -41,7 +47,7 @@ Returns a list of length (m + 1) where m is the length of the search string. Thi
list of pattern matches for every position in the string, including before the first character. A
match at a position is present as the associated value of the pattern string found to be ending at
that position.
O(m * f(k) + z), where m is the length of the string and z is the total number of matches.
O(m log k + z), where m is the length of the string and z is the total number of matches.
-}

module AhoCorasick
Expand All @@ -54,61 +60,61 @@ module AhoCorasick
, matchAC
) where

import Control.Applicative
import Control.DeepSeq
import Data.List
import Data.Maybe
import qualified Data.ByteString.Char8 as C
import qualified Data.ByteString as B
import qualified Data.IntMap.Strict as IM

data ACRoot a = ACRoot (IM.IntMap (ACNode a)) [a]
data ACNode a = ACNode (IM.IntMap (ACNode a)) [a] (ACLink a)
data ACLink a = RootL | NodeL !(ACNode a)
data ACRoot a = ACRoot !(IM.IntMap (ACNode a)) [a]
data ACNode a = ACNode !(IM.IntMap (ACNode a)) (Maybe (ACNode a)) [a]

fromTrieAC :: TrieAC a -> ACRoot a
fromTrieAC (TrieAC tm tvs) = ACRoot rmp tvs where
fromTrieAC (TrieAC tm routs) = ACRoot rmp routs where
rmp = IM.map go1 tm
go1 (TrieAC m vs) = ACNode (IM.mapWithKey (go RootL) m) (vs ++ tvs) RootL
go psuf c (TrieAC m vs) = ACNode (IM.mapWithKey (go suf) m) outs suf where
go1 (TrieAC m vs) = ACNode (IM.mapWithKey (go Nothing) m) Nothing (vs ++ routs)
go psuf !c (TrieAC m vs) = ACNode (IM.mapWithKey (go suf) m) suf outs where
suf = getSuf psuf
getSuf RootL = maybe RootL NodeL (IM.lookup c rmp)
getSuf (NodeL (ACNode mp' _ suf')) = maybe (getSuf suf') NodeL (IM.lookup c mp')
outs = vs ++ case suf of
RootL -> tvs
NodeL (ACNode _ outs' _) -> outs'

matchAC :: ACRoot a -> C.ByteString -> [[a]]
matchAC (ACRoot rmp routs) = (routs:) . go1 where
go1 = go rmp $ const ((routs:) . go1)
go2 (ACNode mp _ suf) = go mp $ const . case suf of
RootL -> go1
NodeL x -> go2 x
go mp miss s = case C.uncons s of
getSuf Nothing = IM.lookup c rmp
getSuf (Just (ACNode mp' suf' _)) = IM.lookup c mp' <|> getSuf suf'
outs = vs ++ maybe routs (\(ACNode _ _ outs') -> outs') suf

matchAC :: ACRoot a -> B.ByteString -> [[a]]
matchAC (ACRoot rmp routs) !s0 = routs : gor s0 where
gor s = case B.uncons s of
Nothing -> []
Just (c,s') -> case IM.lookup (fromEnum c) rmp of
Nothing -> routs : gor s'
Just (ACNode mp suf outs) -> outs : go mp suf s'
go mp suf s = case B.uncons s of
Nothing -> []
Just (c, s') -> case IM.lookup (fromEnum c) mp of
Nothing -> miss s s'
Just x@(ACNode _ outs _) -> outs : go2 x s'
Nothing -> maybe gor (\(ACNode mp' suf' _) -> go mp' suf') suf s
Just (ACNode mp' suf' outs) -> outs : go mp' suf' s'

data TrieAC a = TrieAC (IM.IntMap (TrieAC a)) [a] deriving Show
data TrieAC a = TrieAC !(IM.IntMap (TrieAC a)) ![a] deriving Show

emptyTAC :: TrieAC a
emptyTAC = TrieAC IM.empty []

insertTAC :: C.ByteString -> a -> TrieAC a -> TrieAC a
insertTAC :: B.ByteString -> a -> TrieAC a -> TrieAC a
insertTAC s v = go s where
go cs (TrieAC m vs) = case C.uncons cs of
go cs (TrieAC m vs) = case B.uncons cs of
Nothing -> TrieAC m (v:vs)
Just (c, cs') -> TrieAC m' vs where
m' = IM.alter (Just . go cs' . fromMaybe emptyTAC) (fromEnum c) m
m' = IM.alter ((Just $!) . go cs' . fromMaybe emptyTAC) (fromIntegral c) m

fromListTAC :: [(C.ByteString, a)] -> TrieAC a
fromListTAC :: [(B.ByteString, a)] -> TrieAC a
fromListTAC = foldl' (\t (s, v) -> insertTAC s v t) emptyTAC

--------------------------------------------------------------------------------
-- For tests

-- outs of nodes share structure, so rnf is O(n^2)
instance NFData a => NFData (ACNode a) where
rnf (ACNode mp outs suf) = rnf outs `seq` suf `seq` rnf mp
rnf (ACNode mp _outs suf) = suf `seq` rnf mp
-- outs of nodes share structure, so it is not forced
-- the suf link is forced only to WHNF, otherwise it would be reevaluating various parts of the tree

instance NFData a => NFData (ACRoot a) where
rnf (ACRoot mp outs) = rnf outs `seq` rnf mp
rnf (ACRoot mp _outs) = rnf mp

0 comments on commit d121780

Please sign in to comment.