Created
March 29, 2014 17:59
-
-
Save pxqr/9859111 to your computer and use it in GitHub Desktop.
bloom filters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: bloom-filters | |
version: 0.1.0.0 | |
synopsis: A collection of bloom filters. | |
description: A collection of bloom filters: | |
bloom filter, stable bloom filter and their mutable versions. | |
license: MIT | |
license-file: LICENSE | |
author: Sam T. | |
maintainer: Sam T. <[email protected]> | |
copyright: (c) 2013, Sam T. | |
stability: experimental | |
category: Data | |
build-type: Simple | |
cabal-version: >=1.8 | |
extra-source-files: README.md | |
source-repository head | |
type: git | |
location: https://github.com/fmap/bloom-filters.git | |
flag testing | |
description: Verbose testing mode. | |
default: False | |
library | |
hs-source-dirs: src | |
exposed-modules: Data.Filter.Bloom | |
Data.Filter.Bloom.Mutable | |
other-modules: Data.Filter.Bloom.Common | |
Data.Filter.Bloom.Stable | |
Data.Filter.Bloom.Stable.Mutable | |
build-depends: base == 4.5.* | |
, monad-loops >= 0.3.3.0 | |
, primitive >= 0.4.1 | |
, vector >= 0.9.1 | |
, random >= 1.0.1.1 | |
, mtl >= 2.1.1 | |
ghc-options: -O2 -Wall | |
if flag(testing) | |
hs-source-dirs: tests | |
ghc-options: -Werror | |
test-suite properties | |
type: exitcode-stdio-1.0 | |
main-is: tests/main.hs | |
build-depends: base == 4.5.* | |
, monad-loops >= 0.3.3.0 | |
, bloom-filters | |
, test-framework | |
, test-framework-quickcheck2 | |
, murmur-hash >= 0.1.0.6 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Data.Filter.Bloom | |
( Bloom(..), HashFun | |
-- ^ Construction | |
, empty | |
-- ^ Query | |
, member, notMember | |
-- ^ Combine | |
, intersection, union | |
-- ^ Conversion | |
, freeze, thaw, unsafeFreeze, unsafeThaw | |
) where | |
import Control.Monad.Primitive | |
import qualified Data.Vector.Unboxed as V | |
import Data.Vector.Unboxed (Vector) | |
import Data.Filter.Bloom.Common | |
import Data.Filter.Bloom.Mutable (MBloom(MBloom), HashFun) | |
data Bloom a = Bloom { | |
bloomHash :: HashFun a | |
, bloomBits :: Vector Bool | |
} | |
empty :: Int -> HashFun a -> Bloom a | |
empty m hs = Bloom hs (V.replicate m False) | |
-- | /O(K)/. Membership test. | |
member :: a -> Bloom a -> Bool | |
member key (Bloom hs bits) = all (hashIx bits) (hs key) | |
-- | /O(K)/. In general case 'notMember' is more efficient than 'not' '.' 'member'. | |
notMember :: a -> Bloom a -> Bool | |
notMember key (Bloom hs bits) = any (not . hashIx bits) (hs key) | |
-- | Note that you can only use the function when both filters have | |
-- the same set of hash functions and the same 'size'! | |
intersection :: Bloom a -> Bloom a -> Bloom a | |
intersection (Bloom hs a) (Bloom _ b) = Bloom hs (V.zipWith (&&) a b) | |
-- | Note that you can only use the function when both filters have | |
-- the same set of hash functions and the same 'size'! | |
union :: Bloom a -> Bloom a -> Bloom a | |
union (Bloom hs a) (Bloom _ b) = Bloom hs (V.zipWith (||) a b) | |
freeze :: PrimMonad m => MBloom m a -> m (Bloom a) | |
freeze (MBloom hs bits) = do | |
v <- V.unsafeFreeze bits | |
return $ Bloom hs v | |
{-# INLINE freeze #-} | |
thaw :: PrimMonad m => Bloom a -> m (MBloom m a) | |
thaw (Bloom hs bits) = do | |
v <- V.thaw bits | |
return $ MBloom hs v | |
{-# INLINE thaw #-} | |
unsafeFreeze :: PrimMonad m => MBloom m a -> m (Bloom a) | |
unsafeFreeze (MBloom hs bits) = do | |
v <- V.unsafeFreeze bits | |
return $ Bloom hs v | |
{-# INLINE unsafeFreeze #-} | |
unsafeThaw :: PrimMonad m => Bloom a -> m (MBloom m a) | |
unsafeThaw (Bloom hs bits) = do | |
v <- V.unsafeThaw bits | |
return $ MBloom hs v | |
{-# INLINE unsafeThaw #-} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Data.Filter.Bloom.Common | |
( hashIx, hashIxM | |
, zeroCount, printBits | |
) where | |
import Control.Monad.Primitive | |
import qualified Data.Vector.Unboxed as UV | |
import Data.Vector.Unboxed.Mutable as V | |
hashIx :: UV.Vector Bool -> Int -> Bool | |
hashIx bits ix = bits UV.! (ix `mod` UV.length bits) | |
{-# INLINE hashIx #-} | |
hashIxM :: (PrimMonad m, Unbox a) => MVector (PrimState m) a -> Int -> m a | |
hashIxM bits ix = V.unsafeRead bits (ix `mod` V.length bits) | |
{-# INLINE hashIxM #-} | |
zeroCount :: (PrimMonad m, Unbox a, Eq a, Bounded a) => MVector (PrimState m) a -> m Int | |
zeroCount mv = do | |
v <- UV.freeze mv | |
return $ UV.foldr (\x acc -> acc + fromEnum (x == minBound)) 0 v | |
printBits :: (Unbox a, Enum a) => MVector RealWorld a -> IO () | |
printBits bs = do | |
v <- UV.unsafeFreeze bs | |
mapM_ (print . fromEnum) (UV.toList v) | |
putStrLn "" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- | /K/ --- number of hash functions. | |
module Data.Filter.Bloom.Mutable | |
( MBloom(..), BloomIO, BloomST, HashFun | |
-- ^ Construction | |
, empty, copy | |
-- ^ Query | |
, member, notMember, insert, insertion, size | |
-- ^ Combine | |
, intersection, union | |
-- ^ Debug | |
, zerosFraction, printBloom | |
) where | |
import Control.Monad | |
import Control.Monad.Loops | |
import Control.Monad.ST | |
import Control.Monad.Primitive | |
import qualified Data.Vector.Unboxed as UV | |
import qualified Data.Vector.Unboxed.Mutable as V | |
import Data.Vector.Unboxed.Mutable (MVector, Unbox) | |
import Data.Filter.Bloom.Common | |
type HashFun a = a -> [Int] | |
data MBloom m a = MBloom { | |
bloomHash :: HashFun a | |
, bloomBits :: MVector (PrimState m) Bool | |
} | |
type BloomIO = MBloom IO | |
type BloomST s = MBloom (ST s) | |
-- <todo> guess size from number of functions and vice versa | |
empty :: PrimMonad m => Int -> HashFun a -> m (MBloom m a) | |
empty m h = do | |
v <- V.new m | |
V.set v False | |
return $ MBloom h v | |
copy :: PrimMonad m => MBloom m a -> m (MBloom m a) | |
copy (MBloom hs bits) = do | |
i <- UV.unsafeFreeze bits | |
v <- V.new (V.length bits) | |
UV.copy v i | |
return $ MBloom hs v | |
-- | O(K). Membership test. | |
member :: PrimMonad m => a -> MBloom m a -> m Bool | |
member key (MBloom hs bits) = allM (hashIxM bits) (hs key) | |
-- | /O(K)/. In general case 'notMember' is more efficient than 'not' '.' 'member'. | |
notMember :: PrimMonad m => a -> MBloom m a -> m Bool | |
notMember key (MBloom hs bits) = anyM (\ix -> hashIxM bits ix >>= return . not) (hs key) | |
-- | O(K). | |
insert :: PrimMonad m => a -> MBloom m a -> m () | |
insert key (MBloom hs bits) = | |
forM_ (hs key) $ \ix -> | |
V.unsafeWrite bits (ix `mod` V.length bits) True | |
-- | Insert a key in filter and return if a given key have been there | |
-- _before_ the insertion operaition. It's actually just a slitghtly more | |
-- efficient version of: | |
-- /do { b <- member key bf; insert key bf; return b }/ | |
-- because hash function is applied just once insteade of twice. | |
insertion :: PrimMonad m => a -> MBloom m a -> m Bool | |
insertion key (MBloom hs bits) = do | |
let ixs = hs key | |
oc <- allM (hashIxM bits) ixs | |
unless oc $ | |
forM_ ixs $ \ix -> | |
V.unsafeWrite bits (ix `mod` V.length bits) True | |
return oc | |
-- | Width of bloom filter in bits. Usually denoted as /m/. | |
size :: MBloom m a -> Int | |
size = V.length . bloomBits | |
-- | Note that you can only use the function when both filters have | |
-- the same set of hash functions and the same 'size'! | |
intersection :: PrimMonad m => MBloom m a -> MBloom m a -> m (MBloom m a) | |
intersection a@(MBloom afs abits) b@(MBloom _ bbits) = assertLengths a b $ do | |
v <- unsafeZipWith (&&) abits bbits | |
return $ MBloom afs v | |
-- | Note that you can only use the function when both filters have | |
-- the same set of hash functions and the same 'size'! | |
union :: PrimMonad m => MBloom m a -> MBloom m a -> m (MBloom m a) | |
union a@(MBloom afs abits) b@(MBloom _ bbits) = assertLengths a b $ do | |
v <- unsafeZipWith (||) abits bbits | |
return $ MBloom afs v | |
zerosFraction :: PrimMonad m => MBloom m a -> m Double | |
zerosFraction (MBloom _ bits) = do | |
count <- zeroCount bits | |
return $ fromIntegral count / fromIntegral (V.length bits) | |
printBloom :: MBloom IO a -> IO () | |
printBloom (MBloom _ bits) = printBits bits | |
------------------------------- Helpers ---------------------------------------- | |
unsafeZipWith :: (PrimMonad m, Unbox a, Unbox b, Unbox c) => | |
(a -> b -> c) -> | |
MVector (PrimState m) a -> | |
MVector (PrimState m) b -> | |
m (MVector (PrimState m) c) | |
unsafeZipWith (<>) a b = do | |
av <- UV.unsafeFreeze a | |
bv <- UV.unsafeFreeze b | |
rv <- UV.unsafeThaw $ UV.zipWith (<>) av bv | |
return rv | |
assertLengths :: MBloom m a -> MBloom m a -> b -> b | |
assertLengths (MBloom _ abits) (MBloom _ bbits) res | |
| V.length abits /= V.length bbits = | |
error "assertLength: size of filters do not match" | |
| otherwise = res |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Data.Filter.Bloom.Stable where | |
import Control.Monad.Primitive | |
import Data.Vector.Unboxed.Mutable | |
import Data.Word | |
data Bloom m a = Bloom { | |
bloomHash :: [a -> Int] | |
, bloomBits :: MVector (PrimState m) Word8 | |
} | |
empty :: m (Bloom m a) | |
empty = undefined | |
null :: Bloom m a -> m Bool | |
null = undefined | |
insert :: a -> Bloom m a -> m () | |
insert = undefined | |
member :: a -> Bloom m a -> m Bool | |
member = undefined | |
intersection :: Bloom m a -> Bloom m a -> m (Bloom m a) | |
intersection = undefined | |
union :: Bloom m a -> Bloom m a -> m (Bloom m a) | |
union = undefined |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE ConstraintKinds #-} | |
module Data.Filter.Bloom.Stable.Mutable | |
( MBloom(..) | |
-- ^ Construction | |
, empty | |
-- ^ Query | |
, member, notMember, insert | |
) where | |
import Control.Monad.Loops | |
import Control.Monad.Primitive | |
import Control.Monad.State | |
import Data.Vector.Unboxed.Mutable as V | |
import Data.Filter.Bloom.Common | |
import System.Random | |
-- | The first type param, namely /c/, denotes type of random gen, | |
-- which should be instance of System.Random.RandomGen. | |
-- The second type param, namely /c/, is type of cells. | |
data MBloom c m a = MBloom { | |
bloomHash :: [a -> Int] | |
, bloomBits :: MVector (PrimState m) c | |
} | |
-- | Constraint for cell types. | |
type Cell a = (Ord a, Bounded a, Enum a, Unbox a) | |
empty :: (PrimMonad m, Cell c) => Int -> [a -> Int] -> m (MBloom c m a) | |
empty m hs = do | |
v <- V.new m | |
V.set v minBound | |
return $ MBloom hs v | |
-- | O(K). Membership test. | |
member :: (PrimMonad m, Cell c) => a -> MBloom c m a -> m Bool | |
member key (MBloom hs bits) = allM test hs | |
where | |
test f = do | |
cell <- hashIxM bits (f key) | |
return (cell > minBound) | |
-- | O(K) | |
notMember :: (PrimMonad m, Cell c) => a -> MBloom c m a -> m Bool | |
notMember key (MBloom hs bits) = anyM test hs | |
where | |
test f = do | |
cell <- hashIxM bits (f key) | |
return (cell == minBound) | |
-- | Decrement a P uniformly choosen cells. | |
fresh :: (PrimMonad m, RandomGen g, Cell c) => MBloom c m a -> StateT g m () | |
fresh (MBloom _ bits) = forM_ [1..10 :: Int] $ \_ -> do | |
i <- state next | |
lift $ do | |
cell <- V.unsafeRead bits i | |
if cell > minBound then V.unsafeWrite bits i (pred cell) else return () | |
insert :: (PrimMonad m, RandomGen g, Cell c) => a -> MBloom c m a -> StateT g m () | |
insert key b@(MBloom hs bits) = do | |
fresh b | |
lift $ forM_ hs $ \f -> do | |
V.unsafeWrite bits (f key) maxBound |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Main where | |
import Control.Monad.ST | |
import Control.Monad.Loops | |
import Data.Filter.Bloom.Mutable | |
import Test.Framework | |
import Test.Framework.Providers.QuickCheck2 | |
prop_insertMember :: [String] -> Bool | |
prop_insertMember keys = runST $ do | |
bf <- empty 100 (return . length) | |
a <- allM (\key -> insert key bf >> member key bf) keys | |
b <- allM (`member` bf) keys | |
return (a && b) | |
main :: IO () | |
main = defaultMain | |
[ testProperty "insert-member" prop_insertMember | |
] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment