Skip to content

Instantly share code, notes, and snippets.

@pxqr
Created March 29, 2014 17:59
Show Gist options
  • Save pxqr/9859111 to your computer and use it in GitHub Desktop.
Save pxqr/9859111 to your computer and use it in GitHub Desktop.
bloom filters
name: bloom-filters
version: 0.1.0.0
synopsis: A collection of bloom filters.
description: A collection of bloom filters:
bloom filter, stable bloom filter and their mutable versions.
license: MIT
license-file: LICENSE
author: Sam T.
maintainer: Sam T. <[email protected]>
copyright: (c) 2013, Sam T.
stability: experimental
category: Data
build-type: Simple
cabal-version: >=1.8
extra-source-files: README.md
source-repository head
type: git
location: https://github.com/fmap/bloom-filters.git
flag testing
description: Verbose testing mode.
default: False
library
hs-source-dirs: src
exposed-modules: Data.Filter.Bloom
Data.Filter.Bloom.Mutable
other-modules: Data.Filter.Bloom.Common
Data.Filter.Bloom.Stable
Data.Filter.Bloom.Stable.Mutable
build-depends: base == 4.5.*
, monad-loops >= 0.3.3.0
, primitive >= 0.4.1
, vector >= 0.9.1
, random >= 1.0.1.1
, mtl >= 2.1.1
ghc-options: -O2 -Wall
if flag(testing)
hs-source-dirs: tests
ghc-options: -Werror
test-suite properties
type: exitcode-stdio-1.0
main-is: tests/main.hs
build-depends: base == 4.5.*
, monad-loops >= 0.3.3.0
, bloom-filters
, test-framework
, test-framework-quickcheck2
, murmur-hash >= 0.1.0.6
module Data.Filter.Bloom
( Bloom(..), HashFun
-- ^ Construction
, empty
-- ^ Query
, member, notMember
-- ^ Combine
, intersection, union
-- ^ Conversion
, freeze, thaw, unsafeFreeze, unsafeThaw
) where
import Control.Monad.Primitive
import qualified Data.Vector.Unboxed as V
import Data.Vector.Unboxed (Vector)
import Data.Filter.Bloom.Common
import Data.Filter.Bloom.Mutable (MBloom(MBloom), HashFun)
data Bloom a = Bloom {
bloomHash :: HashFun a
, bloomBits :: Vector Bool
}
empty :: Int -> HashFun a -> Bloom a
empty m hs = Bloom hs (V.replicate m False)
-- | /O(K)/. Membership test.
member :: a -> Bloom a -> Bool
member key (Bloom hs bits) = all (hashIx bits) (hs key)
-- | /O(K)/. In general case 'notMember' is more efficient than 'not' '.' 'member'.
notMember :: a -> Bloom a -> Bool
notMember key (Bloom hs bits) = any (not . hashIx bits) (hs key)
-- | Note that you can only use the function when both filters have
-- the same set of hash functions and the same 'size'!
intersection :: Bloom a -> Bloom a -> Bloom a
intersection (Bloom hs a) (Bloom _ b) = Bloom hs (V.zipWith (&&) a b)
-- | Note that you can only use the function when both filters have
-- the same set of hash functions and the same 'size'!
union :: Bloom a -> Bloom a -> Bloom a
union (Bloom hs a) (Bloom _ b) = Bloom hs (V.zipWith (||) a b)
freeze :: PrimMonad m => MBloom m a -> m (Bloom a)
freeze (MBloom hs bits) = do
v <- V.unsafeFreeze bits
return $ Bloom hs v
{-# INLINE freeze #-}
thaw :: PrimMonad m => Bloom a -> m (MBloom m a)
thaw (Bloom hs bits) = do
v <- V.thaw bits
return $ MBloom hs v
{-# INLINE thaw #-}
unsafeFreeze :: PrimMonad m => MBloom m a -> m (Bloom a)
unsafeFreeze (MBloom hs bits) = do
v <- V.unsafeFreeze bits
return $ Bloom hs v
{-# INLINE unsafeFreeze #-}
unsafeThaw :: PrimMonad m => Bloom a -> m (MBloom m a)
unsafeThaw (Bloom hs bits) = do
v <- V.unsafeThaw bits
return $ MBloom hs v
{-# INLINE unsafeThaw #-}
module Data.Filter.Bloom.Common
( hashIx, hashIxM
, zeroCount, printBits
) where
import Control.Monad.Primitive
import qualified Data.Vector.Unboxed as UV
import Data.Vector.Unboxed.Mutable as V
hashIx :: UV.Vector Bool -> Int -> Bool
hashIx bits ix = bits UV.! (ix `mod` UV.length bits)
{-# INLINE hashIx #-}
hashIxM :: (PrimMonad m, Unbox a) => MVector (PrimState m) a -> Int -> m a
hashIxM bits ix = V.unsafeRead bits (ix `mod` V.length bits)
{-# INLINE hashIxM #-}
zeroCount :: (PrimMonad m, Unbox a, Eq a, Bounded a) => MVector (PrimState m) a -> m Int
zeroCount mv = do
v <- UV.freeze mv
return $ UV.foldr (\x acc -> acc + fromEnum (x == minBound)) 0 v
printBits :: (Unbox a, Enum a) => MVector RealWorld a -> IO ()
printBits bs = do
v <- UV.unsafeFreeze bs
mapM_ (print . fromEnum) (UV.toList v)
putStrLn ""
-- | /K/ --- number of hash functions.
module Data.Filter.Bloom.Mutable
( MBloom(..), BloomIO, BloomST, HashFun
-- ^ Construction
, empty, copy
-- ^ Query
, member, notMember, insert, insertion, size
-- ^ Combine
, intersection, union
-- ^ Debug
, zerosFraction, printBloom
) where
import Control.Monad
import Control.Monad.Loops
import Control.Monad.ST
import Control.Monad.Primitive
import qualified Data.Vector.Unboxed as UV
import qualified Data.Vector.Unboxed.Mutable as V
import Data.Vector.Unboxed.Mutable (MVector, Unbox)
import Data.Filter.Bloom.Common
type HashFun a = a -> [Int]
data MBloom m a = MBloom {
bloomHash :: HashFun a
, bloomBits :: MVector (PrimState m) Bool
}
type BloomIO = MBloom IO
type BloomST s = MBloom (ST s)
-- <todo> guess size from number of functions and vice versa
empty :: PrimMonad m => Int -> HashFun a -> m (MBloom m a)
empty m h = do
v <- V.new m
V.set v False
return $ MBloom h v
copy :: PrimMonad m => MBloom m a -> m (MBloom m a)
copy (MBloom hs bits) = do
i <- UV.unsafeFreeze bits
v <- V.new (V.length bits)
UV.copy v i
return $ MBloom hs v
-- | O(K). Membership test.
member :: PrimMonad m => a -> MBloom m a -> m Bool
member key (MBloom hs bits) = allM (hashIxM bits) (hs key)
-- | /O(K)/. In general case 'notMember' is more efficient than 'not' '.' 'member'.
notMember :: PrimMonad m => a -> MBloom m a -> m Bool
notMember key (MBloom hs bits) = anyM (\ix -> hashIxM bits ix >>= return . not) (hs key)
-- | O(K).
insert :: PrimMonad m => a -> MBloom m a -> m ()
insert key (MBloom hs bits) =
forM_ (hs key) $ \ix ->
V.unsafeWrite bits (ix `mod` V.length bits) True
-- | Insert a key in filter and return if a given key have been there
-- _before_ the insertion operaition. It's actually just a slitghtly more
-- efficient version of:
-- /do { b <- member key bf; insert key bf; return b }/
-- because hash function is applied just once insteade of twice.
insertion :: PrimMonad m => a -> MBloom m a -> m Bool
insertion key (MBloom hs bits) = do
let ixs = hs key
oc <- allM (hashIxM bits) ixs
unless oc $
forM_ ixs $ \ix ->
V.unsafeWrite bits (ix `mod` V.length bits) True
return oc
-- | Width of bloom filter in bits. Usually denoted as /m/.
size :: MBloom m a -> Int
size = V.length . bloomBits
-- | Note that you can only use the function when both filters have
-- the same set of hash functions and the same 'size'!
intersection :: PrimMonad m => MBloom m a -> MBloom m a -> m (MBloom m a)
intersection a@(MBloom afs abits) b@(MBloom _ bbits) = assertLengths a b $ do
v <- unsafeZipWith (&&) abits bbits
return $ MBloom afs v
-- | Note that you can only use the function when both filters have
-- the same set of hash functions and the same 'size'!
union :: PrimMonad m => MBloom m a -> MBloom m a -> m (MBloom m a)
union a@(MBloom afs abits) b@(MBloom _ bbits) = assertLengths a b $ do
v <- unsafeZipWith (||) abits bbits
return $ MBloom afs v
zerosFraction :: PrimMonad m => MBloom m a -> m Double
zerosFraction (MBloom _ bits) = do
count <- zeroCount bits
return $ fromIntegral count / fromIntegral (V.length bits)
printBloom :: MBloom IO a -> IO ()
printBloom (MBloom _ bits) = printBits bits
------------------------------- Helpers ----------------------------------------
unsafeZipWith :: (PrimMonad m, Unbox a, Unbox b, Unbox c) =>
(a -> b -> c) ->
MVector (PrimState m) a ->
MVector (PrimState m) b ->
m (MVector (PrimState m) c)
unsafeZipWith (<>) a b = do
av <- UV.unsafeFreeze a
bv <- UV.unsafeFreeze b
rv <- UV.unsafeThaw $ UV.zipWith (<>) av bv
return rv
assertLengths :: MBloom m a -> MBloom m a -> b -> b
assertLengths (MBloom _ abits) (MBloom _ bbits) res
| V.length abits /= V.length bbits =
error "assertLength: size of filters do not match"
| otherwise = res
module Data.Filter.Bloom.Stable where
import Control.Monad.Primitive
import Data.Vector.Unboxed.Mutable
import Data.Word
data Bloom m a = Bloom {
bloomHash :: [a -> Int]
, bloomBits :: MVector (PrimState m) Word8
}
empty :: m (Bloom m a)
empty = undefined
null :: Bloom m a -> m Bool
null = undefined
insert :: a -> Bloom m a -> m ()
insert = undefined
member :: a -> Bloom m a -> m Bool
member = undefined
intersection :: Bloom m a -> Bloom m a -> m (Bloom m a)
intersection = undefined
union :: Bloom m a -> Bloom m a -> m (Bloom m a)
union = undefined
{-# LANGUAGE ConstraintKinds #-}
module Data.Filter.Bloom.Stable.Mutable
( MBloom(..)
-- ^ Construction
, empty
-- ^ Query
, member, notMember, insert
) where
import Control.Monad.Loops
import Control.Monad.Primitive
import Control.Monad.State
import Data.Vector.Unboxed.Mutable as V
import Data.Filter.Bloom.Common
import System.Random
-- | The first type param, namely /c/, denotes type of random gen,
-- which should be instance of System.Random.RandomGen.
-- The second type param, namely /c/, is type of cells.
data MBloom c m a = MBloom {
bloomHash :: [a -> Int]
, bloomBits :: MVector (PrimState m) c
}
-- | Constraint for cell types.
type Cell a = (Ord a, Bounded a, Enum a, Unbox a)
empty :: (PrimMonad m, Cell c) => Int -> [a -> Int] -> m (MBloom c m a)
empty m hs = do
v <- V.new m
V.set v minBound
return $ MBloom hs v
-- | O(K). Membership test.
member :: (PrimMonad m, Cell c) => a -> MBloom c m a -> m Bool
member key (MBloom hs bits) = allM test hs
where
test f = do
cell <- hashIxM bits (f key)
return (cell > minBound)
-- | O(K)
notMember :: (PrimMonad m, Cell c) => a -> MBloom c m a -> m Bool
notMember key (MBloom hs bits) = anyM test hs
where
test f = do
cell <- hashIxM bits (f key)
return (cell == minBound)
-- | Decrement a P uniformly choosen cells.
fresh :: (PrimMonad m, RandomGen g, Cell c) => MBloom c m a -> StateT g m ()
fresh (MBloom _ bits) = forM_ [1..10 :: Int] $ \_ -> do
i <- state next
lift $ do
cell <- V.unsafeRead bits i
if cell > minBound then V.unsafeWrite bits i (pred cell) else return ()
insert :: (PrimMonad m, RandomGen g, Cell c) => a -> MBloom c m a -> StateT g m ()
insert key b@(MBloom hs bits) = do
fresh b
lift $ forM_ hs $ \f -> do
V.unsafeWrite bits (f key) maxBound
module Main where
import Control.Monad.ST
import Control.Monad.Loops
import Data.Filter.Bloom.Mutable
import Test.Framework
import Test.Framework.Providers.QuickCheck2
prop_insertMember :: [String] -> Bool
prop_insertMember keys = runST $ do
bf <- empty 100 (return . length)
a <- allM (\key -> insert key bf >> member key bf) keys
b <- allM (`member` bf) keys
return (a && b)
main :: IO ()
main = defaultMain
[ testProperty "insert-member" prop_insertMember
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment