Last active
September 5, 2022 14:07
-
-
Save Lysxia/50141795d11e9d43f5094039a35412a3 to your computer and use it in GitHub Desktop.
Applicative HTML Parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE GADTs, DeriveFunctor, LambdaCase #-} | |
import Data.Functor.Product | |
import Data.Void | |
import Control.Applicative | |
type Tag = String | |
type Attr = String | |
data HTML | |
= Tag Tag [Attr] [HTML] | |
| Txt String | |
-- Example | |
example :: HTML | |
example = | |
Tag "html" ["lang=en"] | |
[ Tag "head" [] | |
[ Tag "title" [] [ Txt "Hello World!" ] | |
] | |
, Tag "body" [] | |
[ Tag "h1" [] [ Txt "Hi" ] | |
, Tag "p" ["id=lorem"] [ Txt "lorem ipsum" ] | |
] | |
] | |
parser :: HTMLParser [String] | |
parser = sequenceA | |
[getTitle, getH1, getLorem] | |
getTitle :: HTMLParser String | |
getTitle = onElem "title" (onChild getTxt) | |
getH1 :: HTMLParser String | |
getH1 = onElem "body" (onElem "h1" (onChild getTxt)) | |
getLorem :: HTMLParser String | |
getLorem = onElem "body" (onAttr "id=lorem" (onChild getTxt)) | |
main :: IO () | |
main = print (parseHTML parser example) | |
-- Implementation | |
onElem :: Tag -> HTMLParser a -> HTMLParser a | |
onElem t = Expect . Find (HasTag t) | |
onAttr :: String -> HTMLParser a -> HTMLParser a | |
onAttr a = Expect . Find (HasAttr a) | |
onChild :: HTMLParser a -> HTMLParser a | |
onChild = OnChild | |
getTxt :: HTMLParser String | |
getTxt = Expect (OnElt $ \case Txt t -> Just t ; Tag _ _ _ -> Nothing) | |
data HTMLParser a where | |
And :: Day HTMLParser HTMLParser a -> HTMLParser a | |
-- ^ Conjunction of parsers (all must succeed and their results are combined) | |
Find :: Selector -> HTMLParser a -> HTMLParser (Maybe a) | |
-- ^ Find matching element anywhere and apply subparser to it. | |
-- Return Nothing if no match found. | |
Expect :: HTMLParser (Maybe a) -> HTMLParser a | |
-- ^ Swallow the Maybe error in Find | |
OnChild :: HTMLParser a -> HTMLParser a | |
-- ^ Try applying subparser to children | |
Check :: Selector -> HTMLParser a -> HTMLParser a | |
-- ^ Match current element and apply subparser to it | |
Fail :: HTMLParser a | |
OnElt :: (HTML -> a) -> HTMLParser a | |
Done :: a -> HTMLParser a | |
data Selector | |
= HasTag Tag | |
| HasAttr Attr | |
| (:&&:) Selector Selector | |
| Not Selector | |
evalSel :: Selector -> HTML -> Bool | |
evalSel (HasTag t) (Tag t' _ _) = t == t' | |
evalSel (HasTag _) (Txt _) = False | |
evalSel (HasAttr x) (Tag _ xs _) = x `elem` xs | |
evalSel (HasAttr x) (Txt _) = False | |
evalSel (s :&&: s') h = evalSel s h && evalSel s' h | |
evalSel (Not s) h = not (evalSel s h) | |
data Day f g a where | |
LiftA2 :: (b -> c -> a) -> f b -> g c -> Day f g a | |
data Bind f g a where | |
(:>>=:) :: f a -> (a -> g b) -> Bind f g b | |
instance Functor HTMLParser where | |
fmap = liftA | |
instance Applicative HTMLParser where | |
pure = Done | |
liftA2 f (Done x) (Done y) = Done (f x y) | |
liftA2 f Fail _ = Fail | |
liftA2 _ _ Fail = Fail | |
liftA2 f x y = And (LiftA2 f x y) | |
-- Split a parser into a subparser to parse the children of an element and a continuation to parse its siblings. | |
split :: HTML -> HTMLParser a -> Bind HTMLParser HTMLParser a | |
split h Fail = Fail :>>=: absurd | |
split h (OnElt f) = Done () :>>=: \() -> Done (f h) | |
split h (Done x) = Done () :>>=: \() -> Done x | |
split h p0@(Find f p) | |
| evalSel f h = case split h p of | |
p :>>=: k -> p :>>=: (fmap Just . k) | |
| otherwise = p0 :>>=: \case Nothing -> p0 ; Just x -> Done (Just x) | |
split h p0@(Check f p) | |
| evalSel f h = split h p | |
| otherwise = Fail :>>=: absurd | |
split h (OnChild p) = p :>>=: Done | |
split h (Expect p) = case split h p of | |
p :>>=: k -> p :>>=: (Expect . k) | |
split h (And (LiftA2 f l r)) = case (split h l, split h r) of | |
(l :>>=: kl, r :>>=: kr) -> liftA2 (,) l r :>>=: \(x, y) -> liftA2 f (kl x) (kr y) | |
endElement :: HTMLParser a -> Maybe a | |
endElement (Done x) = Just x | |
endElement Fail = Nothing | |
endElement (Find _ _) = Just Nothing | |
endElement (Check _ _) = Nothing | |
endElement (Expect p) = case endElement p of | |
Just (Just x) -> Just x | |
_ -> Nothing | |
endElement (OnChild p) = Nothing | |
endElement (And (LiftA2 f p q)) = liftA2 f (endElement p) (endElement q) | |
endElement (OnElt _) = error "should not happen" | |
parseHTML' :: HTMLParser a -> HTML -> HTMLParser a | |
parseHTML' p h = case split h p of | |
Done x :>>=: k -> k x | |
p :>>=: k -> case h of | |
Tag _ _ hs -> parseHTMLs p hs k | |
Txt _ -> parseHTMLs p [] k | |
parseHTMLs :: HTMLParser a -> [HTML] -> (a -> HTMLParser b) -> HTMLParser b | |
parseHTMLs p [] k = case endElement p of | |
Nothing -> Fail | |
Just x -> k x | |
parseHTMLs p (h : hs) k = case parseHTML' p h of | |
Fail -> Fail | |
p -> parseHTMLs p hs k | |
parseHTML :: HTMLParser a -> HTML -> Maybe a | |
parseHTML p h = endElement (parseHTML' p h) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment