functional programming guest lecture by tim sheard parsing in haskell defining parsing combinators

Functional Programmingguest lecture by Tim Sheard

Parsing in Haskell

Defining Parsing Combinators

Find these slides at• www.cs.pdx.edu/~sheard/course/guest/ParsingInHaskell.ppt

• Example can be found at• www.cs.pdx.edu/~sheard/course/guest/ParsingInHaskell.hs

http://www.cs.pdx.edu/~sheard/course/guest/ParsingInHaskell.ppt

http://www.cs.pdx.edu/~sheard/course/guest/

Parsing

• Parsing is imposing tree structure on linear text (usually in strings or files)

• Plan of this lecture– Introduce the Parsec library– Write some simple parsers– Test them– Define a simple version of the parsers to see

how they work. • Parsec is a much more sophisticated library

Include the followingmodule ParsingInHaskell where

import Text.ParserCombinators.Parsec

import Text.ParserCombinators.Parsec.Token

import Text.ParserCombinators.Parsec.Language

Parsec• Type:

– data Parser a = …

• Function– parse :: Parser b -> String -> [a] -> Either ParseError b

run :: Show a => Parser a -> String -> IO () run p input = case (parse p "" input) of Left err -> do{ putStr "parse error at " ; print err } Right x -> print x

Operationschar :: Char -> CharParser a Char

string :: String -> CharParser a String

satisfy :: (Char -> Bool) ->

CharParser a Char

(<|>) :: Parser c -> Parser c -> Parser c

test1

test1 = do { string "A"

; char ' '

; string "big"

; char ' '

; string "cat"

}

test2

test2 = do { a <- string "A"

; char ' '

; b <- string "big"

; char ' '

; c <- string "cat"

; return(a,b,c)

}

test3

word s =

lexeme haskell (string s)

test3 = do { a <- word "A"

; b <- word "big"

; c <- word "cat"

; return(a,b,c)

}

A Simple Grammar for English Example taken from Floyd & Beigel.

<Sentence> <Subject> <Predicate>

<Subject> <Pronoun1> | <Pronoun2>

<Pronoun1> I | we | you | he | she | it | they

<Noun Phrase> <Simple Noun Phrase> | <Article> <Noun Phrase>

<Article> a | an | the

<Predicate> <Noun> | <Adjective> <Simple Noun Phrase>

<SimpleNoun Phrase>

<Verb> | <Verb> <Object>

<Object> <Pronoun2> | <Noun Phrase>

<Pronoun2> me | us | you | him | her | it | them

<Noun> . . .

<Verb> . . .

As a parsec grammarsentence = do { subject; verb; predicate}pronoun1 = word "I" <|> word "we" <|> word "you" <|> word "he" <|> word "she" <|> word "it" <|> word "they"pronoun2 = word "me" <|> word "us" <|> word "you" <|> word "him" <|> word "her" <|> word "it" <|> word "them"subject = pronoun1 <|> pronoun2article = word "a" <|> word "the"predicate = do { article; (noun <|> simpleNounPhrase) }adjective = word "red" <|> word "pretty"noun = word "cat" <|> word "ball"simpleNounPhrase = do { adjective; simpleNounPhrase} <|> return ""object = pronoun2 <|> nounPhrasenounPhrase = simpleNounPhrase <|> do {article; noun}verb = word "ate" <|> word "hit"

test4 = run sentence "I hit the pretty red cat"

Some simple combinators• many :: Parser c -> Parser [c]

• sepBy :: Parser c -> Parser d -> Parser [c]

• option :: a -> Parser a -> Parser a

• chainl1 :: GenParser a -> GenParser (a->a->a) -> GenParser a

• (chainl1 p op x) parses one or more occurrences of p, separated by op Returns a value obtained by a left associative application of all functions returned by op to the values returned by p.

Making Parse Treesdata Variable = Var String

deriving (Show,Eq)

data Expression

= Constant Integer -- 5

| Contents Variable -- x

| Minus Expression Expression -- x - 6

| Greater Expression Expression -- 6 > z

| Times Expression Expression -- x * y

deriving (Show,Eq)

Variablesparens x = between (char '(') (char ')') x

pVar = lexeme haskell

(do { c <- lower

; cs <- many (satisfy isAlphaNum)

; return(Var (c:cs))

})

Simple TermssimpleExp :: Parser Expression

simpleExp =

(do { n <- integer haskell; return(Constant n)}) <|>

(do { n <- pVar; return(Contents n)}) <|>

(parens relation)

Complex termsfactor = chainl1 simpleExp

(lexeme haskell (char '*')>> return Times)

summand = chainl1 factor

(lexeme haskell (char '-')>> return Minus)

relation = chainl1 summand

(lexeme haskell (char '>') >> return Greater)

test4 = run pExp "x - 2 > 5"

Defining our own Type of a Parserdata Parser a =

Parser (String -> [(a,String)])

• A function inside a data definition.• The output is a list of successful parses.• This type can be made into a monad

– A monad is the sequencing operator in Haskell.

• Also be made into a Monad with zero and (++) or plus.

Defining the MonadTechnical details, can be ignored when using combinatorsinstance Monad Parser where

return v = Parser (\inp -> [(v,inp)])

p >>= f =

Parser (\inp -> concat

[applyP (f v) out

| (v,out) <- applyP p inp])

instance MonadPlus Parser where

mzero = Parser (\inp -> [])

mplus (Parser p) (Parser q)

= Parser(\inp -> p inp ++ q inp)

instance Functor Parser where . . .

•where applyP undoes the constructor•applyP (Parser f) x = f x

Note the comprehensi

on syntax

Typical Parser• Because the parser is a monad we can use

the Do syntax .

do { x1 <- p1

; x2 <- p2

; ...

; xn <- pn

; f x1 x2 ... Xn

}

Running the Parser

• Running Parsers

papply :: Parser a -> String -> [(a,String)]

papply p = applyP (do {junk; p})

• junk skips over white space and comments. We'll see how to define it later

Simple PrimitivesapplyP :: Parser a -> String -> [(a,String)]

applyP (Parser p) = p

item :: Parser Char

item = Parser (\inp -> case inp of

"" -> []

(x:xs) -> [(x,xs)])

sat :: (Char -> Bool) -> Parser Char

sat p = do {x <- item;

if p x then return x else mzero}

? papply item "abc"

[('a',"bc")]

Examples

? papply item "abc"

[('a',"bc")]

? papply (sat isDigit) "123"

[('1',"23")]

? parse (sat isDigit) "abc"

[]

Useful Parsers char :: Char -> Parser Charchar x = sat (x ==)

digit :: Parser Int

digit = do { x <- sat isDigit

; return (ord x - ord '0') }

lower :: Parser Char

lower = sat isLower

upper :: Parser Char

upper = sat isUpper

Exampleschar x = sat (x ==)

? papply (char 'z') "abc"[]

? papply (char 'a') "abc"[('a',"bc")]

? papply digit "123"[(1,"23")]

? papply upper "ABC"[('A',"BC")]

? papply lower "ABC"[]

More Useful Parsers–letter :: Parser Char–letter = sat isAlpha

• Can even use recursion– string :: String -> Parser String– string "" = return ""– string (x:xs) = – do {char x; string xs; return (x:xs) }

• Helps define even more useful parsers– identifier :: Parser String– identifier = do {x <- lower– ; xs <- many alphanum– ; return (x:xs)}

• What do you think many does?

Examples? papply (string "tim") "tim is red"

[("tim"," is red")]

? papply identifier "tim is blue"

[("tim"," is blue")]

? papply identifier "x5W3 = 12"

[("x5W3"," = 12")]

Choice -- 1 parser or another

• Note that the ++ operator (from MonadPlus) gives non-deterministic choice.

– instance MonadPlus Parser where– (Parser p) ++ (Parser q) – = Parser(\inp -> p inp ++ q inp)

• Sometimes we’d like to prefer one choice over another, and take the second only if the first fails

• We don’t we need an explicit sequencing operator because the monad sequencing plays that role.

Efficiencyforce :: Parser a -> Parser a

force p =

Parser (\ inp ->

let x = applyP p inp

in (fst (head x), snd (head x))

: (tail x) )

Deterministic Choice(+++) :: Parser a -> Parser a -> Parser a

p +++ q =

Parser(\inp ->

case applyP (p `mplus` q) inp of

[] -> []

(x:xs) -> [x])

Example

–? papply (string "x" +++ string "b") "abc"

–[]

–? papply (string "x" +++ string "b") "bcd"

–[("b","cd")]

Sequences (more recursion)many :: Parser a -> Parser [a]many p = force (many1 p +++ return [])

many1 :: Parser a -> Parser [a]many1 p = do {x <- p ; xs <- many p ; return (x:xs)}

sepby :: Parser a -> Parser b -> Parser [a]p `sepby` sep = (p `sepby1` sep) +++ return []

sepby1 :: Parser a -> Parser b -> Parser [a]p `sepby1` sep = do { x <- p ; xs <- many (do {sep; p}) ; return (x:xs) }

Example? papply (many (char 'z')) "zzz234"

[("zzz","234")]

? papply (sepby (char 'z') spaceP) "z z z 34"

[("zzz"," 34")]

Sequences separated by operators

chainl :: Parser a -> Parser (a -> a -> a) -> a -> Parser a

chainl p op v = (p `chainl1` op) +++ return v

chainl1 :: Parser a -> Parser (a -> a -> a) -> Parser a

p `chainl1` op = do {x <- p; rest x }

where rest x =

do {f <- op; y <- p; rest (f x y)} +++ return x

? papply (chainl int (return (+)) 0) "1 3 4 abc"

[(8,"abc")]

Tokens and Lexical IssuesspaceP :: Parser ()spaceP = do {many1 (sat isSpace); return ()}

comment :: Parser ()comment = do{string "--"; many (sat p); return ()} where p x = x /= '\n'

junk :: Parser ()junk = do {many (spaceP +++ comment); return ()}

• A Token is any parser followed by optional white space or a comment

token :: Parser a -> Parser atoken p = do {v <- p; junk; return v}

Using Tokenssymb :: String -> Parser String

symb xs = token (string xs)

ident :: [String] -> Parser String

ident ks =

do { x <- token identifier

; if (not (elem x ks))

then return x else zero }

nat :: Parser Int

nat = token natural

natural :: Parser Int

natural = digit `chainl1` return (\m n -> 10*m + n)

Example? papply (token (char 'z')) "z 123"[('z',"123")]

? papply (symb "tim") "tim is cold"[("tim","is cold")]

? papply natural "123 abc"[(123," abc")]

? papply (many identifier) "x d3 23"[(["x"]," d3 23")]

? papply (many (token identifier)) "x d3 23"[(["x", "d3"],"23")]

More Parsersint :: Parser Int

int = token integer

integer :: Parser Int

integer = (do {char '-’

; n <- natural

; return (-n)})

+++ nat

Example: Parsing Expressions data Term = Add Term Term

| Sub Term Term

| Mult Term Term

| Div Term Term

| Const Int

addop:: Parser(Term -> Term -> Term)

addop = do {symb "+"; return Add} +++

do {symb "-"; return Sub}

mulop:: Parser(Term -> Term -> Term)

mulop = do {symb "*"; return Mult} +++

do {symb "/"; return Div}

Constructing a Parse treeexpr :: Parser Termaddop :: Parser (Term -> Term -> Term)mulop :: Parser (Term -> Term -> Term) expr = term `chainl1` addopterm = factor `chainl1` mulopfactor = (do { n <- token digit ; return (Const n)}) +++ (do {symb "(“ ; n <- expr ; symb ")“ ; return n})

? papply expr "5 abc"[(Const 5,"abc")]

? papply expr "4 + 5 - 2"[(Sub (Add (Const 4) (Const 5))(Const 2),[])]

Array Based Parserstype Subword = (Int,Int)

newtype P a = P (Array Int Char -> Subword -> [a])unP (P z) = z

emptyP :: P ()emptyP = P f where f z (i,j) = [() | i == j]

notchar :: Char -> P Charnotchar s = P f where f z (i,j) = [z!j | i+1 == j, z!j /= s]

charP :: Char -> P CharcharP c = P f where f z (i,j) = [c | i+1 == j, z!j == c]

anychar :: P Charanychar = P f where f z (i,j) = [z!j | i+1 == j]

anystring :: P(Int,Int)anystring = P f where f z (i,j) = [(i,j) | i <= j]

symbol :: String -> P (Int,Int)symbol s = P f where f z (i,j) = if j-i == length s then [(i,j)| and [z!(i+k) == s!!(k-1) | k <-[1..(j-i)]]] else []

Combinatorsinfixr 6 |||

(|||) :: P b -> P b -> P b

(|||) (P r) (P q) = P f

where f z (i,j) = r z (i,j) ++ q z (i,j)

infix 8 <<<

(<<<) :: (b -> c) -> P b -> P c

(<<<) f (P q) = P h

where h z (i,j) = map f (q z (i,j))

infixl 7 ~~~

(~~~) :: P(b -> c) -> P b -> P c

(~~~) (P r) (P q) = P f

where f z (i,j) =

[f y | k <- [i..j], f <- r z (i,k), y <- q z (k,j)]

run :: String -> P b -> [b]

run s (P ax) = ax (s2a s) (0,length s)

s2a s = (array bounds (zip [1..] s))

where bounds = (1,length s)

instance Monad P where

return x =

P(\ z (i,j) -> if i==j then [x] else [])

(>>=) (P f) g = P h

where h z (i,j) =

concat[ unP (g a) z (k,j)

| k <- [i..j] , a <- f z (i,k)]

Examples

p1 = do { symbol "tim"; c <- anychar

; symbol "tom"; return c}

ex4 = run "tim5tom" p1

ex5 = run "timtom" p1

Main> ex4

"5"

Main> ex5

""

Exercise in class

• Write a parser for regular expressions

functional programming guest lecture by tim sheard parsing in haskell defining parsing combinators

Documents