One pass parser and tests.

We remove the HTML test as it is no longer necessary. We cover the test case in spec tests and other HTML tests but keeping this around fails: this is because the new parser has different semantics there. In fact, I suspect the original behaviour was a bug that wasn't caught/fixed but simply included as-is during the testing.
author: Mateusz Kowalczyk <fuuzetsu@fuuzetsu.co.uk> 2013-07-09 14:24:10 +0100
committer: Austin Seipp <austin@well-typed.com> 2014-01-12 14:48:35 -0600
commit: bb6cef20b82ef7a7f2d49f3ef6dc1a7ce880b5f0 (patch)
tree: ea07b1d4ab43169bc8d7074ff05bf1792c93feb0 /src
parent: c1228df0339d041b455bb993786a9ed6322c5e01 (diff)
13 files changed, 600 insertions, 493 deletions
diff --git a/src/Haddock.hs b/src/Haddock.hs
index 6d16d601..b741f5f1 100644
--- a/src/Haddock.hs
+++ b/src/Haddock.hs
@@ -23,8 +23,7 @@ import Haddock.Backends.Xhtml.Themes (getThemes)
 import Haddock.Backends.LaTeX
 import Haddock.Backends.Hoogle
 import Haddock.Interface
-import Haddock.Lex
-import Haddock.Parse
+import Haddock.Parser
 import Haddock.Types
 import Haddock.Version
 import Haddock.InterfaceFile
@@ -447,8 +446,7 @@ getPrologue dflags flags =
     [] -> return Nothing
     [filename] -> do
       str <- readFile filename
-      case parseParas (tokenise dflags str
-                      (1,0) {- TODO: real position -}) of
+      case parseParas dflags str of
         Nothing -> throwE $ "failed to parse haddock prologue from file: " ++ filename
         Just doc -> return (Just doc)
     _otherwise -> throwE "multiple -p/--prologue options"
diff --git a/src/Haddock/Backends/LaTeX.hs b/src/Haddock/Backends/LaTeX.hs
index 0bc95719..64966ac2 100644
--- a/src/Haddock/Backends/LaTeX.hs
+++ b/src/Haddock/Backends/LaTeX.hs
@@ -33,6 +33,9 @@ import Data.Char
 import Control.Monad
 import Data.Maybe
 import Data.List
+
+import Haddock.Doc (combineDocumentation)
+
 -- import Debug.Trace
 
 {- SAMPLE OUTPUT
@@ -278,11 +281,11 @@ ppDecl :: LHsDecl DocName
 
 ppDecl (L loc decl) (doc, fnArgsDoc) instances subdocs = case decl of
   TyClD d@(FamDecl {})          -> ppTyFam False loc doc d unicode
-  TyClD d@(DataDecl {})   
+  TyClD d@(DataDecl {})
                                 -> ppDataDecl instances subdocs loc doc d unicode
   TyClD d@(SynDecl {})          -> ppTySyn loc (doc, fnArgsDoc) d unicode
 -- Family instances happen via FamInst now
---  TyClD d@(TySynonym {})         
+--  TyClD d@(TySynonym {})
 --    | Just _  <- tcdTyPats d    -> ppTyInst False loc doc d unicode
 -- Family instances happen via FamInst now
   TyClD d@(ClassDecl {})         -> ppClassDecl instances loc doc subdocs d unicode
@@ -465,7 +468,7 @@ ppClassDecl :: [DocInstance DocName] -> SrcSpan
             -> Documentation DocName -> [(DocName, DocForDecl DocName)]
             -> TyClDecl DocName -> Bool -> LaTeX
 ppClassDecl instances loc doc subdocs
-  (ClassDecl { tcdCtxt = lctxt, tcdLName = lname, tcdTyVars = ltyvars, tcdFDs = lfds 
+  (ClassDecl { tcdCtxt = lctxt, tcdLName = lname, tcdTyVars = ltyvars, tcdFDs = lfds
              , tcdSigs = lsigs, tcdATs = ats, tcdATDefs = at_defs }) unicode
   = declWithDoc classheader (if null body then Nothing else Just (vcat body)) $$
     instancesBit
@@ -504,7 +507,7 @@ ppDocInstances unicode (i : rest)
   | Just ihead <- isUndocdInstance i
   = declWithDoc (vcat (map (ppInstDecl unicode) (ihead:is))) Nothing $$
     ppDocInstances unicode rest'
-  | otherwise 
+  | otherwise
   = ppDocInstance unicode i $$ ppDocInstances unicode rest
   where
     (is, rest') = spanWith isUndocdInstance rest
@@ -650,9 +653,9 @@ ppSideBySideField subdocs unicode (ConDeclField (L _ name) ltype _) =
 
 -- {-
 -- ppHsFullConstr :: HsConDecl -> LaTeX
--- ppHsFullConstr (HsConDecl _ nm tvs ctxt typeList doc) = 
+-- ppHsFullConstr (HsConDecl _ nm tvs ctxt typeList doc) =
 --      declWithDoc False doc (
--- 	hsep ((ppHsConstrHdr tvs ctxt +++ 
+-- 	hsep ((ppHsConstrHdr tvs ctxt +++
 -- 		ppHsBinder False nm) : map ppHsBangType typeList)
 --       )
 -- ppHsFullConstr (HsRecDecl _ nm tvs ctxt fields doc) =
@@ -661,35 +664,35 @@ ppSideBySideField subdocs unicode (ConDeclField (L _ name) ltype _) =
 --        Nothing -> aboves [hdr, fields_html]
 --        Just _  -> aboves [hdr, constr_doc, fields_html]
 --    )
--- 
+--
 --   where hdr = declBox (ppHsConstrHdr tvs ctxt +++ ppHsBinder False nm)
--- 
--- 	constr_doc	
+--
+-- 	constr_doc
 -- 	  | isJust doc = docBox (docToLaTeX (fromJust doc))
 -- 	  | otherwise  = LaTeX.emptyTable
--- 
--- 	fields_html = 
--- 	   td << 
+--
+-- 	fields_html =
+-- 	   td <<
 -- 	      table ! [width "100%", cellpadding 0, cellspacing 8] << (
 -- 		   aboves (map ppFullField (concat (map expandField fields)))
 -- 		)
 -- -}
--- 
+--
 -- ppShortField :: Bool -> Bool -> ConDeclField DocName -> LaTeX
 -- ppShortField summary unicode (ConDeclField (L _ name) ltype _)
 --   = tda [theclass "recfield"] << (
 --       ppBinder summary (docNameOcc name)
 --       <+> dcolon unicode <+> ppLType unicode ltype
 --     )
--- 
+--
 -- {-
 -- ppFullField :: HsFieldDecl -> LaTeX
--- ppFullField (HsFieldDecl [n] ty doc) 
+-- ppFullField (HsFieldDecl [n] ty doc)
 --   = declWithDoc False doc (
 -- 	ppHsBinder False n <+> dcolon <+> ppHsBangType ty
 --     )
 -- ppFullField _ = error "ppFullField"
--- 
+--
 -- expandField :: HsFieldDecl -> [HsFieldDecl]
 -- expandField (HsFieldDecl ns ty doc) = [ HsFieldDecl [n] ty doc | n <- ns ]
 -- -}
@@ -718,7 +721,7 @@ ppAppNameTypes :: DocName -> [HsType DocName] -> Bool -> LaTeX
 ppAppNameTypes n ts unicode = ppTypeApp n ts ppDocName (ppParendType unicode)
 
 
--- | Print an application of a DocName and a list of Names 
+-- | Print an application of a DocName and a list of Names
 ppAppDocNameNames :: Bool -> DocName -> [Name] -> LaTeX
 ppAppDocNameNames _summ n ns =
   ppTypeApp n ns (ppBinder . nameOccName . getName) ppSymName
@@ -1002,7 +1005,7 @@ parLatexMarkup ppId = Markup {
   markupEmphasis             = \p v -> emph (p v),
   markupMonospaced           = \p _ -> tt (p Mono),
   markupUnorderedList        = \p v -> itemizedList (map ($v) p) $$ text "",
-  markupPic                  = \path _ -> parens (text "image: " <> text path),
+  markupPic                  = \p _ -> markupPic p,
   markupOrderedList          = \p v -> enumeratedList (map ($v) p) $$ text "",
   markupDefList              = \l v -> descriptionList (map (\(a,b) -> (a v, b v)) l),
   markupCodeBlock            = \p _ -> quote (verb (p Verb)) $$ text "",
@@ -1020,6 +1023,14 @@ parLatexMarkup ppId = Markup {
       Just label -> text "\\href" <> braces (text url) <> braces (text label)
       Nothing    -> text "\\url"  <> braces (text url)
 
+    -- Is there a better way of doing this? Just a space is an aribtrary choice.
+    markupPic (Picture uri title) = parens (imageText title)
+      where
+        imageText Nothing = beg
+        imageText (Just t) = beg <> text " " <> text t
+
+        beg = text "image: " <> text uri
+
     markupId ppId_ id v =
       case v of
         Verb  -> theid
diff --git a/src/Haddock/Backends/Xhtml/Decl.hs b/src/Haddock/Backends/Xhtml/Decl.hs
index 2ecc6464..91e6871d 100644
--- a/src/Haddock/Backends/Xhtml/Decl.hs
+++ b/src/Haddock/Backends/Xhtml/Decl.hs
@@ -25,6 +25,7 @@ import Haddock.Backends.Xhtml.Types
 import Haddock.Backends.Xhtml.Utils
 import Haddock.GhcUtils
 import Haddock.Types
+import Haddock.Doc (combineDocumentation)
 
 import           Data.List             ( intersperse )
 import qualified Data.Map as Map
@@ -115,7 +116,7 @@ ppFor _ _ _ _ _ _ _ = error "ppFor"
 ppTySyn :: Bool -> LinksInfo -> SrcSpan -> DocForDecl DocName -> TyClDecl DocName -> Bool
         -> Qualification -> Html
 ppTySyn summary links loc doc (SynDecl { tcdLName = L _ name, tcdTyVars = ltyvars
-                                       , tcdRhs = ltype }) 
+                                       , tcdRhs = ltype })
         unicode qual
   = ppTypeOrFunSig summary links loc [name] (unLoc ltype) doc
                    (full, hdr, spaceHtml +++ equals) unicode qual
@@ -204,12 +205,12 @@ ppAssocType summ links doc (L loc decl) unicode qual =
 -- * TyClDecl helpers
 --------------------------------------------------------------------------------
 
--- | Print a type family and its variables 
+-- | Print a type family and its variables
 ppFamDeclBinderWithVars :: Bool -> FamilyDecl DocName -> Html
 ppFamDeclBinderWithVars summ (FamilyDecl { fdLName = lname, fdTyVars = tvs }) =
   ppAppDocNameNames summ (unLoc lname) (tyvarNames tvs)
 
--- | Print a newtype / data binder and its variables 
+-- | Print a newtype / data binder and its variables
 ppDataBinderWithVars :: Bool -> TyClDecl DocName -> Html
 ppDataBinderWithVars summ decl =
   ppAppDocNameNames summ (tcdName decl) (tyvarNames $ tcdTyVars decl)
@@ -226,7 +227,7 @@ ppAppNameTypes n ts unicode qual =
     ppTypeApp n ts (ppDocName qual) (ppParendType unicode qual)
 
 
--- | Print an application of a DocName and a list of Names 
+-- | Print an application of a DocName and a list of Names
 ppAppDocNameNames :: Bool -> DocName -> [Name] -> Html
 ppAppDocNameNames summ n ns =
   ppTypeApp n ns (ppBinder summ . nameOccName . getName) ppTyName
@@ -305,7 +306,7 @@ ppShortClassDecl :: Bool -> LinksInfo -> TyClDecl DocName -> SrcSpan
                  -> Html
 ppShortClassDecl summary links (ClassDecl { tcdCtxt = lctxt, tcdLName = lname, tcdTyVars = tvs
                                           , tcdFDs = fds, tcdSigs = sigs, tcdATs = ats }) loc
-    subdocs unicode qual = 
+    subdocs unicode qual =
   if null sigs && null ats
     then (if summary then id else topDeclElem links loc [nm]) hdr
     else (if summary then id else topDeclElem links loc [nm]) (hdr <+> keyword "where")
@@ -580,7 +581,7 @@ ppShortField summary unicode qual (ConDeclField (L _ name) ltype _)
 -- Currently doesn't handle 'data instance' decls or kind signatures
 ppDataHeader :: Bool -> TyClDecl DocName -> Bool -> Qualification -> Html
 ppDataHeader summary decl@(DataDecl { tcdDataDefn = HsDataDefn { dd_ND = nd
-                                                               , dd_ctxt = ctxt } }) 
+                                                               , dd_ctxt = ctxt } })
              unicode qual
   = -- newtype or data
     (case nd of { NewType -> keyword "newtype"; DataType -> keyword "data" }) <+>
@@ -723,5 +724,3 @@ ppr_fun_ty ctxt_prec ty1 ty2 unicode qual
     in
     maybeParen ctxt_prec pREC_FUN $
     hsep [p1, arrow unicode <+> p2]
-
-
diff --git a/src/Haddock/Backends/Xhtml/DocMarkup.hs b/src/Haddock/Backends/Xhtml/DocMarkup.hs
index aa4ba377..31d653bc 100644
--- a/src/Haddock/Backends/Xhtml/DocMarkup.hs
+++ b/src/Haddock/Backends/Xhtml/DocMarkup.hs
@@ -18,18 +18,19 @@ module Haddock.Backends.Xhtml.DocMarkup (
   docElement, docSection, docSection_,
 ) where
 
+import Control.Applicative ((<$>))
 
 import Haddock.Backends.Xhtml.Names
 import Haddock.Backends.Xhtml.Utils
 import Haddock.Types
 import Haddock.Utils
+import Haddock.Doc (combineDocumentation)
 
-import Text.XHtml hiding ( name, title, p, quote )
+import Text.XHtml hiding ( name, p, quote )
 import Data.Maybe (fromMaybe)
 
 import GHC
 
-
 parHtmlMarkup :: Qualification -> (a -> Html) -> DocMarkup a Html
 parHtmlMarkup qual ppId = Markup {
   markupEmpty                = noHtml,
@@ -49,7 +50,7 @@ parHtmlMarkup qual ppId = Markup {
   markupCodeBlock            = pre,
   markupHyperlink            = \(Hyperlink url mLabel) -> anchor ! [href url] << fromMaybe url mLabel,
   markupAName                = \aname -> namedAnchor aname << "",
-  markupPic                  = \path -> image ! [src path],
+  markupPic                  = \(Picture uri t) -> image ! ([src uri] ++ fromMaybe [] (return . title <$> t)),
   markupProperty             = pre . toHtml,
   markupExample              = examplesToHtml
   }
diff --git a/src/Haddock/Doc.hs b/src/Haddock/Doc.hs
index 18555cfb..4d68c554 100644
--- a/src/Haddock/Doc.hs
+++ b/src/Haddock/Doc.hs
@@ -1,14 +1,27 @@
+{-# OPTIONS_GHC -fno-warn-orphans #-}
 module Haddock.Doc (
   docAppend,
   docParagraph,
-  combineStringNodes
+  combineStringNodes,
+  combineDocumentation
   ) where
 
-
+import Data.Maybe
+import Data.Monoid
 import Haddock.Types
 import Data.Char (isSpace)
 import Control.Arrow ((***))
 
+-- We put it here so that we can avoid a circular import
+-- anything relevant imports this module anyway
+instance Monoid (Doc id) where
+  mempty  = DocEmpty
+  mappend = docAppend
+
+combineDocumentation :: Documentation name -> Maybe (Doc name)
+combineDocumentation (Documentation Nothing Nothing) = Nothing
+combineDocumentation (Documentation mDoc mWarning)   = Just (fromMaybe mempty mWarning `mappend` fromMaybe mempty mDoc)
+
 -- used to make parsing easier; we group the list items later
 docAppend :: Doc id -> Doc id -> Doc id
 docAppend (DocUnorderedList ds1) (DocUnorderedList ds2)
@@ -85,4 +98,7 @@ combineStringNodes x = x
 
 tryjoin :: Doc id -> Doc id
 tryjoin (DocAppend (DocString x) (DocString y)) = DocString (x ++ y)
+tryjoin (DocAppend (DocString x) (DocAppend (DocString y) z)) = DocAppend (DocString (x ++ y)) z
+tryjoin (DocAppend (DocAppend x (DocString y)) (DocString z))
+  = tryjoin (DocAppend (combineStringNodes x) (DocString $ y ++ z))
 tryjoin x = x
diff --git a/src/Haddock/Interface/LexParseRn.hs b/src/Haddock/Interface/LexParseRn.hs
index 9d775bcb..13563532 100644
--- a/src/Haddock/Interface/LexParseRn.hs
+++ b/src/Haddock/Interface/LexParseRn.hs
@@ -19,8 +19,7 @@ module Haddock.Interface.LexParseRn
 
 
 import Haddock.Types
-import Haddock.Lex
-import Haddock.Parse
+import Haddock.Parser
 import Haddock.Interface.ParseModuleHeader
 import Haddock.Doc
 
@@ -50,19 +49,19 @@ processDocStringParas = process parseParas
 processDocString :: DynFlags -> GlobalRdrEnv -> HsDocString -> ErrMsgM (Maybe (Doc Name))
 processDocString = process parseString
 
-process :: ([LToken] -> Maybe (Doc RdrName))
+process :: (DynFlags -> String -> Maybe (Doc RdrName))
         -> DynFlags
         -> GlobalRdrEnv
         -> HsDocString
         -> ErrMsgM (Maybe (Doc Name))
 process parse dflags gre (HsDocString fs) = do
    let str = unpackFS fs
-   let toks = tokenise dflags str (0,0)  -- TODO: real position
-   case parse toks of
+   case parse dflags str of
      Nothing -> do
        tell [ "doc comment parse failed: " ++ str ]
        return Nothing
-     Just doc -> return (Just (rename dflags gre doc))
+     Just doc -> do
+       return (Just (rename dflags gre doc))
 
 
 processModuleHeader :: DynFlags -> GlobalRdrEnv -> SafeHaskellMode -> Maybe LHsDocString
diff --git a/src/Haddock/Interface/ParseModuleHeader.hs b/src/Haddock/Interface/ParseModuleHeader.hs
index 5087affe..2e4fe73b 100644
--- a/src/Haddock/Interface/ParseModuleHeader.hs
+++ b/src/Haddock/Interface/ParseModuleHeader.hs
@@ -12,8 +12,7 @@
 module Haddock.Interface.ParseModuleHeader (parseModuleHeader) where
 
 import Haddock.Types
-import Haddock.Lex
-import Haddock.Parse
+import Haddock.Parser
 
 import RdrName
 import DynFlags
@@ -47,15 +46,13 @@ parseModuleHeader dflags str0 =
       description1 :: Either String (Maybe (Doc RdrName))
       description1 = case descriptionOpt of
          Nothing -> Right Nothing
-         -- TODO: pass real file position
-         Just description -> case parseString $ tokenise dflags description (0,0) of
+         Just description -> case parseString dflags description of
             Nothing -> Left ("Cannot parse Description: " ++ description)
             Just doc -> Right (Just doc)
    in
       case description1 of
          Left mess -> Left mess
-         -- TODO: pass real file position
-         Right docOpt -> case parseParas $ tokenise dflags str8 (0,0) of
+         Right docOpt -> case parseParas dflags str8 of
            Nothing -> Left "Cannot parse header documentation paragraphs"
            Just doc -> Right (HaddockModInfo {
             hmi_description = docOpt,
diff --git a/src/Haddock/InterfaceFile.hs b/src/Haddock/InterfaceFile.hs
index b993dd67..189550b8 100644
--- a/src/Haddock/InterfaceFile.hs
+++ b/src/Haddock/InterfaceFile.hs
@@ -428,6 +428,16 @@ instance Binary Hyperlink where
         label <- get bh
         return (Hyperlink url label)
 
+instance Binary Picture where
+    put_ bh (Picture uri title) = do
+        put_ bh uri
+        put_ bh title
+    get bh = do
+        uri <- get bh
+        title <- get bh
+        return (Picture uri title)
+
+
 
 {-* Generated by DrIFT : Look, but Don't Touch. *-}
 instance (Binary id) => Binary (Doc id) where
diff --git a/src/Haddock/Lex.x b/src/Haddock/Lex.x
deleted file mode 100644
index 9e59fa4c..00000000
--- a/src/Haddock/Lex.x
+++ /dev/null
@@ -1,255 +0,0 @@
---
--- Haddock - A Haskell Documentation Tool
---
--- (c) Simon Marlow 2002
---
--- This file was modified and integrated into GHC by David Waern 2006.
--- Then moved back into Haddock by Isaac Dupree in 2009 :-)
---
-
-{
-{-# LANGUAGE BangPatterns #-}   -- Generated by Alex
-{-# OPTIONS -Wwarn -w #-}
--- The above warning supression flag is a temporary kludge.
--- While working on this module you are encouraged to remove it and fix
--- any warnings in the module. See
---     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
--- for details
-
-module Haddock.Lex (
-	Token(..),
-	LToken,
-	tokenise
- ) where
-
-import Lexer hiding (Token)
-import Parser ( parseIdentifier )
-import StringBuffer
-import RdrName
-import SrcLoc
-import DynFlags
-import FastString
-
-import qualified Data.Bits
-import Data.Char
-import Data.Word (Word8)
-import Numeric
-import System.IO.Unsafe
-import Debug.Trace
-}
-
-$ws    = $white # \n
-$digit = [0-9]
-$hexdigit = [0-9a-fA-F]
-$special =  [\"\@]
-$alphanum = [A-Za-z0-9]
-$ident    = [$alphanum \'\_\.\!\#\$\%\&\*\+\/\<\=\>\?\@\\\\\^\|\-\~\:]
-
-:-
-
--- beginning of a paragraph
-<0,para> {
- $ws* \n		;
- $ws* \>		{ begin birdtrack }
- $ws* prop \> .* \n	{ strtoken TokProperty `andBegin` property}
- $ws* \>\>\>            { strtoken TokExamplePrompt `andBegin` exampleexpr }
- $ws* [\*\-]		{ token TokBullet `andBegin` string }
- $ws* \[		{ token TokDefStart `andBegin` def }
- $ws* \( $digit+ \) 	{ token TokNumber `andBegin` string }
- $ws* $digit+ \. 	{ token TokNumber `andBegin` string }
- $ws*			{ begin string }		
-}
-
--- beginning of a line
-<line> {
-  $ws* \>		{ begin birdtrack }
-  $ws* \>\>\>		{ strtoken TokExamplePrompt `andBegin` exampleexpr }
-  $ws* \n		{ token TokPara `andBegin` para }
-  -- Here, we really want to be able to say
-  -- $ws* (\n | <eof>) 	{ token TokPara `andBegin` para}
-  -- because otherwise a trailing line of whitespace will result in 
-  -- a spurious TokString at the end of a docstring.  We don't have <eof>,
-  -- though (NOW I realise what it was for :-).  To get around this, we always
-  -- append \n to the end of a docstring.
-  () 			{ begin string }
-}
-
-<birdtrack> .*	\n?	{ strtokenNL TokBirdTrack `andBegin` line }
-
-<property> ()           { token TokPara `andBegin` para }
-
-<example> {
-  $ws*	\n		{ token TokPara `andBegin` para }
-  $ws* \>\>\>	        { strtoken TokExamplePrompt `andBegin` exampleexpr }
-  ()			{ begin exampleresult }
-}
-
-<exampleexpr> .* \n	{ strtokenNL TokExampleExpression `andBegin` example }
-
-<exampleresult> .* \n	{ strtokenNL TokExampleResult `andBegin` example }
-
-<string,def> {
-  $special			{ strtoken $ \s -> TokSpecial (head s) }
-  \<\< [^\>]* \>\>              { strtoken $ \s -> TokPic (init $ init $ tail $ tail s) }
-  \< [^\>]* \>			{ strtoken $ \s -> TokURL (init (tail s)) }
-  \# [^\#]* \#			{ strtoken $ \s -> TokAName (init (tail s)) }
-  \/ [^\/]* \/                  { strtoken $ \s -> TokEmphasis (init (tail s)) }
-  [\'\`] $ident+ [\'\`]		{ ident }
-  \\ .				{ strtoken (TokString . tail) }
-  "&#" $digit+ \;		{ strtoken $ \s -> TokString [chr (read (init (drop 2 s)))] }
-  "&#" [xX] $hexdigit+ \;	{ strtoken $ \s -> case readHex (init (drop 3 s)) of [(n,_)] -> TokString [chr n] }
-  -- allow special characters through if they don't fit one of the previous
-  -- patterns.
-  [\/\'\`\<\#\&\\]			{ strtoken TokString }
-  [^ $special \/ \< \# \n \'\` \& \\ \]]* \n { strtokenNL TokString `andBegin` line }
-  [^ $special \/ \< \# \n \'\` \& \\ \]]+    { strtoken TokString }
-}
-
-<def> {
-  \]				{ token TokDefEnd `andBegin` string }
-}
-
--- ']' doesn't have any special meaning outside of the [...] at the beginning
--- of a definition paragraph.
-<string> {
-  \]				{ strtoken TokString }
-}
-
-{
--- | A located token
-type LToken = (Token, AlexPosn)
-
-data Token
-  = TokPara
-  | TokNumber
-  | TokBullet
-  | TokDefStart
-  | TokDefEnd
-  | TokSpecial Char
-  | TokIdent RdrName
-  | TokString String
-  | TokURL String
-  | TokPic String
-  | TokEmphasis String
-  | TokAName String
-  | TokBirdTrack String
-  | TokProperty String
-  | TokExamplePrompt String
-  | TokExampleExpression String
-  | TokExampleResult String
---  deriving Show
-
-tokenPos :: LToken -> (Int, Int)
-tokenPos t = let AlexPn _ line col = snd t in (line, col)
-
--- -----------------------------------------------------------------------------
--- Alex support stuff
-
--- XXX: copied the posn wrapper code from Alex to make this lexer work
--- with both Alex 2.x and Alex 3.x.  However, we are not using the
--- Unicode/UTF-8 support in Alex 3.x, and Unicode documentation will
--- probably get mangled.
-
--- | Encode a Haskell String to a list of Word8 values, in UTF8 format.
-utf8Encode :: Char -> [Word8]
-utf8Encode = map fromIntegral . go . ord
- where
-  go oc
-   | oc <= 0x7f       = [oc]
-
-   | oc <= 0x7ff      = [ 0xc0 + (oc `Data.Bits.shiftR` 6)
-                        , 0x80 + oc Data.Bits..&. 0x3f
-                        ]
-
-   | oc <= 0xffff     = [ 0xe0 + (oc `Data.Bits.shiftR` 12)
-                        , 0x80 + ((oc `Data.Bits.shiftR` 6) Data.Bits..&. 0x3f)
-                        , 0x80 + oc Data.Bits..&. 0x3f
-                        ]
-   | otherwise        = [ 0xf0 + (oc `Data.Bits.shiftR` 18)
-                        , 0x80 + ((oc `Data.Bits.shiftR` 12) Data.Bits..&. 0x3f)
-                        , 0x80 + ((oc `Data.Bits.shiftR` 6) Data.Bits..&. 0x3f)
-                        , 0x80 + oc Data.Bits..&. 0x3f
-                        ]
-
-type Byte = Word8
-
-type AlexInput = (AlexPosn,     -- current position,
-                  Char,         -- previous char
-                  [Byte],       -- pending bytes on current char
-                  String)       -- current input string
-
-alexInputPrevChar :: AlexInput -> Char
-alexInputPrevChar (p,c,bs,s) = c
-
-alexGetByte :: AlexInput -> Maybe (Byte,AlexInput)
-alexGetByte (p,c,(b:bs),s) = Just (b,(p,c,bs,s))
-alexGetByte (p,c,[],[]) = Nothing
-alexGetByte (p,_,[],(c:s))  = let p' = alexMove p c
-                                  (b:bs) = utf8Encode c
-                              in p' `seq`  Just (b, (p', c, bs, s))
-
-data AlexPosn = AlexPn !Int !Int !Int
-        deriving (Eq,Show)
-
-alexMove :: AlexPosn -> Char -> AlexPosn
-alexMove (AlexPn a l c) '\t' = AlexPn (a+1)  l     (((c+7) `div` 8)*8+1)
-alexMove (AlexPn a l c) '\n' = AlexPn (a+1) (l+1)   1
-alexMove (AlexPn a l c) _    = AlexPn (a+1)  l     (c+1)
-
-type StartCode = Int
-type Action = AlexPosn -> String -> StartCode -> (StartCode -> [LToken]) -> DynFlags -> [LToken]
-
-tokenise :: DynFlags -> String -> (Int, Int) -> [LToken]
-tokenise dflags str (line, col) = go (posn,'\n',[],eofHack str) para
-  where posn = AlexPn 0 line col
-        go inp@(pos,_,_,str) sc =
-          case alexScan inp sc of
-                AlexEOF -> []
-                AlexError _ -> []
-                AlexSkip  inp' len     -> go inp' sc
-                AlexToken inp' len act -> act pos (take len str) sc (\sc -> go inp' sc) dflags
-
--- NB. we add a final \n to the string, (see comment in the beginning of line
--- production above).
-eofHack str = str++"\n"
-
-andBegin  :: Action -> StartCode -> Action
-andBegin act new_sc = \pos str _ cont dflags -> act pos str new_sc cont dflags
-
-token :: Token -> Action
-token t = \pos _ sc cont _ -> (t, pos) : cont sc
-
-strtoken, strtokenNL :: (String -> Token) -> Action
-strtoken t = \pos str sc cont _ -> (t str, pos) : cont sc
-strtokenNL t = \pos str sc cont _ -> (t (filter (/= '\r') str), pos) : cont sc
--- ^ We only want LF line endings in our internal doc string format, so we
--- filter out all CRs.
-
-begin :: StartCode -> Action
-begin sc = \_ _ _ cont _ -> cont sc
-
--- -----------------------------------------------------------------------------
--- Lex a string as a Haskell identifier
-
-ident :: Action
-ident pos str sc cont dflags = 
-  case parseIdent dflags loc id of
-	Just names -> (TokIdent names, pos) : cont sc
-	Nothing -> (TokString str, pos) : cont sc
- where id = init (tail str)
-       -- TODO: Get the real filename here. Maybe we should just be
-       --       using GHC SrcLoc's ourself?
-       filename = mkFastString "<unknown file>"
-       loc = case pos of
-             AlexPn _ line col ->
-                 mkRealSrcLoc filename line col
-
-parseIdent :: DynFlags -> RealSrcLoc -> String -> Maybe RdrName
-parseIdent dflags loc str0 = 
-  let buffer = stringToStringBuffer str0
-      pstate = mkPState dflags buffer loc
-      result = unP parseIdentifier pstate 
-  in case result of 
-       POk _ name -> Just (unLoc name)
-       _ -> Nothing
-}
diff --git a/src/Haddock/Parse.y b/src/Haddock/Parse.y
deleted file mode 100644
index f40ff521..00000000
--- a/src/Haddock/Parse.y
+++ /dev/null
@@ -1,171 +0,0 @@
-{
-{-# LANGUAGE BangPatterns #-} -- required for versions of Happy before 1.18.6
-{-# OPTIONS -Wwarn -w #-}
--- The above warning supression flag is a temporary kludge.
--- While working on this module you are encouraged to remove it and fix
--- any warnings in the module. See
---     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
--- for details
-
-module Haddock.Parse (parseString, parseParas) where
-
-import Haddock.Lex
-import Haddock.Types (Doc(..), Example(Example), Hyperlink(..))
-import Haddock.Doc
-import HsSyn
-import RdrName
-import Data.Char  (isSpace)
-import Data.Maybe (fromMaybe)
-import Data.List  (stripPrefix)
-}
-
-%expect 0
-
-%tokentype { LToken }
-
-%token	'/'	{ (TokSpecial '/',_) }
-	'@'	{ (TokSpecial '@',_) }
-	'['     { (TokDefStart,_) }
-	']'     { (TokDefEnd,_) }
-	DQUO 	{ (TokSpecial '\"',_) }
-	URL	{ (TokURL $$,_) }
-	PIC     { (TokPic $$,_) }
-	ANAME	{ (TokAName $$,_) }
-	'/../'  { (TokEmphasis $$,_) }
-	'-'	{ (TokBullet,_) }
-	'(n)'	{ (TokNumber,_) }
-	'>..'	{ (TokBirdTrack $$,_) }
-	PROP	{ (TokProperty $$,_) }
-	PROMPT	{ (TokExamplePrompt $$,_) }
-	RESULT	{ (TokExampleResult $$,_) }
-	EXP	{ (TokExampleExpression $$,_) }
-	IDENT   { (TokIdent $$,_) }
-	PARA    { (TokPara,_) }
-	STRING	{ (TokString $$,_) }
-
-%monad { Maybe }
-
-%name parseParas doc
-%name parseString seq
-
-%%
-
-doc	:: { Doc RdrName }
-	: apara PARA doc	{ docAppend $1 $3 }
-	| PARA doc 		{ $2 }
-	| apara			{ $1 }
-	| {- empty -}		{ DocEmpty }
-
-apara	:: { Doc RdrName }
-	: ulpara		{ DocUnorderedList [$1] }
-	| olpara		{ DocOrderedList [$1] }
-        | defpara               { DocDefList [$1] }
-	| para			{ $1 }
-
-ulpara  :: { Doc RdrName }
-	: '-' para		{ $2 }
-
-olpara  :: { Doc RdrName } 
-	: '(n)' para		{ $2 }
-
-defpara :: { (Doc RdrName, Doc RdrName) }
-	: '[' seq ']' seq	{ ($2, $4) }
-
-para    :: { Doc RdrName }
-	: seq			{ docParagraph $1 }
-	| codepara		{ DocCodeBlock $1 }
-	| property		{ $1 }
-	| examples		{ DocExamples $1 }
-
-codepara :: { Doc RdrName }
-	: '>..' codepara	{ docAppend (DocString $1) $2 }
-	| '>..'			{ DocString $1 }
-
-property :: { Doc RdrName }
-	: PROP			{ makeProperty $1 }
-
-examples :: { [Example] }
-	: example examples	{ $1 : $2 }
-	| example		{ [$1] }
-
-example :: { Example }
-	: PROMPT EXP result	{ makeExample $1 $2 (lines $3) }
-	| PROMPT EXP		{ makeExample $1 $2 [] }
-
-result :: { String }
-	: RESULT result		{ $1 ++ $2 }
-	| RESULT		{ $1 }
-
-seq	:: { Doc RdrName }
-	: elem seq		{ docAppend $1 $2 }
-	| elem			{ $1 }
-
-elem	:: { Doc RdrName }
-	: elem1			{ $1 }
-	| '@' seq1 '@'		{ DocMonospaced $2 }
-
-seq1	:: { Doc RdrName }
-	: PARA seq1             { docAppend (DocString "\n") $2 }
-	| elem1 seq1            { docAppend $1 $2 }
-	| elem1			{ $1 }
-
-elem1	:: { Doc RdrName }
-	: STRING		{ DocString $1 }
-	| '/../'                { DocEmphasis (DocString $1) }
-	| URL			{ DocHyperlink (makeHyperlink $1) }
-	| PIC                   { DocPic $1 }
-	| ANAME			{ DocAName $1 }
-	| IDENT			{ DocIdentifier $1 }
-	| DQUO strings DQUO	{ DocModule $2 }
-
-strings  :: { String }
-	: STRING		{ $1 }
-	| STRING strings	{ $1 ++ $2 }
-
-{
-happyError :: [LToken] -> Maybe a
-happyError toks = Nothing
-
--- | Create a `Hyperlink` from given string.
---
--- A hyperlink consists of a URL and an optional label.  The label is separated
--- from the url by one or more whitespace characters.
-makeHyperlink :: String -> Hyperlink
-makeHyperlink input = case break isSpace $ strip input of
-  (url, "")    -> Hyperlink url Nothing
-  (url, label) -> Hyperlink url (Just . dropWhile isSpace $ label)
-
-makeProperty :: String -> Doc RdrName
-makeProperty s = case strip s of
-  'p':'r':'o':'p':'>':xs ->
-	DocProperty (dropWhile isSpace xs)
-  xs ->
-	error $ "makeProperty: invalid input " ++ show xs
-
--- | Create an 'Example', stripping superfluous characters as appropriate
-makeExample :: String -> String -> [String] -> Example
-makeExample prompt expression result =
-  Example
-	(strip expression)	-- we do not care about leading and trailing
-				-- whitespace in expressions, so drop them
-	result'
-  where
-	-- 1. drop trailing whitespace from the prompt, remember the prefix
-	(prefix, _) = span isSpace prompt
-
-	-- 2. drop, if possible, the exact same sequence of whitespace
-	-- characters from each result line
-	--
-	-- 3. interpret lines that only contain the string "<BLANKLINE>" as an
-	-- empty line
-	result' = map (substituteBlankLine . tryStripPrefix prefix) result
-	  where
-		tryStripPrefix xs ys = fromMaybe ys $ stripPrefix xs ys
-
-		substituteBlankLine "<BLANKLINE>" = ""
-		substituteBlankLine line          = line
-
--- | Remove all leading and trailing whitespace
-strip :: String -> String
-strip = dropWhile isSpace . reverse . dropWhile isSpace . reverse
-}
diff --git a/src/Haddock/Parser.hs b/src/Haddock/Parser.hs
new file mode 100644
index 00000000..81e274ed
--- /dev/null
+++ b/src/Haddock/Parser.hs
@@ -0,0 +1,435 @@
+{-# LANGUAGE OverloadedStrings #-}
+-- |
+-- Module      :  Haddock.Parser
+-- Copyright   :  (c) Mateusz Kowalczyk 2013,
+--                    Simon Hengel      2013
+-- License     :  BSD-like
+--
+-- Maintainer  :  haddock@projects.haskell.org
+-- Stability   :  experimental
+-- Portability :  portable
+
+module Haddock.Parser (parseString, parseParas) where
+
+import           Control.Applicative
+import           Data.Attoparsec.ByteString   hiding (takeWhile1, take, inClass)
+import qualified Data.Attoparsec.ByteString.Char8 as A8
+import           Data.Attoparsec.ByteString.Char8 hiding (take, string)
+import qualified Data.ByteString as BS
+import           Data.Char (chr)
+import           Data.List (stripPrefix)
+import           Data.Maybe (fromMaybe)
+import           Data.Monoid
+import           DynFlags
+import           FastString (mkFastString)
+import           Haddock.Doc
+import           Haddock.Types
+import           Lexer (mkPState, unP, ParseResult(POk))
+import           Parser (parseIdentifier)
+import           RdrName
+import           SrcLoc (mkRealSrcLoc, unLoc)
+import           StringBuffer (stringToStringBuffer)
+import           Haddock.Utf8
+
+default (Int)
+
+-- | Main entry point to the parser. Appends the newline character
+-- to the input string.
+parseParas :: DynFlags
+              -> String -- ^ String to parse
+              -> Maybe (Doc RdrName)
+parseParas d s = case parseOnly (p <* skipSpace) (encodeUtf8 $ s ++ "\n") of
+  Right r -> Just $ combineStringNodes r
+  _ -> Nothing
+  where
+    p :: Parser (Doc RdrName)
+    -- make sure that we don't swallow up whitespace belonging to next paragraph
+    p = mconcat <$> paragraph d `sepBy` some (optWs *> "\n")
+
+-- | A parser that parsers separate lines of the comments. Eventually
+-- called by 'parseParas'. Appends a newline character to the input string.
+-- Drops any whitespace in front of the input string. It's dropped for the sake of
+-- section headings.
+parseString :: DynFlags -> String -> Maybe (Doc RdrName)
+parseString d = parseString'' d . dropWhile isSpace
+
+-- | A parser that parsers separate lines of the comments. Eventually
+-- called by 'parseParas'. Appends a newline character to the input string.
+-- Unlike 'parseString', doesn't drop the preceding whitespace. Internal use.
+parseString'' :: DynFlags -> String -> Maybe (Doc RdrName)
+parseString'' d = parseString' d . (++ "\n")
+
+-- | An internal use function. Split from the 'parseString' is useful
+-- as we can specify separately when we want the newline to be appended.
+parseString' :: DynFlags -> String -> Maybe (Doc RdrName)
+parseString' d s = case parseOnly p (encodeUtf8 s) of
+  Right r -> Just $ combineStringNodes r
+  _ -> Nothing
+  where
+    p :: Parser (Doc RdrName)
+    p = mconcat <$> some (charEscape <|> monospace d <|> anchor <|> identifier d
+                          <|> moduleName <|> picture <|> url
+                          <|> emphasis d <|> encodedChar <|> string' <|> skipChar)
+
+-- | Parses and processes
+-- <https://en.wikipedia.org/wiki/Numeric_character_reference Numeric character references>
+--
+-- >>> parseOnly encodedChar "&#65;&#66;&#67;"
+-- Right (DocString "ABC")
+encodedChar :: Parser (Doc RdrName)
+encodedChar = "&#" *> c <* ";"
+  where
+    c = DocString . return . chr <$> num
+    num = hex <|> decimal
+    hex = ("x" <|> "X") *> hexadecimal
+
+-- | Plain, regular parser for text. Called as one of the last parsers
+-- to ensure that we have already given a chance to more meaningful parsers
+-- before capturing their characers.
+string' :: Parser (Doc RdrName)
+string' = DocString . decodeUtf8 <$> takeWhile1 (`notElem` "/<@\" &'`\\")
+
+-- | Emphasis parser.
+--
+-- >>> parseOnly emphasis "/Hello world/"
+-- Right (DocEmphasis (DocString "Hello world"))
+emphasis :: DynFlags -> Parser (Doc RdrName)
+emphasis d = stringBlock d id DocEmphasis "/" "/" "\n"
+
+-- | Skips a single character and treats it as a plain string.
+-- This is done to skip over any special characters belonging to other
+-- elements but which were not deemed meaningful at their positions.
+-- Note that this can only be used in places where we're absolutely certain
+-- no unicode is present, such as to skip a 100% certain ASCII delimeter.
+skipChar :: Parser (Doc RdrName)
+skipChar = DocString . return <$> anyChar
+
+-- | Treats the next character as a regular string, even if it's normally
+-- used for markup.
+charEscape :: Parser (Doc RdrName)
+charEscape = "\\" *> (DocString . return <$> A8.satisfy (/= '\n'))
+
+-- | Text anchors to allow for jumping around the generated documentation.
+--
+-- >>> parseOnly anchor "#Hello world#"
+-- Right (DocAName "Hello world")
+anchor :: Parser (Doc RdrName)
+anchor = DocAName . decodeUtf8 <$> ("#" *> takeWhile1 (`notElem` "#\n") <* "#")
+
+-- | Helper for markup structures surrounded with delimiters.
+stringBlock
+  :: DynFlags
+     -> (String -> String) -- ^ Function used to transform parsed out text
+                           -- before we send it to 'parseString''
+     -> (Doc RdrName -> Doc RdrName) -- ^ 'Doc' to wrap around the result
+     -> String -- ^ Opening delimiter
+     -> String -- ^ Closing delimiter
+     -> String -- ^ Additional characters to terminate parsing on
+     -> Parser (Doc RdrName)
+stringBlock d f doc op ed n = do
+  inner <- block op ed n
+  case parseString' d (f inner) of
+    Just r -> return $ doc r
+    _ -> fail $ "inner parse fail with op: ‘" ++ op ++ "’, ed: ‘" ++ ed ++ "’"
+
+-- | Returns sections of text delimited by specified text.
+block :: String -> String -> String -> Parser String
+block op ed n = reverse . drop (length ed) . reverse <$> block' op ed
+  where
+    block' op' ed' = string (encodeUtf8 op') *> mid
+      where
+        mid :: Parser String
+        mid = decodeUtf8 <$> string (encodeUtf8 ed')
+              <|> do
+                inner <- takeWithSkip (head ed') n
+                more <- decodeUtf8 <$> string (encodeUtf8 $ tail ed')
+                        <|> block' "" ed'  -- not full ending, take more
+                return $ inner ++ more
+
+
+-- | Takes all characters until the specified one. Unconditionally
+-- takes a character if it's escaped. Fails if it doesn't find the character or
+-- when the input string is empty.
+takeWithSkip :: Char -> String -> Parser String
+takeWithSkip s n = do
+  content <- decodeUtf8 <$> A8.scan (False, False) p >>= gotSome
+  if or (map (`elem` content) n) || last content /= s
+    then fail "failed in takeWithSkip"
+    else return content
+  where
+    gotSome [] = fail "EOF in takeWithSkip"
+    gotSome xs = return xs
+    -- Apparently ‘scan’ is so magical that it doesn't mangle unicode.
+    p (escaped, terminate) c
+      | terminate = Nothing -- swallows up that extra character
+      | escaped = Just (False, False)
+      | c == s = Just (False, True)
+      | otherwise = Just (c == '\\', False)
+
+-- | Monospaced strings.
+--
+-- >>> parseOnly (monospace dynflags) "@cruel@"
+-- Right (DocMonospaced (DocString "cruel"))
+monospace :: DynFlags -> Parser (Doc RdrName)
+monospace d = stringBlock d id DocMonospaced "@" "@" ""
+
+-- | Module name parser, surrounded by double quotes. This does a very primitive and
+-- purely syntactic checking so that obviously invalid names are not treated as valid
+-- and blindly hyperlinked (not starting with a capital letter or including spaces).
+moduleName :: Parser (Doc RdrName)
+moduleName = DocModule <$> ("\"" *> legalModule <* "\"")
+  where legalModule = do
+          n <- (:) <$> A8.satisfy (`elem` ['A' .. 'Z'])
+               <*> (decodeUtf8 <$> A8.takeWhile (`notElem` "\"\n"))
+
+          if any (`elem` n) " &[{}(=*)+]!#|@/;,^?"
+            then fail "invalid characters in module name"
+            else case n of
+            [] -> return []
+            _ -> if last n == '.' then fail "trailing dot in module name" else return n
+
+
+-- | Picture parser, surrounded by \<\< and \>\>. It's possible to specify
+-- a title for the picture.
+--
+-- >>> parseOnly picture "<<hello.png>>"
+-- Right (DocPic (Picture "hello.png" Nothing))
+-- >>> parseOnly picture "<<hello.png world>>"
+-- Right (DocPic (Picture "hello.png" (Just "world")))
+picture :: Parser (Doc RdrName)
+picture = DocPic . makePicture . decodeUtf8 <$> ("<<" *> takeWhile1 (`notElem` ">\n") <* ">>")
+
+-- | Paragraph parser, called by 'parseParas'.
+paragraph :: DynFlags -> Parser (Doc RdrName)
+paragraph d = examples <|> skipSpace *> (list d <|> birdtracks <|> codeblock d
+                                         <|> property <|> textParagraph d)
+
+-- | List parser, called by 'paragraph'.
+list :: DynFlags -> Parser (Doc RdrName)
+list d = DocUnorderedList <$> unorderedList d
+         <|> DocOrderedList <$> orderedList d
+         <|> DocDefList <$> definitionList d
+
+-- | Parse given text with a provided parser, casting
+-- Nothing to a failure
+parseLine :: (String -> Maybe (Doc RdrName)) -- ^ Parser to use
+             -> (Doc RdrName -> a) -- ^ Doc function to wrap around the result
+             -> BS.ByteString -- ^ Text to parse
+             -> Parser a
+parseLine f doc str = maybe (fail "invalid string") (return . doc) (f $ decodeUtf8 str)
+
+-- | Parses unordered (bullet) lists.
+unorderedList :: DynFlags -> Parser [Doc RdrName]
+unorderedList d = ("*" <|> "-") *> innerList unorderedList d
+
+-- | Parses ordered lists (numbered or dashed).
+orderedList :: DynFlags -> Parser [Doc RdrName]
+orderedList d = skipSpace *> (paren <|> dot) *> innerList orderedList d
+  where
+    dot = decimal <* "."
+    paren = "(" *> (decimal :: Parser Int) <* ")"
+
+-- | Generic function collecting any further lines belonging to the
+-- list entry and recursively collecting any further lists in the
+-- same paragraph. Usually used as
+--
+-- > someListFunction dynflags = listBeginning *> innerList someListFunction dynflags
+innerList :: (DynFlags -> Parser [Doc RdrName]) -- ^ parser calling this function
+             -> DynFlags
+             -> Parser [Doc RdrName]
+innerList p d = do
+  cl <- do
+    content <- A8.takeWhile (/= '\n') <* "\n" -- allow empty
+    parseLine (parseString'' d) id content
+  ulcs <- many ulc
+  let contents = docParagraph $ mconcat $ cl : [x | Right x <- ulcs]
+      unLists = mconcat [x | Left x <- ulcs]
+  return $ contents : unLists
+  where
+    ulc :: Parser (Either [Doc RdrName] (Doc RdrName))
+    ulc = Left <$> (optWs *> p d)
+          <|> Right <$> nonEmptyLine d
+
+-- | Takes the remained of the line until the newline character
+-- and calls 'parseLine' using 'parseString'. Fails if it's made
+-- up strictly of whitespace.
+nonEmptyLine :: DynFlags -> Parser (Doc RdrName)
+nonEmptyLine d = do
+  s <- (takeWhile1 (/= '\n') >>= nonSpace) <* "\n"
+  parseLine (parseString'' d) id s
+  where
+    nonSpace xs
+      | not (any (not . isSpace) (decodeUtf8 xs)) = fail "empty line"
+      | otherwise = return xs
+
+-- | Parses definition lists.
+definitionList :: DynFlags -> Parser [(Doc RdrName, Doc RdrName)]
+definitionList d = do
+  _ <- "["
+  inner <- parseLine (parseString' d) id =<< takeWhile1 (`notElem` "]\n")
+  _ <- "]"
+  outer <- parseLine (parseString'' d) id =<< (A8.takeWhile (/= '\n') <* "\n")
+  ulcs <- many ulc
+  let contents = mconcat $ outer : [x | Right x <- ulcs]
+      unLists = map mconcat [x | Left x <- ulcs]
+  return $ (inner, contents) : unLists
+  where
+    ulc :: Parser (Either [(Doc RdrName, Doc RdrName)] (Doc RdrName))
+    ulc = Left <$> (optWs *> definitionList d)
+          <|> Right <$> nonEmptyLine d
+
+-- | Parses birdtracks. No further markup is parsed after the birdtrack.
+-- Consecutive birdtracks are allowed.
+birdtracks :: Parser (Doc RdrName)
+birdtracks = DocCodeBlock . mconcat . map (DocString . (++ "\n") . decodeUtf8) <$> line `sepBy1` "\n"
+  where
+    line = optWs *> ">" *> A8.takeWhile (/= '\n')
+
+-- | Parses examples. Examples are a paragraph level entitity (separated by an empty line).
+-- Consecutive examples are accepted.
+examples :: Parser (Doc RdrName)
+examples = DocExamples <$> example
+
+-- | Collects consecutive examples and their results.
+example :: Parser [Example]
+example = do
+  ws <- optWs
+  prompt <- decodeUtf8 <$> string ">>>"
+  expr <- (++ "\n") . decodeUtf8 <$> (A8.takeWhile (/= '\n') <* "\n")
+  results <- many result
+  let exs = concat [ e | Left e <- results ]
+      res = filter (not . null) [ r | Right r <- results ]
+  return $ makeExample (decodeUtf8 ws ++ prompt) expr res : exs
+  where
+    result = Left <$> example
+             <|> Right . decodeUtf8 <$> takeWhile1 (/= '\n') <* "\n"
+
+-- | Propery parser.
+--
+-- >>> parseOnly property "prop> hello world"
+-- Right (DocProperty "hello world")
+property :: Parser (Doc RdrName)
+property = do
+    _ <- skipSpace
+    s <- decodeUtf8 <$> (string "prop>" *> takeWhile1 (/= '\n'))
+    return $ makeProperty ("prop>" ++ s)
+
+-- | Paragraph level codeblock. Anything between the two delimiting @
+-- is parsed for markup.
+codeblock :: DynFlags -> Parser (Doc RdrName)
+codeblock d = do
+  -- Note that we don't need to use optWs here because in cases where
+  -- we don't see a \n immediatelly after the opening @, this parser
+  -- fails but we still have a chance to get a codeblock by getting
+  -- a monospaced doc on its own in the paragraph. With that, the cases
+  -- are covered. This should be updated if the implementation ever changes.
+  s <- parseString' d . ('\n':) . decodeUtf8 <$> ("@\n" *> block' <* "@")
+  maybe (fail "codeblock") (return . DocCodeBlock) s
+  where
+    block' = A8.scan False p
+      where
+        p isNewline c
+          | isNewline && c == '@' = Nothing
+          | otherwise = Just $ c == '\n'
+
+-- | Calls 'parseString'' on each line of a paragraph
+textParagraph :: DynFlags -> Parser (Doc RdrName)
+textParagraph d = do
+  s <- parseString' d . concatMap ((++ "\n") . decodeUtf8) <$> line `sepBy1` "\n"
+  maybe (fail "textParagraph") (return . docParagraph) s
+  where
+    line = takeWhile1 (/= '\n')
+
+-- | See 'picture' for adding a page title.
+url :: Parser (Doc RdrName)
+url = DocHyperlink . makeHyperlink . decodeUtf8 <$> ("<" *> takeWhile1 (`notElem` ">\n") <* ">")
+      <|> autoUrl
+
+-- | Naive implementation of auto-linking. Will link everything after
+-- @http://@, @https://@, @ftp://@, @ssh://@, @gopher://@ until a space.
+-- Single trailing punctuation character (.!?,) is split off.
+autoUrl :: Parser (Doc RdrName)
+autoUrl = do
+  link <- decodeUtf8 <$> urlLone
+  return $ formatLink link
+  where
+    urlLone = mappend <$> choice prefixes <*> takeWhile1 (not . isSpace)
+    prefixes = [ "http://", "https://", "ftp://"
+               , "ssh://", "gopher://" ]
+    formatLink :: String -> Doc RdrName
+    formatLink s = if last s `elem` ".!?,"
+                   then docAppend (DocHyperlink $ Hyperlink (init s) Nothing) (DocString [last s])
+                   else DocHyperlink $ Hyperlink s Nothing
+
+-- | Parses strings between identifier delimiters. Consumes all input that it
+-- deems to be valid in an identifier. Note that it simply blindly consumes
+-- characters and does no actual validation itself.
+parseValid :: Parser String
+parseValid = do
+  vs <- many' (A8.satisfy (`elem` "_.!#$%&*+/<=>?@\\?|-~:") <|> digit <|> letter_ascii)
+  c <- peekChar
+  case c of
+    Just '`' -> return vs
+    Just '\'' -> (do {c'' <- char '\''; y'' <- parseValid; return $ vs ++ [c''] ++ y''}) <|> return vs
+    _ -> fail "outofvalid"
+
+-- | Parses identifiers with help of 'parseValid'. Asks GHC for 'RdrName' from the
+-- string it deems valid.
+identifier :: DynFlags -> Parser (Doc RdrName)
+identifier dflags = do
+  o <- idDelim
+  vid <- parseValid
+  e <- idDelim
+  return $ validIdentifier $ o : (vid ++ [e])
+  where idDelim = char '\'' <|> char '`'
+        validIdentifier str = case parseIdent (tail $ init str) of
+          Just identName -> DocIdentifier identName
+          Nothing -> DocString str
+        parseIdent :: String -> Maybe RdrName
+        parseIdent str0 =
+          let buffer = stringToStringBuffer str0
+              realSrcLc = mkRealSrcLoc (mkFastString "<unknown file>") 0 0
+              pstate = mkPState dflags buffer realSrcLc
+          in case unP parseIdentifier pstate of
+            POk _ name -> Just (unLoc name)
+            _ -> Nothing
+
+-- | Remove all leading and trailing whitespace
+strip :: String -> String
+strip = (\f -> f . f) $ dropWhile isSpace . reverse
+
+-- | Consumes whitespace, excluding a newline.
+optWs :: Parser BS.ByteString
+optWs = A8.takeWhile (`elem` " \t\f\v\r")
+
+-- | Create an 'Example', stripping superfluous characters as appropriate.
+-- Remembers the amount of indentation used for the prompt.
+makeExample :: String -> String -> [String] -> Example
+makeExample prompt expression res =
+  Example (strip expression) result'       -- drop whitespace in expressions
+  where (prefix, _) = span isSpace prompt
+        result' = map substituteBlankLine $ filter (not . null) $ map (tryStripPrefix prefix) res
+          where tryStripPrefix xs ys = fromMaybe ys $ stripPrefix xs ys
+                substituteBlankLine "<BLANKLINE>" = ""
+                substituteBlankLine line          = line
+
+-- | Creates a 'Picture' with an optional title. Called by 'picture'.
+makePicture :: String -> Picture
+makePicture input = case break isSpace $ strip input of
+  (uri, "")    -> Picture uri Nothing
+  (uri, label) -> Picture uri (Just $ dropWhile isSpace label)
+
+-- | Creates a 'Hyperlink' with an optional title. Called by 'example'.
+makeHyperlink :: String -> Hyperlink
+makeHyperlink input = case break isSpace $ strip input of
+  (u, "")    -> Hyperlink u Nothing
+  (u, label) -> Hyperlink u (Just $ dropWhile isSpace label)
+
+-- | Makes a property that can be used by other programs for assertions.
+-- Drops whitespace around the property. Called by 'property'
+makeProperty :: String -> Doc RdrName
+makeProperty s = case strip s of
+  'p':'r':'o':'p':'>':xs ->
+    DocProperty (dropWhile isSpace xs)
+  xs ->
+    error $ "makeProperty: invalid input " ++ show xs
diff --git a/src/Haddock/Types.hs b/src/Haddock/Types.hs
index 73fafd6b..00a8b68f 100644
--- a/src/Haddock/Types.hs
+++ b/src/Haddock/Types.hs
@@ -1,4 +1,4 @@
-{-# LANGUAGE DeriveDataTypeable, DeriveFunctor, DeriveFoldable, DeriveTraversable #-}
+{-# LANGUAGE DeriveDataTypeable, DeriveFunctor, DeriveFoldable, DeriveTraversable, GADTs #-}
 {-# OPTIONS_GHC -fno-warn-orphans #-}
 -----------------------------------------------------------------------------
 -- |
@@ -19,7 +19,6 @@ module Haddock.Types (
   , HsDocString, LHsDocString
  ) where
 
-
 import Data.Foldable
 import Data.Traversable
 import Control.Exception
@@ -27,15 +26,12 @@ import Control.Arrow
 import Control.DeepSeq
 import Data.Typeable
 import Data.Map (Map)
-import Data.Maybe
 import qualified Data.Map as Map
-import Data.Monoid
 import GHC hiding (NoLink)
 import OccName
 import Control.Applicative (Applicative(..))
 import Control.Monad (ap)
 
-
 -----------------------------------------------------------------------------
 -- * Convenient synonyms
 -----------------------------------------------------------------------------
@@ -205,7 +201,7 @@ data ExportItem name
       , expItemSubs :: ![name]
       }
 
-  -- | A section heading. 
+  -- | A section heading.
   | ExportGroup
       {
         -- | Section level (1, 2, 3, ...).
@@ -230,11 +226,6 @@ data Documentation name = Documentation
   } deriving Functor
 
 
-combineDocumentation :: Documentation name -> Maybe (Doc name)
-combineDocumentation (Documentation Nothing Nothing) = Nothing
-combineDocumentation (Documentation mDoc mWarning)   = Just (fromMaybe mempty mWarning `mappend` fromMaybe mempty mDoc)
-
-
 -- | Arguments and result are indexed by Int, zero-based from the left,
 -- because that's the easiest to use when recursing over types.
 type FnArgsDoc name = Map Int (Doc name)
@@ -289,7 +280,6 @@ type DocInstance name = (InstHead name, Maybe (Doc name))
 -- of instance types.
 type InstHead name = ([HsType name], name, [HsType name])
 
-
 -----------------------------------------------------------------------------
 -- * Documentation comments
 -----------------------------------------------------------------------------
@@ -314,18 +304,12 @@ data Doc id
   | DocDefList [(Doc id, Doc id)]
   | DocCodeBlock (Doc id)
   | DocHyperlink Hyperlink
-  | DocPic String
+  | DocPic Picture
   | DocAName String
   | DocProperty String
   | DocExamples [Example]
   deriving (Functor, Foldable, Traversable)
 
-
-instance Monoid (Doc id) where
-  mempty  = DocEmpty
-  mappend = DocAppend
-
-
 instance NFData a => NFData (Doc a) where
   rnf doc = case doc of
     DocEmpty                  -> ()
@@ -360,9 +344,18 @@ data Hyperlink = Hyperlink
   } deriving (Eq, Show)
 
 
+data Picture = Picture
+  { pictureUri   :: String
+  , pictureTitle :: Maybe String
+  } deriving (Eq, Show)
+
+
 instance NFData Hyperlink where
   rnf (Hyperlink a b) = a `deepseq` b `deepseq` ()
 
+instance NFData Picture where
+  rnf (Picture a b) = a `deepseq` b `deepseq` ()
+
 
 data Example = Example
   { exampleExpression :: String
@@ -396,7 +389,7 @@ data DocMarkup id a = Markup
   , markupCodeBlock            :: a -> a
   , markupHyperlink            :: Hyperlink -> a
   , markupAName                :: String -> a
-  , markupPic                  :: String -> a
+  , markupPic                  :: Picture -> a
   , markupProperty             :: String -> a
   , markupExample              :: [Example] -> a
   }
diff --git a/src/Haddock/Utf8.hs b/src/Haddock/Utf8.hs
new file mode 100644
index 00000000..1fb0e818
--- /dev/null
+++ b/src/Haddock/Utf8.hs
@@ -0,0 +1,74 @@
+module Haddock.Utf8 (encodeUtf8, decodeUtf8) where
+import           Data.Bits ((.|.), (.&.), shiftL, shiftR)
+import qualified Data.ByteString as BS
+import           Data.Char (chr, ord)
+import           Data.Word (Word8)
+
+-- | Helper that encodes and packs a 'String' into a 'BS.ByteString'
+encodeUtf8 :: String -> BS.ByteString
+encodeUtf8 = BS.pack . encode
+
+-- | Helper that unpacks and decodes a 'BS.ByteString' into a 'String'
+decodeUtf8 :: BS.ByteString -> String
+decodeUtf8 = decode . BS.unpack
+
+-- Copy/pasted functions from Codec.Binary.UTF8.String for encoding/decoding
+-- | Character to use when 'encode' or 'decode' fail for a byte.
+replacementCharacter :: Char
+replacementCharacter = '\xfffd'
+
+-- | Encode a Haskell String to a list of Word8 values, in UTF8 format.
+encode :: String -> [Word8]
+encode = concatMap (map fromIntegral . go . ord)
+ where
+  go oc
+   | oc <= 0x7f       = [oc]
+
+   | oc <= 0x7ff      = [ 0xc0 + (oc `shiftR` 6)
+                        , 0x80 + oc .&. 0x3f
+                        ]
+
+   | oc <= 0xffff     = [ 0xe0 + (oc `shiftR` 12)
+                        , 0x80 + ((oc `shiftR` 6) .&. 0x3f)
+                        , 0x80 + oc .&. 0x3f
+                        ]
+   | otherwise        = [ 0xf0 + (oc `shiftR` 18)
+                        , 0x80 + ((oc `shiftR` 12) .&. 0x3f)
+                        , 0x80 + ((oc `shiftR` 6) .&. 0x3f)
+                        , 0x80 + oc .&. 0x3f
+                        ]
+
+-- | Decode a UTF8 string packed into a list of Word8 values, directly to String
+decode :: [Word8] -> String
+decode [    ] = ""
+decode (c:cs)
+  | c < 0x80  = chr (fromEnum c) : decode cs
+  | c < 0xc0  = replacementCharacter : decode cs
+  | c < 0xe0  = multi1
+  | c < 0xf0  = multi_byte 2 0xf  0x800
+  | c < 0xf8  = multi_byte 3 0x7  0x10000
+  | c < 0xfc  = multi_byte 4 0x3  0x200000
+  | c < 0xfe  = multi_byte 5 0x1  0x4000000
+  | otherwise = replacementCharacter : decode cs
+  where
+    multi1 = case cs of
+      c1 : ds | c1 .&. 0xc0 == 0x80 ->
+        let d = ((fromEnum c .&. 0x1f) `shiftL` 6) .|.  fromEnum (c1 .&. 0x3f)
+        in if d >= 0x000080 then toEnum d : decode ds
+                            else replacementCharacter : decode ds
+      _ -> replacementCharacter : decode cs
+
+    multi_byte :: Int -> Word8 -> Int -> String
+    multi_byte i mask overlong = aux i cs (fromEnum (c .&. mask))
+      where
+        aux 0 rs acc
+          | overlong <= acc && acc <= 0x10ffff &&
+            (acc < 0xd800 || 0xdfff < acc)     &&
+            (acc < 0xfffe || 0xffff < acc)      = chr acc : decode rs
+          | otherwise = replacementCharacter : decode rs
+
+        aux n (r:rs) acc
+          | r .&. 0xc0 == 0x80 = aux (n-1) rs
+                               $ shiftL acc 6 .|. fromEnum (r .&. 0x3f)
+
+        aux _ rs     _ = replacementCharacter : decode rs
author	Mateusz Kowalczyk <fuuzetsu@fuuzetsu.co.uk>	2013-07-09 14:24:10 +0100
committer	Austin Seipp <austin@well-typed.com>	2014-01-12 14:48:35 -0600
commit	bb6cef20b82ef7a7f2d49f3ef6dc1a7ce880b5f0 (patch)
tree	ea07b1d4ab43169bc8d7074ff05bf1792c93feb0 /src
parent	c1228df0339d041b455bb993786a9ed6322c5e01 (diff)