1 files changed, 435 insertions, 0 deletions
diff --git a/src/Haddock/Parser.hs b/src/Haddock/Parser.hs
new file mode 100644
index 00000000..81e274ed
--- /dev/null
+++ b/src/Haddock/Parser.hs
@@ -0,0 +1,435 @@
+{-# LANGUAGE OverloadedStrings #-}
+-- |
+-- Module      :  Haddock.Parser
+-- Copyright   :  (c) Mateusz Kowalczyk 2013,
+--                    Simon Hengel      2013
+-- License     :  BSD-like
+--
+-- Maintainer  :  haddock@projects.haskell.org
+-- Stability   :  experimental
+-- Portability :  portable
+
+module Haddock.Parser (parseString, parseParas) where
+
+import           Control.Applicative
+import           Data.Attoparsec.ByteString   hiding (takeWhile1, take, inClass)
+import qualified Data.Attoparsec.ByteString.Char8 as A8
+import           Data.Attoparsec.ByteString.Char8 hiding (take, string)
+import qualified Data.ByteString as BS
+import           Data.Char (chr)
+import           Data.List (stripPrefix)
+import           Data.Maybe (fromMaybe)
+import           Data.Monoid
+import           DynFlags
+import           FastString (mkFastString)
+import           Haddock.Doc
+import           Haddock.Types
+import           Lexer (mkPState, unP, ParseResult(POk))
+import           Parser (parseIdentifier)
+import           RdrName
+import           SrcLoc (mkRealSrcLoc, unLoc)
+import           StringBuffer (stringToStringBuffer)
+import           Haddock.Utf8
+
+default (Int)
+
+-- | Main entry point to the parser. Appends the newline character
+-- to the input string.
+parseParas :: DynFlags
+              -> String -- ^ String to parse
+              -> Maybe (Doc RdrName)
+parseParas d s = case parseOnly (p <* skipSpace) (encodeUtf8 $ s ++ "\n") of
+  Right r -> Just $ combineStringNodes r
+  _ -> Nothing
+  where
+    p :: Parser (Doc RdrName)
+    -- make sure that we don't swallow up whitespace belonging to next paragraph
+    p = mconcat <$> paragraph d `sepBy` some (optWs *> "\n")
+
+-- | A parser that parsers separate lines of the comments. Eventually
+-- called by 'parseParas'. Appends a newline character to the input string.
+-- Drops any whitespace in front of the input string. It's dropped for the sake of
+-- section headings.
+parseString :: DynFlags -> String -> Maybe (Doc RdrName)
+parseString d = parseString'' d . dropWhile isSpace
+
+-- | A parser that parsers separate lines of the comments. Eventually
+-- called by 'parseParas'. Appends a newline character to the input string.
+-- Unlike 'parseString', doesn't drop the preceding whitespace. Internal use.
+parseString'' :: DynFlags -> String -> Maybe (Doc RdrName)
+parseString'' d = parseString' d . (++ "\n")
+
+-- | An internal use function. Split from the 'parseString' is useful
+-- as we can specify separately when we want the newline to be appended.
+parseString' :: DynFlags -> String -> Maybe (Doc RdrName)
+parseString' d s = case parseOnly p (encodeUtf8 s) of
+  Right r -> Just $ combineStringNodes r
+  _ -> Nothing
+  where
+    p :: Parser (Doc RdrName)
+    p = mconcat <$> some (charEscape <|> monospace d <|> anchor <|> identifier d
+                          <|> moduleName <|> picture <|> url
+                          <|> emphasis d <|> encodedChar <|> string' <|> skipChar)
+
+-- | Parses and processes
+-- <https://en.wikipedia.org/wiki/Numeric_character_reference Numeric character references>
+--
+-- >>> parseOnly encodedChar "&#65;&#66;&#67;"
+-- Right (DocString "ABC")
+encodedChar :: Parser (Doc RdrName)
+encodedChar = "&#" *> c <* ";"
+  where
+    c = DocString . return . chr <$> num
+    num = hex <|> decimal
+    hex = ("x" <|> "X") *> hexadecimal
+
+-- | Plain, regular parser for text. Called as one of the last parsers
+-- to ensure that we have already given a chance to more meaningful parsers
+-- before capturing their characers.
+string' :: Parser (Doc RdrName)
+string' = DocString . decodeUtf8 <$> takeWhile1 (`notElem` "/<@\" &'`\\")
+
+-- | Emphasis parser.
+--
+-- >>> parseOnly emphasis "/Hello world/"
+-- Right (DocEmphasis (DocString "Hello world"))
+emphasis :: DynFlags -> Parser (Doc RdrName)
+emphasis d = stringBlock d id DocEmphasis "/" "/" "\n"
+
+-- | Skips a single character and treats it as a plain string.
+-- This is done to skip over any special characters belonging to other
+-- elements but which were not deemed meaningful at their positions.
+-- Note that this can only be used in places where we're absolutely certain
+-- no unicode is present, such as to skip a 100% certain ASCII delimeter.
+skipChar :: Parser (Doc RdrName)
+skipChar = DocString . return <$> anyChar
+
+-- | Treats the next character as a regular string, even if it's normally
+-- used for markup.
+charEscape :: Parser (Doc RdrName)
+charEscape = "\\" *> (DocString . return <$> A8.satisfy (/= '\n'))
+
+-- | Text anchors to allow for jumping around the generated documentation.
+--
+-- >>> parseOnly anchor "#Hello world#"
+-- Right (DocAName "Hello world")
+anchor :: Parser (Doc RdrName)
+anchor = DocAName . decodeUtf8 <$> ("#" *> takeWhile1 (`notElem` "#\n") <* "#")
+
+-- | Helper for markup structures surrounded with delimiters.
+stringBlock
+  :: DynFlags
+     -> (String -> String) -- ^ Function used to transform parsed out text
+                           -- before we send it to 'parseString''
+     -> (Doc RdrName -> Doc RdrName) -- ^ 'Doc' to wrap around the result
+     -> String -- ^ Opening delimiter
+     -> String -- ^ Closing delimiter
+     -> String -- ^ Additional characters to terminate parsing on
+     -> Parser (Doc RdrName)
+stringBlock d f doc op ed n = do
+  inner <- block op ed n
+  case parseString' d (f inner) of
+    Just r -> return $ doc r
+    _ -> fail $ "inner parse fail with op: ‘" ++ op ++ "’, ed: ‘" ++ ed ++ "’"
+
+-- | Returns sections of text delimited by specified text.
+block :: String -> String -> String -> Parser String
+block op ed n = reverse . drop (length ed) . reverse <$> block' op ed
+  where
+    block' op' ed' = string (encodeUtf8 op') *> mid
+      where
+        mid :: Parser String
+        mid = decodeUtf8 <$> string (encodeUtf8 ed')
+              <|> do
+                inner <- takeWithSkip (head ed') n
+                more <- decodeUtf8 <$> string (encodeUtf8 $ tail ed')
+                        <|> block' "" ed'  -- not full ending, take more
+                return $ inner ++ more
+
+
+-- | Takes all characters until the specified one. Unconditionally
+-- takes a character if it's escaped. Fails if it doesn't find the character or
+-- when the input string is empty.
+takeWithSkip :: Char -> String -> Parser String
+takeWithSkip s n = do
+  content <- decodeUtf8 <$> A8.scan (False, False) p >>= gotSome
+  if or (map (`elem` content) n) || last content /= s
+    then fail "failed in takeWithSkip"
+    else return content
+  where
+    gotSome [] = fail "EOF in takeWithSkip"
+    gotSome xs = return xs
+    -- Apparently ‘scan’ is so magical that it doesn't mangle unicode.
+    p (escaped, terminate) c
+      | terminate = Nothing -- swallows up that extra character
+      | escaped = Just (False, False)
+      | c == s = Just (False, True)
+      | otherwise = Just (c == '\\', False)
+
+-- | Monospaced strings.
+--
+-- >>> parseOnly (monospace dynflags) "@cruel@"
+-- Right (DocMonospaced (DocString "cruel"))
+monospace :: DynFlags -> Parser (Doc RdrName)
+monospace d = stringBlock d id DocMonospaced "@" "@" ""
+
+-- | Module name parser, surrounded by double quotes. This does a very primitive and
+-- purely syntactic checking so that obviously invalid names are not treated as valid
+-- and blindly hyperlinked (not starting with a capital letter or including spaces).
+moduleName :: Parser (Doc RdrName)
+moduleName = DocModule <$> ("\"" *> legalModule <* "\"")
+  where legalModule = do
+          n <- (:) <$> A8.satisfy (`elem` ['A' .. 'Z'])
+               <*> (decodeUtf8 <$> A8.takeWhile (`notElem` "\"\n"))
+
+          if any (`elem` n) " &[{}(=*)+]!#|@/;,^?"
+            then fail "invalid characters in module name"
+            else case n of
+            [] -> return []
+            _ -> if last n == '.' then fail "trailing dot in module name" else return n
+
+
+-- | Picture parser, surrounded by \<\< and \>\>. It's possible to specify
+-- a title for the picture.
+--
+-- >>> parseOnly picture "<<hello.png>>"
+-- Right (DocPic (Picture "hello.png" Nothing))
+-- >>> parseOnly picture "<<hello.png world>>"
+-- Right (DocPic (Picture "hello.png" (Just "world")))
+picture :: Parser (Doc RdrName)
+picture = DocPic . makePicture . decodeUtf8 <$> ("<<" *> takeWhile1 (`notElem` ">\n") <* ">>")
+
+-- | Paragraph parser, called by 'parseParas'.
+paragraph :: DynFlags -> Parser (Doc RdrName)
+paragraph d = examples <|> skipSpace *> (list d <|> birdtracks <|> codeblock d
+                                         <|> property <|> textParagraph d)
+
+-- | List parser, called by 'paragraph'.
+list :: DynFlags -> Parser (Doc RdrName)
+list d = DocUnorderedList <$> unorderedList d
+         <|> DocOrderedList <$> orderedList d
+         <|> DocDefList <$> definitionList d
+
+-- | Parse given text with a provided parser, casting
+-- Nothing to a failure
+parseLine :: (String -> Maybe (Doc RdrName)) -- ^ Parser to use
+             -> (Doc RdrName -> a) -- ^ Doc function to wrap around the result
+             -> BS.ByteString -- ^ Text to parse
+             -> Parser a
+parseLine f doc str = maybe (fail "invalid string") (return . doc) (f $ decodeUtf8 str)
+
+-- | Parses unordered (bullet) lists.
+unorderedList :: DynFlags -> Parser [Doc RdrName]
+unorderedList d = ("*" <|> "-") *> innerList unorderedList d
+
+-- | Parses ordered lists (numbered or dashed).
+orderedList :: DynFlags -> Parser [Doc RdrName]
+orderedList d = skipSpace *> (paren <|> dot) *> innerList orderedList d
+  where
+    dot = decimal <* "."
+    paren = "(" *> (decimal :: Parser Int) <* ")"
+
+-- | Generic function collecting any further lines belonging to the
+-- list entry and recursively collecting any further lists in the
+-- same paragraph. Usually used as
+--
+-- > someListFunction dynflags = listBeginning *> innerList someListFunction dynflags
+innerList :: (DynFlags -> Parser [Doc RdrName]) -- ^ parser calling this function
+             -> DynFlags
+             -> Parser [Doc RdrName]
+innerList p d = do
+  cl <- do
+    content <- A8.takeWhile (/= '\n') <* "\n" -- allow empty
+    parseLine (parseString'' d) id content
+  ulcs <- many ulc
+  let contents = docParagraph $ mconcat $ cl : [x | Right x <- ulcs]
+      unLists = mconcat [x | Left x <- ulcs]
+  return $ contents : unLists
+  where
+    ulc :: Parser (Either [Doc RdrName] (Doc RdrName))
+    ulc = Left <$> (optWs *> p d)
+          <|> Right <$> nonEmptyLine d
+
+-- | Takes the remained of the line until the newline character
+-- and calls 'parseLine' using 'parseString'. Fails if it's made
+-- up strictly of whitespace.
+nonEmptyLine :: DynFlags -> Parser (Doc RdrName)
+nonEmptyLine d = do
+  s <- (takeWhile1 (/= '\n') >>= nonSpace) <* "\n"
+  parseLine (parseString'' d) id s
+  where
+    nonSpace xs
+      | not (any (not . isSpace) (decodeUtf8 xs)) = fail "empty line"
+      | otherwise = return xs
+
+-- | Parses definition lists.
+definitionList :: DynFlags -> Parser [(Doc RdrName, Doc RdrName)]
+definitionList d = do
+  _ <- "["
+  inner <- parseLine (parseString' d) id =<< takeWhile1 (`notElem` "]\n")
+  _ <- "]"
+  outer <- parseLine (parseString'' d) id =<< (A8.takeWhile (/= '\n') <* "\n")
+  ulcs <- many ulc
+  let contents = mconcat $ outer : [x | Right x <- ulcs]
+      unLists = map mconcat [x | Left x <- ulcs]
+  return $ (inner, contents) : unLists
+  where
+    ulc :: Parser (Either [(Doc RdrName, Doc RdrName)] (Doc RdrName))
+    ulc = Left <$> (optWs *> definitionList d)
+          <|> Right <$> nonEmptyLine d
+
+-- | Parses birdtracks. No further markup is parsed after the birdtrack.
+-- Consecutive birdtracks are allowed.
+birdtracks :: Parser (Doc RdrName)
+birdtracks = DocCodeBlock . mconcat . map (DocString . (++ "\n") . decodeUtf8) <$> line `sepBy1` "\n"
+  where
+    line = optWs *> ">" *> A8.takeWhile (/= '\n')
+
+-- | Parses examples. Examples are a paragraph level entitity (separated by an empty line).
+-- Consecutive examples are accepted.
+examples :: Parser (Doc RdrName)
+examples = DocExamples <$> example
+
+-- | Collects consecutive examples and their results.
+example :: Parser [Example]
+example = do
+  ws <- optWs
+  prompt <- decodeUtf8 <$> string ">>>"
+  expr <- (++ "\n") . decodeUtf8 <$> (A8.takeWhile (/= '\n') <* "\n")
+  results <- many result
+  let exs = concat [ e | Left e <- results ]
+      res = filter (not . null) [ r | Right r <- results ]
+  return $ makeExample (decodeUtf8 ws ++ prompt) expr res : exs
+  where
+    result = Left <$> example
+             <|> Right . decodeUtf8 <$> takeWhile1 (/= '\n') <* "\n"
+
+-- | Propery parser.
+--
+-- >>> parseOnly property "prop> hello world"
+-- Right (DocProperty "hello world")
+property :: Parser (Doc RdrName)
+property = do
+    _ <- skipSpace
+    s <- decodeUtf8 <$> (string "prop>" *> takeWhile1 (/= '\n'))
+    return $ makeProperty ("prop>" ++ s)
+
+-- | Paragraph level codeblock. Anything between the two delimiting @
+-- is parsed for markup.
+codeblock :: DynFlags -> Parser (Doc RdrName)
+codeblock d = do
+  -- Note that we don't need to use optWs here because in cases where
+  -- we don't see a \n immediatelly after the opening @, this parser
+  -- fails but we still have a chance to get a codeblock by getting
+  -- a monospaced doc on its own in the paragraph. With that, the cases
+  -- are covered. This should be updated if the implementation ever changes.
+  s <- parseString' d . ('\n':) . decodeUtf8 <$> ("@\n" *> block' <* "@")
+  maybe (fail "codeblock") (return . DocCodeBlock) s
+  where
+    block' = A8.scan False p
+      where
+        p isNewline c
+          | isNewline && c == '@' = Nothing
+          | otherwise = Just $ c == '\n'
+
+-- | Calls 'parseString'' on each line of a paragraph
+textParagraph :: DynFlags -> Parser (Doc RdrName)
+textParagraph d = do
+  s <- parseString' d . concatMap ((++ "\n") . decodeUtf8) <$> line `sepBy1` "\n"
+  maybe (fail "textParagraph") (return . docParagraph) s
+  where
+    line = takeWhile1 (/= '\n')
+
+-- | See 'picture' for adding a page title.
+url :: Parser (Doc RdrName)
+url = DocHyperlink . makeHyperlink . decodeUtf8 <$> ("<" *> takeWhile1 (`notElem` ">\n") <* ">")
+      <|> autoUrl
+
+-- | Naive implementation of auto-linking. Will link everything after
+-- @http://@, @https://@, @ftp://@, @ssh://@, @gopher://@ until a space.
+-- Single trailing punctuation character (.!?,) is split off.
+autoUrl :: Parser (Doc RdrName)
+autoUrl = do
+  link <- decodeUtf8 <$> urlLone
+  return $ formatLink link
+  where
+    urlLone = mappend <$> choice prefixes <*> takeWhile1 (not . isSpace)
+    prefixes = [ "http://", "https://", "ftp://"
+               , "ssh://", "gopher://" ]
+    formatLink :: String -> Doc RdrName
+    formatLink s = if last s `elem` ".!?,"
+                   then docAppend (DocHyperlink $ Hyperlink (init s) Nothing) (DocString [last s])
+                   else DocHyperlink $ Hyperlink s Nothing
+
+-- | Parses strings between identifier delimiters. Consumes all input that it
+-- deems to be valid in an identifier. Note that it simply blindly consumes
+-- characters and does no actual validation itself.
+parseValid :: Parser String
+parseValid = do
+  vs <- many' (A8.satisfy (`elem` "_.!#$%&*+/<=>?@\\?|-~:") <|> digit <|> letter_ascii)
+  c <- peekChar
+  case c of
+    Just '`' -> return vs
+    Just '\'' -> (do {c'' <- char '\''; y'' <- parseValid; return $ vs ++ [c''] ++ y''}) <|> return vs
+    _ -> fail "outofvalid"
+
+-- | Parses identifiers with help of 'parseValid'. Asks GHC for 'RdrName' from the
+-- string it deems valid.
+identifier :: DynFlags -> Parser (Doc RdrName)
+identifier dflags = do
+  o <- idDelim
+  vid <- parseValid
+  e <- idDelim
+  return $ validIdentifier $ o : (vid ++ [e])
+  where idDelim = char '\'' <|> char '`'
+        validIdentifier str = case parseIdent (tail $ init str) of
+          Just identName -> DocIdentifier identName
+          Nothing -> DocString str
+        parseIdent :: String -> Maybe RdrName
+        parseIdent str0 =
+          let buffer = stringToStringBuffer str0
+              realSrcLc = mkRealSrcLoc (mkFastString "<unknown file>") 0 0
+              pstate = mkPState dflags buffer realSrcLc
+          in case unP parseIdentifier pstate of
+            POk _ name -> Just (unLoc name)
+            _ -> Nothing
+
+-- | Remove all leading and trailing whitespace
+strip :: String -> String
+strip = (\f -> f . f) $ dropWhile isSpace . reverse
+
+-- | Consumes whitespace, excluding a newline.
+optWs :: Parser BS.ByteString
+optWs = A8.takeWhile (`elem` " \t\f\v\r")
+
+-- | Create an 'Example', stripping superfluous characters as appropriate.
+-- Remembers the amount of indentation used for the prompt.
+makeExample :: String -> String -> [String] -> Example
+makeExample prompt expression res =
+  Example (strip expression) result'       -- drop whitespace in expressions
+  where (prefix, _) = span isSpace prompt
+        result' = map substituteBlankLine $ filter (not . null) $ map (tryStripPrefix prefix) res
+          where tryStripPrefix xs ys = fromMaybe ys $ stripPrefix xs ys
+                substituteBlankLine "<BLANKLINE>" = ""
+                substituteBlankLine line          = line
+
+-- | Creates a 'Picture' with an optional title. Called by 'picture'.
+makePicture :: String -> Picture
+makePicture input = case break isSpace $ strip input of
+  (uri, "")    -> Picture uri Nothing
+  (uri, label) -> Picture uri (Just $ dropWhile isSpace label)
+
+-- | Creates a 'Hyperlink' with an optional title. Called by 'example'.
+makeHyperlink :: String -> Hyperlink
+makeHyperlink input = case break isSpace $ strip input of
+  (u, "")    -> Hyperlink u Nothing
+  (u, label) -> Hyperlink u (Just $ dropWhile isSpace label)
+
+-- | Makes a property that can be used by other programs for assertions.
+-- Drops whitespace around the property. Called by 'property'
+makeProperty :: String -> Doc RdrName
+makeProperty s = case strip s of
+  'p':'r':'o':'p':'>':xs ->
+    DocProperty (dropWhile isSpace xs)
+  xs ->
+    error $ "makeProperty: invalid input " ++ show xs