5 files changed, 739 insertions, 0 deletions
diff --git a/haddock-library/src/Documentation/Haddock/Doc.hs b/haddock-library/src/Documentation/Haddock/Doc.hs
new file mode 100644
index 00000000..4d6c10a4
--- /dev/null
+++ b/haddock-library/src/Documentation/Haddock/Doc.hs
@@ -0,0 +1,60 @@
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+module Documentation.Haddock.Doc (docParagraph) where
+
+import Data.Monoid
+import Documentation.Haddock.Types
+import Data.Char (isSpace)
+
+-- We put it here so that we can avoid a circular import
+-- anything relevant imports this module anyway
+instance Monoid (DocH mod id) where
+  mempty  = DocEmpty
+  mappend = docAppend
+
+docAppend :: DocH mod id -> DocH mod id -> DocH mod id
+docAppend (DocDefList ds1) (DocDefList ds2) = DocDefList (ds1++ds2)
+docAppend (DocDefList ds1) (DocAppend (DocDefList ds2) d) = DocAppend (DocDefList (ds1++ds2)) d
+docAppend (DocOrderedList ds1) (DocOrderedList ds2) = DocOrderedList (ds1 ++ ds2)
+docAppend (DocUnorderedList ds1) (DocUnorderedList ds2) = DocUnorderedList (ds1 ++ ds2)
+docAppend DocEmpty d = d
+docAppend d DocEmpty = d
+docAppend (DocString s1) (DocString s2) = DocString (s1 ++ s2)
+docAppend (DocAppend d (DocString s1)) (DocString s2) = DocAppend d (DocString (s1 ++ s2))
+docAppend (DocString s1) (DocAppend (DocString s2) d) = DocAppend (DocString (s1 ++ s2)) d
+docAppend d1 d2 = DocAppend d1 d2
+
+-- again to make parsing easier - we spot a paragraph whose only item
+-- is a DocMonospaced and make it into a DocCodeBlock
+docParagraph :: DocH mod id -> DocH mod id
+docParagraph (DocMonospaced p)
+  = DocCodeBlock (docCodeBlock p)
+docParagraph (DocAppend (DocString s1) (DocMonospaced p))
+  | all isSpace s1
+  = DocCodeBlock (docCodeBlock p)
+docParagraph (DocAppend (DocString s1)
+    (DocAppend (DocMonospaced p) (DocString s2)))
+  | all isSpace s1 && all isSpace s2
+  = DocCodeBlock (docCodeBlock p)
+docParagraph (DocAppend (DocMonospaced p) (DocString s2))
+  | all isSpace s2
+  = DocCodeBlock (docCodeBlock p)
+docParagraph p
+  = DocParagraph p
+
+
+-- Drop trailing whitespace from @..@ code blocks.  Otherwise this:
+--
+--    -- @
+--    -- foo
+--    -- @
+--
+-- turns into (DocCodeBlock "\nfoo\n ") which when rendered in HTML
+-- gives an extra vertical space after the code block.  The single space
+-- on the final line seems to trigger the extra vertical space.
+--
+docCodeBlock :: DocH mod id -> DocH mod id
+docCodeBlock (DocString s)
+  = DocString (reverse $ dropWhile (`elem` " \t") $ reverse s)
+docCodeBlock (DocAppend l r)
+  = DocAppend l (docCodeBlock r)
+docCodeBlock d = d
diff --git a/haddock-library/src/Documentation/Haddock/Parser.hs b/haddock-library/src/Documentation/Haddock/Parser.hs
new file mode 100644
index 00000000..3d146d33
--- /dev/null
+++ b/haddock-library/src/Documentation/Haddock/Parser.hs
@@ -0,0 +1,474 @@
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE StandaloneDeriving #-}
+{-# LANGUAGE FlexibleInstances #-}
+{-# LANGUAGE UndecidableInstances #-}
+{-# LANGUAGE IncoherentInstances #-}
+{-# LANGUAGE LambdaCase #-}
+-- |
+-- Module      :  Documentation.Haddock.Parser
+-- Copyright   :  (c) Mateusz Kowalczyk 2013-2014,
+--                    Simon Hengel      2013
+-- License     :  BSD-like
+--
+-- Maintainer  :  haddock@projects.haskell.org
+-- Stability   :  experimental
+-- Portability :  portable
+
+module Documentation.Haddock.Parser ( parseString, parseParas
+                                    , overIdentifier, toRegular, Identifier
+                                    ) where
+
+import           Control.Applicative
+import           Control.Arrow (first)
+import           Control.Monad (void, mfilter)
+import           Data.Attoparsec.ByteString.Char8 hiding (parse, take, endOfLine)
+import qualified Data.ByteString.Char8 as BS
+import           Data.Char (chr, isAsciiUpper)
+import           Data.List (stripPrefix, intercalate, unfoldr)
+import           Data.Maybe (fromMaybe)
+import           Data.Monoid
+import           Documentation.Haddock.Doc
+import           Documentation.Haddock.Parser.Util
+import           Documentation.Haddock.Types
+import           Documentation.Haddock.Utf8
+import           Prelude hiding (takeWhile)
+
+-- | Identifier string surrounded with opening and closing quotes/backticks.
+type Identifier = (Char, String, Char)
+
+-- | Drops the quotes/backticks around all identifiers, as if they
+-- were valid but still 'String's.
+toRegular :: DocH mod Identifier -> DocH mod String
+toRegular = fmap (\(_, x, _) -> x)
+
+-- | Maps over 'DocIdentifier's over 'String' with potentially failing
+-- conversion using user-supplied function. If the conversion fails,
+-- the identifier is deemed to not be valid and is treated as a
+-- regular string.
+overIdentifier :: (String -> Maybe a)
+               -> DocH mod Identifier
+               -> DocH mod a
+overIdentifier f d = g d
+  where
+    g (DocIdentifier (o, x, e)) = case f x of
+      Nothing -> DocString $ o : x ++ [e]
+      Just x' -> DocIdentifier x'
+    g DocEmpty = DocEmpty
+    g (DocAppend x x') = DocAppend (g x) (g x')
+    g (DocString x) = DocString x
+    g (DocParagraph x) = DocParagraph $ g x
+    g (DocIdentifierUnchecked x) = DocIdentifierUnchecked x
+    g (DocModule x) = DocModule x
+    g (DocWarning x) = DocWarning $ g x
+    g (DocEmphasis x) = DocEmphasis $ g x
+    g (DocMonospaced x) = DocMonospaced $ g x
+    g (DocBold x) = DocBold $ g x
+    g (DocUnorderedList x) = DocUnorderedList $ fmap g x
+    g (DocOrderedList x) = DocOrderedList $ fmap g x
+    g (DocDefList x) = DocDefList $ fmap (\(y, z) -> (g y, g z)) x
+    g (DocCodeBlock x) = DocCodeBlock $ g x
+    g (DocHyperlink x) = DocHyperlink x
+    g (DocPic x) = DocPic x
+    g (DocAName x) = DocAName x
+    g (DocProperty x) = DocProperty x
+    g (DocExamples x) = DocExamples x
+    g (DocHeader (Header l x)) = DocHeader . Header l $ g x
+
+parse :: Parser a -> BS.ByteString -> a
+parse p = either err id . parseOnly (p <* endOfInput)
+  where
+    err = error . ("Haddock.Parser.parse: " ++)
+
+-- | Main entry point to the parser. Appends the newline character
+-- to the input string.
+parseParas :: String -- ^ String to parse
+           -> DocH mod Identifier
+parseParas = parse (p <* skipSpace) . encodeUtf8 . (++ "\n")
+  where
+    p :: Parser (DocH mod Identifier)
+    p = mconcat <$> paragraph `sepBy` many (skipHorizontalSpace *> "\n")
+
+-- | Parse a text paragraph. Actually just a wrapper over 'parseStringBS' which
+-- drops leading whitespace and encodes the string to UTF8 first.
+parseString :: String -> DocH mod Identifier
+parseString = parseStringBS . encodeUtf8 . dropWhile isSpace
+
+parseStringBS :: BS.ByteString -> DocH mod Identifier
+parseStringBS = parse p
+  where
+    p :: Parser (DocH mod Identifier)
+    p = mconcat <$> many (monospace <|> anchor <|> identifier
+                          <|> moduleName <|> picture <|> hyperlink <|> autoUrl <|> bold
+                          <|> emphasis <|> encodedChar <|> string' <|> skipSpecialChar)
+
+-- | Parses and processes
+-- <https://en.wikipedia.org/wiki/Numeric_character_reference Numeric character references>
+--
+-- >>> parseOnly encodedChar "&#65;&#66;&#67;"
+-- Right (DocString "ABC")
+encodedChar :: Parser (DocH mod a)
+encodedChar = "&#" *> c <* ";"
+  where
+    c = DocString . return . chr <$> num
+    num = hex <|> decimal
+    hex = ("x" <|> "X") *> hexadecimal
+
+specialChar :: [Char]
+specialChar = "_/<@\"&'`"
+
+-- | Plain, regular parser for text. Called as one of the last parsers
+-- to ensure that we have already given a chance to more meaningful parsers
+-- before capturing their characers.
+string' :: Parser (DocH mod a)
+string' = DocString . unescape . decodeUtf8 <$> takeWhile1_ (`notElem` specialChar)
+  where
+    unescape "" = ""
+    unescape ('\\':x:xs) = x : unescape xs
+    unescape (x:xs) = x : unescape xs
+
+-- | Skips a single special character and treats it as a plain string.
+-- This is done to skip over any special characters belonging to other
+-- elements but which were not deemed meaningful at their positions.
+skipSpecialChar :: Parser (DocH mod a)
+skipSpecialChar = DocString . return <$> satisfy (`elem` specialChar)
+
+-- | Emphasis parser.
+--
+-- >>> parseOnly emphasis "/Hello world/"
+-- Right (DocEmphasis (DocString "Hello world"))
+emphasis :: Parser (DocH mod Identifier)
+emphasis = DocEmphasis . parseStringBS <$>
+  mfilter ('\n' `BS.notElem`) ("/" *> takeWhile1_ (/= '/') <* "/")
+
+-- | Bold parser.
+--
+-- >>> parseOnly bold "__Hello world__"
+-- Right (DocBold (DocString "Hello world"))
+bold :: Parser (DocH mod Identifier)
+bold = DocBold . parseStringBS <$> disallowNewline ("__" *> takeUntil "__")
+
+disallowNewline :: Parser BS.ByteString -> Parser BS.ByteString
+disallowNewline = mfilter ('\n' `BS.notElem`)
+
+-- | Like `takeWhile`, but unconditionally take escaped characters.
+takeWhile_ :: (Char -> Bool) -> Parser BS.ByteString
+takeWhile_ p = scan False p_
+  where
+    p_ escaped c
+      | escaped = Just False
+      | not $ p c = Nothing
+      | otherwise = Just (c == '\\')
+
+-- | Like `takeWhile1`, but unconditionally take escaped characters.
+takeWhile1_ :: (Char -> Bool) -> Parser BS.ByteString
+takeWhile1_ = mfilter (not . BS.null) . takeWhile_
+
+-- | Text anchors to allow for jumping around the generated documentation.
+--
+-- >>> parseOnly anchor "#Hello world#"
+-- Right (DocAName "Hello world")
+anchor :: Parser (DocH mod a)
+anchor = DocAName . decodeUtf8 <$> ("#" *> takeWhile1 (`notElem` "#\n") <* "#")
+
+-- | Monospaced strings.
+--
+-- >>> parseOnly monospace "@cruel@"
+-- Right (DocMonospaced (DocString "cruel"))
+monospace :: Parser (DocH mod Identifier)
+monospace = DocMonospaced . parseStringBS <$> ("@" *> takeWhile1_ (/= '@') <* "@")
+
+moduleName :: Parser (DocH mod a)
+moduleName = DocModule <$> (char '"' *> modid <* char '"')
+  where
+    modid = intercalate "." <$> conid `sepBy1` "."
+    conid = (:)
+      <$> satisfy isAsciiUpper
+      -- NOTE: According to Haskell 2010 we shouldd actually only
+      -- accept {small | large | digit | ' } here.  But as we can't
+      -- match on unicode characters, this is currently not possible.
+      <*> (decodeUtf8 <$> takeWhile (`notElem` " .&[{}(=*)+]!#|@/;,^?\"\n"))
+
+-- | Picture parser, surrounded by \<\< and \>\>. It's possible to specify
+-- a title for the picture.
+--
+-- >>> parseOnly picture "<<hello.png>>"
+-- Right (DocPic (Picture "hello.png" Nothing))
+-- >>> parseOnly picture "<<hello.png world>>"
+-- Right (DocPic (Picture "hello.png" (Just "world")))
+picture :: Parser (DocH mod a)
+picture = DocPic . makeLabeled Picture . decodeUtf8
+          <$> disallowNewline ("<<" *> takeUntil ">>")
+
+-- | Paragraph parser, called by 'parseParas'.
+paragraph :: Parser (DocH mod Identifier)
+paragraph = examples <|> skipSpace *> (list <|> birdtracks <|> codeblock
+                                       <|> property <|> header
+                                       <|> textParagraph)
+
+header :: Parser (DocH mod Identifier)
+header = do
+  let psers = map (string . encodeUtf8 . concat . flip replicate "=") [6, 5 .. 1]
+      pser = foldl1 (<|>) psers
+  delim <- decodeUtf8 <$> pser
+  line <- skipHorizontalSpace *> nonEmptyLine >>= return . parseString
+  rest <- paragraph <|> return mempty
+  return $ DocParagraph (DocHeader (Header (length delim) line)) <> rest
+
+textParagraph :: Parser (DocH mod Identifier)
+textParagraph = docParagraph . parseString . intercalate "\n" <$> many1 nonEmptyLine
+
+-- | List parser, called by 'paragraph'.
+list :: Parser (DocH mod Identifier)
+list = DocUnorderedList <$> unorderedList
+       <|> DocOrderedList <$> orderedList
+       <|> DocDefList <$> definitionList
+
+-- | Parses unordered (bullet) lists.
+unorderedList :: Parser [DocH mod Identifier]
+unorderedList = ("*" <|> "-") *> innerList unorderedList
+
+-- | Parses ordered lists (numbered or dashed).
+orderedList :: Parser [DocH mod Identifier]
+orderedList = (paren <|> dot) *> innerList orderedList
+  where
+    dot = (decimal :: Parser Int) <* "."
+    paren = "(" *> decimal <* ")"
+
+-- | Generic function collecting any further lines belonging to the
+-- list entry and recursively collecting any further lists in the
+-- same paragraph. Usually used as
+--
+-- > someListFunction = listBeginning *> innerList someListFunction
+innerList :: Parser [DocH mod Identifier] -> Parser [DocH mod Identifier]
+innerList item = do
+  c <- takeLine
+  (cs, items) <- more item
+  let contents = docParagraph . parseString . dropNLs . unlines $ c : cs
+  return $ case items of
+    Left p -> [contents `joinPara` p]
+    Right i -> contents : i
+
+-- | Parses definition lists.
+definitionList :: Parser [(DocH mod Identifier, DocH mod Identifier)]
+definitionList = do
+  label <- "[" *> (parseStringBS <$> takeWhile1 (`notElem` "]\n")) <* "]"
+  c <- takeLine
+  (cs, items) <- more definitionList
+  let contents = parseString . dropNLs . unlines $ c : cs
+  return $ case items of
+    Left p -> [(label, contents `joinPara` p)]
+    Right i -> (label, contents) : i
+
+-- | If possible, appends two 'Doc's under a 'DocParagraph' rather than
+-- outside of it. This allows to get structures like
+--
+-- @DocParagraph (DocAppend … …)@
+--
+-- rather than
+--
+-- @DocAppend (DocParagraph …) …@
+joinPara :: DocH mod id -> DocH mod id -> DocH mod id
+joinPara (DocParagraph p) c = docParagraph $ p <> c
+joinPara d p = d <> p
+
+-- | Drops all trailing newlines.
+dropNLs :: String -> String
+dropNLs = reverse . dropWhile (== '\n') . reverse
+
+-- | Main worker for 'innerList' and 'definitionList'.
+-- We need the 'Either' here to be able to tell in the respective functions
+-- whether we're dealing with the next list or a nested paragraph.
+more :: Monoid a => Parser a
+     -> Parser ([String], Either (DocH mod Identifier) a)
+more item = innerParagraphs <|> moreListItems item
+            <|> moreContent item <|> pure ([], Right mempty)
+
+-- | Use by 'innerList' and 'definitionList' to parse any nested paragraphs.
+innerParagraphs :: Parser ([String], Either (DocH mod Identifier) a)
+innerParagraphs = (,) [] . Left <$> ("\n" *> indentedParagraphs)
+
+-- | Attemps to fetch the next list if possibly. Used by 'innerList' and
+-- 'definitionList' to recursivly grab lists that aren't separated by a whole
+-- paragraph.
+moreListItems :: Parser a
+              -> Parser ([String], Either (DocH mod Identifier) a)
+moreListItems item = (,) [] . Right <$> (skipSpace *> item)
+
+-- | Helper for 'innerList' and 'definitionList' which simply takes
+-- a line of text and attempts to parse more list content with 'more'.
+moreContent :: Monoid a => Parser a
+            -> Parser ([String], Either (DocH mod Identifier) a)
+moreContent item = first . (:) <$> nonEmptyLine <*> more item
+
+-- | Runs the 'parseParas' parser on an indented paragraph.
+-- The indentation is 4 spaces.
+indentedParagraphs :: Parser (DocH mod Identifier)
+indentedParagraphs = parseParas . concat <$> dropFrontOfPara "    "
+
+-- | Grab as many fully indented paragraphs as we can.
+dropFrontOfPara :: Parser BS.ByteString -> Parser [String]
+dropFrontOfPara sp = do
+  currentParagraph <- some (sp *> takeNonEmptyLine)
+  followingParagraphs <-
+    skipHorizontalSpace *> nextPar -- we have more paragraphs to take
+    <|> skipHorizontalSpace *> nlList -- end of the ride, remember the newline
+    <|> endOfInput *> return [] -- nothing more to take at all
+  return (currentParagraph ++ followingParagraphs)
+  where
+    nextPar = (++) <$> nlList <*> dropFrontOfPara sp
+    nlList = "\n" *> return ["\n"]
+
+nonSpace :: BS.ByteString -> Parser BS.ByteString
+nonSpace xs
+  | not $ any (not . isSpace) $ decodeUtf8 xs = fail "empty line"
+  | otherwise = return xs
+
+-- | Takes a non-empty, not fully whitespace line.
+--
+--  Doesn't discard the trailing newline.
+takeNonEmptyLine :: Parser String
+takeNonEmptyLine = do
+    (++ "\n") . decodeUtf8 <$> (takeWhile1 (/= '\n') >>= nonSpace) <* "\n"
+
+birdtracks :: Parser (DocH mod a)
+birdtracks = DocCodeBlock . DocString . intercalate "\n" . stripSpace <$> many1 line
+  where
+    line = skipHorizontalSpace *> ">" *> takeLine
+
+stripSpace :: [String] -> [String]
+stripSpace = fromMaybe <*> mapM strip'
+  where
+    strip' (' ':xs') = Just xs'
+    strip' "" = Just ""
+    strip' _  = Nothing
+
+-- | Parses examples. Examples are a paragraph level entitity (separated by an empty line).
+-- Consecutive examples are accepted.
+examples :: Parser (DocH mod a)
+examples = DocExamples <$> (many (skipHorizontalSpace *> "\n") *> go)
+  where
+    go :: Parser [Example]
+    go = do
+      prefix <- decodeUtf8 <$> takeHorizontalSpace <* ">>>"
+      expr <- takeLine
+      (rs, es) <- resultAndMoreExamples
+      return (makeExample prefix expr rs : es)
+      where
+        resultAndMoreExamples :: Parser ([String], [Example])
+        resultAndMoreExamples = moreExamples <|> result <|> pure ([], [])
+          where
+            moreExamples :: Parser ([String], [Example])
+            moreExamples = (,) [] <$> go
+
+            result :: Parser ([String], [Example])
+            result = first . (:) <$> nonEmptyLine <*> resultAndMoreExamples
+
+    makeExample :: String -> String -> [String] -> Example
+    makeExample prefix expression res =
+      Example (strip expression) result
+      where
+        result = map (substituteBlankLine . tryStripPrefix) res
+
+        tryStripPrefix xs = fromMaybe xs (stripPrefix prefix xs)
+
+        substituteBlankLine "<BLANKLINE>" = ""
+        substituteBlankLine xs = xs
+
+nonEmptyLine :: Parser String
+nonEmptyLine = mfilter (any (not . isSpace)) takeLine
+
+takeLine :: Parser String
+takeLine = decodeUtf8 <$> takeWhile (/= '\n') <* endOfLine
+
+endOfLine :: Parser ()
+endOfLine = void "\n" <|> endOfInput
+
+-- | Property parser.
+--
+-- >>> parseOnly property "prop> hello world"
+-- Right (DocProperty "hello world")
+property :: Parser (DocH mod a)
+property = DocProperty . strip . decodeUtf8 <$> ("prop>" *> takeWhile1 (/= '\n'))
+
+-- |
+-- Paragraph level codeblock. Anything between the two delimiting @ is parsed
+-- for markup.
+codeblock :: Parser (DocH mod Identifier)
+codeblock =
+  DocCodeBlock . parseStringBS . dropSpaces
+  <$> ("@" *> skipHorizontalSpace *> "\n" *> block' <* "@")
+  where
+    dropSpaces xs =
+      let rs = decodeUtf8 xs
+      in case splitByNl rs of
+        [] -> xs
+        ys -> case last ys of
+          ' ':_ -> case mapM dropSpace ys of
+            Nothing -> xs
+            Just zs -> encodeUtf8 $ intercalate "\n" zs
+          _ -> xs
+
+    -- This is necessary because ‘lines’ swallows up a trailing newline
+    -- and we lose information about whether the last line belongs to @ or to
+    -- text which we need to decide whether we actually want to be dropping
+    -- anything at all.
+    splitByNl = unfoldr (\case '\n':s -> Just (span (/= '\n') s)
+                               _      -> Nothing)
+                . ('\n' :)
+
+    dropSpace "" = Just ""
+    dropSpace (' ':xs) = Just xs
+    dropSpace _ = Nothing
+
+    block' = scan False p
+      where
+        p isNewline c
+          | isNewline && c == '@' = Nothing
+          | isNewline && isSpace c = Just isNewline
+          | otherwise = Just $ c == '\n'
+
+hyperlink :: Parser (DocH mod a)
+hyperlink = DocHyperlink . makeLabeled Hyperlink . decodeUtf8
+              <$> disallowNewline ("<" *> takeUntil ">")
+            <|> autoUrl
+
+autoUrl :: Parser (DocH mod a)
+autoUrl = mkLink <$> url
+  where
+    url = mappend <$> ("http://" <|> "https://" <|> "ftp://") <*> takeWhile1 (not . isSpace)
+    mkLink :: BS.ByteString -> DocH mod a
+    mkLink s = case BS.unsnoc s of
+      Just (xs, x) | x `elem` ",.!?" -> DocHyperlink (Hyperlink (decodeUtf8 xs) Nothing) <> DocString [x]
+      _ -> DocHyperlink (Hyperlink (decodeUtf8 s) Nothing)
+
+-- | Parses strings between identifier delimiters. Consumes all input that it
+-- deems to be valid in an identifier. Note that it simply blindly consumes
+-- characters and does no actual validation itself.
+parseValid :: Parser String
+parseValid = do
+  vs' <- many' $ utf8String "⋆" <|> return <$> idChar
+  let vs = concat vs'
+  c <- peekChar
+  case c of
+    Just '`' -> return vs
+    Just '\'' -> (\x -> vs ++ "'" ++ x) <$> ("'" *> parseValid)
+                 <|> return vs
+    _ -> fail "outofvalid"
+  where
+    idChar = satisfy (`elem` "_.!#$%&*+/<=>?@\\|-~:^")
+             <|> digit <|> letter_ascii
+
+-- | Parses UTF8 strings from ByteString streams.
+utf8String :: String -> Parser String
+utf8String x = decodeUtf8 <$> string (encodeUtf8 x)
+
+-- | Parses identifiers with help of 'parseValid'. Asks GHC for 'String' from the
+-- string it deems valid.
+identifier :: Parser (DocH mod Identifier)
+identifier = do
+  o <- idDelim
+  vid <- parseValid
+  e <- idDelim
+  return $ DocIdentifier (o, vid, e)
+  where
+    idDelim = char '\'' <|> char '`'
diff --git a/haddock-library/src/Documentation/Haddock/Parser/Util.hs b/haddock-library/src/Documentation/Haddock/Parser/Util.hs
new file mode 100644
index 00000000..25dba2d5
--- /dev/null
+++ b/haddock-library/src/Documentation/Haddock/Parser/Util.hs
@@ -0,0 +1,61 @@
+-- |
+-- Module      :  Documentation.Haddock.Parser.Util
+-- Copyright   :  (c) Mateusz Kowalczyk 2013-2014,
+--                    Simon Hengel      2013
+-- License     :  BSD-like
+--
+-- Maintainer  :  haddock@projects.haskell.org
+-- Stability   :  experimental
+-- Portability :  portable
+--
+-- Various utility functions used by the parser.
+module Documentation.Haddock.Parser.Util where
+
+import           Control.Applicative
+import           Control.Monad (mfilter)
+import           Data.Attoparsec.ByteString.Char8 hiding (parse, take, endOfLine)
+import           Data.ByteString.Char8 (ByteString)
+import qualified Data.ByteString.Char8 as BS
+import           Prelude hiding (takeWhile)
+
+-- | Remove all leading and trailing whitespace
+strip :: String -> String
+strip = (\f -> f . f) $ dropWhile isSpace . reverse
+
+skipHorizontalSpace :: Parser ()
+skipHorizontalSpace = skipWhile (`elem` " \t\f\v\r")
+
+takeHorizontalSpace :: Parser BS.ByteString
+takeHorizontalSpace = takeWhile (`elem` " \t\f\v\r")
+
+makeLabeled :: (String -> Maybe String -> a) -> String -> a
+makeLabeled f input = case break isSpace $ removeEscapes $ strip input of
+  (uri, "")    -> f uri Nothing
+  (uri, label) -> f uri (Just $ dropWhile isSpace label)
+  where
+    -- As we don't parse these any further, we don't do any processing to the
+    -- string so we at least remove escape character here. Perhaps we should
+    -- actually be parsing the label at the very least?
+    removeEscapes "" = ""
+    removeEscapes ('\\':'\\':xs) = '\\' : removeEscapes xs
+    removeEscapes ('\\':xs) = removeEscapes xs
+    removeEscapes (x:xs) = x : removeEscapes xs
+
+takeUntil :: ByteString -> Parser ByteString
+takeUntil end_ = dropEnd <$> requireEnd (scan (False, end) p) >>= gotSome
+  where
+    end = BS.unpack end_
+
+    p :: (Bool, String) -> Char -> Maybe (Bool, String)
+    p acc c = case acc of
+      (True, _) -> Just (False, end)
+      (_, []) -> Nothing
+      (_, x:xs) | x == c -> Just (False, xs)
+      _ -> Just (c == '\\', end)
+
+    dropEnd = BS.reverse . BS.drop (length end) . BS.reverse
+    requireEnd = mfilter (BS.isSuffixOf end_)
+
+    gotSome xs
+      | BS.null xs = fail "didn't get any content"
+      | otherwise = return xs
diff --git a/haddock-library/src/Documentation/Haddock/Types.hs b/haddock-library/src/Documentation/Haddock/Types.hs
new file mode 100644
index 00000000..3e25d230
--- /dev/null
+++ b/haddock-library/src/Documentation/Haddock/Types.hs
@@ -0,0 +1,70 @@
+{-# LANGUAGE DeriveDataTypeable, DeriveFunctor, DeriveFoldable #-}
+{-# LANGUAGE DeriveTraversable, StandaloneDeriving #-}
+
+-- |
+-- Module      :  Documentation.Haddock.Types
+-- Copyright   :  (c) Simon Marlow      2003-2006,
+--                    David Waern       2006-2009,
+--                    Mateusz Kowalczyk 2013
+-- License     :  BSD-like
+--
+-- Maintainer  :  haddock@projects.haskellorg
+-- Stability   :  experimental
+-- Portability :  portable
+--
+-- Exposes documentation data types used for (some) of Haddock.
+module Documentation.Haddock.Types where
+
+import Data.Foldable
+import Data.Traversable
+
+instance Foldable Header where
+  foldMap f (Header _ a) = f a
+
+instance Traversable Header where
+  traverse f (Header l a) = Header l `fmap` f a
+
+data Hyperlink = Hyperlink
+  { hyperlinkUrl   :: String
+  , hyperlinkLabel :: Maybe String
+  } deriving (Eq, Show)
+
+
+data Picture = Picture
+  { pictureUri   :: String
+  , pictureTitle :: Maybe String
+  } deriving (Eq, Show)
+
+data Header id = Header
+  { headerLevel :: Int
+  , headerTitle :: id
+  } deriving Functor
+
+data Example = Example
+  { exampleExpression :: String
+  , exampleResult     :: [String]
+  } deriving (Eq, Show)
+
+data DocH mod id
+  = DocEmpty
+  | DocAppend (DocH mod id) (DocH mod id)
+  | DocString String
+  | DocParagraph (DocH mod id)
+  | DocIdentifier id
+  | DocIdentifierUnchecked mod
+  | DocModule String
+  | DocWarning (DocH mod id)
+  | DocEmphasis (DocH mod id)
+  | DocMonospaced (DocH mod id)
+  | DocBold (DocH mod id)
+  | DocUnorderedList [DocH mod id]
+  | DocOrderedList [DocH mod id]
+  | DocDefList [(DocH mod id, DocH mod id)]
+  | DocCodeBlock (DocH mod id)
+  | DocHyperlink Hyperlink
+  | DocPic Picture
+  | DocAName String
+  | DocProperty String
+  | DocExamples [Example]
+  | DocHeader (Header (DocH mod id))
+  deriving (Functor, Foldable, Traversable)
diff --git a/haddock-library/src/Documentation/Haddock/Utf8.hs b/haddock-library/src/Documentation/Haddock/Utf8.hs
new file mode 100644
index 00000000..3f75e53b
--- /dev/null
+++ b/haddock-library/src/Documentation/Haddock/Utf8.hs
@@ -0,0 +1,74 @@
+module Documentation.Haddock.Utf8 (encodeUtf8, decodeUtf8) where
+import           Data.Bits ((.|.), (.&.), shiftL, shiftR)
+import qualified Data.ByteString as BS
+import           Data.Char (chr, ord)
+import           Data.Word (Word8)
+
+-- | Helper that encodes and packs a 'String' into a 'BS.ByteString'
+encodeUtf8 :: String -> BS.ByteString
+encodeUtf8 = BS.pack . encode
+
+-- | Helper that unpacks and decodes a 'BS.ByteString' into a 'String'
+decodeUtf8 :: BS.ByteString -> String
+decodeUtf8 = decode . BS.unpack
+
+-- Copy/pasted functions from Codec.Binary.UTF8.String for encoding/decoding
+-- | Character to use when 'encode' or 'decode' fail for a byte.
+replacementCharacter :: Char
+replacementCharacter = '\xfffd'
+
+-- | Encode a Haskell String to a list of Word8 values, in UTF8 format.
+encode :: String -> [Word8]
+encode = concatMap (map fromIntegral . go . ord)
+ where
+  go oc
+   | oc <= 0x7f       = [oc]
+
+   | oc <= 0x7ff      = [ 0xc0 + (oc `shiftR` 6)
+                        , 0x80 + oc .&. 0x3f
+                        ]
+
+   | oc <= 0xffff     = [ 0xe0 + (oc `shiftR` 12)
+                        , 0x80 + ((oc `shiftR` 6) .&. 0x3f)
+                        , 0x80 + oc .&. 0x3f
+                        ]
+   | otherwise        = [ 0xf0 + (oc `shiftR` 18)
+                        , 0x80 + ((oc `shiftR` 12) .&. 0x3f)
+                        , 0x80 + ((oc `shiftR` 6) .&. 0x3f)
+                        , 0x80 + oc .&. 0x3f
+                        ]
+
+-- | Decode a UTF8 string packed into a list of Word8 values, directly to String
+decode :: [Word8] -> String
+decode [    ] = ""
+decode (c:cs)
+  | c < 0x80  = chr (fromEnum c) : decode cs
+  | c < 0xc0  = replacementCharacter : decode cs
+  | c < 0xe0  = multi1
+  | c < 0xf0  = multi_byte 2 0xf  0x800
+  | c < 0xf8  = multi_byte 3 0x7  0x10000
+  | c < 0xfc  = multi_byte 4 0x3  0x200000
+  | c < 0xfe  = multi_byte 5 0x1  0x4000000
+  | otherwise = replacementCharacter : decode cs
+  where
+    multi1 = case cs of
+      c1 : ds | c1 .&. 0xc0 == 0x80 ->
+        let d = ((fromEnum c .&. 0x1f) `shiftL` 6) .|.  fromEnum (c1 .&. 0x3f)
+        in if d >= 0x000080 then toEnum d : decode ds
+                            else replacementCharacter : decode ds
+      _ -> replacementCharacter : decode cs
+
+    multi_byte :: Int -> Word8 -> Int -> String
+    multi_byte i mask overlong = aux i cs (fromEnum (c .&. mask))
+      where
+        aux 0 rs acc
+          | overlong <= acc && acc <= 0x10ffff &&
+            (acc < 0xd800 || 0xdfff < acc)     &&
+            (acc < 0xfffe || 0xffff < acc)      = chr acc : decode rs
+          | otherwise = replacementCharacter : decode rs
+
+        aux n (r:rs) acc
+          | r .&. 0xc0 == 0x80 = aux (n-1) rs
+                               $ shiftL acc 6 .|. fromEnum (r .&. 0x3f)
+
+        aux _ rs     _ = replacementCharacter : decode rs