diff options
Diffstat (limited to 'haddock-library/src/Documentation/Haddock/Parser.hs')
| -rw-r--r-- | haddock-library/src/Documentation/Haddock/Parser.hs | 216 | 
1 files changed, 140 insertions, 76 deletions
diff --git a/haddock-library/src/Documentation/Haddock/Parser.hs b/haddock-library/src/Documentation/Haddock/Parser.hs index e8bc2761..b7ab85b0 100644 --- a/haddock-library/src/Documentation/Haddock/Parser.hs +++ b/haddock-library/src/Documentation/Haddock/Parser.hs @@ -1,8 +1,5 @@  {-# LANGUAGE OverloadedStrings #-} -{-# LANGUAGE StandaloneDeriving #-} -{-# LANGUAGE FlexibleInstances #-} -{-# LANGUAGE UndecidableInstances #-} -{-# LANGUAGE IncoherentInstances #-} +{-# LANGUAGE ViewPatterns #-}  -- |  -- Module      :  Documentation.Haddock.Parser  -- Copyright   :  (c) Mateusz Kowalczyk 2013-2014, @@ -24,14 +21,14 @@ module Documentation.Haddock.Parser ( parseString, parseParas  import           Control.Applicative  import           Control.Arrow (first) -import           Control.Monad (void, mfilter) -import           Data.Attoparsec.ByteString.Char8 hiding (parse, take, endOfLine) +import           Control.Monad  import qualified Data.ByteString.Char8 as BS  import           Data.Char (chr, isAsciiUpper)  import           Data.List (stripPrefix, intercalate, unfoldr)  import           Data.Maybe (fromMaybe)  import           Data.Monoid  import           Documentation.Haddock.Doc +import           Documentation.Haddock.Parser.Monad hiding (take, endOfLine)  import           Documentation.Haddock.Parser.Util  import           Documentation.Haddock.Types  import           Documentation.Haddock.Utf8 @@ -81,7 +78,7 @@ overIdentifier f d = g d      g (DocExamples x) = DocExamples x      g (DocHeader (Header l x)) = DocHeader . Header l $ g x -parse :: Parser a -> BS.ByteString -> a +parse :: Parser a -> BS.ByteString -> (ParserState, a)  parse p = either err id . parseOnly (p <* endOfInput)    where      err = error . ("Haddock.Parser.parse: " ++) @@ -89,11 +86,21 @@ parse p = either err id . parseOnly (p <* endOfInput)  -- | Main entry point to the parser. Appends the newline character  -- to the input string.  parseParas :: String -- ^ String to parse -           -> DocH mod Identifier -parseParas = parse (p <* skipSpace) . encodeUtf8 . (++ "\n") +           -> MetaDoc mod Identifier +parseParas input = case parseParasState input of +  (state, a) -> MetaDoc { _meta = Meta { _version = parserStateSince state } +                        , _doc = a +                        } + +parseParasState :: String -> (ParserState, DocH mod Identifier) +parseParasState = parse (p <* skipSpace) . encodeUtf8 . (++ "\n")    where      p :: Parser (DocH mod Identifier) -    p = mconcat <$> paragraph `sepBy` many (skipHorizontalSpace *> "\n") +    p = docConcat <$> paragraph `sepBy` many (skipHorizontalSpace *> "\n") + +parseParagraphs :: String -> Parser (DocH mod Identifier) +parseParagraphs input = case parseParasState input of +  (state, a) -> setParserState state >> return a  -- | Parse a text paragraph. Actually just a wrapper over 'parseStringBS' which  -- drops leading whitespace and encodes the string to UTF8 first. @@ -101,19 +108,19 @@ parseString :: String -> DocH mod Identifier  parseString = parseStringBS . encodeUtf8 . dropWhile isSpace  parseStringBS :: BS.ByteString -> DocH mod Identifier -parseStringBS = parse p +parseStringBS = snd . parse p    where      p :: Parser (DocH mod Identifier) -    p = mconcat <$> many (monospace <|> anchor <|> identifier <|> moduleName -                          <|> picture <|> hyperlink <|> bold -                          <|> emphasis <|> encodedChar <|> string' -                          <|> skipSpecialChar) +    p = docConcat <$> many (monospace <|> anchor <|> identifier <|> moduleName +                            <|> picture <|> markdownImage <|> hyperlink <|> bold +                            <|> emphasis <|> encodedChar <|> string' +                            <|> skipSpecialChar)  -- | Parses and processes  -- <https://en.wikipedia.org/wiki/Numeric_character_reference Numeric character references>  -- --- >>> parseOnly encodedChar "A" --- Right (DocString "A") +-- >>> parseString "A" +-- DocString "A"  encodedChar :: Parser (DocH mod a)  encodedChar = "&#" *> c <* ";"    where @@ -145,16 +152,16 @@ skipSpecialChar = DocString . return <$> satisfy (`elem` specialChar)  -- | Emphasis parser.  -- --- >>> parseOnly emphasis "/Hello world/" --- Right (DocEmphasis (DocString "Hello world")) +-- >>> parseString "/Hello world/" +-- DocEmphasis (DocString "Hello world")  emphasis :: Parser (DocH mod Identifier)  emphasis = DocEmphasis . parseStringBS <$>    mfilter ('\n' `BS.notElem`) ("/" *> takeWhile1_ (/= '/') <* "/")  -- | Bold parser.  -- --- >>> parseOnly bold "__Hello world__" --- Right (DocBold (DocString "Hello world")) +-- >>> parseString "__Hello world__" +-- DocBold (DocString "Hello world")  bold :: Parser (DocH mod Identifier)  bold = DocBold . parseStringBS <$> disallowNewline ("__" *> takeUntil "__") @@ -176,19 +183,23 @@ takeWhile1_ = mfilter (not . BS.null) . takeWhile_  -- | Text anchors to allow for jumping around the generated documentation.  -- --- >>> parseOnly anchor "#Hello world#" --- Right (DocAName "Hello world") +-- >>> parseString "#Hello world#" +-- DocAName "Hello world"  anchor :: Parser (DocH mod a)  anchor = DocAName . decodeUtf8 <$>           disallowNewline ("#" *> takeWhile1_ (/= '#') <* "#")  -- | Monospaced strings.  -- --- >>> parseOnly monospace "@cruel@" --- Right (DocMonospaced (DocString "cruel")) +-- >>> parseString "@cruel@" +-- DocMonospaced (DocString "cruel")  monospace :: Parser (DocH mod Identifier) -monospace = DocMonospaced . parseStringBS <$> ("@" *> takeWhile1_ (/= '@') <* "@") +monospace = DocMonospaced . parseStringBS +            <$> ("@" *> takeWhile1_ (/= '@') <* "@") +-- | Module names: we try our reasonable best to only allow valid +-- Haskell module names, with caveat about not matching on technically +-- valid unicode symbols.  moduleName :: Parser (DocH mod a)  moduleName = DocModule <$> (char '"' *> modid <* char '"')    where @@ -204,26 +215,45 @@ moduleName = DocModule <$> (char '"' *> modid <* char '"')  -- | Picture parser, surrounded by \<\< and \>\>. It's possible to specify  -- a title for the picture.  -- --- >>> parseOnly picture "<<hello.png>>" --- Right (DocPic (Picture {pictureUri = "hello.png", pictureTitle = Nothing})) --- >>> parseOnly picture "<<hello.png world>>" --- Right (DocPic (Picture {pictureUri = "hello.png", pictureTitle = Just "world"})) +-- >>> parseString "<<hello.png>>" +-- DocPic (Picture {pictureUri = "hello.png", pictureTitle = Nothing}) +-- >>> parseString "<<hello.png world>>" +-- DocPic (Picture {pictureUri = "hello.png", pictureTitle = Just "world"})  picture :: Parser (DocH mod a)  picture = DocPic . makeLabeled Picture . decodeUtf8            <$> disallowNewline ("<<" *> takeUntil ">>") +markdownImage :: Parser (DocH mod a) +markdownImage = fromHyperlink <$> ("!" *> linkParser) +  where +    fromHyperlink (Hyperlink url label) = DocPic (Picture url label) +  -- | Paragraph parser, called by 'parseParas'.  paragraph :: Parser (DocH mod Identifier) -paragraph = examples <|> skipSpace *> (list <|> birdtracks <|> codeblock -                                       <|> property <|> header -                                       <|> textParagraph) +paragraph = examples <|> skipSpace *> ( +      since +  <|> unorderedList +  <|> orderedList +  <|> birdtracks +  <|> codeblock +  <|> property +  <|> header +  <|> textParagraphThatStartsWithMarkdownLink +  <|> definitionList +  <|> docParagraph <$> textParagraph +  ) + +since :: Parser (DocH mod a) +since = ("@since " *> version <* skipHorizontalSpace <* endOfLine) >>= setSince >> return DocEmpty +  where +    version = decimal `sepBy1'` "."  -- | Headers inside the comment denoted with @=@ signs, up to 6 levels  -- deep.  -- --- >>> parseOnly header "= Hello" +-- >>> snd <$> parseOnly header "= Hello"  -- Right (DocHeader (Header {headerLevel = 1, headerTitle = DocString "Hello"})) --- >>> parseOnly header "== World" +-- >>> snd <$> parseOnly header "== World"  -- Right (DocHeader (Header {headerLevel = 2, headerTitle = DocString "World"}))  header :: Parser (DocH mod Identifier)  header = do @@ -231,26 +261,37 @@ header = do        pser = foldl1 (<|>) psers    delim <- decodeUtf8 <$> pser    line <- skipHorizontalSpace *> nonEmptyLine >>= return . parseString -  rest <- paragraph <|> return mempty -  return $ DocHeader (Header (length delim) line) <> rest +  rest <- paragraph <|> return DocEmpty +  return $ DocHeader (Header (length delim) line) `docAppend` rest  textParagraph :: Parser (DocH mod Identifier) -textParagraph = docParagraph . parseString . intercalate "\n" <$> many1 nonEmptyLine +textParagraph = parseString . intercalate "\n" <$> many1 nonEmptyLine --- | List parser, called by 'paragraph'. -list :: Parser (DocH mod Identifier) -list = DocUnorderedList <$> unorderedList -       <|> DocOrderedList <$> orderedList -       <|> DocDefList <$> definitionList +textParagraphThatStartsWithMarkdownLink :: Parser (DocH mod Identifier) +textParagraphThatStartsWithMarkdownLink = docParagraph <$> (docAppend <$> markdownLink <*> optionalTextParagraph) +  where +    optionalTextParagraph :: Parser (DocH mod Identifier) +    optionalTextParagraph = (docAppend <$> whitespace <*> textParagraph) <|> pure DocEmpty + +    whitespace :: Parser (DocH mod a) +    whitespace = DocString <$> (f <$> takeHorizontalSpace <*> optional "\n") +      where +        f :: BS.ByteString -> Maybe BS.ByteString -> String +        f xs (fromMaybe "" -> x) +          | BS.null (xs <> x) = "" +          | otherwise = " "  -- | Parses unordered (bullet) lists. -unorderedList :: Parser [DocH mod Identifier] -unorderedList = ("*" <|> "-") *> innerList unorderedList +unorderedList :: Parser (DocH mod Identifier) +unorderedList = DocUnorderedList <$> p +  where +    p = ("*" <|> "-") *> innerList p  -- | Parses ordered lists (numbered or dashed). -orderedList :: Parser [DocH mod Identifier] -orderedList = (paren <|> dot) *> innerList orderedList +orderedList :: Parser (DocH mod Identifier) +orderedList = DocOrderedList <$> p    where +    p = (paren <|> dot) *> innerList p      dot = (decimal :: Parser Int) <* "."      paren = "(" *> decimal <* ")" @@ -265,19 +306,21 @@ innerList item = do    (cs, items) <- more item    let contents = docParagraph . parseString . dropNLs . unlines $ c : cs    return $ case items of -    Left p -> [contents <> p] +    Left p -> [contents `docAppend` p]      Right i -> contents : i  -- | Parses definition lists. -definitionList :: Parser [(DocH mod Identifier, DocH mod Identifier)] -definitionList = do -  label <- "[" *> (parseStringBS <$> takeWhile1 (`notElem` ("]\n"::String))) <* "]" -  c <- takeLine -  (cs, items) <- more definitionList -  let contents = parseString . dropNLs . unlines $ c : cs -  return $ case items of -    Left p -> [(label, contents <> p)] -    Right i -> (label, contents) : i +definitionList :: Parser (DocH mod Identifier) +definitionList = DocDefList <$> p +  where +    p = do +      label <- "[" *> (parseStringBS <$> takeWhile1 (`notElem` ("]\n" :: String))) <* ("]" <* optional ":") +      c <- takeLine +      (cs, items) <- more p +      let contents = parseString . dropNLs . unlines $ c : cs +      return $ case items of +        Left x -> [(label, contents `docAppend` x)] +        Right i -> (label, contents) : i  -- | Drops all trailing newlines.  dropNLs :: String -> String @@ -291,12 +334,12 @@ more :: Monoid a => Parser a  more item = innerParagraphs <|> moreListItems item              <|> moreContent item <|> pure ([], Right mempty) --- | Use by 'innerList' and 'definitionList' to parse any nested paragraphs. +-- | Used by 'innerList' and 'definitionList' to parse any nested paragraphs.  innerParagraphs :: Parser ([String], Either (DocH mod Identifier) a)  innerParagraphs = (,) [] . Left <$> ("\n" *> indentedParagraphs) --- | Attemps to fetch the next list if possibly. Used by 'innerList' and --- 'definitionList' to recursivly grab lists that aren't separated by a whole +-- | Attempts to fetch the next list if possibly. Used by 'innerList' and +-- 'definitionList' to recursively grab lists that aren't separated by a whole  -- paragraph.  moreListItems :: Parser a                -> Parser ([String], Either (DocH mod Identifier) a) @@ -308,10 +351,10 @@ moreContent :: Monoid a => Parser a              -> Parser ([String], Either (DocH mod Identifier) a)  moreContent item = first . (:) <$> nonEmptyLine <*> more item --- | Runs the 'parseParas' parser on an indented paragraph. +-- | Parses an indented paragraph.  -- The indentation is 4 spaces.  indentedParagraphs :: Parser (DocH mod Identifier) -indentedParagraphs = parseParas . concat <$> dropFrontOfPara "    " +indentedParagraphs = (concat <$> dropFrontOfPara "    ") >>= parseParagraphs  -- | Grab as many fully indented paragraphs as we can.  dropFrontOfPara :: Parser BS.ByteString -> Parser [String] @@ -399,7 +442,7 @@ endOfLine = void "\n" <|> endOfInput  -- | Property parser.  -- --- >>> parseOnly property "prop> hello world" +-- >>> snd <$> parseOnly property "prop> hello world"  -- Right (DocProperty "hello world")  property :: Parser (DocH mod a)  property = DocProperty . strip . decodeUtf8 <$> ("prop>" *> takeWhile1 (/= '\n')) @@ -442,11 +485,32 @@ codeblock =            | isNewline && isSpace c = Just isNewline            | otherwise = Just $ c == '\n' --- | Parses links that were specifically marked as such.  hyperlink :: Parser (DocH mod a)  hyperlink = DocHyperlink . makeLabeled Hyperlink . decodeUtf8                <$> disallowNewline ("<" *> takeUntil ">")              <|> autoUrl +            <|> markdownLink + +markdownLink :: Parser (DocH mod a) +markdownLink = DocHyperlink <$> linkParser + +linkParser :: Parser Hyperlink +linkParser = flip Hyperlink <$> label <*> (whitespace *> url) +  where +    label :: Parser (Maybe String) +    label = Just . strip . decode <$> ("[" *> takeUntil "]") + +    whitespace :: Parser () +    whitespace = skipHorizontalSpace <* optional ("\n" *> skipHorizontalSpace) + +    url :: Parser String +    url = rejectWhitespace (decode <$> ("(" *> takeUntil ")")) + +    rejectWhitespace :: MonadPlus m => m String -> m String +    rejectWhitespace = mfilter (all (not . isSpace)) + +    decode :: BS.ByteString -> String +    decode = removeEscapes . decodeUtf8  -- | Looks for URL-like things to automatically hyperlink even if they  -- weren't marked as links. @@ -456,32 +520,32 @@ autoUrl = mkLink <$> url      url = mappend <$> ("http://" <|> "https://" <|> "ftp://") <*> takeWhile1 (not . isSpace)      mkLink :: BS.ByteString -> DocH mod a      mkLink s = case unsnoc s of -      Just (xs, x) | x `elem` (",.!?"::String) -> DocHyperlink (Hyperlink (decodeUtf8 xs) Nothing) <> DocString [x] +      Just (xs, x) | x `elem` (",.!?" :: String) -> DocHyperlink (Hyperlink (decodeUtf8 xs) Nothing) `docAppend` DocString [x]        _ -> DocHyperlink (Hyperlink (decodeUtf8 s) Nothing)  -- | Parses strings between identifier delimiters. Consumes all input that it  -- deems to be valid in an identifier. Note that it simply blindly consumes  -- characters and does no actual validation itself.  parseValid :: Parser String -parseValid = do -  vs' <- many' $ utf8String "⋆" <|> return <$> idChar -  let vs = concat vs' -  c <- peekChar -  case c of -    Just '`' -> return vs -    Just '\'' -> (\x -> vs ++ "'" ++ x) <$> ("'" *> parseValid) -                 <|> return vs -    _ -> fail "outofvalid" +parseValid = p some    where      idChar = satisfy (`elem` ("_.!#$%&*+/<=>?@\\|-~:^"::String))               <|> digit <|> letter_ascii +    p p' = do +      vs' <- p' $ utf8String "⋆" <|> return <$> idChar +      let vs = concat vs' +      c <- peekChar' +      case c of +        '`' -> return vs +        '\'' -> (\x -> vs ++ "'" ++ x) <$> ("'" *> p many') <|> return vs +        _ -> fail "outofvalid"  -- | Parses UTF8 strings from ByteString streams.  utf8String :: String -> Parser String  utf8String x = decodeUtf8 <$> string (encodeUtf8 x) --- | Parses identifiers with help of 'parseValid'. Asks GHC for 'String' from the --- string it deems valid. +-- | Parses identifiers with help of 'parseValid'. Asks GHC for +-- 'String' from the string it deems valid.  identifier :: Parser (DocH mod Identifier)  identifier = do    o <- idDelim  | 
