diff options
Diffstat (limited to 'src/Haddock/Parser.hs')
-rw-r--r-- | src/Haddock/Parser.hs | 260 |
1 files changed, 147 insertions, 113 deletions
diff --git a/src/Haddock/Parser.hs b/src/Haddock/Parser.hs index ece9291f..1489ae84 100644 --- a/src/Haddock/Parser.hs +++ b/src/Haddock/Parser.hs @@ -13,7 +13,10 @@ -- Stability : experimental -- Portability : portable -module Haddock.Parser (parseString, parseParas, parseStringMaybe, parseParasMaybe) where +module Haddock.Parser ( parseString, parseParas + , overIdentifier, toRegular + , takeUntil + ) where import Prelude hiding (takeWhile) import Control.Arrow (first) @@ -25,25 +28,51 @@ import Data.Char (chr, isAsciiUpper) import Data.List (stripPrefix, intercalate, unfoldr) import Data.Maybe (fromMaybe) import Data.Monoid -import DynFlags -import FastString (mkFastString) import Haddock.Doc -import Haddock.Types -import Lexer (mkPState, unP, ParseResult(POk)) -import Parser (parseIdentifier) -import RdrName -import SrcLoc (mkRealSrcLoc, unLoc) -import StringBuffer (stringToStringBuffer) +import Haddock.Types hiding (Doc) import Haddock.Utf8 -import Haddock.Parser.Util - -{-# DEPRECATED parseParasMaybe "use `parseParas` instead" #-} -parseParasMaybe :: DynFlags -> String -> Maybe (Doc RdrName) -parseParasMaybe d = Just . parseParas d - -{-# DEPRECATED parseStringMaybe "use `parseString` instead" #-} -parseStringMaybe :: DynFlags -> String -> Maybe (Doc RdrName) -parseStringMaybe d = Just . parseString d +import Data.ByteString.Char8 (ByteString) + +-- | Identifier string surrounded with opening and closing quotes/backticks. +type IString = (Char, String, Char) + +-- | Drops the quotes/backticks around all identifiers, as if they +-- were valid but still 'String's. +toRegular :: DocH mod IString -> DocH mod String +toRegular = fmap (\(_, x, _) -> x) + +-- | Maps over 'DocIdentifier's over 'String' with potentially failing +-- conversion using user-supplied function. If the conversion fails, +-- the identifier is deemed to not be valid and is treated as a +-- regular string. +overIdentifier :: (String -> Maybe a) + -> DocH mod IString + -> DocH mod a +overIdentifier f d = g d + where + g (DocIdentifier (o, x, e)) = case f x of + Nothing -> DocString $ o : x ++ [e] + Just x' -> DocIdentifier x' + g DocEmpty = DocEmpty + g (DocAppend x x') = DocAppend (g x) (g x') + g (DocString x) = DocString x + g (DocParagraph x) = DocParagraph $ g x + g (DocIdentifierUnchecked x) = DocIdentifierUnchecked x + g (DocModule x) = DocModule x + g (DocWarning x) = DocWarning $ g x + g (DocEmphasis x) = DocEmphasis $ g x + g (DocMonospaced x) = DocMonospaced $ g x + g (DocBold x) = DocBold $ g x + g (DocUnorderedList x) = DocUnorderedList $ fmap g x + g (DocOrderedList x) = DocOrderedList $ fmap g x + g (DocDefList x) = DocDefList $ fmap (\(y, z) -> (g y, g z)) x + g (DocCodeBlock x) = DocCodeBlock $ g x + g (DocHyperlink x) = DocHyperlink x + g (DocPic x) = DocPic x + g (DocAName x) = DocAName x + g (DocProperty x) = DocProperty x + g (DocExamples x) = DocExamples x + g (DocHeader (Header l x)) = DocHeader . Header l $ g x parse :: Parser a -> BS.ByteString -> a parse p = either err id . parseOnly (p <* endOfInput) @@ -52,33 +81,32 @@ parse p = either err id . parseOnly (p <* endOfInput) -- | Main entry point to the parser. Appends the newline character -- to the input string. -parseParas :: DynFlags - -> String -- ^ String to parse - -> Doc RdrName -parseParas d = parse (p <* skipSpace) . encodeUtf8 . (++ "\n") +parseParas :: String -- ^ String to parse + -> DocH mod IString +parseParas = parse (p <* skipSpace) . encodeUtf8 . (++ "\n") where - p :: Parser (Doc RdrName) - p = mconcat <$> paragraph d `sepBy` many (skipHorizontalSpace *> "\n") + p :: Parser (DocH mod IString) + p = mconcat <$> paragraph `sepBy` many (skipHorizontalSpace *> "\n") -- | Parse a text paragraph. Actually just a wrapper over 'parseStringBS' which -- drops leading whitespace and encodes the string to UTF8 first. -parseString :: DynFlags -> String -> Doc RdrName -parseString d = parseStringBS d . encodeUtf8 . dropWhile isSpace +parseString :: String -> DocH mod IString +parseString = parseStringBS . encodeUtf8 . dropWhile isSpace -parseStringBS :: DynFlags -> BS.ByteString -> Doc RdrName -parseStringBS d = parse p +parseStringBS :: BS.ByteString -> DocH mod IString +parseStringBS = parse p where - p :: Parser (Doc RdrName) - p = mconcat <$> many (monospace d <|> anchor <|> identifier d - <|> moduleName <|> picture <|> hyperlink <|> autoUrl <|> bold d - <|> emphasis d <|> encodedChar <|> string' <|> skipSpecialChar) + p :: Parser (DocH mod IString) + p = mconcat <$> many (monospace <|> anchor <|> identifier + <|> moduleName <|> picture <|> hyperlink <|> autoUrl <|> bold + <|> emphasis <|> encodedChar <|> string' <|> skipSpecialChar) -- | Parses and processes -- <https://en.wikipedia.org/wiki/Numeric_character_reference Numeric character references> -- -- >>> parseOnly encodedChar "ABC" -- Right (DocString "ABC") -encodedChar :: Parser (Doc a) +encodedChar :: Parser (DocH mod a) encodedChar = "&#" *> c <* ";" where c = DocString . return . chr <$> num @@ -91,7 +119,7 @@ specialChar = "_/<@\"&'`" -- | Plain, regular parser for text. Called as one of the last parsers -- to ensure that we have already given a chance to more meaningful parsers -- before capturing their characers. -string' :: Parser (Doc a) +string' :: Parser (DocH mod a) string' = DocString . unescape . decodeUtf8 <$> takeWhile1_ (`notElem` specialChar) where unescape "" = "" @@ -101,23 +129,23 @@ string' = DocString . unescape . decodeUtf8 <$> takeWhile1_ (`notElem` specialCh -- | Skips a single special character and treats it as a plain string. -- This is done to skip over any special characters belonging to other -- elements but which were not deemed meaningful at their positions. -skipSpecialChar :: Parser (Doc a) +skipSpecialChar :: Parser (DocH mod a) skipSpecialChar = DocString . return <$> satisfy (`elem` specialChar) -- | Emphasis parser. -- -- >>> parseOnly emphasis "/Hello world/" -- Right (DocEmphasis (DocString "Hello world")) -emphasis :: DynFlags -> Parser (Doc RdrName) -emphasis d = DocEmphasis . parseStringBS d <$> +emphasis :: Parser (DocH mod IString) +emphasis = DocEmphasis . parseStringBS <$> mfilter ('\n' `BS.notElem`) ("/" *> takeWhile1_ (/= '/') <* "/") -- | Bold parser. -- -- >>> parseOnly bold "__Hello world__" -- Right (DocBold (DocString "Hello world")) -bold :: DynFlags -> Parser (Doc RdrName) -bold d = DocBold . parseStringBS d <$> disallowNewline ("__" *> takeUntil "__") +bold :: Parser (DocH mod IString) +bold = DocBold . parseStringBS <$> disallowNewline ("__" *> takeUntil "__") disallowNewline :: Parser BS.ByteString -> Parser BS.ByteString disallowNewline = mfilter ('\n' `BS.notElem`) @@ -139,17 +167,17 @@ takeWhile1_ = mfilter (not . BS.null) . takeWhile_ -- -- >>> parseOnly anchor "#Hello world#" -- Right (DocAName "Hello world") -anchor :: Parser (Doc a) +anchor :: Parser (DocH mod a) anchor = DocAName . decodeUtf8 <$> ("#" *> takeWhile1 (`notElem` "#\n") <* "#") -- | Monospaced strings. -- --- >>> parseOnly (monospace dynflags) "@cruel@" +-- >>> parseOnly monospace "@cruel@" -- Right (DocMonospaced (DocString "cruel")) -monospace :: DynFlags -> Parser (Doc RdrName) -monospace d = DocMonospaced . parseStringBS d <$> ("@" *> takeWhile1_ (/= '@') <* "@") +monospace :: Parser (DocH mod IString) +monospace = DocMonospaced . parseStringBS <$> ("@" *> takeWhile1_ (/= '@') <* "@") -moduleName :: Parser (Doc a) +moduleName :: Parser (DocH mod a) moduleName = DocModule <$> (char '"' *> modid <* char '"') where modid = intercalate "." <$> conid `sepBy1` "." @@ -167,41 +195,41 @@ moduleName = DocModule <$> (char '"' *> modid <* char '"') -- Right (DocPic (Picture "hello.png" Nothing)) -- >>> parseOnly picture "<<hello.png world>>" -- Right (DocPic (Picture "hello.png" (Just "world"))) -picture :: Parser (Doc a) +picture :: Parser (DocH mod a) picture = DocPic . makeLabeled Picture . decodeUtf8 <$> disallowNewline ("<<" *> takeUntil ">>") -- | Paragraph parser, called by 'parseParas'. -paragraph :: DynFlags -> Parser (Doc RdrName) -paragraph d = examples <|> skipSpace *> (list d <|> birdtracks <|> codeblock d - <|> property <|> header d - <|> textParagraph d) +paragraph :: Parser (DocH mod IString) +paragraph = examples <|> skipSpace *> (list <|> birdtracks <|> codeblock + <|> property <|> header + <|> textParagraph) -header :: DynFlags -> Parser (Doc RdrName) -header d = do +header :: Parser (DocH mod IString) +header = do let psers = map (string . encodeUtf8 . concat . flip replicate "=") [6, 5 .. 1] pser = foldl1 (<|>) psers delim <- decodeUtf8 <$> pser - line <- skipHorizontalSpace *> nonEmptyLine >>= return . parseString d - rest <- paragraph d <|> return mempty + line <- skipHorizontalSpace *> nonEmptyLine >>= return . parseString + rest <- paragraph <|> return mempty return $ docAppend (DocParagraph (DocHeader (Header (length delim) line))) rest -textParagraph :: DynFlags -> Parser (Doc RdrName) -textParagraph d = docParagraph . parseString d . intercalate "\n" <$> many1 nonEmptyLine +textParagraph :: Parser (DocH mod IString) +textParagraph = docParagraph . parseString . intercalate "\n" <$> many1 nonEmptyLine -- | List parser, called by 'paragraph'. -list :: DynFlags -> Parser (Doc RdrName) -list d = DocUnorderedList <$> unorderedList d - <|> DocOrderedList <$> orderedList d - <|> DocDefList <$> definitionList d +list :: Parser (DocH mod IString) +list = DocUnorderedList <$> unorderedList + <|> DocOrderedList <$> orderedList + <|> DocDefList <$> definitionList -- | Parses unordered (bullet) lists. -unorderedList :: DynFlags -> Parser [Doc RdrName] -unorderedList d = ("*" <|> "-") *> innerList (unorderedList d) d +unorderedList :: Parser [DocH mod IString] +unorderedList = ("*" <|> "-") *> innerList unorderedList -- | Parses ordered lists (numbered or dashed). -orderedList :: DynFlags -> Parser [Doc RdrName] -orderedList d = (paren <|> dot) *> innerList (orderedList d) d +orderedList :: Parser [DocH mod IString] +orderedList = (paren <|> dot) *> innerList orderedList where dot = (decimal :: Parser Int) <* "." paren = "(" *> decimal <* ")" @@ -210,23 +238,23 @@ orderedList d = (paren <|> dot) *> innerList (orderedList d) d -- list entry and recursively collecting any further lists in the -- same paragraph. Usually used as -- --- > someListFunction dynflags = listBeginning *> innerList someListFunction dynflags -innerList :: Parser [Doc RdrName] -> DynFlags -> Parser [Doc RdrName] -innerList item d = do +-- > someListFunction = listBeginning *> innerList someListFunction +innerList :: Parser [DocH mod IString] -> Parser [DocH mod IString] +innerList item = do c <- takeLine - (cs, items) <- more item d - let contents = docParagraph . parseString d . dropNLs . unlines $ c : cs + (cs, items) <- more item + let contents = docParagraph . parseString . dropNLs . unlines $ c : cs return $ case items of Left p -> [contents `joinPara` p] Right i -> contents : i -- | Parses definition lists. -definitionList :: DynFlags -> Parser [(Doc RdrName, Doc RdrName)] -definitionList d = do - label <- "[" *> (parseStringBS d <$> takeWhile1 (`notElem` "]\n")) <* "]" +definitionList :: Parser [(DocH mod IString, DocH mod IString)] +definitionList = do + label <- "[" *> (parseStringBS <$> takeWhile1 (`notElem` "]\n")) <* "]" c <- takeLine - (cs, items) <- more (definitionList d) d - let contents = parseString d . dropNLs . unlines $ c : cs + (cs, items) <- more definitionList + let contents = parseString . dropNLs . unlines $ c : cs return $ case items of Left p -> [(label, contents `joinPara` p)] Right i -> (label, contents) : i @@ -239,7 +267,7 @@ definitionList d = do -- rather than -- -- @DocAppend (DocParagraph …) …@ -joinPara :: Doc id -> Doc id -> Doc id +joinPara :: DocH mod id -> DocH mod id -> DocH mod id joinPara (DocParagraph p) c = docParagraph $ docAppend p c joinPara d p = docAppend d p @@ -250,33 +278,32 @@ dropNLs = reverse . dropWhile (== '\n') . reverse -- | Main worker for 'innerList' and 'definitionList'. -- We need the 'Either' here to be able to tell in the respective functions -- whether we're dealing with the next list or a nested paragraph. -more :: Monoid a => Parser a -> DynFlags - -> Parser ([String], Either (Doc RdrName) a) -more item d = innerParagraphs d <|> moreListItems item - <|> moreContent item d <|> pure ([], Right mempty) +more :: Monoid a => Parser a + -> Parser ([String], Either (DocH mod IString) a) +more item = innerParagraphs <|> moreListItems item + <|> moreContent item <|> pure ([], Right mempty) -- | Use by 'innerList' and 'definitionList' to parse any nested paragraphs. -innerParagraphs :: DynFlags - -> Parser ([String], Either (Doc RdrName) a) -innerParagraphs d = (,) [] . Left <$> ("\n" *> indentedParagraphs d) +innerParagraphs :: Parser ([String], Either (DocH mod IString) a) +innerParagraphs = (,) [] . Left <$> ("\n" *> indentedParagraphs) -- | Attemps to fetch the next list if possibly. Used by 'innerList' and -- 'definitionList' to recursivly grab lists that aren't separated by a whole -- paragraph. moreListItems :: Parser a - -> Parser ([String], Either (Doc RdrName) a) + -> Parser ([String], Either (DocH mod IString) a) moreListItems item = (,) [] . Right <$> (skipSpace *> item) -- | Helper for 'innerList' and 'definitionList' which simply takes -- a line of text and attempts to parse more list content with 'more'. -moreContent :: Monoid a => Parser a -> DynFlags - -> Parser ([String], Either (Doc RdrName) a) -moreContent item d = first . (:) <$> nonEmptyLine <*> more item d +moreContent :: Monoid a => Parser a + -> Parser ([String], Either (DocH mod IString) a) +moreContent item = first . (:) <$> nonEmptyLine <*> more item -- | Runs the 'parseParas' parser on an indented paragraph. -- The indentation is 4 spaces. -indentedParagraphs :: DynFlags -> Parser (Doc RdrName) -indentedParagraphs d = parseParas d . concat <$> dropFrontOfPara " " +indentedParagraphs :: Parser (DocH mod IString) +indentedParagraphs = parseParas . concat <$> dropFrontOfPara " " -- | Grab as many fully indented paragraphs as we can. dropFrontOfPara :: Parser BS.ByteString -> Parser [String] @@ -303,7 +330,7 @@ takeNonEmptyLine :: Parser String takeNonEmptyLine = do (++ "\n") . decodeUtf8 <$> (takeWhile1 (/= '\n') >>= nonSpace) <* "\n" -birdtracks :: Parser (Doc a) +birdtracks :: Parser (DocH mod a) birdtracks = DocCodeBlock . DocString . intercalate "\n" . stripSpace <$> many1 line where line = skipHorizontalSpace *> ">" *> takeLine @@ -317,7 +344,7 @@ stripSpace = fromMaybe <*> mapM strip' -- | Parses examples. Examples are a paragraph level entitity (separated by an empty line). -- Consecutive examples are accepted. -examples :: Parser (Doc a) +examples :: Parser (DocH mod a) examples = DocExamples <$> (many (skipHorizontalSpace *> "\n") *> go) where go :: Parser [Example] @@ -360,15 +387,15 @@ endOfLine = void "\n" <|> endOfInput -- -- >>> parseOnly property "prop> hello world" -- Right (DocProperty "hello world") -property :: Parser (Doc a) +property :: Parser (DocH mod a) property = DocProperty . strip . decodeUtf8 <$> ("prop>" *> takeWhile1 (/= '\n')) -- | -- Paragraph level codeblock. Anything between the two delimiting @ is parsed -- for markup. -codeblock :: DynFlags -> Parser (Doc RdrName) -codeblock d = - DocCodeBlock . parseStringBS d . dropSpaces +codeblock :: Parser (DocH mod IString) +codeblock = + DocCodeBlock . parseStringBS . dropSpaces <$> ("@" *> skipHorizontalSpace *> "\n" *> block' <* "@") where dropSpaces xs = @@ -386,7 +413,7 @@ codeblock d = -- text which we need to decide whether we actually want to be dropping -- anything at all. splitByNl = unfoldr (\case '\n':s -> Just (span (/= '\n') s) - _ -> Nothing) + _ -> Nothing) . ('\n' :) dropSpace "" = Just "" @@ -400,16 +427,16 @@ codeblock d = | isNewline && isSpace c = Just isNewline | otherwise = Just $ c == '\n' -hyperlink :: Parser (Doc a) +hyperlink :: Parser (DocH mod a) hyperlink = DocHyperlink . makeLabeled Hyperlink . decodeUtf8 <$> disallowNewline ("<" *> takeUntil ">") <|> autoUrl -autoUrl :: Parser (Doc a) +autoUrl :: Parser (DocH mod a) autoUrl = mkLink <$> url where url = mappend <$> ("http://" <|> "https://" <|> "ftp://") <*> takeWhile1 (not . isSpace) - mkLink :: BS.ByteString -> Doc a + mkLink :: BS.ByteString -> DocH mod a mkLink s = case BS.unsnoc s of Just (xs, x) | x `elem` ",.!?" -> DocHyperlink (Hyperlink (decodeUtf8 xs) Nothing) <> DocString [x] _ -> DocHyperlink (Hyperlink (decodeUtf8 s) Nothing) @@ -435,28 +462,16 @@ parseValid = do utf8String :: String -> Parser String utf8String x = decodeUtf8 <$> string (encodeUtf8 x) --- | Parses identifiers with help of 'parseValid'. Asks GHC for 'RdrName' from the +-- | Parses identifiers with help of 'parseValid'. Asks GHC for 'String' from the -- string it deems valid. -identifier :: DynFlags -> Parser (Doc RdrName) -identifier dflags = do +identifier :: Parser (DocH mod IString) +identifier = do o <- idDelim vid <- parseValid e <- idDelim - return $ validIdentifier o vid e + return $ DocIdentifier (o, vid, e) where idDelim = char '\'' <|> char '`' - validIdentifier o ident e = case parseIdent ident of - Just identName -> DocIdentifier identName - Nothing -> DocString $ o : ident ++ [e] - - parseIdent :: String -> Maybe RdrName - parseIdent str0 = - let buffer = stringToStringBuffer str0 - realSrcLc = mkRealSrcLoc (mkFastString "<unknown file>") 0 0 - pstate = mkPState dflags buffer realSrcLc - in case unP parseIdentifier pstate of - POk _ name -> Just (unLoc name) - _ -> Nothing -- | Remove all leading and trailing whitespace strip :: String -> String @@ -480,3 +495,22 @@ makeLabeled f input = case break isSpace $ removeEscapes $ strip input of removeEscapes ('\\':'\\':xs) = '\\' : removeEscapes xs removeEscapes ('\\':xs) = removeEscapes xs removeEscapes (x:xs) = x : removeEscapes xs + +takeUntil :: ByteString -> Parser ByteString +takeUntil end_ = dropEnd <$> requireEnd (scan (False, end) p) >>= gotSome + where + end = BS.unpack end_ + + p :: (Bool, String) -> Char -> Maybe (Bool, String) + p acc c = case acc of + (True, _) -> Just (False, end) + (_, []) -> Nothing + (_, x:xs) | x == c -> Just (False, xs) + _ -> Just (c == '\\', end) + + dropEnd = BS.reverse . BS.drop (length end) . BS.reverse + requireEnd = mfilter (BS.isSuffixOf end_) + + gotSome xs + | BS.null xs = fail "didn't get any content" + | otherwise = return xs |