diff options
author | Mateusz Kowalczyk <fuuzetsu@fuuzetsu.co.uk> | 2014-05-05 09:01:03 +0200 |
---|---|---|
committer | Mateusz Kowalczyk <fuuzetsu@fuuzetsu.co.uk> | 2014-05-05 11:00:41 +0200 |
commit | cc269e6b0b615e9e237c35a985e4ace7b9ab0dd9 (patch) | |
tree | f0264138c81909151f9724c1f02f7bf8b30803cb /src/Haddock/Parser.hs | |
parent | 7ac2d0f2d31c2e1c7ede09828f3d5ba5626bd0d4 (diff) |
Move parser + parser tests out to own package.
We move some types out that are necessary as well and then
re-export and specialise them in the core Haddock.
Reason for moving out spec tests is that if we're working on the parser,
we can simply work on that and we can ignore the rest of Haddock. The
downside is that it's a little inconvenient if at the end of the day we
want to see that everything passes.
Diffstat (limited to 'src/Haddock/Parser.hs')
-rw-r--r-- | src/Haddock/Parser.hs | 529 |
1 files changed, 30 insertions, 499 deletions
diff --git a/src/Haddock/Parser.hs b/src/Haddock/Parser.hs index 1489ae84..720f442b 100644 --- a/src/Haddock/Parser.hs +++ b/src/Haddock/Parser.hs @@ -13,504 +13,35 @@ -- Stability : experimental -- Portability : portable -module Haddock.Parser ( parseString, parseParas - , overIdentifier, toRegular - , takeUntil +module Haddock.Parser ( module Documentation.Haddock.Parser + , parseParasMaybe + , parseStringMaybe + , parseIdent ) where -import Prelude hiding (takeWhile) -import Control.Arrow (first) -import Control.Monad (void, mfilter) -import Control.Applicative -import Data.Attoparsec.ByteString.Char8 hiding (parse, take, endOfLine) -import qualified Data.ByteString.Char8 as BS -import Data.Char (chr, isAsciiUpper) -import Data.List (stripPrefix, intercalate, unfoldr) -import Data.Maybe (fromMaybe) -import Data.Monoid -import Haddock.Doc -import Haddock.Types hiding (Doc) -import Haddock.Utf8 -import Data.ByteString.Char8 (ByteString) - --- | Identifier string surrounded with opening and closing quotes/backticks. -type IString = (Char, String, Char) - --- | Drops the quotes/backticks around all identifiers, as if they --- were valid but still 'String's. -toRegular :: DocH mod IString -> DocH mod String -toRegular = fmap (\(_, x, _) -> x) - --- | Maps over 'DocIdentifier's over 'String' with potentially failing --- conversion using user-supplied function. If the conversion fails, --- the identifier is deemed to not be valid and is treated as a --- regular string. -overIdentifier :: (String -> Maybe a) - -> DocH mod IString - -> DocH mod a -overIdentifier f d = g d - where - g (DocIdentifier (o, x, e)) = case f x of - Nothing -> DocString $ o : x ++ [e] - Just x' -> DocIdentifier x' - g DocEmpty = DocEmpty - g (DocAppend x x') = DocAppend (g x) (g x') - g (DocString x) = DocString x - g (DocParagraph x) = DocParagraph $ g x - g (DocIdentifierUnchecked x) = DocIdentifierUnchecked x - g (DocModule x) = DocModule x - g (DocWarning x) = DocWarning $ g x - g (DocEmphasis x) = DocEmphasis $ g x - g (DocMonospaced x) = DocMonospaced $ g x - g (DocBold x) = DocBold $ g x - g (DocUnorderedList x) = DocUnorderedList $ fmap g x - g (DocOrderedList x) = DocOrderedList $ fmap g x - g (DocDefList x) = DocDefList $ fmap (\(y, z) -> (g y, g z)) x - g (DocCodeBlock x) = DocCodeBlock $ g x - g (DocHyperlink x) = DocHyperlink x - g (DocPic x) = DocPic x - g (DocAName x) = DocAName x - g (DocProperty x) = DocProperty x - g (DocExamples x) = DocExamples x - g (DocHeader (Header l x)) = DocHeader . Header l $ g x - -parse :: Parser a -> BS.ByteString -> a -parse p = either err id . parseOnly (p <* endOfInput) - where - err = error . ("Haddock.Parser.parse: " ++) - --- | Main entry point to the parser. Appends the newline character --- to the input string. -parseParas :: String -- ^ String to parse - -> DocH mod IString -parseParas = parse (p <* skipSpace) . encodeUtf8 . (++ "\n") - where - p :: Parser (DocH mod IString) - p = mconcat <$> paragraph `sepBy` many (skipHorizontalSpace *> "\n") - --- | Parse a text paragraph. Actually just a wrapper over 'parseStringBS' which --- drops leading whitespace and encodes the string to UTF8 first. -parseString :: String -> DocH mod IString -parseString = parseStringBS . encodeUtf8 . dropWhile isSpace - -parseStringBS :: BS.ByteString -> DocH mod IString -parseStringBS = parse p - where - p :: Parser (DocH mod IString) - p = mconcat <$> many (monospace <|> anchor <|> identifier - <|> moduleName <|> picture <|> hyperlink <|> autoUrl <|> bold - <|> emphasis <|> encodedChar <|> string' <|> skipSpecialChar) - --- | Parses and processes --- <https://en.wikipedia.org/wiki/Numeric_character_reference Numeric character references> --- --- >>> parseOnly encodedChar "ABC" --- Right (DocString "ABC") -encodedChar :: Parser (DocH mod a) -encodedChar = "&#" *> c <* ";" - where - c = DocString . return . chr <$> num - num = hex <|> decimal - hex = ("x" <|> "X") *> hexadecimal - -specialChar :: [Char] -specialChar = "_/<@\"&'`" - --- | Plain, regular parser for text. Called as one of the last parsers --- to ensure that we have already given a chance to more meaningful parsers --- before capturing their characers. -string' :: Parser (DocH mod a) -string' = DocString . unescape . decodeUtf8 <$> takeWhile1_ (`notElem` specialChar) - where - unescape "" = "" - unescape ('\\':x:xs) = x : unescape xs - unescape (x:xs) = x : unescape xs - --- | Skips a single special character and treats it as a plain string. --- This is done to skip over any special characters belonging to other --- elements but which were not deemed meaningful at their positions. -skipSpecialChar :: Parser (DocH mod a) -skipSpecialChar = DocString . return <$> satisfy (`elem` specialChar) - --- | Emphasis parser. --- --- >>> parseOnly emphasis "/Hello world/" --- Right (DocEmphasis (DocString "Hello world")) -emphasis :: Parser (DocH mod IString) -emphasis = DocEmphasis . parseStringBS <$> - mfilter ('\n' `BS.notElem`) ("/" *> takeWhile1_ (/= '/') <* "/") - --- | Bold parser. --- --- >>> parseOnly bold "__Hello world__" --- Right (DocBold (DocString "Hello world")) -bold :: Parser (DocH mod IString) -bold = DocBold . parseStringBS <$> disallowNewline ("__" *> takeUntil "__") - -disallowNewline :: Parser BS.ByteString -> Parser BS.ByteString -disallowNewline = mfilter ('\n' `BS.notElem`) - --- | Like `takeWhile`, but unconditionally take escaped characters. -takeWhile_ :: (Char -> Bool) -> Parser BS.ByteString -takeWhile_ p = scan False p_ - where - p_ escaped c - | escaped = Just False - | not $ p c = Nothing - | otherwise = Just (c == '\\') - --- | Like `takeWhile1`, but unconditionally take escaped characters. -takeWhile1_ :: (Char -> Bool) -> Parser BS.ByteString -takeWhile1_ = mfilter (not . BS.null) . takeWhile_ - --- | Text anchors to allow for jumping around the generated documentation. --- --- >>> parseOnly anchor "#Hello world#" --- Right (DocAName "Hello world") -anchor :: Parser (DocH mod a) -anchor = DocAName . decodeUtf8 <$> ("#" *> takeWhile1 (`notElem` "#\n") <* "#") - --- | Monospaced strings. --- --- >>> parseOnly monospace "@cruel@" --- Right (DocMonospaced (DocString "cruel")) -monospace :: Parser (DocH mod IString) -monospace = DocMonospaced . parseStringBS <$> ("@" *> takeWhile1_ (/= '@') <* "@") - -moduleName :: Parser (DocH mod a) -moduleName = DocModule <$> (char '"' *> modid <* char '"') - where - modid = intercalate "." <$> conid `sepBy1` "." - conid = (:) - <$> satisfy isAsciiUpper - -- NOTE: According to Haskell 2010 we shouldd actually only - -- accept {small | large | digit | ' } here. But as we can't - -- match on unicode characters, this is currently not possible. - <*> (decodeUtf8 <$> takeWhile (`notElem` " .&[{}(=*)+]!#|@/;,^?\"\n")) - --- | Picture parser, surrounded by \<\< and \>\>. It's possible to specify --- a title for the picture. --- --- >>> parseOnly picture "<<hello.png>>" --- Right (DocPic (Picture "hello.png" Nothing)) --- >>> parseOnly picture "<<hello.png world>>" --- Right (DocPic (Picture "hello.png" (Just "world"))) -picture :: Parser (DocH mod a) -picture = DocPic . makeLabeled Picture . decodeUtf8 - <$> disallowNewline ("<<" *> takeUntil ">>") - --- | Paragraph parser, called by 'parseParas'. -paragraph :: Parser (DocH mod IString) -paragraph = examples <|> skipSpace *> (list <|> birdtracks <|> codeblock - <|> property <|> header - <|> textParagraph) - -header :: Parser (DocH mod IString) -header = do - let psers = map (string . encodeUtf8 . concat . flip replicate "=") [6, 5 .. 1] - pser = foldl1 (<|>) psers - delim <- decodeUtf8 <$> pser - line <- skipHorizontalSpace *> nonEmptyLine >>= return . parseString - rest <- paragraph <|> return mempty - return $ docAppend (DocParagraph (DocHeader (Header (length delim) line))) rest - -textParagraph :: Parser (DocH mod IString) -textParagraph = docParagraph . parseString . intercalate "\n" <$> many1 nonEmptyLine - --- | List parser, called by 'paragraph'. -list :: Parser (DocH mod IString) -list = DocUnorderedList <$> unorderedList - <|> DocOrderedList <$> orderedList - <|> DocDefList <$> definitionList - --- | Parses unordered (bullet) lists. -unorderedList :: Parser [DocH mod IString] -unorderedList = ("*" <|> "-") *> innerList unorderedList - --- | Parses ordered lists (numbered or dashed). -orderedList :: Parser [DocH mod IString] -orderedList = (paren <|> dot) *> innerList orderedList - where - dot = (decimal :: Parser Int) <* "." - paren = "(" *> decimal <* ")" - --- | Generic function collecting any further lines belonging to the --- list entry and recursively collecting any further lists in the --- same paragraph. Usually used as --- --- > someListFunction = listBeginning *> innerList someListFunction -innerList :: Parser [DocH mod IString] -> Parser [DocH mod IString] -innerList item = do - c <- takeLine - (cs, items) <- more item - let contents = docParagraph . parseString . dropNLs . unlines $ c : cs - return $ case items of - Left p -> [contents `joinPara` p] - Right i -> contents : i - --- | Parses definition lists. -definitionList :: Parser [(DocH mod IString, DocH mod IString)] -definitionList = do - label <- "[" *> (parseStringBS <$> takeWhile1 (`notElem` "]\n")) <* "]" - c <- takeLine - (cs, items) <- more definitionList - let contents = parseString . dropNLs . unlines $ c : cs - return $ case items of - Left p -> [(label, contents `joinPara` p)] - Right i -> (label, contents) : i - --- | If possible, appends two 'Doc's under a 'DocParagraph' rather than --- outside of it. This allows to get structures like --- --- @DocParagraph (DocAppend … …)@ --- --- rather than --- --- @DocAppend (DocParagraph …) …@ -joinPara :: DocH mod id -> DocH mod id -> DocH mod id -joinPara (DocParagraph p) c = docParagraph $ docAppend p c -joinPara d p = docAppend d p - --- | Drops all trailing newlines. -dropNLs :: String -> String -dropNLs = reverse . dropWhile (== '\n') . reverse - --- | Main worker for 'innerList' and 'definitionList'. --- We need the 'Either' here to be able to tell in the respective functions --- whether we're dealing with the next list or a nested paragraph. -more :: Monoid a => Parser a - -> Parser ([String], Either (DocH mod IString) a) -more item = innerParagraphs <|> moreListItems item - <|> moreContent item <|> pure ([], Right mempty) - --- | Use by 'innerList' and 'definitionList' to parse any nested paragraphs. -innerParagraphs :: Parser ([String], Either (DocH mod IString) a) -innerParagraphs = (,) [] . Left <$> ("\n" *> indentedParagraphs) - --- | Attemps to fetch the next list if possibly. Used by 'innerList' and --- 'definitionList' to recursivly grab lists that aren't separated by a whole --- paragraph. -moreListItems :: Parser a - -> Parser ([String], Either (DocH mod IString) a) -moreListItems item = (,) [] . Right <$> (skipSpace *> item) - --- | Helper for 'innerList' and 'definitionList' which simply takes --- a line of text and attempts to parse more list content with 'more'. -moreContent :: Monoid a => Parser a - -> Parser ([String], Either (DocH mod IString) a) -moreContent item = first . (:) <$> nonEmptyLine <*> more item - --- | Runs the 'parseParas' parser on an indented paragraph. --- The indentation is 4 spaces. -indentedParagraphs :: Parser (DocH mod IString) -indentedParagraphs = parseParas . concat <$> dropFrontOfPara " " - --- | Grab as many fully indented paragraphs as we can. -dropFrontOfPara :: Parser BS.ByteString -> Parser [String] -dropFrontOfPara sp = do - currentParagraph <- some (sp *> takeNonEmptyLine) - followingParagraphs <- - skipHorizontalSpace *> nextPar -- we have more paragraphs to take - <|> skipHorizontalSpace *> nlList -- end of the ride, remember the newline - <|> endOfInput *> return [] -- nothing more to take at all - return (currentParagraph ++ followingParagraphs) - where - nextPar = (++) <$> nlList <*> dropFrontOfPara sp - nlList = "\n" *> return ["\n"] - -nonSpace :: BS.ByteString -> Parser BS.ByteString -nonSpace xs - | not $ any (not . isSpace) $ decodeUtf8 xs = fail "empty line" - | otherwise = return xs - --- | Takes a non-empty, not fully whitespace line. --- --- Doesn't discard the trailing newline. -takeNonEmptyLine :: Parser String -takeNonEmptyLine = do - (++ "\n") . decodeUtf8 <$> (takeWhile1 (/= '\n') >>= nonSpace) <* "\n" - -birdtracks :: Parser (DocH mod a) -birdtracks = DocCodeBlock . DocString . intercalate "\n" . stripSpace <$> many1 line - where - line = skipHorizontalSpace *> ">" *> takeLine - -stripSpace :: [String] -> [String] -stripSpace = fromMaybe <*> mapM strip' - where - strip' (' ':xs') = Just xs' - strip' "" = Just "" - strip' _ = Nothing - --- | Parses examples. Examples are a paragraph level entitity (separated by an empty line). --- Consecutive examples are accepted. -examples :: Parser (DocH mod a) -examples = DocExamples <$> (many (skipHorizontalSpace *> "\n") *> go) - where - go :: Parser [Example] - go = do - prefix <- decodeUtf8 <$> takeHorizontalSpace <* ">>>" - expr <- takeLine - (rs, es) <- resultAndMoreExamples - return (makeExample prefix expr rs : es) - where - resultAndMoreExamples :: Parser ([String], [Example]) - resultAndMoreExamples = moreExamples <|> result <|> pure ([], []) - where - moreExamples :: Parser ([String], [Example]) - moreExamples = (,) [] <$> go - - result :: Parser ([String], [Example]) - result = first . (:) <$> nonEmptyLine <*> resultAndMoreExamples - - makeExample :: String -> String -> [String] -> Example - makeExample prefix expression res = - Example (strip expression) result - where - result = map (substituteBlankLine . tryStripPrefix) res - - tryStripPrefix xs = fromMaybe xs (stripPrefix prefix xs) - - substituteBlankLine "<BLANKLINE>" = "" - substituteBlankLine xs = xs - -nonEmptyLine :: Parser String -nonEmptyLine = mfilter (any (not . isSpace)) takeLine - -takeLine :: Parser String -takeLine = decodeUtf8 <$> takeWhile (/= '\n') <* endOfLine - -endOfLine :: Parser () -endOfLine = void "\n" <|> endOfInput - --- | Property parser. --- --- >>> parseOnly property "prop> hello world" --- Right (DocProperty "hello world") -property :: Parser (DocH mod a) -property = DocProperty . strip . decodeUtf8 <$> ("prop>" *> takeWhile1 (/= '\n')) - --- | --- Paragraph level codeblock. Anything between the two delimiting @ is parsed --- for markup. -codeblock :: Parser (DocH mod IString) -codeblock = - DocCodeBlock . parseStringBS . dropSpaces - <$> ("@" *> skipHorizontalSpace *> "\n" *> block' <* "@") - where - dropSpaces xs = - let rs = decodeUtf8 xs - in case splitByNl rs of - [] -> xs - ys -> case last ys of - ' ':_ -> case mapM dropSpace ys of - Nothing -> xs - Just zs -> encodeUtf8 $ intercalate "\n" zs - _ -> xs - - -- This is necessary because ‘lines’ swallows up a trailing newline - -- and we lose information about whether the last line belongs to @ or to - -- text which we need to decide whether we actually want to be dropping - -- anything at all. - splitByNl = unfoldr (\case '\n':s -> Just (span (/= '\n') s) - _ -> Nothing) - . ('\n' :) - - dropSpace "" = Just "" - dropSpace (' ':xs) = Just xs - dropSpace _ = Nothing - - block' = scan False p - where - p isNewline c - | isNewline && c == '@' = Nothing - | isNewline && isSpace c = Just isNewline - | otherwise = Just $ c == '\n' - -hyperlink :: Parser (DocH mod a) -hyperlink = DocHyperlink . makeLabeled Hyperlink . decodeUtf8 - <$> disallowNewline ("<" *> takeUntil ">") - <|> autoUrl - -autoUrl :: Parser (DocH mod a) -autoUrl = mkLink <$> url - where - url = mappend <$> ("http://" <|> "https://" <|> "ftp://") <*> takeWhile1 (not . isSpace) - mkLink :: BS.ByteString -> DocH mod a - mkLink s = case BS.unsnoc s of - Just (xs, x) | x `elem` ",.!?" -> DocHyperlink (Hyperlink (decodeUtf8 xs) Nothing) <> DocString [x] - _ -> DocHyperlink (Hyperlink (decodeUtf8 s) Nothing) - --- | Parses strings between identifier delimiters. Consumes all input that it --- deems to be valid in an identifier. Note that it simply blindly consumes --- characters and does no actual validation itself. -parseValid :: Parser String -parseValid = do - vs' <- many' $ utf8String "⋆" <|> return <$> idChar - let vs = concat vs' - c <- peekChar - case c of - Just '`' -> return vs - Just '\'' -> (\x -> vs ++ "'" ++ x) <$> ("'" *> parseValid) - <|> return vs - _ -> fail "outofvalid" - where - idChar = satisfy (`elem` "_.!#$%&*+/<=>?@\\|-~:^") - <|> digit <|> letter_ascii - --- | Parses UTF8 strings from ByteString streams. -utf8String :: String -> Parser String -utf8String x = decodeUtf8 <$> string (encodeUtf8 x) - --- | Parses identifiers with help of 'parseValid'. Asks GHC for 'String' from the --- string it deems valid. -identifier :: Parser (DocH mod IString) -identifier = do - o <- idDelim - vid <- parseValid - e <- idDelim - return $ DocIdentifier (o, vid, e) - where - idDelim = char '\'' <|> char '`' - --- | Remove all leading and trailing whitespace -strip :: String -> String -strip = (\f -> f . f) $ dropWhile isSpace . reverse - -skipHorizontalSpace :: Parser () -skipHorizontalSpace = skipWhile (`elem` " \t\f\v\r") - -takeHorizontalSpace :: Parser BS.ByteString -takeHorizontalSpace = takeWhile (`elem` " \t\f\v\r") - -makeLabeled :: (String -> Maybe String -> a) -> String -> a -makeLabeled f input = case break isSpace $ removeEscapes $ strip input of - (uri, "") -> f uri Nothing - (uri, label) -> f uri (Just $ dropWhile isSpace label) - where - -- As we don't parse these any further, we don't do any processing to the - -- string so we at least remove escape character here. Perhaps we should - -- actually be parsing the label at the very least? - removeEscapes "" = "" - removeEscapes ('\\':'\\':xs) = '\\' : removeEscapes xs - removeEscapes ('\\':xs) = removeEscapes xs - removeEscapes (x:xs) = x : removeEscapes xs - -takeUntil :: ByteString -> Parser ByteString -takeUntil end_ = dropEnd <$> requireEnd (scan (False, end) p) >>= gotSome - where - end = BS.unpack end_ - - p :: (Bool, String) -> Char -> Maybe (Bool, String) - p acc c = case acc of - (True, _) -> Just (False, end) - (_, []) -> Nothing - (_, x:xs) | x == c -> Just (False, xs) - _ -> Just (c == '\\', end) - - dropEnd = BS.reverse . BS.drop (length end) . BS.reverse - requireEnd = mfilter (BS.isSuffixOf end_) - - gotSome xs - | BS.null xs = fail "didn't get any content" - | otherwise = return xs +import Documentation.Haddock.Parser +import DynFlags (DynFlags) +import FastString (mkFastString) +import Documentation.Haddock.Types +import Lexer (mkPState, unP, ParseResult(POk)) +import Parser (parseIdentifier) +import RdrName (RdrName) +import SrcLoc (mkRealSrcLoc, unLoc) +import StringBuffer (stringToStringBuffer) + +{-# DEPRECATED parseParasMaybe "use `parseParas` instead" #-} +parseParasMaybe :: DynFlags -> String -> Maybe (DocH mod RdrName) +parseParasMaybe d = Just . overIdentifier (parseIdent d) . parseParas + +{-# DEPRECATED parseStringMaybe "use `parseString` instead" #-} +parseStringMaybe :: DynFlags -> String -> Maybe (DocH mod RdrName) +parseStringMaybe d = Just . overIdentifier (parseIdent d) . parseString + +parseIdent :: DynFlags -> String -> Maybe RdrName +parseIdent dflags str0 = + let buffer = stringToStringBuffer str0 + realSrcLc = mkRealSrcLoc (mkFastString "<unknown file>") 0 0 + pstate = mkPState dflags buffer realSrcLc + in case unP parseIdentifier pstate of + POk _ name -> Just (unLoc name) + _ -> Nothing |