diff options
author | Alec Theriault <alec.theriault@gmail.com> | 2018-04-25 11:24:07 -0700 |
---|---|---|
committer | Alexander Biehl <alexbiehl@gmail.com> | 2018-04-25 11:24:07 -0700 |
commit | 79c7159101c03bbbc7350e07963896ca2bb97c02 (patch) | |
tree | c754c425ed9d4ff8755dbe67589fa3c9dbbce10c /haddock-library/src | |
parent | 979c7338cfcdc59f0b0dda562a53558c416cc362 (diff) |
Replace 'attoparsec' with 'parsec' (#799)
* Remove attoparsec with parsec and start fixing failed parses
* Make tests pass
* Fix encoding issues
The Haddock parser no longer needs to worry about bytestrings. All
the internal parsing work in haddock-library happens over 'Text'.
* Remove attoparsec vendor
* Fix stuff broken in 'attoparsec' -> 'parsec'
* hyperlinks
* codeblocks
* examples
Pretty much all issues are due to attoparsec's backtracking failure
behaviour vs. parsec's non-backtracking failure behaviour.
* Fix small TODOs
* Missing quote + Haddocks
* Better handle spaces before/after paragraphs
* Address review comments
Diffstat (limited to 'haddock-library/src')
3 files changed, 348 insertions, 387 deletions
diff --git a/haddock-library/src/Documentation/Haddock/Parser.hs b/haddock-library/src/Documentation/Haddock/Parser.hs index 4921b3a7..d79da40b 100644 --- a/haddock-library/src/Documentation/Haddock/Parser.hs +++ b/haddock-library/src/Documentation/Haddock/Parser.hs @@ -16,27 +16,35 @@ -- to be -- -- @'toRegular' . '_doc' . 'parseParas'@ -module Documentation.Haddock.Parser ( parseString, parseParas - , overIdentifier, toRegular, Identifier - ) where +module Documentation.Haddock.Parser ( + parseString, + parseParas, + overIdentifier, + toRegular, + Identifier +) where import Control.Applicative import Control.Arrow (first) import Control.Monad -import qualified Data.ByteString.Char8 as BS -import Data.Char (chr, isUpper, isAlpha, isAlphaNum) -import Data.List (stripPrefix, intercalate, unfoldr, elemIndex) +import Data.Char (chr, isUpper, isAlpha, isAlphaNum, isSpace) +import Data.List (intercalate, unfoldr, elemIndex, notElem) import Data.Maybe (fromMaybe, mapMaybe) import Data.Monoid import qualified Data.Set as Set import Documentation.Haddock.Doc -import Documentation.Haddock.Parser.Monad hiding (take, endOfLine) +import Documentation.Haddock.Parser.Monad import Documentation.Haddock.Parser.Util import Documentation.Haddock.Types -import Documentation.Haddock.Utf8 import Prelude hiding (takeWhile) import qualified Prelude as P +import qualified Text.Parsec as Parsec +import Text.Parsec (try) + +import qualified Data.Text as T +import Data.Text (Text) + #if MIN_VERSION_base(4,9,0) import Text.Read.Lex (isSymbolChar) #else @@ -56,7 +64,7 @@ isSymbolChar c = not (isPuncChar c) && case generalCategory c of ModifierSymbol -> True OtherSymbol -> True DashPunctuation -> True - OtherPunctuation -> not (c `elem` ("'\"" :: String)) + OtherPunctuation -> c `notElem` ("'\"" :: String) ConnectorPunctuation -> c /= '_' _ -> False where @@ -109,8 +117,14 @@ overIdentifier f d = g d g (DocHeader (Header l x)) = DocHeader . Header l $ g x g (DocTable (Table h b)) = DocTable (Table (map (fmap g) h) (map (fmap g) b)) -parse :: Parser a -> BS.ByteString -> (ParserState, a) -parse p = either err id . parseOnly (p <* endOfInput) + +choice' :: [Parser a] -> Parser a +choice' [] = empty +choice' [p] = p +choice' (p : ps) = try p <|> choice' ps + +parse :: Parser a -> Text -> (ParserState, a) +parse p = either err id . parseOnly (p <* Parsec.eof) where err = error . ("Haddock.Parser.parse: " ++) @@ -127,31 +141,46 @@ parseParas pkg input = case parseParasState input of } parseParasState :: String -> (ParserState, DocH mod Identifier) -parseParasState = - parse (p <* skipSpace) . encodeUtf8 . (++ "\n") . filter (/= '\r') +parseParasState = parse (emptyLines *> p) . T.pack . (++ "\n") . filter (/= '\r') where p :: Parser (DocH mod Identifier) - p = docConcat <$> paragraph `sepBy` many (skipHorizontalSpace *> "\n") + p = docConcat <$> many (paragraph <* emptyLines) + + emptyLines :: Parser () + emptyLines = void $ many (try (skipHorizontalSpace *> "\n")) parseParagraphs :: String -> Parser (DocH mod Identifier) parseParagraphs input = case parseParasState input of - (state, a) -> setParserState state >> return a + (state, a) -> Parsec.putState state *> pure a --- | Parse a text paragraph. Actually just a wrapper over 'parseStringBS' which --- drops leading whitespace and encodes the string to UTF8 first. +-- | Variant of 'parseText' for 'String' instead of 'Text' parseString :: String -> DocH mod Identifier -parseString = parseStringBS . encodeUtf8 . dropWhile isSpace . filter (/= '\r') +parseString = parseText . T.pack -parseStringBS :: BS.ByteString -> DocH mod Identifier -parseStringBS = snd . parse p +-- | Parse a text paragraph. Actually just a wrapper over 'parseParagraph' which +-- drops leading whitespace. +parseText :: Text -> DocH mod Identifier +parseText = parseParagraph . T.dropWhile isSpace . T.filter (/= '\r') + +parseParagraph :: Text -> DocH mod Identifier +parseParagraph = snd . parse p where p :: Parser (DocH mod Identifier) - p = docConcat <$> many (monospace <|> anchor <|> identifier <|> moduleName - <|> picture <|> mathDisplay <|> mathInline - <|> markdownImage - <|> hyperlink <|> bold - <|> emphasis <|> encodedChar <|> string' - <|> skipSpecialChar) + p = docConcat <$> many (choice' [ monospace + , anchor + , identifier + , moduleName + , picture + , mathDisplay + , mathInline + , markdownImage + , hyperlink + , bold + , emphasis + , encodedChar + , string' + , skipSpecialChar + ]) -- | Parses and processes -- <https://en.wikipedia.org/wiki/Numeric_character_reference Numeric character references> @@ -175,7 +204,7 @@ specialChar = "_/<@\"&'`# " -- to ensure that we have already given a chance to more meaningful parsers -- before capturing their characers. string' :: Parser (DocH mod a) -string' = DocString . unescape . decodeUtf8 <$> takeWhile1_ (notInClass specialChar) +string' = DocString . unescape . T.unpack <$> takeWhile1_ (`notElem` specialChar) where unescape "" = "" unescape ('\\':x:xs) = x : unescape xs @@ -185,45 +214,45 @@ string' = DocString . unescape . decodeUtf8 <$> takeWhile1_ (notInClass specialC -- This is done to skip over any special characters belonging to other -- elements but which were not deemed meaningful at their positions. skipSpecialChar :: Parser (DocH mod a) -skipSpecialChar = DocString . return <$> satisfy (inClass specialChar) +skipSpecialChar = DocString . return <$> Parsec.oneOf specialChar -- | Emphasis parser. -- -- >>> parseString "/Hello world/" -- DocEmphasis (DocString "Hello world") emphasis :: Parser (DocH mod Identifier) -emphasis = DocEmphasis . parseStringBS <$> - mfilter ('\n' `BS.notElem`) ("/" *> takeWhile1_ (/= '/') <* "/") +emphasis = DocEmphasis . parseParagraph <$> + disallowNewline ("/" *> takeWhile1_ (/= '/') <* "/") -- | Bold parser. -- -- >>> parseString "__Hello world__" -- DocBold (DocString "Hello world") bold :: Parser (DocH mod Identifier) -bold = DocBold . parseStringBS <$> disallowNewline ("__" *> takeUntil "__") +bold = DocBold . parseParagraph <$> disallowNewline ("__" *> takeUntil "__") -disallowNewline :: Parser BS.ByteString -> Parser BS.ByteString -disallowNewline = mfilter ('\n' `BS.notElem`) +disallowNewline :: Parser Text -> Parser Text +disallowNewline = mfilter (T.all (/= '\n')) -- | Like `takeWhile`, but unconditionally take escaped characters. -takeWhile_ :: (Char -> Bool) -> Parser BS.ByteString -takeWhile_ p = scan False p_ +takeWhile_ :: (Char -> Bool) -> Parser Text +takeWhile_ p = scan p_ False where p_ escaped c | escaped = Just False | not $ p c = Nothing | otherwise = Just (c == '\\') --- | Like `takeWhile1`, but unconditionally take escaped characters. -takeWhile1_ :: (Char -> Bool) -> Parser BS.ByteString -takeWhile1_ = mfilter (not . BS.null) . takeWhile_ +-- | Like 'takeWhile1', but unconditionally take escaped characters. +takeWhile1_ :: (Char -> Bool) -> Parser Text +takeWhile1_ = mfilter (not . T.null) . takeWhile_ -- | Text anchors to allow for jumping around the generated documentation. -- -- >>> parseString "#Hello world#" -- DocAName "Hello world" anchor :: Parser (DocH mod a) -anchor = DocAName . decodeUtf8 <$> +anchor = DocAName . T.unpack <$> disallowNewline ("#" *> takeWhile1_ (/= '#') <* "#") -- | Monospaced strings. @@ -231,7 +260,7 @@ anchor = DocAName . decodeUtf8 <$> -- >>> parseString "@cruel@" -- DocMonospaced (DocString "cruel") monospace :: Parser (DocH mod Identifier) -monospace = DocMonospaced . parseStringBS +monospace = DocMonospaced . parseParagraph <$> ("@" *> takeWhile1_ (/= '@') <* "@") -- | Module names. @@ -239,14 +268,14 @@ monospace = DocMonospaced . parseStringBS -- Note that we allow '#' and '\' to support anchors (old style anchors are of -- the form "SomeModule\#anchor"). moduleName :: Parser (DocH mod a) -moduleName = DocModule <$> (char '"' *> modid <* char '"') +moduleName = DocModule <$> ("\"" *> modid <* "\"") where - modid = intercalate "." <$> conid `sepBy1` "." + modid = intercalate "." <$> conid `Parsec.sepBy1` "." conid = (:) - <$> satisfyUnicode (\c -> isAlpha c && isUpper c) - <*> many (satisfyUnicode conChar <|> char '\\' <|> char '#') + <$> Parsec.satisfy (\c -> isAlpha c && isUpper c) + <*> many (conChar <|> Parsec.oneOf "\\#") - conChar c = isAlphaNum c || c == '_' + conChar = Parsec.alphaNum <|> Parsec.char '_' -- | Picture parser, surrounded by \<\< and \>\>. It's possible to specify -- a title for the picture. @@ -256,7 +285,7 @@ moduleName = DocModule <$> (char '"' *> modid <* char '"') -- >>> parseString "<<hello.png world>>" -- DocPic (Picture {pictureUri = "hello.png", pictureTitle = Just "world"}) picture :: Parser (DocH mod a) -picture = DocPic . makeLabeled Picture . decodeUtf8 +picture = DocPic . makeLabeled Picture <$> disallowNewline ("<<" *> takeUntil ">>") -- | Inline math parser, surrounded by \\( and \\). @@ -264,7 +293,7 @@ picture = DocPic . makeLabeled Picture . decodeUtf8 -- >>> parseString "\\(\\int_{-\\infty}^{\\infty} e^{-x^2/2} = \\sqrt{2\\pi}\\)" -- DocMathInline "\\int_{-\\infty}^{\\infty} e^{-x^2/2} = \\sqrt{2\\pi}" mathInline :: Parser (DocH mod a) -mathInline = DocMathInline . decodeUtf8 +mathInline = DocMathInline . T.unpack <$> disallowNewline ("\\(" *> takeUntil "\\)") -- | Display math parser, surrounded by \\[ and \\]. @@ -272,7 +301,7 @@ mathInline = DocMathInline . decodeUtf8 -- >>> parseString "\\[\\int_{-\\infty}^{\\infty} e^{-x^2/2} = \\sqrt{2\\pi}\\]" -- DocMathDisplay "\\int_{-\\infty}^{\\infty} e^{-x^2/2} = \\sqrt{2\\pi}" mathDisplay :: Parser (DocH mod a) -mathDisplay = DocMathDisplay . decodeUtf8 +mathDisplay = DocMathDisplay . T.unpack <$> ("\\[" *> takeUntil "\\]") markdownImage :: Parser (DocH mod a) @@ -282,20 +311,21 @@ markdownImage = fromHyperlink <$> ("!" *> linkParser) -- | Paragraph parser, called by 'parseParas'. paragraph :: Parser (DocH mod Identifier) -paragraph = examples <|> table <|> do - indent <- takeIndent - choice - [ since - , unorderedList indent - , orderedList indent - , birdtracks - , codeblock - , property - , header - , textParagraphThatStartsWithMarkdownLink - , definitionList indent - , docParagraph <$> textParagraph - ] +paragraph = choice' [ examples + , table + , do indent <- takeIndent + choice' [ since + , unorderedList indent + , orderedList indent + , birdtracks + , codeblock + , property + , header + , textParagraphThatStartsWithMarkdownLink + , definitionList indent + , docParagraph <$> textParagraph + ] + ] -- | Provides support for grid tables. -- @@ -317,45 +347,45 @@ table :: Parser (DocH mod Identifier) table = do -- first we parse the first row, which determines the width of the table firstRow <- parseFirstRow - let len = BS.length firstRow + let len = T.length firstRow -- then we parse all consequtive rows starting and ending with + or |, -- of the width `len`. - restRows <- many (parseRestRows len) + restRows <- many (try (parseRestRows len)) -- Now we gathered the table block, the next step is to split the block -- into cells. DocTable <$> tableStepTwo len (firstRow : restRows) where - parseFirstRow :: Parser BS.ByteString + parseFirstRow :: Parser Text parseFirstRow = do skipHorizontalSpace -- upper-left corner is + - c <- char '+' - cs <- many1 (char '-' <|> char '+') + c <- Parsec.char '+' + cs <- some (Parsec.char '-' <|> Parsec.char '+') -- upper right corner is + too guard (last cs == '+') -- trailing space skipHorizontalSpace - _ <- char '\n' + _ <- Parsec.newline - return (BS.cons c $ BS.pack cs) + return (T.cons c $ T.pack cs) - parseRestRows :: Int -> Parser BS.ByteString + parseRestRows :: Int -> Parser Text parseRestRows l = do skipHorizontalSpace - c <- char '|' <|> char '+' - bs <- scan (l - 2) predicate - c2 <- char '|' <|> char '+' + c <- Parsec.char '|' <|> Parsec.char '+' + bs <- scan predicate (l - 2) + c2 <- Parsec.char '|' <|> Parsec.char '+' -- trailing space skipHorizontalSpace - _ <- char '\n' + _ <- Parsec.newline - return (BS.cons c (BS.snoc bs c2)) + return (T.cons c (T.snoc bs c2)) where predicate n c | n <= 0 = Nothing @@ -366,22 +396,22 @@ table = do -- and changes to '=' to '-'. tableStepTwo :: Int -- ^ width - -> [BS.ByteString] -- ^ rows + -> [Text] -- ^ rows -> Parser (Table (DocH mod Identifier)) tableStepTwo width = go 0 [] where go _ left [] = tableStepThree width (reverse left) Nothing go n left (r : rs) - | BS.all (`elem` ['+', '=']) r = + | T.all (`elem` ['+', '=']) r = tableStepThree width (reverse left ++ r' : rs) (Just n) | otherwise = go (n + 1) (r : left) rs where - r' = BS.map (\c -> if c == '=' then '-' else c) r + r' = T.map (\c -> if c == '=' then '-' else c) r -- Third step recognises cells in the table area, returning a list of TC, cells. tableStepThree :: Int -- ^ width - -> [BS.ByteString] -- ^ rows + -> [Text] -- ^ rows -> Maybe Int -- ^ index of header separator -> Parser (Table (DocH mod Identifier)) tableStepThree width rs hdrIndex = do @@ -408,32 +438,32 @@ tableStepThree width rs hdrIndex = do scanRight :: Int -> Int -> Maybe (Int, Int) scanRight x y = go (x + 1) where bs = rs !! y - go x' | x' >= width = fail "overflow right " - | BS.index bs x' == '+' = scanDown x y x' <|> go (x' + 1) - | BS.index bs x' == '-' = go (x' + 1) - | otherwise = fail $ "not a border (right) " ++ show (x,y,x') + go x' | x' >= width = fail "overflow right " + | T.index bs x' == '+' = scanDown x y x' <|> go (x' + 1) + | T.index bs x' == '-' = go (x' + 1) + | otherwise = fail $ "not a border (right) " ++ show (x,y,x') -- scan down looking for + scanDown :: Int -> Int -> Int -> Maybe (Int, Int) scanDown x y x2 = go (y + 1) where - go y' | y' >= height = fail "overflow down" - | BS.index (rs !! y') x2 == '+' = scanLeft x y x2 y' <|> go (y' + 1) - | BS.index (rs !! y') x2 == '|' = go (y' + 1) - | otherwise = fail $ "not a border (down) " ++ show (x,y,x2,y') + go y' | y' >= height = fail "overflow down" + | T.index (rs !! y') x2 == '+' = scanLeft x y x2 y' <|> go (y' + 1) + | T.index (rs !! y') x2 == '|' = go (y' + 1) + | otherwise = fail $ "not a border (down) " ++ show (x,y,x2,y') -- check that at y2 x..x2 characters are '+' or '-' scanLeft :: Int -> Int -> Int -> Int -> Maybe (Int, Int) scanLeft x y x2 y2 - | all (\x' -> BS.index bs x' `elem` ['+', '-']) [x..x2] = scanUp x y x2 y2 - | otherwise = fail $ "not a border (left) " ++ show (x,y,x2,y2) + | all (\x' -> T.index bs x' `elem` ['+', '-']) [x..x2] = scanUp x y x2 y2 + | otherwise = fail $ "not a border (left) " ++ show (x,y,x2,y2) where bs = rs !! y2 -- check that at y2 x..x2 characters are '+' or '-' scanUp :: Int -> Int -> Int -> Int -> Maybe (Int, Int) scanUp x y x2 y2 - | all (\y' -> BS.index (rs !! y') x `elem` ['+', '|']) [y..y2] = return (x2, y2) - | otherwise = fail $ "not a border (up) " ++ show (x,y,x2,y2) + | all (\y' -> T.index (rs !! y') x `elem` ['+', '|']) [y..y2] = return (x2, y2) + | otherwise = fail $ "not a border (up) " ++ show (x,y,x2,y2) -- | table cell: top left bottom right data TC = TC !Int !Int !Int !Int @@ -446,7 +476,7 @@ tcYS :: TC -> [Int] tcYS (TC y _ y2 _) = [y, y2] -- | Fourth step. Given the locations of cells, forms 'Table' structure. -tableStepFour :: [BS.ByteString] -> Maybe Int -> [TC] -> Parser (Table (DocH mod Identifier)) +tableStepFour :: [Text] -> Maybe Int -> [TC] -> Parser (Table (DocH mod Identifier)) tableStepFour rs hdrIndex cells = case hdrIndex of Nothing -> return $ Table [] rowsDoc Just i -> case elemIndex i yTabStops of @@ -464,7 +494,7 @@ tableStepFour rs hdrIndex cells = case hdrIndex of init' [_] = [] init' (x : xs) = x : init' xs - rowsDoc = (fmap . fmap) parseStringBS rows + rowsDoc = (fmap . fmap) parseParagraph rows rows = map makeRow (init' yTabStops) where @@ -477,9 +507,9 @@ tableStepFour rs hdrIndex cells = case hdrIndex of yts = length $ P.takeWhile (< y2) $ dropWhile (< y) yTabStops -- extract cell contents given boundaries - extract :: Int -> Int -> Int -> Int -> BS.ByteString - extract x y x2 y2 = BS.intercalate "\n" - [ BS.take (x2 - x + 1) $ BS.drop x $ rs !! y' + extract :: Int -> Int -> Int -> Int -> Text + extract x y x2 y2 = T.intercalate "\n" + [ T.take (x2 - x + 1) $ T.drop x $ rs !! y' | y' <- [y .. y2] ] @@ -487,7 +517,7 @@ tableStepFour rs hdrIndex cells = case hdrIndex of since :: Parser (DocH mod a) since = ("@since " *> version <* skipHorizontalSpace <* endOfLine) >>= setSince >> return DocEmpty where - version = decimal `sepBy1'` "." + version = decimal `Parsec.sepBy1` "." -- | Headers inside the comment denoted with @=@ signs, up to 6 levels -- deep. @@ -498,38 +528,39 @@ since = ("@since " *> version <* skipHorizontalSpace <* endOfLine) >>= setSince -- Right (DocHeader (Header {headerLevel = 2, headerTitle = DocString "World"})) header :: Parser (DocH mod Identifier) header = do - let psers = map (string . encodeUtf8 . concat . flip replicate "=") [6, 5 .. 1] - pser = foldl1 (<|>) psers - delim <- decodeUtf8 <$> pser - line <- skipHorizontalSpace *> nonEmptyLine >>= return . parseString - rest <- paragraph <|> return DocEmpty + let psers = map (string . flip T.replicate "=") [6, 5 .. 1] + pser = choice' psers + delim <- T.unpack <$> pser + line <- skipHorizontalSpace *> nonEmptyLine >>= return . parseText + rest <- try paragraph <|> return DocEmpty return $ DocHeader (Header (length delim) line) `docAppend` rest textParagraph :: Parser (DocH mod Identifier) -textParagraph = parseString . intercalate "\n" <$> many1 nonEmptyLine +textParagraph = parseText . T.intercalate "\n" <$> some nonEmptyLine textParagraphThatStartsWithMarkdownLink :: Parser (DocH mod Identifier) textParagraphThatStartsWithMarkdownLink = docParagraph <$> (docAppend <$> markdownLink <*> optionalTextParagraph) where optionalTextParagraph :: Parser (DocH mod Identifier) - optionalTextParagraph = (docAppend <$> whitespace <*> textParagraph) <|> pure DocEmpty + optionalTextParagraph = choice' [ docAppend <$> whitespace <*> textParagraph + , pure DocEmpty ] whitespace :: Parser (DocH mod a) whitespace = DocString <$> (f <$> takeHorizontalSpace <*> optional "\n") where - f :: BS.ByteString -> Maybe BS.ByteString -> String + f :: Text -> Maybe Text -> String f xs (fromMaybe "" -> x) - | BS.null (xs <> x) = "" + | T.null (xs <> x) = "" | otherwise = " " -- | Parses unordered (bullet) lists. -unorderedList :: BS.ByteString -> Parser (DocH mod Identifier) +unorderedList :: Text -> Parser (DocH mod Identifier) unorderedList indent = DocUnorderedList <$> p where p = ("*" <|> "-") *> innerList indent p -- | Parses ordered lists (numbered or dashed). -orderedList :: BS.ByteString -> Parser (DocH mod Identifier) +orderedList :: Text -> Parser (DocH mod Identifier) orderedList indent = DocOrderedList <$> p where p = (paren <|> dot) *> innerList indent p @@ -541,104 +572,110 @@ orderedList indent = DocOrderedList <$> p -- same paragraph. Usually used as -- -- > someListFunction = listBeginning *> innerList someListFunction -innerList :: BS.ByteString -> Parser [DocH mod Identifier] +innerList :: Text -> Parser [DocH mod Identifier] -> Parser [DocH mod Identifier] innerList indent item = do c <- takeLine (cs, items) <- more indent item - let contents = docParagraph . parseString . dropNLs . unlines $ c : cs + let contents = docParagraph . parseText . dropNLs . T.unlines $ c : cs return $ case items of Left p -> [contents `docAppend` p] Right i -> contents : i -- | Parses definition lists. -definitionList :: BS.ByteString -> Parser (DocH mod Identifier) +definitionList :: Text -> Parser (DocH mod Identifier) definitionList indent = DocDefList <$> p where p = do - label <- "[" *> (parseStringBS <$> takeWhile1_ (notInClass "]\n")) <* ("]" <* optional ":") + label <- "[" *> (parseParagraph <$> takeWhile1_ (`notElem` ("]\n" :: String))) <* ("]" <* optional ":") c <- takeLine (cs, items) <- more indent p - let contents = parseString . dropNLs . unlines $ c : cs + let contents = parseText . dropNLs . T.unlines $ c : cs return $ case items of Left x -> [(label, contents `docAppend` x)] Right i -> (label, contents) : i -- | Drops all trailing newlines. -dropNLs :: String -> String -dropNLs = reverse . dropWhile (== '\n') . reverse +dropNLs :: Text -> Text +dropNLs = T.dropWhileEnd (== '\n') -- | Main worker for 'innerList' and 'definitionList'. -- We need the 'Either' here to be able to tell in the respective functions -- whether we're dealing with the next list or a nested paragraph. -more :: Monoid a => BS.ByteString -> Parser a - -> Parser ([String], Either (DocH mod Identifier) a) -more indent item = innerParagraphs indent - <|> moreListItems indent item - <|> moreContent indent item - <|> pure ([], Right mempty) +more :: Monoid a => Text -> Parser a + -> Parser ([Text], Either (DocH mod Identifier) a) +more indent item = choice' [ innerParagraphs indent + , moreListItems indent item + , moreContent indent item + , pure ([], Right mempty) + ] -- | Used by 'innerList' and 'definitionList' to parse any nested paragraphs. -innerParagraphs :: BS.ByteString - -> Parser ([String], Either (DocH mod Identifier) a) +innerParagraphs :: Text + -> Parser ([Text], Either (DocH mod Identifier) a) innerParagraphs indent = (,) [] . Left <$> ("\n" *> indentedParagraphs indent) -- | Attempts to fetch the next list if possibly. Used by 'innerList' and -- 'definitionList' to recursively grab lists that aren't separated by a whole -- paragraph. -moreListItems :: BS.ByteString -> Parser a - -> Parser ([String], Either (DocH mod Identifier) a) +moreListItems :: Text -> Parser a + -> Parser ([Text], Either (DocH mod Identifier) a) moreListItems indent item = (,) [] . Right <$> indentedItem where - indentedItem = string indent *> skipSpace *> item + indentedItem = string indent *> Parsec.spaces *> item -- | Helper for 'innerList' and 'definitionList' which simply takes -- a line of text and attempts to parse more list content with 'more'. -moreContent :: Monoid a => BS.ByteString -> Parser a - -> Parser ([String], Either (DocH mod Identifier) a) +moreContent :: Monoid a => Text -> Parser a + -> Parser ([Text], Either (DocH mod Identifier) a) moreContent indent item = first . (:) <$> nonEmptyLine <*> more indent item -- | Parses an indented paragraph. -- The indentation is 4 spaces. -indentedParagraphs :: BS.ByteString -> Parser (DocH mod Identifier) +indentedParagraphs :: Text -> Parser (DocH mod Identifier) indentedParagraphs indent = - (concat <$> dropFrontOfPara indent') >>= parseParagraphs + (T.unpack . T.concat <$> dropFrontOfPara indent') >>= parseParagraphs where - indent' = string $ BS.append indent " " + indent' = string $ indent <> " " -- | Grab as many fully indented paragraphs as we can. -dropFrontOfPara :: Parser BS.ByteString -> Parser [String] +dropFrontOfPara :: Parser Text -> Parser [Text] dropFrontOfPara sp = do - currentParagraph <- some (sp *> takeNonEmptyLine) + currentParagraph <- some (try (sp *> takeNonEmptyLine)) followingParagraphs <- - skipHorizontalSpace *> nextPar -- we have more paragraphs to take - <|> skipHorizontalSpace *> nlList -- end of the ride, remember the newline - <|> endOfInput *> return [] -- nothing more to take at all + choice' [ skipHorizontalSpace *> nextPar -- we have more paragraphs to take + , skipHorizontalSpace *> nlList -- end of the ride, remember the newline + , Parsec.eof *> return [] -- nothing more to take at all + ] return (currentParagraph ++ followingParagraphs) where nextPar = (++) <$> nlList <*> dropFrontOfPara sp nlList = "\n" *> return ["\n"] -nonSpace :: BS.ByteString -> Parser BS.ByteString +nonSpace :: Text -> Parser Text nonSpace xs - | not $ any (not . isSpace) $ decodeUtf8 xs = fail "empty line" + | T.all isSpace xs = fail "empty line" | otherwise = return xs -- | Takes a non-empty, not fully whitespace line. -- -- Doesn't discard the trailing newline. -takeNonEmptyLine :: Parser String +takeNonEmptyLine :: Parser Text takeNonEmptyLine = do - (++ "\n") . decodeUtf8 <$> (takeWhile1 (/= '\n') >>= nonSpace) <* "\n" + l <- takeWhile1 (Parsec.noneOf "\n") >>= nonSpace + _ <- "\n" + pure (l <> "\n") -- | Takes indentation of first non-empty line. -- -- More precisely: skips all whitespace-only lines and returns indentation -- (horizontal space, might be empty) of that non-empty line. -takeIndent :: Parser BS.ByteString +takeIndent :: Parser Text takeIndent = do indent <- takeHorizontalSpace - "\n" *> takeIndent <|> return indent + choice' [ "\n" *> takeIndent + , return indent + ] -- | Blocks of text of the form: -- @@ -647,97 +684,98 @@ takeIndent = do -- >> baz -- birdtracks :: Parser (DocH mod a) -birdtracks = DocCodeBlock . DocString . intercalate "\n" . stripSpace <$> many1 line +birdtracks = DocCodeBlock . DocString . T.unpack . T.intercalate "\n" . stripSpace <$> some line where - line = skipHorizontalSpace *> ">" *> takeLine + line = try (skipHorizontalSpace *> ">" *> takeLine) -stripSpace :: [String] -> [String] +stripSpace :: [Text] -> [Text] stripSpace = fromMaybe <*> mapM strip' where - strip' (' ':xs') = Just xs' - strip' "" = Just "" - strip' _ = Nothing + strip' t = case T.uncons t of + Nothing -> Just "" + Just (' ',t') -> Just t' + _ -> Nothing -- | Parses examples. Examples are a paragraph level entitity (separated by an empty line). -- Consecutive examples are accepted. examples :: Parser (DocH mod a) -examples = DocExamples <$> (many (skipHorizontalSpace *> "\n") *> go) +examples = DocExamples <$> (many (try (skipHorizontalSpace *> "\n")) *> go) where go :: Parser [Example] go = do - prefix <- decodeUtf8 <$> takeHorizontalSpace <* ">>>" + prefix <- takeHorizontalSpace <* ">>>" expr <- takeLine (rs, es) <- resultAndMoreExamples return (makeExample prefix expr rs : es) where - resultAndMoreExamples :: Parser ([String], [Example]) - resultAndMoreExamples = moreExamples <|> result <|> pure ([], []) + resultAndMoreExamples :: Parser ([Text], [Example]) + resultAndMoreExamples = choice' [ moreExamples, result, pure ([], []) ] where - moreExamples :: Parser ([String], [Example]) + moreExamples :: Parser ([Text], [Example]) moreExamples = (,) [] <$> go - result :: Parser ([String], [Example]) + result :: Parser ([Text], [Example]) result = first . (:) <$> nonEmptyLine <*> resultAndMoreExamples - makeExample :: String -> String -> [String] -> Example + makeExample :: Text -> Text -> [Text] -> Example makeExample prefix expression res = - Example (strip expression) result + Example (T.unpack (T.strip expression)) result where - result = map (substituteBlankLine . tryStripPrefix) res + result = map (T.unpack . substituteBlankLine . tryStripPrefix) res - tryStripPrefix xs = fromMaybe xs (stripPrefix prefix xs) + tryStripPrefix xs = fromMaybe xs (T.stripPrefix prefix xs) substituteBlankLine "<BLANKLINE>" = "" substituteBlankLine xs = xs -nonEmptyLine :: Parser String -nonEmptyLine = mfilter (any (not . isSpace)) takeLine +nonEmptyLine :: Parser Text +nonEmptyLine = try (mfilter (T.any (not . isSpace)) takeLine) -takeLine :: Parser String -takeLine = decodeUtf8 <$> takeWhile (/= '\n') <* endOfLine +takeLine :: Parser Text +takeLine = try (takeWhile (Parsec.noneOf "\n") <* endOfLine) endOfLine :: Parser () -endOfLine = void "\n" <|> endOfInput +endOfLine = void "\n" <|> Parsec.eof -- | Property parser. -- -- >>> snd <$> parseOnly property "prop> hello world" -- Right (DocProperty "hello world") property :: Parser (DocH mod a) -property = DocProperty . strip . decodeUtf8 <$> ("prop>" *> takeWhile1 (/= '\n')) +property = DocProperty . T.unpack . T.strip <$> ("prop>" *> takeWhile1 (Parsec.noneOf "\n")) -- | -- Paragraph level codeblock. Anything between the two delimiting \@ is parsed -- for markup. codeblock :: Parser (DocH mod Identifier) codeblock = - DocCodeBlock . parseStringBS . dropSpaces + DocCodeBlock . parseParagraph . dropSpaces <$> ("@" *> skipHorizontalSpace *> "\n" *> block' <* "@") where dropSpaces xs = - let rs = decodeUtf8 xs - in case splitByNl rs of + case splitByNl xs of [] -> xs - ys -> case last ys of - ' ':_ -> case mapM dropSpace ys of - Nothing -> xs - Just zs -> encodeUtf8 $ intercalate "\n" zs + ys -> case T.uncons (last ys) of + Just (' ',_) -> case mapM dropSpace ys of + Nothing -> xs + Just zs -> T.intercalate "\n" zs _ -> xs -- This is necessary because ‘lines’ swallows up a trailing newline -- and we lose information about whether the last line belongs to @ or to -- text which we need to decide whether we actually want to be dropping -- anything at all. - splitByNl = unfoldr (\x -> case x of - '\n':s -> Just (span (/= '\n') s) - _ -> Nothing) - . ('\n' :) + splitByNl = unfoldr (\x -> case T.uncons x of + Just ('\n',x') -> Just (T.span (/= '\n') x') + _ -> Nothing) + . ("\n" <>) - dropSpace "" = Just "" - dropSpace (' ':xs) = Just xs - dropSpace _ = Nothing + dropSpace t = case T.uncons t of + Nothing -> Just "" + Just (' ',t') -> Just t' + _ -> Nothing - block' = scan False p + block' = scan p False where p isNewline c | isNewline && c == '@' = Nothing @@ -745,11 +783,11 @@ codeblock = | otherwise = Just $ c == '\n' hyperlink :: Parser (DocH mod a) -hyperlink = angleBracketLink <|> markdownLink <|> autoUrl +hyperlink = choice' [ angleBracketLink, markdownLink, autoUrl ] angleBracketLink :: Parser (DocH mod a) angleBracketLink = - DocHyperlink . makeLabeled Hyperlink . decodeUtf8 + DocHyperlink . makeLabeled Hyperlink <$> disallowNewline ("<" *> takeUntil ">") markdownLink :: Parser (DocH mod a) @@ -759,7 +797,7 @@ linkParser :: Parser Hyperlink linkParser = flip Hyperlink <$> label <*> (whitespace *> url) where label :: Parser (Maybe String) - label = Just . strip . decode <$> ("[" *> takeUntil "]") + label = Just . decode . T.strip <$> ("[" *> takeUntil "]") whitespace :: Parser () whitespace = skipHorizontalSpace <* optional ("\n" *> skipHorizontalSpace) @@ -770,19 +808,25 @@ linkParser = flip Hyperlink <$> label <*> (whitespace *> url) rejectWhitespace :: MonadPlus m => m String -> m String rejectWhitespace = mfilter (all (not . isSpace)) - decode :: BS.ByteString -> String - decode = removeEscapes . decodeUtf8 + decode :: Text -> String + decode = T.unpack . removeEscapes -- | Looks for URL-like things to automatically hyperlink even if they -- weren't marked as links. autoUrl :: Parser (DocH mod a) autoUrl = mkLink <$> url where - url = mappend <$> ("http://" <|> "https://" <|> "ftp://") <*> takeWhile1 (not . isSpace) - mkLink :: BS.ByteString -> DocH mod a - mkLink s = case unsnoc s of - Just (xs, x) | inClass ",.!?" x -> DocHyperlink (Hyperlink (decodeUtf8 xs) Nothing) `docAppend` DocString [x] - _ -> DocHyperlink (Hyperlink (decodeUtf8 s) Nothing) + url = mappend <$> choice' [ "http://", "https://", "ftp://"] <*> takeWhile1 (Parsec.satisfy (not . isSpace)) + + mkLink :: Text -> DocH mod a + mkLink s = case T.unsnoc s of + Just (xs,x) | x `elem` (",.!?" :: String) -> DocHyperlink (mkHyperlink xs) `docAppend` DocString [x] + _ -> DocHyperlink (mkHyperlink s) + + mkHyperlink :: Text -> Hyperlink + mkHyperlink lnk = Hyperlink (T.unpack lnk) Nothing + + -- | Parses strings between identifier delimiters. Consumes all input that it -- deems to be valid in an identifier. Note that it simply blindly consumes @@ -790,14 +834,14 @@ autoUrl = mkLink <$> url parseValid :: Parser String parseValid = p some where - idChar = satisfyUnicode (\c -> isAlphaNum c || isSymbolChar c || c == '_') + idChar = Parsec.satisfy (\c -> isAlphaNum c || isSymbolChar c || c == '_') p p' = do vs <- p' idChar c <- peekChar' case c of '`' -> return vs - '\'' -> (\x -> vs ++ "'" ++ x) <$> ("'" *> p many') <|> return vs + '\'' -> choice' [ (\x -> vs ++ "'" ++ x) <$> ("'" *> p many), return vs ] _ -> fail "outofvalid" -- | Parses identifiers with help of 'parseValid'. Asks GHC for @@ -809,4 +853,4 @@ identifier = do e <- idDelim return $ DocIdentifier (o, vid, e) where - idDelim = satisfy (\c -> c == '\'' || c == '`') + idDelim = Parsec.satisfy (\c -> c == '\'' || c == '`') diff --git a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs index ff6101da..585c76bb 100644 --- a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs +++ b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs @@ -1,48 +1,24 @@ -{-# LANGUAGE GeneralizedNewtypeDeriving, TypeFamilies #-} -module Documentation.Haddock.Parser.Monad ( - module Documentation.Haddock.Parser.Monad -, Attoparsec.isDigit -, Attoparsec.isDigit_w8 -, Attoparsec.isAlpha_iso8859_15 -, Attoparsec.isAlpha_ascii -, Attoparsec.isSpace -, Attoparsec.isSpace_w8 -, Attoparsec.inClass -, Attoparsec.notInClass -, Attoparsec.isEndOfLine -, Attoparsec.isHorizontalSpace -, Attoparsec.choice -, Attoparsec.count -, Attoparsec.option -, Attoparsec.many' -, Attoparsec.many1 -, Attoparsec.many1' -, Attoparsec.manyTill -, Attoparsec.manyTill' -, Attoparsec.sepBy -, Attoparsec.sepBy' -, Attoparsec.sepBy1 -, Attoparsec.sepBy1' -, Attoparsec.skipMany -, Attoparsec.skipMany1 -, Attoparsec.eitherP -) where - -import Control.Applicative -import Control.Monad -import Data.String -import Data.ByteString (ByteString, length) -import qualified Data.ByteString.Lazy as LB -import qualified Data.Attoparsec.ByteString.Char8 as Attoparsec -import qualified Data.Attoparsec.Combinator as Attoparsec -import Control.Monad.Trans.State -import qualified Control.Monad.Trans.Class as Trans -import Data.Word -import Data.Bits -import Data.Tuple - -import Documentation.Haddock.Types (Version) -import Documentation.Haddock.Utf8 (encodeUtf8, decodeUtf8) +{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE TypeSynonymInstances #-} + +module Documentation.Haddock.Parser.Monad where + +import qualified Text.Parsec.Char as Parsec +import qualified Text.Parsec as Parsec + +import qualified Data.Text as T +import Data.Text ( Text ) + +import Data.String ( IsString(..) ) +import Data.Bits ( Bits(..) ) +import Data.Char ( ord ) +import Data.List ( foldl' ) + +import Documentation.Haddock.Types ( Version ) newtype ParserState = ParserState { parserStateSince :: Maybe Version @@ -51,120 +27,65 @@ newtype ParserState = ParserState { initialParserState :: ParserState initialParserState = ParserState Nothing -newtype Parser a = Parser (StateT ParserState Attoparsec.Parser a) - deriving (Functor, Applicative, Alternative, Monad, MonadPlus) - -instance (a ~ ByteString) => IsString (Parser a) where - fromString = lift . fromString - -parseOnly :: Parser a -> ByteString -> Either String (ParserState, a) -parseOnly (Parser p) = fmap swap . Attoparsec.parseOnly (runStateT p initialParserState) +setSince :: Version -> Parser () +setSince since = Parsec.modifyState (\st -> st {parserStateSince = Just since}) -lift :: Attoparsec.Parser a -> Parser a -lift = Parser . Trans.lift +type Parser = Parsec.Parsec Text ParserState -setParserState :: ParserState -> Parser () -setParserState = Parser . put +instance (a ~ Text) => IsString (Parser a) where + fromString = fmap T.pack . Parsec.string -setSince :: Version -> Parser () -setSince since = Parser $ modify (\st -> st {parserStateSince = Just since}) - -char :: Char -> Parser Char -char = lift . Attoparsec.char - -char8 :: Char -> Parser Word8 -char8 = lift . Attoparsec.char8 - --- | Peek a unicode character and return the number of bytes that it took up -peekUnicode :: Parser (Char, Int) -peekUnicode = lift $ Attoparsec.lookAhead $ do - - -- attoparsec's take fails on shorter inputs rather than truncate - bs <- Attoparsec.choice (map Attoparsec.take [4,3,2,1]) - - let c = head . decodeUtf8 $ bs - n = Data.ByteString.length . encodeUtf8 $ [c] - pure (c, fromIntegral n) - --- | Like 'satisfy', but consuming a unicode character -satisfyUnicode :: (Char -> Bool) -> Parser Char -satisfyUnicode predicate = do - (c,n) <- peekUnicode - if predicate c - then Documentation.Haddock.Parser.Monad.take n *> pure c - else fail "satsifyUnicode" - -anyChar :: Parser Char -anyChar = lift Attoparsec.anyChar - -notChar :: Char -> Parser Char -notChar = lift . Attoparsec.notChar - -satisfy :: (Char -> Bool) -> Parser Char -satisfy = lift . Attoparsec.satisfy +parseOnly :: Parser a -> Text -> Either String (ParserState, a) +parseOnly p t = case Parsec.runParser p' initialParserState "<haddock>" t of + Left e -> Left (show e) + Right (x,s) -> Right (s,x) + where p' = (,) <$> p <*> Parsec.getState +-- | Always succeeds, but returns 'Nothing' if at the end of input. Does not +-- consume input. peekChar :: Parser (Maybe Char) -peekChar = lift Attoparsec.peekChar +peekChar = Parsec.optionMaybe . Parsec.try . Parsec.lookAhead $ Parsec.anyChar +-- | Fails if at the end of input. Does not consume input. peekChar' :: Parser Char -peekChar' = lift Attoparsec.peekChar' - -digit :: Parser Char -digit = lift Attoparsec.digit - -letter_iso8859_15 :: Parser Char -letter_iso8859_15 = lift Attoparsec.letter_iso8859_15 - -letter_ascii :: Parser Char -letter_ascii = lift Attoparsec.letter_ascii - -space :: Parser Char -space = lift Attoparsec.space - -string :: ByteString -> Parser ByteString -string = lift . Attoparsec.string - -stringCI :: ByteString -> Parser ByteString -stringCI = lift . Attoparsec.stringCI - -skipSpace :: Parser () -skipSpace = lift Attoparsec.skipSpace - -skipWhile :: (Char -> Bool) -> Parser () -skipWhile = lift . Attoparsec.skipWhile - -take :: Int -> Parser ByteString -take = lift . Attoparsec.take - -scan :: s -> (s -> Char -> Maybe s) -> Parser ByteString -scan s = lift . Attoparsec.scan s - -takeWhile :: (Char -> Bool) -> Parser ByteString -takeWhile = lift . Attoparsec.takeWhile - -takeWhile1 :: (Char -> Bool) -> Parser ByteString -takeWhile1 = lift . Attoparsec.takeWhile1 - -takeTill :: (Char -> Bool) -> Parser ByteString -takeTill = lift . Attoparsec.takeTill - -takeByteString :: Parser ByteString -takeByteString = lift Attoparsec.takeByteString - -takeLazyByteString :: Parser LB.ByteString -takeLazyByteString = lift Attoparsec.takeLazyByteString - -endOfLine :: Parser () -endOfLine = lift Attoparsec.endOfLine - +peekChar' = Parsec.lookAhead Parsec.anyChar + +-- | Parses the given string. Returns the parsed string. +string :: Text -> Parser Text +string t = Parsec.string (T.unpack t) *> pure t + +-- | Scan the input text, accumulating characters as long as the scanning +-- function returns true. +scan :: (s -> Char -> Maybe s) -- ^ scan function + -> s -- ^ initial state + -> Parser Text +scan f = fmap T.pack . go + where go s1 = do { cOpt <- peekChar + ; case cOpt >>= f s1 of + Nothing -> pure "" + Just s2 -> (:) <$> Parsec.anyChar <*> go s2 + } + +-- | Apply a parser for a character zero or more times and collect the result in +-- a string. +takeWhile :: Parser Char -> Parser Text +takeWhile = fmap T.pack . Parsec.many + +-- | Apply a parser for a character one or more times and collect the result in +-- a string. +takeWhile1 :: Parser Char -> Parser Text +takeWhile1 = fmap T.pack . Parsec.many1 + +-- | Parse a decimal number. decimal :: Integral a => Parser a -decimal = lift Attoparsec.decimal +decimal = foldl' step 0 `fmap` Parsec.many1 Parsec.digit + where step a c = a * 10 + fromIntegral (ord c - 48) +-- | Parse a hexadecimal number. hexadecimal :: (Integral a, Bits a) => Parser a -hexadecimal = lift Attoparsec.hexadecimal - -endOfInput :: Parser () -endOfInput = lift Attoparsec.endOfInput - -atEnd :: Parser Bool -atEnd = lift Attoparsec.atEnd +hexadecimal = foldl' step 0 `fmap` Parsec.many1 Parsec.hexDigit + where + step a c | w >= 48 && w <= 57 = (a `shiftL` 4) .|. fromIntegral (w - 48) + | w >= 97 = (a `shiftL` 4) .|. fromIntegral (w - 87) + | otherwise = (a `shiftL` 4) .|. fromIntegral (w - 55) + where w = ord c diff --git a/haddock-library/src/Documentation/Haddock/Parser/Util.hs b/haddock-library/src/Documentation/Haddock/Parser/Util.hs index ab5e5e9e..ffa91b09 100644 --- a/haddock-library/src/Documentation/Haddock/Parser/Util.hs +++ b/haddock-library/src/Documentation/Haddock/Parser/Util.hs @@ -1,4 +1,4 @@ -{-# LANGUAGE CPP #-} +{-# LANGUAGE OverloadedStrings #-} -- | -- Module : Documentation.Haddock.Parser.Util -- Copyright : (c) Mateusz Kowalczyk 2013-2014, @@ -11,62 +11,59 @@ -- -- Various utility functions used by the parser. module Documentation.Haddock.Parser.Util ( - unsnoc -, strip -, takeUntil -, removeEscapes -, makeLabeled -, takeHorizontalSpace -, skipHorizontalSpace + takeUntil, + removeEscapes, + makeLabeled, + takeHorizontalSpace, + skipHorizontalSpace, ) where +import qualified Text.Parsec as Parsec + +import qualified Data.Text as T +import Data.Text (Text) + import Control.Applicative import Control.Monad (mfilter) -import Documentation.Haddock.Parser.Monad hiding (isHorizontalSpace) -import Data.ByteString.Char8 (ByteString) -import qualified Data.ByteString.Char8 as BS +import Documentation.Haddock.Parser.Monad import Prelude hiding (takeWhile) -#if MIN_VERSION_bytestring(0,10,2) -import Data.ByteString.Char8 (unsnoc) -#else -unsnoc :: ByteString -> Maybe (ByteString, Char) -unsnoc bs - | BS.null bs = Nothing - | otherwise = Just (BS.init bs, BS.last bs) -#endif +import Data.Char (isSpace) --- | Remove all leading and trailing whitespace -strip :: String -> String -strip = (\f -> f . f) $ dropWhile isSpace . reverse - -isHorizontalSpace :: Char -> Bool -isHorizontalSpace = inClass " \t\f\v\r" +-- | Characters that count as horizontal space +horizontalSpace :: [Char] +horizontalSpace = " \t\f\v\r" +-- | Skip and ignore leading horizontal space skipHorizontalSpace :: Parser () -skipHorizontalSpace = skipWhile isHorizontalSpace +skipHorizontalSpace = Parsec.skipMany (Parsec.oneOf horizontalSpace) -takeHorizontalSpace :: Parser BS.ByteString -takeHorizontalSpace = takeWhile isHorizontalSpace +-- | Take leading horizontal space +takeHorizontalSpace :: Parser Text +takeHorizontalSpace = takeWhile (Parsec.oneOf horizontalSpace) -makeLabeled :: (String -> Maybe String -> a) -> String -> a -makeLabeled f input = case break isSpace $ removeEscapes $ strip input of - (uri, "") -> f uri Nothing - (uri, label) -> f uri (Just $ dropWhile isSpace label) +makeLabeled :: (String -> Maybe String -> a) -> Text -> a +makeLabeled f input = case T.break isSpace $ removeEscapes $ T.strip input of + (uri, "") -> f (T.unpack uri) Nothing + (uri, label) -> f (T.unpack uri) (Just . T.unpack $ T.stripStart label) -- | Remove escapes from given string. -- -- Only do this if you do not process (read: parse) the input any further. -removeEscapes :: String -> String -removeEscapes "" = "" -removeEscapes ('\\':'\\':xs) = '\\' : removeEscapes xs -removeEscapes ('\\':xs) = removeEscapes xs -removeEscapes (x:xs) = x : removeEscapes xs +removeEscapes :: Text -> Text +removeEscapes = T.unfoldr go + where + go :: Text -> Maybe (Char, Text) + go xs = case T.uncons xs of + Just ('\\',ys) -> T.uncons ys + unconsed -> unconsed -takeUntil :: ByteString -> Parser ByteString -takeUntil end_ = dropEnd <$> requireEnd (scan (False, end) p) >>= gotSome +-- | Consume characters from the input up to and including the given pattern. +-- Return everything consumed except for the end pattern itself. +takeUntil :: Text -> Parser Text +takeUntil end_ = T.dropEnd (T.length end_) <$> requireEnd (scan p (False, end)) >>= gotSome where - end = BS.unpack end_ + end = T.unpack end_ p :: (Bool, String) -> Char -> Maybe (Bool, String) p acc c = case acc of @@ -75,9 +72,8 @@ takeUntil end_ = dropEnd <$> requireEnd (scan (False, end) p) >>= gotSome (_, x:xs) | x == c -> Just (False, xs) _ -> Just (c == '\\', end) - dropEnd = BS.reverse . BS.drop (length end) . BS.reverse - requireEnd = mfilter (BS.isSuffixOf end_) + requireEnd = mfilter (T.isSuffixOf end_) gotSome xs - | BS.null xs = fail "didn't get any content" + | T.null xs = fail "didn't get any content" | otherwise = return xs |