diff options
author | Alec Theriault <alec.theriault@gmail.com> | 2018-04-25 11:24:07 -0700 |
---|---|---|
committer | Alexander Biehl <alexbiehl@gmail.com> | 2018-04-25 11:24:07 -0700 |
commit | 79c7159101c03bbbc7350e07963896ca2bb97c02 (patch) | |
tree | c754c425ed9d4ff8755dbe67589fa3c9dbbce10c /haddock-library/src/Documentation/Haddock/Parser | |
parent | 979c7338cfcdc59f0b0dda562a53558c416cc362 (diff) |
Replace 'attoparsec' with 'parsec' (#799)
* Remove attoparsec with parsec and start fixing failed parses
* Make tests pass
* Fix encoding issues
The Haddock parser no longer needs to worry about bytestrings. All
the internal parsing work in haddock-library happens over 'Text'.
* Remove attoparsec vendor
* Fix stuff broken in 'attoparsec' -> 'parsec'
* hyperlinks
* codeblocks
* examples
Pretty much all issues are due to attoparsec's backtracking failure
behaviour vs. parsec's non-backtracking failure behaviour.
* Fix small TODOs
* Missing quote + Haddocks
* Better handle spaces before/after paragraphs
* Address review comments
Diffstat (limited to 'haddock-library/src/Documentation/Haddock/Parser')
-rw-r--r-- | haddock-library/src/Documentation/Haddock/Parser/Monad.hs | 225 | ||||
-rw-r--r-- | haddock-library/src/Documentation/Haddock/Parser/Util.hs | 82 |
2 files changed, 112 insertions, 195 deletions
diff --git a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs index ff6101da..585c76bb 100644 --- a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs +++ b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs @@ -1,48 +1,24 @@ -{-# LANGUAGE GeneralizedNewtypeDeriving, TypeFamilies #-} -module Documentation.Haddock.Parser.Monad ( - module Documentation.Haddock.Parser.Monad -, Attoparsec.isDigit -, Attoparsec.isDigit_w8 -, Attoparsec.isAlpha_iso8859_15 -, Attoparsec.isAlpha_ascii -, Attoparsec.isSpace -, Attoparsec.isSpace_w8 -, Attoparsec.inClass -, Attoparsec.notInClass -, Attoparsec.isEndOfLine -, Attoparsec.isHorizontalSpace -, Attoparsec.choice -, Attoparsec.count -, Attoparsec.option -, Attoparsec.many' -, Attoparsec.many1 -, Attoparsec.many1' -, Attoparsec.manyTill -, Attoparsec.manyTill' -, Attoparsec.sepBy -, Attoparsec.sepBy' -, Attoparsec.sepBy1 -, Attoparsec.sepBy1' -, Attoparsec.skipMany -, Attoparsec.skipMany1 -, Attoparsec.eitherP -) where - -import Control.Applicative -import Control.Monad -import Data.String -import Data.ByteString (ByteString, length) -import qualified Data.ByteString.Lazy as LB -import qualified Data.Attoparsec.ByteString.Char8 as Attoparsec -import qualified Data.Attoparsec.Combinator as Attoparsec -import Control.Monad.Trans.State -import qualified Control.Monad.Trans.Class as Trans -import Data.Word -import Data.Bits -import Data.Tuple - -import Documentation.Haddock.Types (Version) -import Documentation.Haddock.Utf8 (encodeUtf8, decodeUtf8) +{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE TypeSynonymInstances #-} + +module Documentation.Haddock.Parser.Monad where + +import qualified Text.Parsec.Char as Parsec +import qualified Text.Parsec as Parsec + +import qualified Data.Text as T +import Data.Text ( Text ) + +import Data.String ( IsString(..) ) +import Data.Bits ( Bits(..) ) +import Data.Char ( ord ) +import Data.List ( foldl' ) + +import Documentation.Haddock.Types ( Version ) newtype ParserState = ParserState { parserStateSince :: Maybe Version @@ -51,120 +27,65 @@ newtype ParserState = ParserState { initialParserState :: ParserState initialParserState = ParserState Nothing -newtype Parser a = Parser (StateT ParserState Attoparsec.Parser a) - deriving (Functor, Applicative, Alternative, Monad, MonadPlus) - -instance (a ~ ByteString) => IsString (Parser a) where - fromString = lift . fromString - -parseOnly :: Parser a -> ByteString -> Either String (ParserState, a) -parseOnly (Parser p) = fmap swap . Attoparsec.parseOnly (runStateT p initialParserState) +setSince :: Version -> Parser () +setSince since = Parsec.modifyState (\st -> st {parserStateSince = Just since}) -lift :: Attoparsec.Parser a -> Parser a -lift = Parser . Trans.lift +type Parser = Parsec.Parsec Text ParserState -setParserState :: ParserState -> Parser () -setParserState = Parser . put +instance (a ~ Text) => IsString (Parser a) where + fromString = fmap T.pack . Parsec.string -setSince :: Version -> Parser () -setSince since = Parser $ modify (\st -> st {parserStateSince = Just since}) - -char :: Char -> Parser Char -char = lift . Attoparsec.char - -char8 :: Char -> Parser Word8 -char8 = lift . Attoparsec.char8 - --- | Peek a unicode character and return the number of bytes that it took up -peekUnicode :: Parser (Char, Int) -peekUnicode = lift $ Attoparsec.lookAhead $ do - - -- attoparsec's take fails on shorter inputs rather than truncate - bs <- Attoparsec.choice (map Attoparsec.take [4,3,2,1]) - - let c = head . decodeUtf8 $ bs - n = Data.ByteString.length . encodeUtf8 $ [c] - pure (c, fromIntegral n) - --- | Like 'satisfy', but consuming a unicode character -satisfyUnicode :: (Char -> Bool) -> Parser Char -satisfyUnicode predicate = do - (c,n) <- peekUnicode - if predicate c - then Documentation.Haddock.Parser.Monad.take n *> pure c - else fail "satsifyUnicode" - -anyChar :: Parser Char -anyChar = lift Attoparsec.anyChar - -notChar :: Char -> Parser Char -notChar = lift . Attoparsec.notChar - -satisfy :: (Char -> Bool) -> Parser Char -satisfy = lift . Attoparsec.satisfy +parseOnly :: Parser a -> Text -> Either String (ParserState, a) +parseOnly p t = case Parsec.runParser p' initialParserState "<haddock>" t of + Left e -> Left (show e) + Right (x,s) -> Right (s,x) + where p' = (,) <$> p <*> Parsec.getState +-- | Always succeeds, but returns 'Nothing' if at the end of input. Does not +-- consume input. peekChar :: Parser (Maybe Char) -peekChar = lift Attoparsec.peekChar +peekChar = Parsec.optionMaybe . Parsec.try . Parsec.lookAhead $ Parsec.anyChar +-- | Fails if at the end of input. Does not consume input. peekChar' :: Parser Char -peekChar' = lift Attoparsec.peekChar' - -digit :: Parser Char -digit = lift Attoparsec.digit - -letter_iso8859_15 :: Parser Char -letter_iso8859_15 = lift Attoparsec.letter_iso8859_15 - -letter_ascii :: Parser Char -letter_ascii = lift Attoparsec.letter_ascii - -space :: Parser Char -space = lift Attoparsec.space - -string :: ByteString -> Parser ByteString -string = lift . Attoparsec.string - -stringCI :: ByteString -> Parser ByteString -stringCI = lift . Attoparsec.stringCI - -skipSpace :: Parser () -skipSpace = lift Attoparsec.skipSpace - -skipWhile :: (Char -> Bool) -> Parser () -skipWhile = lift . Attoparsec.skipWhile - -take :: Int -> Parser ByteString -take = lift . Attoparsec.take - -scan :: s -> (s -> Char -> Maybe s) -> Parser ByteString -scan s = lift . Attoparsec.scan s - -takeWhile :: (Char -> Bool) -> Parser ByteString -takeWhile = lift . Attoparsec.takeWhile - -takeWhile1 :: (Char -> Bool) -> Parser ByteString -takeWhile1 = lift . Attoparsec.takeWhile1 - -takeTill :: (Char -> Bool) -> Parser ByteString -takeTill = lift . Attoparsec.takeTill - -takeByteString :: Parser ByteString -takeByteString = lift Attoparsec.takeByteString - -takeLazyByteString :: Parser LB.ByteString -takeLazyByteString = lift Attoparsec.takeLazyByteString - -endOfLine :: Parser () -endOfLine = lift Attoparsec.endOfLine - +peekChar' = Parsec.lookAhead Parsec.anyChar + +-- | Parses the given string. Returns the parsed string. +string :: Text -> Parser Text +string t = Parsec.string (T.unpack t) *> pure t + +-- | Scan the input text, accumulating characters as long as the scanning +-- function returns true. +scan :: (s -> Char -> Maybe s) -- ^ scan function + -> s -- ^ initial state + -> Parser Text +scan f = fmap T.pack . go + where go s1 = do { cOpt <- peekChar + ; case cOpt >>= f s1 of + Nothing -> pure "" + Just s2 -> (:) <$> Parsec.anyChar <*> go s2 + } + +-- | Apply a parser for a character zero or more times and collect the result in +-- a string. +takeWhile :: Parser Char -> Parser Text +takeWhile = fmap T.pack . Parsec.many + +-- | Apply a parser for a character one or more times and collect the result in +-- a string. +takeWhile1 :: Parser Char -> Parser Text +takeWhile1 = fmap T.pack . Parsec.many1 + +-- | Parse a decimal number. decimal :: Integral a => Parser a -decimal = lift Attoparsec.decimal +decimal = foldl' step 0 `fmap` Parsec.many1 Parsec.digit + where step a c = a * 10 + fromIntegral (ord c - 48) +-- | Parse a hexadecimal number. hexadecimal :: (Integral a, Bits a) => Parser a -hexadecimal = lift Attoparsec.hexadecimal - -endOfInput :: Parser () -endOfInput = lift Attoparsec.endOfInput - -atEnd :: Parser Bool -atEnd = lift Attoparsec.atEnd +hexadecimal = foldl' step 0 `fmap` Parsec.many1 Parsec.hexDigit + where + step a c | w >= 48 && w <= 57 = (a `shiftL` 4) .|. fromIntegral (w - 48) + | w >= 97 = (a `shiftL` 4) .|. fromIntegral (w - 87) + | otherwise = (a `shiftL` 4) .|. fromIntegral (w - 55) + where w = ord c diff --git a/haddock-library/src/Documentation/Haddock/Parser/Util.hs b/haddock-library/src/Documentation/Haddock/Parser/Util.hs index ab5e5e9e..ffa91b09 100644 --- a/haddock-library/src/Documentation/Haddock/Parser/Util.hs +++ b/haddock-library/src/Documentation/Haddock/Parser/Util.hs @@ -1,4 +1,4 @@ -{-# LANGUAGE CPP #-} +{-# LANGUAGE OverloadedStrings #-} -- | -- Module : Documentation.Haddock.Parser.Util -- Copyright : (c) Mateusz Kowalczyk 2013-2014, @@ -11,62 +11,59 @@ -- -- Various utility functions used by the parser. module Documentation.Haddock.Parser.Util ( - unsnoc -, strip -, takeUntil -, removeEscapes -, makeLabeled -, takeHorizontalSpace -, skipHorizontalSpace + takeUntil, + removeEscapes, + makeLabeled, + takeHorizontalSpace, + skipHorizontalSpace, ) where +import qualified Text.Parsec as Parsec + +import qualified Data.Text as T +import Data.Text (Text) + import Control.Applicative import Control.Monad (mfilter) -import Documentation.Haddock.Parser.Monad hiding (isHorizontalSpace) -import Data.ByteString.Char8 (ByteString) -import qualified Data.ByteString.Char8 as BS +import Documentation.Haddock.Parser.Monad import Prelude hiding (takeWhile) -#if MIN_VERSION_bytestring(0,10,2) -import Data.ByteString.Char8 (unsnoc) -#else -unsnoc :: ByteString -> Maybe (ByteString, Char) -unsnoc bs - | BS.null bs = Nothing - | otherwise = Just (BS.init bs, BS.last bs) -#endif +import Data.Char (isSpace) --- | Remove all leading and trailing whitespace -strip :: String -> String -strip = (\f -> f . f) $ dropWhile isSpace . reverse - -isHorizontalSpace :: Char -> Bool -isHorizontalSpace = inClass " \t\f\v\r" +-- | Characters that count as horizontal space +horizontalSpace :: [Char] +horizontalSpace = " \t\f\v\r" +-- | Skip and ignore leading horizontal space skipHorizontalSpace :: Parser () -skipHorizontalSpace = skipWhile isHorizontalSpace +skipHorizontalSpace = Parsec.skipMany (Parsec.oneOf horizontalSpace) -takeHorizontalSpace :: Parser BS.ByteString -takeHorizontalSpace = takeWhile isHorizontalSpace +-- | Take leading horizontal space +takeHorizontalSpace :: Parser Text +takeHorizontalSpace = takeWhile (Parsec.oneOf horizontalSpace) -makeLabeled :: (String -> Maybe String -> a) -> String -> a -makeLabeled f input = case break isSpace $ removeEscapes $ strip input of - (uri, "") -> f uri Nothing - (uri, label) -> f uri (Just $ dropWhile isSpace label) +makeLabeled :: (String -> Maybe String -> a) -> Text -> a +makeLabeled f input = case T.break isSpace $ removeEscapes $ T.strip input of + (uri, "") -> f (T.unpack uri) Nothing + (uri, label) -> f (T.unpack uri) (Just . T.unpack $ T.stripStart label) -- | Remove escapes from given string. -- -- Only do this if you do not process (read: parse) the input any further. -removeEscapes :: String -> String -removeEscapes "" = "" -removeEscapes ('\\':'\\':xs) = '\\' : removeEscapes xs -removeEscapes ('\\':xs) = removeEscapes xs -removeEscapes (x:xs) = x : removeEscapes xs +removeEscapes :: Text -> Text +removeEscapes = T.unfoldr go + where + go :: Text -> Maybe (Char, Text) + go xs = case T.uncons xs of + Just ('\\',ys) -> T.uncons ys + unconsed -> unconsed -takeUntil :: ByteString -> Parser ByteString -takeUntil end_ = dropEnd <$> requireEnd (scan (False, end) p) >>= gotSome +-- | Consume characters from the input up to and including the given pattern. +-- Return everything consumed except for the end pattern itself. +takeUntil :: Text -> Parser Text +takeUntil end_ = T.dropEnd (T.length end_) <$> requireEnd (scan p (False, end)) >>= gotSome where - end = BS.unpack end_ + end = T.unpack end_ p :: (Bool, String) -> Char -> Maybe (Bool, String) p acc c = case acc of @@ -75,9 +72,8 @@ takeUntil end_ = dropEnd <$> requireEnd (scan (False, end) p) >>= gotSome (_, x:xs) | x == c -> Just (False, xs) _ -> Just (c == '\\', end) - dropEnd = BS.reverse . BS.drop (length end) . BS.reverse - requireEnd = mfilter (BS.isSuffixOf end_) + requireEnd = mfilter (T.isSuffixOf end_) gotSome xs - | BS.null xs = fail "didn't get any content" + | T.null xs = fail "didn't get any content" | otherwise = return xs |