From 82b8f491e18d707f67857bcb170b2147fa275ccc Mon Sep 17 00:00:00 2001 From: Alec Theriault Date: Wed, 29 Aug 2018 04:52:42 -0700 Subject: Faster 'Text' driven parser combinators Perf only change: * use 'getParserState'/'setParserState' to make 'Text'-optimized parser combinators * minimize uses of 'Data.Text.{pack,unpack,cons,snoc}' --- .../src/Documentation/Haddock/Parser.hs | 30 ++++---- .../src/Documentation/Haddock/Parser/Monad.hs | 86 +++++++++++++++++----- .../src/Documentation/Haddock/Parser/Util.hs | 2 +- 3 files changed, 83 insertions(+), 35 deletions(-) diff --git a/haddock-library/src/Documentation/Haddock/Parser.hs b/haddock-library/src/Documentation/Haddock/Parser.hs index d79da40b..46b7ad3e 100644 --- a/haddock-library/src/Documentation/Haddock/Parser.hs +++ b/haddock-library/src/Documentation/Haddock/Parser.hs @@ -360,32 +360,34 @@ table = do parseFirstRow :: Parser Text parseFirstRow = do skipHorizontalSpace - -- upper-left corner is + - c <- Parsec.char '+' - cs <- some (Parsec.char '-' <|> Parsec.char '+') + cs <- takeWhile (\c -> c == '-' || c == '+') - -- upper right corner is + too - guard (last cs == '+') + -- upper-left and upper-right corners are `+` + guard (T.length cs >= 2 && + T.head cs == '+' && + T.last cs == '+') -- trailing space skipHorizontalSpace _ <- Parsec.newline - return (T.cons c $ T.pack cs) + return cs parseRestRows :: Int -> Parser Text parseRestRows l = do skipHorizontalSpace + bs <- scan predicate l - c <- Parsec.char '|' <|> Parsec.char '+' - bs <- scan predicate (l - 2) - c2 <- Parsec.char '|' <|> Parsec.char '+' + -- Left and right edges are `|` or `+` + guard (T.length bs >= 2 && + (T.head bs == '|' || T.head bs == '+') && + (T.last bs == '|' || T.last bs == '+')) -- trailing space skipHorizontalSpace _ <- Parsec.newline - return (T.cons c (T.snoc bs c2)) + return bs where predicate n c | n <= 0 = Nothing @@ -662,7 +664,7 @@ nonSpace xs -- Doesn't discard the trailing newline. takeNonEmptyLine :: Parser Text takeNonEmptyLine = do - l <- takeWhile1 (Parsec.noneOf "\n") >>= nonSpace + l <- takeWhile1 (/= '\n') >>= nonSpace _ <- "\n" pure (l <> "\n") @@ -732,7 +734,7 @@ nonEmptyLine :: Parser Text nonEmptyLine = try (mfilter (T.any (not . isSpace)) takeLine) takeLine :: Parser Text -takeLine = try (takeWhile (Parsec.noneOf "\n") <* endOfLine) +takeLine = try (takeWhile (/= '\n') <* endOfLine) endOfLine :: Parser () endOfLine = void "\n" <|> Parsec.eof @@ -742,7 +744,7 @@ endOfLine = void "\n" <|> Parsec.eof -- >>> snd <$> parseOnly property "prop> hello world" -- Right (DocProperty "hello world") property :: Parser (DocH mod a) -property = DocProperty . T.unpack . T.strip <$> ("prop>" *> takeWhile1 (Parsec.noneOf "\n")) +property = DocProperty . T.unpack . T.strip <$> ("prop>" *> takeWhile1 (/= '\n')) -- | -- Paragraph level codeblock. Anything between the two delimiting \@ is parsed @@ -816,7 +818,7 @@ linkParser = flip Hyperlink <$> label <*> (whitespace *> url) autoUrl :: Parser (DocH mod a) autoUrl = mkLink <$> url where - url = mappend <$> choice' [ "http://", "https://", "ftp://"] <*> takeWhile1 (Parsec.satisfy (not . isSpace)) + url = mappend <$> choice' [ "http://", "https://", "ftp://"] <*> takeWhile1 (not . isSpace) mkLink :: Text -> DocH mod a mkLink s = case T.unsnoc s of diff --git a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs index a5664aa8..8f5bd217 100644 --- a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs +++ b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs @@ -9,10 +9,15 @@ module Documentation.Haddock.Parser.Monad where import qualified Text.Parsec.Char as Parsec import qualified Text.Parsec as Parsec +import Text.Parsec.Pos ( updatePosChar ) +import Text.Parsec ( State(..) + , getParserState, setParserState ) import qualified Data.Text as T import Data.Text ( Text ) +import Control.Monad ( mfilter ) +import Data.Functor ( ($>) ) import Data.String ( IsString(..) ) import Data.Bits ( Bits(..) ) import Data.Char ( ord ) @@ -20,7 +25,11 @@ import Data.List ( foldl' ) import Control.Applicative as App import Documentation.Haddock.Types ( Version ) +import Prelude hiding (takeWhile) +-- | The only bit of information we really care about truding along with us +-- through parsing is the version attached to a @\@since@ annotation - if +-- the doc even contained one. newtype ParserState = ParserState { parserStateSince :: Maybe Version } deriving (Eq, Show) @@ -29,7 +38,7 @@ initialParserState :: ParserState initialParserState = ParserState Nothing setSince :: Version -> Parser () -setSince since = Parsec.modifyState (\st -> st {parserStateSince = Just since}) +setSince since = Parsec.modifyState (\st -> st{ parserStateSince = Just since }) type Parser = Parsec.Parsec Text ParserState @@ -44,38 +53,75 @@ parseOnly p t = case Parsec.runParser p' initialParserState "" t of -- | Always succeeds, but returns 'Nothing' if at the end of input. Does not -- consume input. +-- +-- Equivalent to @Parsec.optionMaybe . Parsec.lookAhead $ Parsec.anyChar@, but +-- more efficient. peekChar :: Parser (Maybe Char) -peekChar = Parsec.optionMaybe . Parsec.try . Parsec.lookAhead $ Parsec.anyChar +peekChar = headOpt . stateInput <$> getParserState + where headOpt t | T.null t = Nothing + | otherwise = Just (T.head t) +{-# INLINE peekChar #-} -- | Fails if at the end of input. Does not consume input. +-- +-- Equivalent to @Parsec.lookAhead Parsec.anyChar@, but more efficient. peekChar' :: Parser Char -peekChar' = Parsec.lookAhead Parsec.anyChar +peekChar' = headFail . stateInput =<< getParserState + where headFail t | T.null t = Parsec.parserFail "peekChar': reached EOF" + | otherwise = App.pure (T.head t) +{-# INLINE peekChar' #-} -- | Parses the given string. Returns the parsed string. +-- +-- Equivalent to @Parsec.string (T.unpack t) $> t@, but more efficient. string :: Text -> Parser Text -string t = Parsec.string (T.unpack t) *> App.pure t +string t = do + s@State{ stateInput = inp, statePos = pos } <- getParserState + case T.stripPrefix t inp of + Nothing -> Parsec.parserFail "string: Failed to match the input string" + Just inp' -> + let pos' = T.foldl updatePosChar pos t + s' = s{ stateInput = inp', statePos = pos' } + in setParserState s' $> t + +-- | Keep matching characters as long as the predicate function holds (and +-- return them). +-- +-- Equivalent to @fmap T.pack . Parsec.many@, but more efficient. +takeWhile :: (Char -> Bool) -> Parser Text +takeWhile f = do + s@State{ stateInput = inp, statePos = pos } <- getParserState + let (t, inp') = T.span f inp + pos' = T.foldl updatePosChar pos t + s' = s{ stateInput = inp', statePos = pos' } + setParserState s' $> t + + +-- | Like 'takeWhile', but fails if no characters matched. +-- +-- Equivalent to @fmap T.pack . Parsec.many1@, but more efficient. +takeWhile1 :: (Char -> Bool) -> Parser Text +takeWhile1 = mfilter (not . T.null) . takeWhile -- | Scan the input text, accumulating characters as long as the scanning -- function returns true. scan :: (s -> Char -> Maybe s) -- ^ scan function -> s -- ^ initial state -> Parser Text -scan f = fmap T.pack . go - where go s1 = do { cOpt <- peekChar - ; case cOpt >>= f s1 of - Nothing -> pure "" - Just s2 -> (:) <$> Parsec.anyChar <*> go s2 - } - --- | Apply a parser for a character zero or more times and collect the result in --- a string. -takeWhile :: Parser Char -> Parser Text -takeWhile = fmap T.pack . Parsec.many - --- | Apply a parser for a character one or more times and collect the result in --- a string. -takeWhile1 :: Parser Char -> Parser Text -takeWhile1 = fmap T.pack . Parsec.many1 +scan f st = do + s@State{ stateInput = inp, statePos = pos } <- getParserState + go inp st pos 0 $ \inp' pos' n -> + let s' = s{ Parsec.stateInput = inp', Parsec.statePos = pos' } + in setParserState s' $> T.take n inp + where + go inp s !pos !n cont + = case T.uncons inp of + Nothing -> cont inp pos n -- ran out of input + Just (c, inp') -> + case f s c of + Nothing -> cont inp pos n -- scan function failed + Just s' -> go inp' s' (updatePosChar pos c) (n+1) cont + -- | Parse a decimal number. decimal :: Integral a => Parser a diff --git a/haddock-library/src/Documentation/Haddock/Parser/Util.hs b/haddock-library/src/Documentation/Haddock/Parser/Util.hs index ffa91b09..98570c22 100644 --- a/haddock-library/src/Documentation/Haddock/Parser/Util.hs +++ b/haddock-library/src/Documentation/Haddock/Parser/Util.hs @@ -40,7 +40,7 @@ skipHorizontalSpace = Parsec.skipMany (Parsec.oneOf horizontalSpace) -- | Take leading horizontal space takeHorizontalSpace :: Parser Text -takeHorizontalSpace = takeWhile (Parsec.oneOf horizontalSpace) +takeHorizontalSpace = takeWhile (`elem` horizontalSpace) makeLabeled :: (String -> Maybe String -> a) -> Text -> a makeLabeled f input = case T.break isSpace $ removeEscapes $ T.strip input of -- cgit v1.2.3