Faster 'Text' driven parser combinators

Perf only change: * use 'getParserState'/'setParserState' to make 'Text'-optimized parser combinators * minimize uses of 'Data.Text.{pack,unpack,cons,snoc}'
author: Alec Theriault <alec.theriault@gmail.com> 2018-08-29 04:52:42 -0700
committer: Alec Theriault <alec.theriault@gmail.com> 2018-11-05 15:54:22 -0800
commit: 82b8f491e18d707f67857bcb170b2147fa275ccc (patch)
tree: bea7fedeab9a8e6bfd5e811369f1b76781621309 /haddock-library
parent: d9c87e47afda04fd2bda380b81afca8e337a517c (diff)
3 files changed, 83 insertions, 35 deletions
diff --git a/haddock-library/src/Documentation/Haddock/Parser.hs b/haddock-library/src/Documentation/Haddock/Parser.hs
index d79da40b..46b7ad3e 100644
--- a/haddock-library/src/Documentation/Haddock/Parser.hs
+++ b/haddock-library/src/Documentation/Haddock/Parser.hs
@@ -360,32 +360,34 @@ table = do
     parseFirstRow :: Parser Text
     parseFirstRow = do
         skipHorizontalSpace
-        -- upper-left corner is +
-        c <- Parsec.char '+'
-        cs <- some (Parsec.char '-' <|> Parsec.char '+')
+        cs <- takeWhile (\c -> c == '-' || c == '+')
 
-        -- upper right corner is + too
-        guard (last cs == '+')
+        -- upper-left and upper-right corners are `+`
+        guard (T.length cs >= 2 &&
+               T.head cs == '+' &&
+               T.last cs == '+')
 
         -- trailing space
         skipHorizontalSpace
         _ <- Parsec.newline
 
-        return (T.cons c $ T.pack cs)
+        return cs
 
     parseRestRows :: Int -> Parser Text
     parseRestRows l = do
         skipHorizontalSpace
+        bs <- scan predicate l
 
-        c <- Parsec.char '|' <|> Parsec.char '+'
-        bs <- scan predicate (l - 2)
-        c2 <- Parsec.char '|' <|> Parsec.char '+'
+        -- Left and right edges are `|` or `+`
+        guard (T.length bs >= 2 &&
+               (T.head bs == '|' || T.head bs == '+') &&
+               (T.last bs == '|' || T.last bs == '+'))
 
         -- trailing space
         skipHorizontalSpace
         _ <- Parsec.newline
 
-        return (T.cons c (T.snoc bs c2))
+        return bs
       where
         predicate n c
             | n <= 0    = Nothing
@@ -662,7 +664,7 @@ nonSpace xs
 --  Doesn't discard the trailing newline.
 takeNonEmptyLine :: Parser Text
 takeNonEmptyLine = do
-    l <- takeWhile1 (Parsec.noneOf "\n") >>= nonSpace
+    l <- takeWhile1 (/= '\n') >>= nonSpace
     _ <- "\n"
     pure (l <> "\n")
 
@@ -732,7 +734,7 @@ nonEmptyLine :: Parser Text
 nonEmptyLine = try (mfilter (T.any (not . isSpace)) takeLine)
 
 takeLine :: Parser Text
-takeLine = try (takeWhile (Parsec.noneOf "\n") <* endOfLine)
+takeLine = try (takeWhile (/= '\n') <* endOfLine)
 
 endOfLine :: Parser ()
 endOfLine = void "\n" <|> Parsec.eof 
@@ -742,7 +744,7 @@ endOfLine = void "\n" <|> Parsec.eof
 -- >>> snd <$> parseOnly property "prop> hello world"
 -- Right (DocProperty "hello world")
 property :: Parser (DocH mod a)
-property = DocProperty . T.unpack . T.strip <$> ("prop>" *> takeWhile1 (Parsec.noneOf "\n"))
+property = DocProperty . T.unpack . T.strip <$> ("prop>" *> takeWhile1 (/= '\n'))
 
 -- |
 -- Paragraph level codeblock. Anything between the two delimiting \@ is parsed
@@ -816,7 +818,7 @@ linkParser = flip Hyperlink <$> label <*> (whitespace *> url)
 autoUrl :: Parser (DocH mod a)
 autoUrl = mkLink <$> url
   where
-    url = mappend <$> choice' [ "http://", "https://", "ftp://"] <*> takeWhile1 (Parsec.satisfy (not . isSpace))
+    url = mappend <$> choice' [ "http://", "https://", "ftp://"] <*> takeWhile1 (not . isSpace)
     
     mkLink :: Text -> DocH mod a
     mkLink s = case T.unsnoc s of
diff --git a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs
index a5664aa8..8f5bd217 100644
--- a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs
+++ b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs
@@ -9,10 +9,15 @@ module Documentation.Haddock.Parser.Monad where
 
 import qualified Text.Parsec.Char as Parsec
 import qualified Text.Parsec as Parsec
+import           Text.Parsec.Pos             ( updatePosChar )
+import           Text.Parsec                 ( State(..)
+                                             , getParserState, setParserState )
 
 import qualified Data.Text as T
 import           Data.Text                   ( Text )
 
+import           Control.Monad               ( mfilter )
+import           Data.Functor                ( ($>) )
 import           Data.String                 ( IsString(..) )
 import           Data.Bits                   ( Bits(..) )
 import           Data.Char                   ( ord )
@@ -20,7 +25,11 @@ import           Data.List                   ( foldl' )
 import           Control.Applicative as App
 
 import           Documentation.Haddock.Types ( Version )
+import           Prelude hiding (takeWhile)
 
+-- | The only bit of information we really care about truding along with us
+-- through parsing is the version attached to a @\@since@ annotation - if
+-- the doc even contained one.
 newtype ParserState = ParserState {
   parserStateSince :: Maybe Version
 } deriving (Eq, Show)
@@ -29,7 +38,7 @@ initialParserState :: ParserState
 initialParserState = ParserState Nothing
 
 setSince :: Version -> Parser ()
-setSince since = Parsec.modifyState (\st -> st {parserStateSince = Just since})
+setSince since = Parsec.modifyState (\st -> st{ parserStateSince = Just since })
 
 type Parser = Parsec.Parsec Text ParserState
 
@@ -44,38 +53,75 @@ parseOnly p t = case Parsec.runParser p' initialParserState "<haddock>" t of
 
 -- | Always succeeds, but returns 'Nothing' if at the end of input. Does not
 -- consume input.
+--
+-- Equivalent to @Parsec.optionMaybe . Parsec.lookAhead $ Parsec.anyChar@, but
+-- more efficient.
 peekChar :: Parser (Maybe Char)
-peekChar = Parsec.optionMaybe . Parsec.try . Parsec.lookAhead $ Parsec.anyChar
+peekChar = headOpt . stateInput <$> getParserState
+  where headOpt t | T.null t = Nothing
+                  | otherwise = Just (T.head t)
+{-# INLINE peekChar #-}
 
 -- | Fails if at the end of input. Does not consume input.
+--
+-- Equivalent to @Parsec.lookAhead Parsec.anyChar@, but more efficient.
 peekChar' :: Parser Char
-peekChar' = Parsec.lookAhead Parsec.anyChar 
+peekChar' = headFail . stateInput =<< getParserState
+  where headFail t | T.null t = Parsec.parserFail "peekChar': reached EOF"
+                   | otherwise = App.pure (T.head t)
+{-# INLINE peekChar' #-}
 
 -- | Parses the given string. Returns the parsed string.
+--
+-- Equivalent to @Parsec.string (T.unpack t) $> t@, but more efficient.
 string :: Text -> Parser Text
-string t = Parsec.string (T.unpack t) *> App.pure t
+string t = do
+  s@State{ stateInput = inp, statePos = pos } <- getParserState
+  case T.stripPrefix t inp of
+    Nothing -> Parsec.parserFail "string: Failed to match the input string"
+    Just inp' ->
+      let pos' = T.foldl updatePosChar pos t
+          s' = s{ stateInput = inp', statePos = pos' }
+      in setParserState s' $> t
+
+-- | Keep matching characters as long as the predicate function holds (and
+-- return them).
+--
+-- Equivalent to @fmap T.pack . Parsec.many@, but more efficient.
+takeWhile :: (Char -> Bool) -> Parser Text
+takeWhile f = do
+  s@State{ stateInput = inp, statePos = pos } <- getParserState
+  let (t, inp') = T.span f inp
+      pos' = T.foldl updatePosChar pos t
+      s' = s{ stateInput = inp', statePos = pos' }
+  setParserState s' $> t
+
+
+-- | Like 'takeWhile', but fails if no characters matched.
+--
+-- Equivalent to @fmap T.pack . Parsec.many1@, but more efficient.
+takeWhile1 :: (Char -> Bool) -> Parser Text
+takeWhile1 = mfilter (not . T.null) . takeWhile
 
 -- | Scan the input text, accumulating characters as long as the scanning
 -- function returns true.
 scan :: (s -> Char -> Maybe s) -- ^ scan function
      -> s                      -- ^ initial state
      -> Parser Text 
-scan f = fmap T.pack . go
-  where go s1 = do { cOpt <- peekChar
-                   ; case cOpt >>= f s1 of
-                       Nothing -> pure ""
-                       Just s2 -> (:) <$> Parsec.anyChar <*> go s2
-                   }
-
--- | Apply a parser for a character zero or more times and collect the result in
--- a string.
-takeWhile :: Parser Char -> Parser Text
-takeWhile = fmap T.pack . Parsec.many
-
--- | Apply a parser for a character one or more times and collect the result in
--- a string.
-takeWhile1 :: Parser Char -> Parser Text
-takeWhile1 =  fmap T.pack . Parsec.many1
+scan f st = do
+  s@State{ stateInput = inp, statePos = pos } <- getParserState
+  go inp st pos 0 $ \inp' pos' n ->
+    let s' = s{ Parsec.stateInput = inp', Parsec.statePos = pos' }
+    in setParserState s' $> T.take n inp
+  where
+    go inp s !pos !n cont
+      = case T.uncons inp of
+          Nothing -> cont inp pos n        -- ran out of input
+          Just (c, inp') ->
+             case f s c of
+               Nothing -> cont inp pos n   -- scan function failed
+               Just s' -> go inp' s' (updatePosChar pos c) (n+1) cont
+
 
 -- | Parse a decimal number.
 decimal :: Integral a => Parser a
diff --git a/haddock-library/src/Documentation/Haddock/Parser/Util.hs b/haddock-library/src/Documentation/Haddock/Parser/Util.hs
index ffa91b09..98570c22 100644
--- a/haddock-library/src/Documentation/Haddock/Parser/Util.hs
+++ b/haddock-library/src/Documentation/Haddock/Parser/Util.hs
@@ -40,7 +40,7 @@ skipHorizontalSpace = Parsec.skipMany (Parsec.oneOf horizontalSpace)
 
 -- | Take leading horizontal space
 takeHorizontalSpace :: Parser Text 
-takeHorizontalSpace = takeWhile (Parsec.oneOf horizontalSpace)
+takeHorizontalSpace = takeWhile (`elem` horizontalSpace)
 
 makeLabeled :: (String -> Maybe String -> a) -> Text -> a
 makeLabeled f input = case T.break isSpace $ removeEscapes $ T.strip input of
author	Alec Theriault <alec.theriault@gmail.com>	2018-08-29 04:52:42 -0700
committer	Alec Theriault <alec.theriault@gmail.com>	2018-11-05 15:54:22 -0800
commit	82b8f491e18d707f67857bcb170b2147fa275ccc (patch)
tree	bea7fedeab9a8e6bfd5e811369f1b76781621309 /haddock-library
parent	d9c87e47afda04fd2bda380b81afca8e337a517c (diff)