aboutsummaryrefslogtreecommitdiff
path: root/haddock-library/src/Documentation/Haddock/Parser
diff options
context:
space:
mode:
Diffstat (limited to 'haddock-library/src/Documentation/Haddock/Parser')
-rw-r--r--haddock-library/src/Documentation/Haddock/Parser/Identifier.hs186
-rw-r--r--haddock-library/src/Documentation/Haddock/Parser/Monad.hs97
-rw-r--r--haddock-library/src/Documentation/Haddock/Parser/Util.hs2
3 files changed, 264 insertions, 21 deletions
diff --git a/haddock-library/src/Documentation/Haddock/Parser/Identifier.hs b/haddock-library/src/Documentation/Haddock/Parser/Identifier.hs
new file mode 100644
index 00000000..7bc98b62
--- /dev/null
+++ b/haddock-library/src/Documentation/Haddock/Parser/Identifier.hs
@@ -0,0 +1,186 @@
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE ViewPatterns #-}
+-- |
+-- Module : Documentation.Haddock.Parser.Identifier
+-- Copyright : (c) Alec Theriault 2019,
+-- License : BSD-like
+--
+-- Maintainer : haddock@projects.haskell.org
+-- Stability : experimental
+-- Portability : portable
+--
+-- Functionality for parsing identifiers and operators
+
+module Documentation.Haddock.Parser.Identifier (
+ Identifier(..),
+ parseValid,
+) where
+
+import Documentation.Haddock.Types ( Namespace(..) )
+import Documentation.Haddock.Parser.Monad
+import qualified Text.Parsec as Parsec
+import Text.Parsec.Pos ( updatePosChar )
+import Text.Parsec ( State(..)
+ , getParserState, setParserState )
+
+import Data.Text (Text)
+import qualified Data.Text as T
+
+import Data.Char (isAlpha, isAlphaNum)
+import Control.Monad (guard)
+import Data.Functor (($>))
+#if MIN_VERSION_base(4,9,0)
+import Text.Read.Lex (isSymbolChar)
+#else
+import Data.Char (GeneralCategory (..),
+ generalCategory)
+#endif
+
+import Data.Maybe
+
+-- | Identifier string surrounded with namespace, opening, and closing quotes/backticks.
+data Identifier = Identifier !Namespace !Char String !Char
+ deriving (Show, Eq)
+
+parseValid :: Parser Identifier
+parseValid = do
+ s@State{ stateInput = inp, statePos = pos } <- getParserState
+
+ case takeIdentifier inp of
+ Nothing -> Parsec.parserFail "parseValid: Failed to match a valid identifier"
+ Just (ns, op, ident, cl, inp') ->
+ let posOp = updatePosChar pos op
+ posIdent = T.foldl updatePosChar posOp ident
+ posCl = updatePosChar posIdent cl
+ s' = s{ stateInput = inp', statePos = posCl }
+ in setParserState s' $> Identifier ns op (T.unpack ident) cl
+
+
+#if !MIN_VERSION_base(4,9,0)
+-- inlined from base-4.10.0.0
+isSymbolChar :: Char -> Bool
+isSymbolChar c = not (isPuncChar c) && case generalCategory c of
+ MathSymbol -> True
+ CurrencySymbol -> True
+ ModifierSymbol -> True
+ OtherSymbol -> True
+ DashPunctuation -> True
+ OtherPunctuation -> c `notElem` "'\""
+ ConnectorPunctuation -> c /= '_'
+ _ -> False
+ where
+ -- | The @special@ character class as defined in the Haskell Report.
+ isPuncChar :: Char -> Bool
+ isPuncChar = (`elem` (",;()[]{}`" :: String))
+#endif
+
+-- | Try to parse a delimited identifier off the front of the given input.
+--
+-- This tries to match as many valid Haskell identifiers/operators as possible,
+-- to the point of sometimes accepting invalid things (ex: keywords). Some
+-- considerations:
+--
+-- - operators and identifiers can have module qualifications
+-- - operators can be wrapped in parens (for prefix)
+-- - identifiers can be wrapped in backticks (for infix)
+-- - delimiters are backticks or regular ticks
+-- - since regular ticks are also valid in identifiers, we opt for the
+-- longest successful parse
+--
+-- This function should make /O(1)/ allocations
+takeIdentifier :: Text -> Maybe (Namespace, Char, Text, Char, Text)
+takeIdentifier input = listToMaybe $ do
+
+ -- Optional namespace
+ let (ns, input') = case T.uncons input of
+ Just ('v', i) -> (Value, i)
+ Just ('t', i) -> (Type, i)
+ _ -> (None, input)
+
+ -- Opening tick
+ (op, input'') <- maybeToList (T.uncons input')
+ guard (op == '\'' || op == '`')
+
+ -- Identifier/operator
+ (ident, input''') <- wrapped input''
+
+ -- Closing tick
+ (cl, input'''') <- maybeToList (T.uncons input''')
+ guard (cl == '\'' || cl == '`')
+
+ pure (ns, op, ident, cl, input'''')
+
+ where
+
+ -- | Parse out a wrapped, possibly qualified, operator or identifier
+ wrapped t = do
+ (c, t' ) <- maybeToList (T.uncons t)
+ -- Tuples
+ case c of
+ '(' | Just (c', _) <- T.uncons t'
+ , c' == ',' || c' == ')'
+ -> do let (commas, t'') = T.span (== ',') t'
+ (')', t''') <- maybeToList (T.uncons t'')
+ pure (T.take (T.length commas + 2) t, t''')
+
+ -- Parenthesized
+ '(' -> do (n, t'' ) <- general False 0 [] t'
+ (')', t''') <- maybeToList (T.uncons t'')
+ pure (T.take (n + 2) t, t''')
+
+ -- Backticked
+ '`' -> do (n, t'' ) <- general False 0 [] t'
+ ('`', t''') <- maybeToList (T.uncons t'')
+ pure (T.take (n + 2) t, t''')
+
+ -- Unadorned
+ _ -> do (n, t'' ) <- general False 0 [] t
+ pure (T.take n t, t'')
+
+ -- | Parse out a possibly qualified operator or identifier
+ general :: Bool -- ^ refuse inputs starting with operators
+ -> Int -- ^ total characters \"consumed\" so far
+ -> [(Int, Text)] -- ^ accumulated results
+ -> Text -- ^ current input
+ -> [(Int, Text)] -- ^ total characters parsed & what remains
+ general !identOnly !i acc t
+ -- Starts with an identifier (either just an identifier, or a module qual)
+ | Just (n, rest) <- identLike t
+ = if T.null rest
+ then acc
+ else case T.head rest of
+ '`' -> (n + i, rest) : acc
+ ')' -> (n + i, rest) : acc
+ '.' -> general False (n + i + 1) acc (T.tail rest)
+ '\'' -> let (m, rest') = quotes rest
+ in general True (n + m + 1 + i) ((n + m + i, rest') : acc) (T.tail rest')
+ _ -> acc
+
+ -- An operator
+ | Just (n, rest) <- optr t
+ , not identOnly
+ = (n + i, rest) : acc
+
+ -- Anything else
+ | otherwise
+ = acc
+
+ -- | Parse an identifier off the front of the input
+ identLike t
+ | T.null t = Nothing
+ | isAlpha (T.head t) || '_' == T.head t
+ = let !(idt, rest) = T.span (\c -> isAlphaNum c || c == '_') t
+ !(octos, rest') = T.span (== '#') rest
+ in Just (T.length idt + T.length octos, rest')
+ | otherwise = Nothing
+
+ -- | Parse all but the last quote off the front of the input
+ -- PRECONDITION: T.head t == '\''
+ quotes :: Text -> (Int, Text)
+ quotes t = let !n = T.length (T.takeWhile (== '\'') t) - 1
+ in (n, T.drop n t)
+
+ -- | Parse an operator off the front of the input
+ optr t = let !(op, rest) = T.span isSymbolChar t
+ in if T.null op then Nothing else Just (T.length op, rest)
diff --git a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs
index a5664aa8..fa46f536 100644
--- a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs
+++ b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs
@@ -4,15 +4,32 @@
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE TypeSynonymInstances #-}
+-- |
+-- Module : Documentation.Haddock.Parser.Monad
+-- Copyright : (c) Alec Theriault 2018-2019,
+-- License : BSD-like
+--
+-- Maintainer : haddock@projects.haskell.org
+-- Stability : experimental
+-- Portability : portable
+--
+-- Defines the Parsec monad over which all parsing is done and also provides
+-- more efficient versions of the usual parsec combinator functions (but
+-- specialized to 'Text').
module Documentation.Haddock.Parser.Monad where
import qualified Text.Parsec.Char as Parsec
import qualified Text.Parsec as Parsec
+import Text.Parsec.Pos ( updatePosChar )
+import Text.Parsec ( State(..)
+ , getParserState, setParserState )
import qualified Data.Text as T
import Data.Text ( Text )
+import Control.Monad ( mfilter )
+import Data.Functor ( ($>) )
import Data.String ( IsString(..) )
import Data.Bits ( Bits(..) )
import Data.Char ( ord )
@@ -20,7 +37,11 @@ import Data.List ( foldl' )
import Control.Applicative as App
import Documentation.Haddock.Types ( Version )
+import Prelude hiding (takeWhile)
+-- | The only bit of information we really care about truding along with us
+-- through parsing is the version attached to a @\@since@ annotation - if
+-- the doc even contained one.
newtype ParserState = ParserState {
parserStateSince :: Maybe Version
} deriving (Eq, Show)
@@ -29,7 +50,7 @@ initialParserState :: ParserState
initialParserState = ParserState Nothing
setSince :: Version -> Parser ()
-setSince since = Parsec.modifyState (\st -> st {parserStateSince = Just since})
+setSince since = Parsec.modifyState (\st -> st{ parserStateSince = Just since })
type Parser = Parsec.Parsec Text ParserState
@@ -44,38 +65,74 @@ parseOnly p t = case Parsec.runParser p' initialParserState "<haddock>" t of
-- | Always succeeds, but returns 'Nothing' if at the end of input. Does not
-- consume input.
+--
+-- Equivalent to @Parsec.optionMaybe . Parsec.lookAhead $ Parsec.anyChar@, but
+-- more efficient.
peekChar :: Parser (Maybe Char)
-peekChar = Parsec.optionMaybe . Parsec.try . Parsec.lookAhead $ Parsec.anyChar
+peekChar = headOpt . stateInput <$> getParserState
+ where headOpt t | T.null t = Nothing
+ | otherwise = Just (T.head t)
+{-# INLINE peekChar #-}
-- | Fails if at the end of input. Does not consume input.
+--
+-- Equivalent to @Parsec.lookAhead Parsec.anyChar@, but more efficient.
peekChar' :: Parser Char
-peekChar' = Parsec.lookAhead Parsec.anyChar
+peekChar' = headFail . stateInput =<< getParserState
+ where headFail t | T.null t = Parsec.parserFail "peekChar': reached EOF"
+ | otherwise = App.pure (T.head t)
+{-# INLINE peekChar' #-}
-- | Parses the given string. Returns the parsed string.
+--
+-- Equivalent to @Parsec.string (T.unpack t) $> t@, but more efficient.
string :: Text -> Parser Text
-string t = Parsec.string (T.unpack t) *> App.pure t
+string t = do
+ s@State{ stateInput = inp, statePos = pos } <- getParserState
+ case T.stripPrefix t inp of
+ Nothing -> Parsec.parserFail "string: Failed to match the input string"
+ Just inp' ->
+ let pos' = T.foldl updatePosChar pos t
+ s' = s{ stateInput = inp', statePos = pos' }
+ in setParserState s' $> t
+
+-- | Keep matching characters as long as the predicate function holds (and
+-- return them).
+--
+-- Equivalent to @fmap T.pack . Parsec.many@, but more efficient.
+takeWhile :: (Char -> Bool) -> Parser Text
+takeWhile f = do
+ s@State{ stateInput = inp, statePos = pos } <- getParserState
+ let (t, inp') = T.span f inp
+ pos' = T.foldl updatePosChar pos t
+ s' = s{ stateInput = inp', statePos = pos' }
+ setParserState s' $> t
+
+-- | Like 'takeWhile', but fails if no characters matched.
+--
+-- Equivalent to @fmap T.pack . Parsec.many1@, but more efficient.
+takeWhile1 :: (Char -> Bool) -> Parser Text
+takeWhile1 = mfilter (not . T.null) . takeWhile
-- | Scan the input text, accumulating characters as long as the scanning
-- function returns true.
scan :: (s -> Char -> Maybe s) -- ^ scan function
-> s -- ^ initial state
-> Parser Text
-scan f = fmap T.pack . go
- where go s1 = do { cOpt <- peekChar
- ; case cOpt >>= f s1 of
- Nothing -> pure ""
- Just s2 -> (:) <$> Parsec.anyChar <*> go s2
- }
-
--- | Apply a parser for a character zero or more times and collect the result in
--- a string.
-takeWhile :: Parser Char -> Parser Text
-takeWhile = fmap T.pack . Parsec.many
-
--- | Apply a parser for a character one or more times and collect the result in
--- a string.
-takeWhile1 :: Parser Char -> Parser Text
-takeWhile1 = fmap T.pack . Parsec.many1
+scan f st = do
+ s@State{ stateInput = inp, statePos = pos } <- getParserState
+ go inp st pos 0 $ \inp' pos' n ->
+ let s' = s{ Parsec.stateInput = inp', Parsec.statePos = pos' }
+ in setParserState s' $> T.take n inp
+ where
+ go inp s !pos !n cont
+ = case T.uncons inp of
+ Nothing -> cont inp pos n -- ran out of input
+ Just (c, inp') ->
+ case f s c of
+ Nothing -> cont inp pos n -- scan function failed
+ Just s' -> go inp' s' (updatePosChar pos c) (n+1) cont
+
-- | Parse a decimal number.
decimal :: Integral a => Parser a
diff --git a/haddock-library/src/Documentation/Haddock/Parser/Util.hs b/haddock-library/src/Documentation/Haddock/Parser/Util.hs
index ffa91b09..98570c22 100644
--- a/haddock-library/src/Documentation/Haddock/Parser/Util.hs
+++ b/haddock-library/src/Documentation/Haddock/Parser/Util.hs
@@ -40,7 +40,7 @@ skipHorizontalSpace = Parsec.skipMany (Parsec.oneOf horizontalSpace)
-- | Take leading horizontal space
takeHorizontalSpace :: Parser Text
-takeHorizontalSpace = takeWhile (Parsec.oneOf horizontalSpace)
+takeHorizontalSpace = takeWhile (`elem` horizontalSpace)
makeLabeled :: (String -> Maybe String -> a) -> Text -> a
makeLabeled f input = case T.break isSpace $ removeEscapes $ T.strip input of