diff options
Diffstat (limited to 'haddock-library/src/Documentation/Haddock/Parser')
3 files changed, 264 insertions, 21 deletions
diff --git a/haddock-library/src/Documentation/Haddock/Parser/Identifier.hs b/haddock-library/src/Documentation/Haddock/Parser/Identifier.hs new file mode 100644 index 00000000..7bc98b62 --- /dev/null +++ b/haddock-library/src/Documentation/Haddock/Parser/Identifier.hs @@ -0,0 +1,186 @@ +{-# LANGUAGE CPP #-} +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE ViewPatterns #-} +-- | +-- Module : Documentation.Haddock.Parser.Identifier +-- Copyright : (c) Alec Theriault 2019, +-- License : BSD-like +-- +-- Maintainer : haddock@projects.haskell.org +-- Stability : experimental +-- Portability : portable +-- +-- Functionality for parsing identifiers and operators + +module Documentation.Haddock.Parser.Identifier ( + Identifier(..), + parseValid, +) where + +import Documentation.Haddock.Types ( Namespace(..) ) +import Documentation.Haddock.Parser.Monad +import qualified Text.Parsec as Parsec +import Text.Parsec.Pos ( updatePosChar ) +import Text.Parsec ( State(..) + , getParserState, setParserState ) + +import Data.Text (Text) +import qualified Data.Text as T + +import Data.Char (isAlpha, isAlphaNum) +import Control.Monad (guard) +import Data.Functor (($>)) +#if MIN_VERSION_base(4,9,0) +import Text.Read.Lex (isSymbolChar) +#else +import Data.Char (GeneralCategory (..), + generalCategory) +#endif + +import Data.Maybe + +-- | Identifier string surrounded with namespace, opening, and closing quotes/backticks. +data Identifier = Identifier !Namespace !Char String !Char + deriving (Show, Eq) + +parseValid :: Parser Identifier +parseValid = do + s@State{ stateInput = inp, statePos = pos } <- getParserState + + case takeIdentifier inp of + Nothing -> Parsec.parserFail "parseValid: Failed to match a valid identifier" + Just (ns, op, ident, cl, inp') -> + let posOp = updatePosChar pos op + posIdent = T.foldl updatePosChar posOp ident + posCl = updatePosChar posIdent cl + s' = s{ stateInput = inp', statePos = posCl } + in setParserState s' $> Identifier ns op (T.unpack ident) cl + + +#if !MIN_VERSION_base(4,9,0) +-- inlined from base-4.10.0.0 +isSymbolChar :: Char -> Bool +isSymbolChar c = not (isPuncChar c) && case generalCategory c of + MathSymbol -> True + CurrencySymbol -> True + ModifierSymbol -> True + OtherSymbol -> True + DashPunctuation -> True + OtherPunctuation -> c `notElem` "'\"" + ConnectorPunctuation -> c /= '_' + _ -> False + where + -- | The @special@ character class as defined in the Haskell Report. + isPuncChar :: Char -> Bool + isPuncChar = (`elem` (",;()[]{}`" :: String)) +#endif + +-- | Try to parse a delimited identifier off the front of the given input. +-- +-- This tries to match as many valid Haskell identifiers/operators as possible, +-- to the point of sometimes accepting invalid things (ex: keywords). Some +-- considerations: +-- +-- - operators and identifiers can have module qualifications +-- - operators can be wrapped in parens (for prefix) +-- - identifiers can be wrapped in backticks (for infix) +-- - delimiters are backticks or regular ticks +-- - since regular ticks are also valid in identifiers, we opt for the +-- longest successful parse +-- +-- This function should make /O(1)/ allocations +takeIdentifier :: Text -> Maybe (Namespace, Char, Text, Char, Text) +takeIdentifier input = listToMaybe $ do + + -- Optional namespace + let (ns, input') = case T.uncons input of + Just ('v', i) -> (Value, i) + Just ('t', i) -> (Type, i) + _ -> (None, input) + + -- Opening tick + (op, input'') <- maybeToList (T.uncons input') + guard (op == '\'' || op == '`') + + -- Identifier/operator + (ident, input''') <- wrapped input'' + + -- Closing tick + (cl, input'''') <- maybeToList (T.uncons input''') + guard (cl == '\'' || cl == '`') + + pure (ns, op, ident, cl, input'''') + + where + + -- | Parse out a wrapped, possibly qualified, operator or identifier + wrapped t = do + (c, t' ) <- maybeToList (T.uncons t) + -- Tuples + case c of + '(' | Just (c', _) <- T.uncons t' + , c' == ',' || c' == ')' + -> do let (commas, t'') = T.span (== ',') t' + (')', t''') <- maybeToList (T.uncons t'') + pure (T.take (T.length commas + 2) t, t''') + + -- Parenthesized + '(' -> do (n, t'' ) <- general False 0 [] t' + (')', t''') <- maybeToList (T.uncons t'') + pure (T.take (n + 2) t, t''') + + -- Backticked + '`' -> do (n, t'' ) <- general False 0 [] t' + ('`', t''') <- maybeToList (T.uncons t'') + pure (T.take (n + 2) t, t''') + + -- Unadorned + _ -> do (n, t'' ) <- general False 0 [] t + pure (T.take n t, t'') + + -- | Parse out a possibly qualified operator or identifier + general :: Bool -- ^ refuse inputs starting with operators + -> Int -- ^ total characters \"consumed\" so far + -> [(Int, Text)] -- ^ accumulated results + -> Text -- ^ current input + -> [(Int, Text)] -- ^ total characters parsed & what remains + general !identOnly !i acc t + -- Starts with an identifier (either just an identifier, or a module qual) + | Just (n, rest) <- identLike t + = if T.null rest + then acc + else case T.head rest of + '`' -> (n + i, rest) : acc + ')' -> (n + i, rest) : acc + '.' -> general False (n + i + 1) acc (T.tail rest) + '\'' -> let (m, rest') = quotes rest + in general True (n + m + 1 + i) ((n + m + i, rest') : acc) (T.tail rest') + _ -> acc + + -- An operator + | Just (n, rest) <- optr t + , not identOnly + = (n + i, rest) : acc + + -- Anything else + | otherwise + = acc + + -- | Parse an identifier off the front of the input + identLike t + | T.null t = Nothing + | isAlpha (T.head t) || '_' == T.head t + = let !(idt, rest) = T.span (\c -> isAlphaNum c || c == '_') t + !(octos, rest') = T.span (== '#') rest + in Just (T.length idt + T.length octos, rest') + | otherwise = Nothing + + -- | Parse all but the last quote off the front of the input + -- PRECONDITION: T.head t == '\'' + quotes :: Text -> (Int, Text) + quotes t = let !n = T.length (T.takeWhile (== '\'') t) - 1 + in (n, T.drop n t) + + -- | Parse an operator off the front of the input + optr t = let !(op, rest) = T.span isSymbolChar t + in if T.null op then Nothing else Just (T.length op, rest) diff --git a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs index a5664aa8..fa46f536 100644 --- a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs +++ b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs @@ -4,15 +4,32 @@ {-# LANGUAGE TypeFamilies #-} {-# LANGUAGE BangPatterns #-} {-# LANGUAGE TypeSynonymInstances #-} +-- | +-- Module : Documentation.Haddock.Parser.Monad +-- Copyright : (c) Alec Theriault 2018-2019, +-- License : BSD-like +-- +-- Maintainer : haddock@projects.haskell.org +-- Stability : experimental +-- Portability : portable +-- +-- Defines the Parsec monad over which all parsing is done and also provides +-- more efficient versions of the usual parsec combinator functions (but +-- specialized to 'Text'). module Documentation.Haddock.Parser.Monad where import qualified Text.Parsec.Char as Parsec import qualified Text.Parsec as Parsec +import Text.Parsec.Pos ( updatePosChar ) +import Text.Parsec ( State(..) + , getParserState, setParserState ) import qualified Data.Text as T import Data.Text ( Text ) +import Control.Monad ( mfilter ) +import Data.Functor ( ($>) ) import Data.String ( IsString(..) ) import Data.Bits ( Bits(..) ) import Data.Char ( ord ) @@ -20,7 +37,11 @@ import Data.List ( foldl' ) import Control.Applicative as App import Documentation.Haddock.Types ( Version ) +import Prelude hiding (takeWhile) +-- | The only bit of information we really care about truding along with us +-- through parsing is the version attached to a @\@since@ annotation - if +-- the doc even contained one. newtype ParserState = ParserState { parserStateSince :: Maybe Version } deriving (Eq, Show) @@ -29,7 +50,7 @@ initialParserState :: ParserState initialParserState = ParserState Nothing setSince :: Version -> Parser () -setSince since = Parsec.modifyState (\st -> st {parserStateSince = Just since}) +setSince since = Parsec.modifyState (\st -> st{ parserStateSince = Just since }) type Parser = Parsec.Parsec Text ParserState @@ -44,38 +65,74 @@ parseOnly p t = case Parsec.runParser p' initialParserState "<haddock>" t of -- | Always succeeds, but returns 'Nothing' if at the end of input. Does not -- consume input. +-- +-- Equivalent to @Parsec.optionMaybe . Parsec.lookAhead $ Parsec.anyChar@, but +-- more efficient. peekChar :: Parser (Maybe Char) -peekChar = Parsec.optionMaybe . Parsec.try . Parsec.lookAhead $ Parsec.anyChar +peekChar = headOpt . stateInput <$> getParserState + where headOpt t | T.null t = Nothing + | otherwise = Just (T.head t) +{-# INLINE peekChar #-} -- | Fails if at the end of input. Does not consume input. +-- +-- Equivalent to @Parsec.lookAhead Parsec.anyChar@, but more efficient. peekChar' :: Parser Char -peekChar' = Parsec.lookAhead Parsec.anyChar +peekChar' = headFail . stateInput =<< getParserState + where headFail t | T.null t = Parsec.parserFail "peekChar': reached EOF" + | otherwise = App.pure (T.head t) +{-# INLINE peekChar' #-} -- | Parses the given string. Returns the parsed string. +-- +-- Equivalent to @Parsec.string (T.unpack t) $> t@, but more efficient. string :: Text -> Parser Text -string t = Parsec.string (T.unpack t) *> App.pure t +string t = do + s@State{ stateInput = inp, statePos = pos } <- getParserState + case T.stripPrefix t inp of + Nothing -> Parsec.parserFail "string: Failed to match the input string" + Just inp' -> + let pos' = T.foldl updatePosChar pos t + s' = s{ stateInput = inp', statePos = pos' } + in setParserState s' $> t + +-- | Keep matching characters as long as the predicate function holds (and +-- return them). +-- +-- Equivalent to @fmap T.pack . Parsec.many@, but more efficient. +takeWhile :: (Char -> Bool) -> Parser Text +takeWhile f = do + s@State{ stateInput = inp, statePos = pos } <- getParserState + let (t, inp') = T.span f inp + pos' = T.foldl updatePosChar pos t + s' = s{ stateInput = inp', statePos = pos' } + setParserState s' $> t + +-- | Like 'takeWhile', but fails if no characters matched. +-- +-- Equivalent to @fmap T.pack . Parsec.many1@, but more efficient. +takeWhile1 :: (Char -> Bool) -> Parser Text +takeWhile1 = mfilter (not . T.null) . takeWhile -- | Scan the input text, accumulating characters as long as the scanning -- function returns true. scan :: (s -> Char -> Maybe s) -- ^ scan function -> s -- ^ initial state -> Parser Text -scan f = fmap T.pack . go - where go s1 = do { cOpt <- peekChar - ; case cOpt >>= f s1 of - Nothing -> pure "" - Just s2 -> (:) <$> Parsec.anyChar <*> go s2 - } - --- | Apply a parser for a character zero or more times and collect the result in --- a string. -takeWhile :: Parser Char -> Parser Text -takeWhile = fmap T.pack . Parsec.many - --- | Apply a parser for a character one or more times and collect the result in --- a string. -takeWhile1 :: Parser Char -> Parser Text -takeWhile1 = fmap T.pack . Parsec.many1 +scan f st = do + s@State{ stateInput = inp, statePos = pos } <- getParserState + go inp st pos 0 $ \inp' pos' n -> + let s' = s{ Parsec.stateInput = inp', Parsec.statePos = pos' } + in setParserState s' $> T.take n inp + where + go inp s !pos !n cont + = case T.uncons inp of + Nothing -> cont inp pos n -- ran out of input + Just (c, inp') -> + case f s c of + Nothing -> cont inp pos n -- scan function failed + Just s' -> go inp' s' (updatePosChar pos c) (n+1) cont + -- | Parse a decimal number. decimal :: Integral a => Parser a diff --git a/haddock-library/src/Documentation/Haddock/Parser/Util.hs b/haddock-library/src/Documentation/Haddock/Parser/Util.hs index ffa91b09..98570c22 100644 --- a/haddock-library/src/Documentation/Haddock/Parser/Util.hs +++ b/haddock-library/src/Documentation/Haddock/Parser/Util.hs @@ -40,7 +40,7 @@ skipHorizontalSpace = Parsec.skipMany (Parsec.oneOf horizontalSpace) -- | Take leading horizontal space takeHorizontalSpace :: Parser Text -takeHorizontalSpace = takeWhile (Parsec.oneOf horizontalSpace) +takeHorizontalSpace = takeWhile (`elem` horizontalSpace) makeLabeled :: (String -> Maybe String -> a) -> Text -> a makeLabeled f input = case T.break isSpace $ removeEscapes $ T.strip input of |