{-# LANGUAGE CPP #-} {-# LANGUAGE BangPatterns #-} {-# LANGUAGE ViewPatterns #-} -- | -- Module : Documentation.Haddock.Parser.Identifier -- Copyright : (c) Alec Theriault 2019, -- License : BSD-like -- -- Maintainer : haddock@projects.haskell.org -- Stability : experimental -- Portability : portable -- -- Functionality for parsing identifiers and operators module Documentation.Haddock.Parser.Identifier ( Identifier(..), parseValid, ) where import Documentation.Haddock.Types ( Namespace(..) ) import Documentation.Haddock.Parser.Monad import qualified Text.Parsec as Parsec import Text.Parsec.Pos ( updatePosChar ) import Text.Parsec ( State(..) , getParserState, setParserState ) import Data.Text (Text) import qualified Data.Text as T import Data.Char (isAlpha, isAlphaNum) import Control.Monad (guard) import Data.Functor (($>)) #if MIN_VERSION_base(4,9,0) import Text.Read.Lex (isSymbolChar) #else import Data.Char (GeneralCategory (..), generalCategory) #endif import Data.Maybe -- | Identifier string surrounded with namespace, opening, and closing quotes/backticks. data Identifier = Identifier !Namespace !Char String !Char deriving (Show, Eq) parseValid :: Parser Identifier parseValid = do s@State{ stateInput = inp, statePos = pos } <- getParserState case takeIdentifier inp of Nothing -> Parsec.parserFail "parseValid: Failed to match a valid identifier" Just (ns, op, ident, cl, inp') -> let posOp = updatePosChar pos op posIdent = T.foldl updatePosChar posOp ident posCl = updatePosChar posIdent cl s' = s{ stateInput = inp', statePos = posCl } in setParserState s' $> Identifier ns op (T.unpack ident) cl #if !MIN_VERSION_base(4,9,0) -- inlined from base-4.10.0.0 isSymbolChar :: Char -> Bool isSymbolChar c = not (isPuncChar c) && case generalCategory c of MathSymbol -> True CurrencySymbol -> True ModifierSymbol -> True OtherSymbol -> True DashPunctuation -> True OtherPunctuation -> c `notElem` "'\"" ConnectorPunctuation -> c /= '_' _ -> False where -- | The @special@ character class as defined in the Haskell Report. isPuncChar :: Char -> Bool isPuncChar = (`elem` (",;()[]{}`" :: String)) #endif -- | Try to parse a delimited identifier off the front of the given input. -- -- This tries to match as many valid Haskell identifiers/operators as possible, -- to the point of sometimes accepting invalid things (ex: keywords). Some -- considerations: -- -- - operators and identifiers can have module qualifications -- - operators can be wrapped in parens (for prefix) -- - identifiers can be wrapped in backticks (for infix) -- - delimiters are backticks or regular ticks -- - since regular ticks are also valid in identifiers, we opt for the -- longest successful parse -- -- This function should make /O(1)/ allocations takeIdentifier :: Text -> Maybe (Namespace, Char, Text, Char, Text) takeIdentifier input = listToMaybe $ do -- Optional namespace let (ns, input') = case T.uncons input of Just ('v', i) -> (Value, i) Just ('t', i) -> (Type, i) _ -> (None, input) -- Opening tick (op, input'') <- maybeToList (T.uncons input') guard (op == '\'' || op == '`') -- Identifier/operator (ident, input''') <- wrapped input'' -- Closing tick (cl, input'''') <- maybeToList (T.uncons input''') guard (cl == '\'' || cl == '`') return (ns, op, ident, cl, input'''') where -- | Parse out a wrapped, possibly qualified, operator or identifier wrapped t = do (c, t' ) <- maybeToList (T.uncons t) -- Tuples case c of '(' | Just (c', _) <- T.uncons t' , c' == ',' || c' == ')' -> do let (commas, t'') = T.span (== ',') t' (')', t''') <- maybeToList (T.uncons t'') return (T.take (T.length commas + 2) t, t''') -- Parenthesized '(' -> do (n, t'' ) <- general False 0 [] t' (')', t''') <- maybeToList (T.uncons t'') return (T.take (n + 2) t, t''') -- Backticked '`' -> do (n, t'' ) <- general False 0 [] t' ('`', t''') <- maybeToList (T.uncons t'') return (T.take (n + 2) t, t''') -- Unadorned _ -> do (n, t'' ) <- general False 0 [] t return (T.take n t, t'') -- | Parse out a possibly qualified operator or identifier general :: Bool -- ^ refuse inputs starting with operators -> Int -- ^ total characters \"consumed\" so far -> [(Int, Text)] -- ^ accumulated results -> Text -- ^ current input -> [(Int, Text)] -- ^ total characters parsed & what remains general !identOnly !i acc t -- Starts with an identifier (either just an identifier, or a module qual) | Just (n, rest) <- identLike t = if T.null rest then acc else case T.head rest of '`' -> (n + i, rest) : acc ')' -> (n + i, rest) : acc '.' -> general False (n + i + 1) acc (T.tail rest) '\'' -> let (m, rest') = quotes rest in general True (n + m + 1 + i) ((n + m + i, rest') : acc) (T.tail rest') _ -> acc -- An operator | Just (n, rest) <- optr t , not identOnly = (n + i, rest) : acc -- Anything else | otherwise = acc -- | Parse an identifier off the front of the input identLike t | T.null t = Nothing | isAlpha (T.head t) || '_' == T.head t = let !(idt, rest) = T.span (\c -> isAlphaNum c || c == '_') t !(octos, rest') = T.span (== '#') rest in Just (T.length idt + T.length octos, rest') | otherwise = Nothing -- | Parse all but the last quote off the front of the input -- PRECONDITION: T.head t == '\'' quotes :: Text -> (Int, Text) quotes t = let !n = T.length (T.takeWhile (== '\'') t) - 1 in (n, T.drop n t) -- | Parse an operator off the front of the input optr t = let !(op, rest) = T.span isSymbolChar t in if T.null op then Nothing else Just (T.length op, rest)