aboutsummaryrefslogtreecommitdiff
path: root/haddock-library/src/Documentation/Haddock/Parser/Identifier.hs
blob: a83e5abf9e9c0808f42639b1066b660d5c5b762e (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
{-# LANGUAGE CPP          #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE ViewPatterns #-}
-- |
-- Module      :  Documentation.Haddock.Parser.Identifier
-- Copyright   :  (c) Alec Theriault 2019,
-- License     :  BSD-like
--
-- Maintainer  :  haddock@projects.haskell.org
-- Stability   :  experimental
-- Portability :  portable
--
-- Functionality for parsing identifiers and operators

module Documentation.Haddock.Parser.Identifier (
  Identifier(..),
  parseValid,
) where

import Documentation.Haddock.Types           ( Namespace(..) )
import Documentation.Haddock.Parser.Monad
import qualified Text.Parsec as Parsec
import           Text.Parsec.Pos             ( updatePosChar )
import           Text.Parsec                 ( State(..)
                                             , getParserState, setParserState )

import Data.Text (Text)
import qualified Data.Text as T

import           Data.Char (isAlpha, isAlphaNum)
import Control.Monad (guard)
import Data.Functor (($>))
#if MIN_VERSION_base(4,9,0)
import           Text.Read.Lex                      (isSymbolChar)
#else
import           Data.Char                          (GeneralCategory (..),
                                                     generalCategory)
#endif

import Data.Maybe

-- | Identifier string surrounded with namespace, opening, and closing quotes/backticks.
data Identifier = Identifier !Namespace !Char String !Char
  deriving (Show, Eq)

parseValid :: Parser Identifier
parseValid = do
  s@State{ stateInput = inp, statePos = pos } <- getParserState

  case takeIdentifier inp of
    Nothing -> Parsec.parserFail "parseValid: Failed to match a valid identifier"
    Just (ns, op, ident, cl, inp') ->
      let posOp = updatePosChar pos op
          posIdent = T.foldl updatePosChar posOp ident
          posCl = updatePosChar posIdent cl
          s' = s{ stateInput = inp', statePos = posCl }
      in setParserState s' $> Identifier ns op (T.unpack ident) cl


#if !MIN_VERSION_base(4,9,0)
-- inlined from base-4.10.0.0
isSymbolChar :: Char -> Bool
isSymbolChar c = not (isPuncChar c) && case generalCategory c of
    MathSymbol           -> True
    CurrencySymbol       -> True
    ModifierSymbol       -> True
    OtherSymbol          -> True
    DashPunctuation      -> True
    OtherPunctuation     -> c `notElem` "'\""
    ConnectorPunctuation -> c /= '_'
    _                    -> False
  where
    -- | The @special@ character class as defined in the Haskell Report.
    isPuncChar :: Char -> Bool
    isPuncChar = (`elem` (",;()[]{}`" :: String))
#endif

-- | Try to parse a delimited identifier off the front of the given input.
--
-- This tries to match as many valid Haskell identifiers/operators as possible,
-- to the point of sometimes accepting invalid things (ex: keywords). Some
-- considerations:
--
--   - operators and identifiers can have module qualifications
--   - operators can be wrapped in parens (for prefix)
--   - identifiers can be wrapped in backticks (for infix)
--   - delimiters are backticks or regular ticks
--   - since regular ticks are also valid in identifiers, we opt for the
--     longest successful parse
--
-- This function should make /O(1)/ allocations
takeIdentifier :: Text -> Maybe (Namespace, Char, Text, Char, Text)
takeIdentifier input = listToMaybe $ do

    -- Optional namespace
    let (ns, input') = case T.uncons input of
                         Just ('v', i) -> (Value, i)
                         Just ('t', i) -> (Type, i)
                         _             -> (None, input)

    -- Opening tick
    (op, input'') <- maybeToList (T.uncons input')
    guard (op == '\'' || op == '`')

    -- Identifier/operator
    (ident, input''') <- wrapped input''

    -- Closing tick
    (cl, input'''') <- maybeToList (T.uncons input''')
    guard (cl == '\'' || cl == '`')

    return (ns, op, ident, cl, input'''')

  where

    -- | Parse out a wrapped, possibly qualified, operator or identifier
    wrapped t = do
      (c, t'  ) <- maybeToList (T.uncons t)
      -- Tuples
      case c of
        '(' | Just (c', _) <- T.uncons t'
            , c' == ',' || c' == ')'
            -> do let (commas, t'') = T.span (== ',') t'
                  (')', t''') <- maybeToList (T.uncons t'')
                  return (T.take (T.length commas + 2) t, t''')

        -- Parenthesized
        '(' -> do (n,   t'' ) <- general False 0 [] t'
                  (')', t''') <- maybeToList (T.uncons t'')
                  return (T.take (n + 2) t, t''')

        -- Backticked
        '`' -> do (n,   t'' ) <- general False 0 [] t'
                  ('`', t''') <- maybeToList (T.uncons t'')
                  return (T.take (n + 2) t, t''')

        -- Unadorned
        _   -> do (n,   t'' ) <- general False 0 [] t
                  return (T.take n t, t'')

    -- | Parse out a possibly qualified operator or identifier
    general :: Bool           -- ^ refuse inputs starting with operators
            -> Int            -- ^ total characters \"consumed\" so far
            -> [(Int, Text)]  -- ^ accumulated results
            -> Text           -- ^ current input
            -> [(Int, Text)]  -- ^ total characters parsed & what remains
    general !identOnly !i acc t
      -- Starts with an identifier (either just an identifier, or a module qual)
      | Just (n, rest) <- identLike t
      = if T.null rest
          then acc
          else case T.head rest of
                 '`' -> (n + i, rest) : acc
                 ')' -> (n + i, rest) : acc
                 '.' -> general False (n + i + 1) acc (T.tail rest)
                 '\'' -> let (m, rest') = quotes rest
                         in general True (n + m + 1 + i) ((n + m + i, rest') : acc) (T.tail rest')
                 _ -> acc

      -- An operator
      | Just (n, rest) <- optr t
      , not identOnly
      = (n + i, rest) : acc

      -- Anything else
      | otherwise
      = acc

    -- | Parse an identifier off the front of the input
    identLike t
      | T.null t = Nothing
      | isAlpha (T.head t) || '_' == T.head t
      = let !(idt, rest) = T.span (\c -> isAlphaNum c || c == '_') t
            !(octos, rest') = T.span (== '#') rest
      in Just (T.length idt + T.length octos, rest')
      | otherwise = Nothing

    -- | Parse all but the last quote off the front of the input
    -- PRECONDITION: T.head t == '\''
    quotes :: Text -> (Int, Text)
    quotes t = let !n = T.length (T.takeWhile (== '\'') t) - 1
               in (n, T.drop n t)

    -- | Parse an operator off the front of the input
    optr t = let !(op, rest) = T.span isSymbolChar t
             in if T.null op then Nothing else Just (T.length op, rest)