1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
|
{-# LANGUAGE BangPatterns #-}
-- |
-- Module : Documentation.Haddock.Parser.Identifier
-- Copyright : (c) Alec Theriault 2019,
-- License : BSD-like
--
-- Maintainer : haddock@projects.haskell.org
-- Stability : experimental
-- Portability : portable
--
-- Functionality for parsing identifiers and operators
module Documentation.Haddock.Parser.Identifier (
Identifier(..),
parseValid,
) where
import Documentation.Haddock.Types ( Namespace(..) )
import Documentation.Haddock.Parser.Monad
import qualified Text.Parsec as Parsec
import Text.Parsec.Pos ( updatePosChar )
import Text.Parsec ( State(..)
, getParserState, setParserState )
import Data.Text (Text)
import qualified Data.Text as T
import Data.Char (isAlpha, isAlphaNum)
import Control.Monad (guard)
import Data.Maybe
import CompatPrelude
-- | Identifier string surrounded with namespace, opening, and closing quotes/backticks.
data Identifier = Identifier !Namespace !Char String !Char
deriving (Show, Eq)
parseValid :: Parser Identifier
parseValid = do
s@State{ stateInput = inp, statePos = pos } <- getParserState
case takeIdentifier inp of
Nothing -> Parsec.parserFail "parseValid: Failed to match a valid identifier"
Just (ns, op, ident, cl, inp') ->
let posOp = updatePosChar pos op
posIdent = T.foldl updatePosChar posOp ident
posCl = updatePosChar posIdent cl
s' = s{ stateInput = inp', statePos = posCl }
in setParserState s' $> Identifier ns op (T.unpack ident) cl
-- | Try to parse a delimited identifier off the front of the given input.
--
-- This tries to match as many valid Haskell identifiers/operators as possible,
-- to the point of sometimes accepting invalid things (ex: keywords). Some
-- considerations:
--
-- - operators and identifiers can have module qualifications
-- - operators can be wrapped in parens (for prefix)
-- - identifiers can be wrapped in backticks (for infix)
-- - delimiters are backticks or regular ticks
-- - since regular ticks are also valid in identifiers, we opt for the
-- longest successful parse
--
-- This function should make /O(1)/ allocations
takeIdentifier :: Text -> Maybe (Namespace, Char, Text, Char, Text)
takeIdentifier input = listToMaybe $ do
-- Optional namespace
let (ns, input') = case T.uncons input of
Just ('v', i) -> (Value, i)
Just ('t', i) -> (Type, i)
_ -> (None, input)
-- Opening tick
(op, input'') <- maybeToList (T.uncons input')
guard (op == '\'' || op == '`')
-- Identifier/operator
(ident, input''') <- wrapped input''
-- Closing tick
(cl, input'''') <- maybeToList (T.uncons input''')
guard (cl == '\'' || cl == '`')
return (ns, op, ident, cl, input'''')
where
-- | Parse out a wrapped, possibly qualified, operator or identifier
wrapped t = do
(c, t' ) <- maybeToList (T.uncons t)
-- Tuples
case c of
'(' | Just (c', _) <- T.uncons t'
, c' == ',' || c' == ')'
-> do let (commas, t'') = T.span (== ',') t'
(')', t''') <- maybeToList (T.uncons t'')
return (T.take (T.length commas + 2) t, t''')
-- Parenthesized
'(' -> do (n, t'' ) <- general False 0 [] t'
(')', t''') <- maybeToList (T.uncons t'')
return (T.take (n + 2) t, t''')
-- Backticked
'`' -> do (n, t'' ) <- general False 0 [] t'
('`', t''') <- maybeToList (T.uncons t'')
return (T.take (n + 2) t, t''')
-- Unadorned
_ -> do (n, t'' ) <- general False 0 [] t
return (T.take n t, t'')
-- | Parse out a possibly qualified operator or identifier
general :: Bool -- ^ refuse inputs starting with operators
-> Int -- ^ total characters \"consumed\" so far
-> [(Int, Text)] -- ^ accumulated results
-> Text -- ^ current input
-> [(Int, Text)] -- ^ total characters parsed & what remains
general !identOnly !i acc t
-- Starts with an identifier (either just an identifier, or a module qual)
| Just (n, rest) <- identLike t
= if T.null rest
then acc
else case T.head rest of
'`' -> (n + i, rest) : acc
')' -> (n + i, rest) : acc
'.' -> general False (n + i + 1) acc (T.tail rest)
'\'' -> let (m, rest') = quotes rest
in general True (n + m + 1 + i) ((n + m + i, rest') : acc) (T.tail rest')
_ -> acc
-- An operator
| Just (n, rest) <- optr t
, not identOnly
= (n + i, rest) : acc
-- Anything else
| otherwise
= acc
-- | Parse an identifier off the front of the input
identLike t
| T.null t = Nothing
| isAlpha (T.head t) || '_' == T.head t
= let !(idt, rest) = T.span (\c -> isAlphaNum c || c == '_') t
!(octos, rest') = T.span (== '#') rest
in Just (T.length idt + T.length octos, rest')
| otherwise = Nothing
-- | Parse all but the last quote off the front of the input
-- PRECONDITION: T.head t `elem` ['\'', '`']
quotes :: Text -> (Int, Text)
quotes t = let !n = T.length (T.takeWhile (`elem` ['\'', '`']) t) - 1
in (n, T.drop n t)
-- | Parse an operator off the front of the input
optr t = let !(op, rest) = T.span isSymbolChar t
in if T.null op then Nothing else Just (T.length op, rest)
|