1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
|
{-# LANGUAGE OverloadedStrings #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
module Haddock.Backends.Hyperlinker.Parser (parse) where
import Control.Monad.Trans.Maybe
import Control.Monad.Trans.Class
import Control.Applicative ( Alternative(..) )
import Data.List ( isPrefixOf, isSuffixOf )
import qualified Data.ByteString as BS
import GHC.Platform
import GHC.Types.SourceText
import GHC.Driver.Session
import GHC.Driver.Config.Diagnostic
import GHC.Utils.Error ( pprLocMsgEnvelope )
import GHC.Data.FastString ( mkFastString )
import GHC.Parser.Errors.Ppr ()
import qualified GHC.Types.Error as E
import GHC.Parser.Lexer as Lexer
( P(..), ParseResult(..), PState(..), Token(..)
, initParserState, lexer, mkParserOpts, getPsErrorMessages)
import GHC.Data.Bag ( bagToList )
import GHC.Utils.Outputable ( text, ($$) )
import GHC.Utils.Panic ( panic )
import GHC.Driver.Ppr ( showSDoc )
import GHC.Types.SrcLoc
import GHC.Data.StringBuffer ( StringBuffer, atEnd )
import Haddock.Backends.Hyperlinker.Types as T
import Haddock.GhcUtils
-- | Turn source code string into a stream of more descriptive tokens.
--
-- Result should retain original file layout (including comments,
-- whitespace, and CPP).
parse
:: DynFlags -- ^ Flags for this module
-> FilePath -- ^ Path to the source of this module
-> BS.ByteString -- ^ Raw UTF-8 encoded source of this module
-> [T.Token]
parse dflags fpath bs = case unP (go False []) initState of
POk _ toks -> reverse toks
PFailed pst ->
let err:_ = bagToList (E.getMessages $ getPsErrorMessages pst) in
panic $ showSDoc dflags $
text "Hyperlinker parse error:" $$ pprLocMsgEnvelope err
where
initState = initParserState pflags buf start
buf = stringBufferFromByteString bs
start = mkRealSrcLoc (mkFastString fpath) 1 1
arch_os = platformArchOS (targetPlatform dflags)
pflags = mkParserOpts (extensionFlags dflags)
(initDiagOpts dflags)
(supportedLanguagesAndExtensions arch_os)
(safeImportsOn dflags)
False -- lex Haddocks as comment tokens
True -- produce comment tokens
False -- produce position pragmas tokens
go :: Bool -- ^ are we currently in a pragma?
-> [T.Token] -- ^ tokens accumulated so far (in reverse)
-> P [T.Token]
go inPrag toks = do
(b, _) <- getInput
if not (atEnd b)
then do
mtok <- runMaybeT (parseCppLine <|> parsePlainTok inPrag)
(newToks, inPrag') <- case mtok of
Nothing -> unknownLine
Just a -> pure a
go inPrag' (newToks ++ toks)
else
pure toks
-- | Like 'Lexer.lexer', but slower, with a better API, and filtering out empty tokens
wrappedLexer :: P (RealLocated Lexer.Token)
wrappedLexer = Lexer.lexer False andThen
where andThen (L (RealSrcSpan s _) t)
| srcSpanStartLine s /= srcSpanEndLine s ||
srcSpanStartCol s /= srcSpanEndCol s
= pure (L s t)
andThen (L (RealSrcSpan s _) ITeof) = pure (L s ITeof)
andThen _ = wrappedLexer
-- | Try to parse a CPP line (can fail)
parseCppLine :: MaybeT P ([T.Token], Bool)
parseCppLine = MaybeT $ do
(b, l) <- getInput
case tryCppLine l b of
Just (cppBStr, l', b')
-> let cppTok = T.Token { tkType = TkCpp
, tkValue = cppBStr
, tkSpan = mkRealSrcSpan l l' }
in setInput (b', l') *> pure (Just ([cppTok], False))
_ -> return Nothing
-- | Try to parse a regular old token (can fail)
parsePlainTok :: Bool -> MaybeT P ([T.Token], Bool) -- return list is only ever 0-2 elements
parsePlainTok inPrag = do
(bInit, lInit) <- lift getInput
L sp tok <- tryP (Lexer.lexer False return)
(bEnd, _) <- lift getInput
case sp of
UnhelpfulSpan _ -> pure ([], False) -- pretend the token never existed
RealSrcSpan rsp _ -> do
let typ = if inPrag then TkPragma else classify tok
RealSrcLoc lStart _ = srcSpanStart sp -- safe since @sp@ is real
(spaceBStr, bStart) = spanPosition lInit lStart bInit
inPragDef = inPragma inPrag tok
(bEnd', inPrag') <- case tok of
-- Update internal line + file position if this is a LINE pragma
ITline_prag _ -> tryOrElse (bEnd, inPragDef) $ do
L _ (ITinteger (IL { il_value = line })) <- tryP wrappedLexer
L _ (ITstring _ file) <- tryP wrappedLexer
L spF ITclose_prag <- tryP wrappedLexer
let newLoc = mkRealSrcLoc file (fromIntegral line - 1) (srcSpanEndCol spF)
(bEnd'', _) <- lift getInput
lift $ setInput (bEnd'', newLoc)
pure (bEnd'', False)
-- Update internal column position if this is a COLUMN pragma
ITcolumn_prag _ -> tryOrElse (bEnd, inPragDef) $ do
L _ (ITinteger (IL { il_value = col })) <- tryP wrappedLexer
L spF ITclose_prag <- tryP wrappedLexer
let newLoc = mkRealSrcLoc (srcSpanFile spF) (srcSpanEndLine spF) (fromIntegral col)
(bEnd'', _) <- lift getInput
lift $ setInput (bEnd'', newLoc)
pure (bEnd'', False)
_ -> pure (bEnd, inPragDef)
let tokBStr = splitStringBuffer bStart bEnd'
plainTok = T.Token { tkType = typ
, tkValue = tokBStr
, tkSpan = rsp }
spaceTok = T.Token { tkType = TkSpace
, tkValue = spaceBStr
, tkSpan = mkRealSrcSpan lInit lStart }
pure (plainTok : [ spaceTok | not (BS.null spaceBStr) ], inPrag')
-- | Parse whatever remains of the line as an unknown token (can't fail)
unknownLine :: P ([T.Token], Bool)
unknownLine = do
(b, l) <- getInput
let (unkBStr, l', b') = spanLine l b
unkTok = T.Token { tkType = TkUnknown
, tkValue = unkBStr
, tkSpan = mkRealSrcSpan l l' }
setInput (b', l')
pure ([unkTok], False)
-- | Get the input
getInput :: P (StringBuffer, RealSrcLoc)
getInput = P $ \p@PState { buffer = buf, loc = srcLoc } -> POk p (buf, psRealLoc srcLoc)
-- | Set the input
setInput :: (StringBuffer, RealSrcLoc) -> P ()
setInput (buf, srcLoc) =
P $ \p@PState{ loc = PsLoc _ buf_loc } ->
POk (p { buffer = buf, loc = PsLoc srcLoc buf_loc }) ()
tryP :: P a -> MaybeT P a
tryP (P f) = MaybeT $ P $ \s -> case f s of
POk s' a -> POk s' (Just a)
PFailed _ -> POk s Nothing
tryOrElse :: Alternative f => a -> f a -> f a
tryOrElse x p = p <|> pure x
-- | Classify given tokens as appropriate Haskell token type.
classify :: Lexer.Token -> TokenType
classify tok =
case tok of
ITas -> TkKeyword
ITcase -> TkKeyword
ITclass -> TkKeyword
ITdata -> TkKeyword
ITdefault -> TkKeyword
ITderiving -> TkKeyword
ITdo {} -> TkKeyword
ITelse -> TkKeyword
IThiding -> TkKeyword
ITforeign -> TkKeyword
ITif -> TkKeyword
ITimport -> TkKeyword
ITin -> TkKeyword
ITinfix -> TkKeyword
ITinfixl -> TkKeyword
ITinfixr -> TkKeyword
ITinstance -> TkKeyword
ITlet -> TkKeyword
ITmodule -> TkKeyword
ITnewtype -> TkKeyword
ITof -> TkKeyword
ITqualified -> TkKeyword
ITthen -> TkKeyword
ITtype -> TkKeyword
ITvia -> TkKeyword
ITwhere -> TkKeyword
ITforall {} -> TkKeyword
ITexport -> TkKeyword
ITlabel -> TkKeyword
ITdynamic -> TkKeyword
ITsafe -> TkKeyword
ITinterruptible -> TkKeyword
ITunsafe -> TkKeyword
ITstdcallconv -> TkKeyword
ITccallconv -> TkKeyword
ITcapiconv -> TkKeyword
ITprimcallconv -> TkKeyword
ITjavascriptcallconv -> TkKeyword
ITmdo {} -> TkKeyword
ITfamily -> TkKeyword
ITrole -> TkKeyword
ITgroup -> TkKeyword
ITby -> TkKeyword
ITusing -> TkKeyword
ITpattern -> TkKeyword
ITstatic -> TkKeyword
ITstock -> TkKeyword
ITanyclass -> TkKeyword
ITunit -> TkKeyword
ITsignature -> TkKeyword
ITdependency -> TkKeyword
ITrequires -> TkKeyword
ITinline_prag {} -> TkPragma
ITspec_prag {} -> TkPragma
ITspec_inline_prag {} -> TkPragma
ITsource_prag {} -> TkPragma
ITrules_prag {} -> TkPragma
ITwarning_prag {} -> TkPragma
ITdeprecated_prag {} -> TkPragma
ITline_prag {} -> TkPragma
ITcolumn_prag {} -> TkPragma
ITscc_prag {} -> TkPragma
ITunpack_prag {} -> TkPragma
ITnounpack_prag {} -> TkPragma
ITann_prag {} -> TkPragma
ITcomplete_prag {} -> TkPragma
ITclose_prag -> TkPragma
IToptions_prag {} -> TkPragma
ITinclude_prag {} -> TkPragma
ITlanguage_prag -> TkPragma
ITminimal_prag {} -> TkPragma
IToverlappable_prag {} -> TkPragma
IToverlapping_prag {} -> TkPragma
IToverlaps_prag {} -> TkPragma
ITincoherent_prag {} -> TkPragma
ITctype {} -> TkPragma
ITdotdot -> TkGlyph
ITcolon -> TkGlyph
ITdcolon {} -> TkGlyph
ITequal -> TkGlyph
ITlam -> TkGlyph
ITlcase -> TkGlyph
ITvbar -> TkGlyph
ITlarrow {} -> TkGlyph
ITrarrow {} -> TkGlyph
ITlolly {} -> TkGlyph
ITat -> TkGlyph
ITtilde -> TkGlyph
ITdarrow {} -> TkGlyph
ITminus -> TkGlyph
ITprefixminus -> TkGlyph
ITbang -> TkGlyph
ITdot -> TkOperator
ITproj {} -> TkOperator
ITstar {} -> TkOperator
ITtypeApp -> TkGlyph
ITpercent -> TkGlyph
ITbiglam -> TkGlyph
ITocurly -> TkSpecial
ITccurly -> TkSpecial
ITvocurly -> TkSpecial
ITvccurly -> TkSpecial
ITobrack -> TkSpecial
ITopabrack -> TkSpecial
ITcpabrack -> TkSpecial
ITcbrack -> TkSpecial
IToparen -> TkSpecial
ITcparen -> TkSpecial
IToubxparen -> TkSpecial
ITcubxparen -> TkSpecial
ITsemi -> TkSpecial
ITcomma -> TkSpecial
ITunderscore -> TkIdentifier
ITbackquote -> TkSpecial
ITsimpleQuote -> TkSpecial
ITvarid {} -> TkIdentifier
ITconid {} -> TkIdentifier
ITvarsym {} -> TkOperator
ITconsym {} -> TkOperator
ITqvarid {} -> TkIdentifier
ITqconid {} -> TkIdentifier
ITqvarsym {} -> TkOperator
ITqconsym {} -> TkOperator
ITdupipvarid {} -> TkUnknown
ITlabelvarid {} -> TkUnknown
ITchar {} -> TkChar
ITstring {} -> TkString
ITinteger {} -> TkNumber
ITrational {} -> TkNumber
ITprimchar {} -> TkChar
ITprimstring {} -> TkString
ITprimint {} -> TkNumber
ITprimword {} -> TkNumber
ITprimfloat {} -> TkNumber
ITprimdouble {} -> TkNumber
ITopenExpQuote {} -> TkSpecial
ITopenPatQuote -> TkSpecial
ITopenDecQuote -> TkSpecial
ITopenTypQuote -> TkSpecial
ITcloseQuote {} -> TkSpecial
ITopenTExpQuote {} -> TkSpecial
ITcloseTExpQuote -> TkSpecial
ITdollar -> TkSpecial
ITdollardollar -> TkSpecial
ITtyQuote -> TkSpecial
ITquasiQuote {} -> TkUnknown
ITqQuasiQuote {} -> TkUnknown
ITproc -> TkKeyword
ITrec -> TkKeyword
IToparenbar {} -> TkGlyph
ITcparenbar {} -> TkGlyph
ITlarrowtail {} -> TkGlyph
ITrarrowtail {} -> TkGlyph
ITLarrowtail {} -> TkGlyph
ITRarrowtail {} -> TkGlyph
ITcomment_line_prag -> TkUnknown
ITunknown {} -> TkUnknown
ITeof -> TkUnknown
ITlineComment {} -> TkComment
ITdocComment {} -> TkComment
ITdocOptions {} -> TkComment
-- The lexer considers top-level pragmas as comments (see `pragState` in
-- the GHC lexer for more), so we have to manually reverse this. The
-- following is a hammer: it smashes _all_ pragma-like block comments into
-- pragmas.
ITblockComment c _
| isPrefixOf "{-#" c
, isSuffixOf "#-}" c -> TkPragma
| otherwise -> TkComment
-- | Classify given tokens as beginning pragmas (or not).
inPragma :: Bool -- ^ currently in pragma
-> Lexer.Token -- ^ current token
-> Bool -- ^ new information about whether we are in a pragma
inPragma _ ITclose_prag = False
inPragma True _ = True
inPragma False tok =
case tok of
ITinline_prag {} -> True
ITspec_prag {} -> True
ITspec_inline_prag {} -> True
ITsource_prag {} -> True
ITrules_prag {} -> True
ITwarning_prag {} -> True
ITdeprecated_prag {} -> True
ITline_prag {} -> True
ITcolumn_prag {} -> True
ITscc_prag {} -> True
ITunpack_prag {} -> True
ITnounpack_prag {} -> True
ITann_prag {} -> True
ITcomplete_prag {} -> True
IToptions_prag {} -> True
ITinclude_prag {} -> True
ITlanguage_prag -> True
ITminimal_prag {} -> True
IToverlappable_prag {} -> True
IToverlapping_prag {} -> True
IToverlaps_prag {} -> True
ITincoherent_prag {} -> True
ITctype {} -> True
_ -> False
|