haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395

{-# LANGUAGE OverloadedStrings #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
module Haddock.Backends.Hyperlinker.Parser (parse) where

import Control.Applicative ( Alternative(..) )
import Data.List           ( isPrefixOf, isSuffixOf )

import qualified Data.ByteString as BS

import BasicTypes          ( IntegralLit(..) )
import DynFlags
import ErrUtils            ( emptyMessages, pprLocErrMsg )
import FastString          ( mkFastString )
import Lexer               ( P(..), ParseResult(..), PState(..), Token(..)
                           , mkPStatePure, lexer, mkParserFlags', getErrorMessages, addFatalError )
import Bag                 ( bagToList )
import Outputable          ( showSDoc, panic, text, ($$) )
import SrcLoc
import StringBuffer        ( StringBuffer, atEnd )

import Haddock.Backends.Hyperlinker.Types as T
import Haddock.GhcUtils

-- | Turn source code string into a stream of more descriptive tokens.
--
-- Result should retain original file layout (including comments,
-- whitespace, and CPP).
parse
  :: DynFlags      -- ^ Flags for this module
  -> FilePath      -- ^ Path to the source of this module
  -> BS.ByteString -- ^ Raw UTF-8 encoded source of this module
  -> [T.Token]
parse dflags fpath bs = case unP (go False []) initState of
    POk _ toks -> reverse toks
    PFailed pst ->
      let err:_ = bagToList (getErrorMessages pst dflags) in
      panic $ showSDoc dflags $
        text "Hyperlinker parse error:" $$ pprLocErrMsg err
  where

    initState = mkPStatePure pflags buf start
    buf = stringBufferFromByteString bs
    start = mkRealSrcLoc (mkFastString fpath) 1 1
    pflags = mkParserFlags' (warningFlags dflags)
                            (extensionFlags dflags)
                            (thisPackage dflags)
                            (safeImportsOn dflags)
                            False -- lex Haddocks as comment tokens
                            True  -- produce comment tokens
                            False -- produce position pragmas tokens

    go :: Bool        -- ^ are we currently in a pragma?
       -> [T.Token]   -- ^ tokens accumulated so far (in reverse)
       -> P [T.Token]
    go inPrag toks = do
      (b, _) <- getInput
      if not (atEnd b)
        then do
          (newToks, inPrag') <- parseCppLine <|> parsePlainTok inPrag <|> unknownLine
          go inPrag' (newToks ++ toks)
        else
          pure toks

    -- | Like 'Lexer.lexer', but slower, with a better API, and filtering out empty tokens
    wrappedLexer :: P (RealLocated Lexer.Token)
    wrappedLexer = Lexer.lexer False andThen
      where andThen (L (RealSrcSpan s) t)
              | srcSpanStartLine s /= srcSpanEndLine s ||
                srcSpanStartCol s /= srcSpanEndCol s
              = pure (L s t)
            andThen (L (RealSrcSpan s) ITeof) = pure (L s ITeof)
            andThen _ = wrappedLexer

    -- | Try to parse a CPP line (can fail)
    parseCppLine :: P ([T.Token], Bool)
    parseCppLine = do
      (b, l) <- getInput
      case tryCppLine l b of
        Just (cppBStr, l', b')
             -> let cppTok = T.Token { tkType = TkCpp
                                     , tkValue = cppBStr
                                     , tkSpan = mkRealSrcSpan l l' }
                in setInput (b', l') *> pure ([cppTok], False)
        _    -> empty

    -- | Try to parse a regular old token (can fail)
    parsePlainTok :: Bool -> P ([T.Token], Bool)  -- return list is only ever 0-2 elements
    parsePlainTok inPrag = do
      (bInit, lInit) <- getInput
      L sp tok <- Lexer.lexer False return
      (bEnd, _) <- getInput
      case sp of
        UnhelpfulSpan _ -> pure ([], False) -- pretend the token never existed
        RealSrcSpan rsp -> do
          let typ = if inPrag then TkPragma else classify tok
              RealSrcLoc lStart = srcSpanStart sp -- safe since @sp@ is real
              (spaceBStr, bStart) = spanPosition lInit lStart bInit
              inPragDef = inPragma inPrag tok

          (bEnd', inPrag') <- case tok of

            -- Update internal line + file position if this is a LINE pragma
            ITline_prag _ -> tryOrElse (bEnd, inPragDef) $ do
              L _ (ITinteger (IL { il_value = line })) <- wrappedLexer
              L _ (ITstring _ file)                    <- wrappedLexer
              L spF ITclose_prag                       <- wrappedLexer

              let newLoc = mkRealSrcLoc file (fromIntegral line - 1) (srcSpanEndCol spF)
              (bEnd'', _) <- getInput
              setInput (bEnd'', newLoc)

              pure (bEnd'', False)

            -- Update internal column position if this is a COLUMN pragma
            ITcolumn_prag _ -> tryOrElse (bEnd, inPragDef) $ do
              L _ (ITinteger (IL { il_value = col }))  <- wrappedLexer
              L spF ITclose_prag                       <- wrappedLexer

              let newLoc = mkRealSrcLoc (srcSpanFile spF) (srcSpanEndLine spF) (fromIntegral col)
              (bEnd'', _) <- getInput
              setInput (bEnd'', newLoc)

              pure (bEnd'', False)

            _ -> pure (bEnd, inPragDef)

          let tokBStr = splitStringBuffer bStart bEnd'
              plainTok = T.Token { tkType = typ
                                 , tkValue = tokBStr
                                 , tkSpan = rsp }
              spaceTok = T.Token { tkType = TkSpace
                                 , tkValue = spaceBStr
                                 , tkSpan = mkRealSrcSpan lInit lStart }

          pure (plainTok : [ spaceTok | not (BS.null spaceBStr) ], inPrag')

    -- | Parse whatever remains of the line as an unknown token (can't fail)
    unknownLine :: P ([T.Token], Bool)
    unknownLine = do
      (b, l) <- getInput
      let (unkBStr, l', b') = spanLine l b
          unkTok = T.Token { tkType = TkUnknown
                           , tkValue = unkBStr
                           , tkSpan = mkRealSrcSpan l l' }
      setInput (b', l')
      pure ([unkTok], False)


-- | Get the input
getInput :: P (StringBuffer, RealSrcLoc)
getInput = P $ \p@PState { buffer = buf, loc = srcLoc } -> POk p (buf, srcLoc)

-- | Set the input
setInput :: (StringBuffer, RealSrcLoc) -> P ()
setInput (buf, srcLoc) = P $ \p -> POk (p { buffer = buf, loc = srcLoc }) ()


-- | Orphan instance that adds backtracking to 'P'
instance Alternative P where
  empty = addFatalError noSrcSpan (text "Alterative.empty")
  P x <|> P y = P $ \s -> case x s of { p@POk{} -> p
                                      ; _ -> y s }

-- | Try a parser. If it fails, backtrack and return the pure value.
tryOrElse :: a -> P a -> P a
tryOrElse x p = p <|> pure x

-- | Classify given tokens as appropriate Haskell token type.
classify :: Lexer.Token -> TokenType
classify tok =
  case tok of
    ITas                   -> TkKeyword
    ITcase                 -> TkKeyword
    ITclass                -> TkKeyword
    ITdata                 -> TkKeyword
    ITdefault              -> TkKeyword
    ITderiving             -> TkKeyword
    ITdo                   -> TkKeyword
    ITelse                 -> TkKeyword
    IThiding               -> TkKeyword
    ITforeign              -> TkKeyword
    ITif                   -> TkKeyword
    ITimport               -> TkKeyword
    ITin                   -> TkKeyword
    ITinfix                -> TkKeyword
    ITinfixl               -> TkKeyword
    ITinfixr               -> TkKeyword
    ITinstance             -> TkKeyword
    ITlet                  -> TkKeyword
    ITmodule               -> TkKeyword
    ITnewtype              -> TkKeyword
    ITof                   -> TkKeyword
    ITqualified            -> TkKeyword
    ITthen                 -> TkKeyword
    ITtype                 -> TkKeyword
    ITvia                  -> TkKeyword
    ITwhere                -> TkKeyword

    ITforall            {} -> TkKeyword
    ITexport               -> TkKeyword
    ITlabel                -> TkKeyword
    ITdynamic              -> TkKeyword
    ITsafe                 -> TkKeyword
    ITinterruptible        -> TkKeyword
    ITunsafe               -> TkKeyword
    ITstdcallconv          -> TkKeyword
    ITccallconv            -> TkKeyword
    ITcapiconv             -> TkKeyword
    ITprimcallconv         -> TkKeyword
    ITjavascriptcallconv   -> TkKeyword
    ITmdo                  -> TkKeyword
    ITfamily               -> TkKeyword
    ITrole                 -> TkKeyword
    ITgroup                -> TkKeyword
    ITby                   -> TkKeyword
    ITusing                -> TkKeyword
    ITpattern              -> TkKeyword
    ITstatic               -> TkKeyword
    ITstock                -> TkKeyword
    ITanyclass             -> TkKeyword

    ITunit                 -> TkKeyword
    ITsignature            -> TkKeyword
    ITdependency           -> TkKeyword
    ITrequires             -> TkKeyword

    ITinline_prag       {} -> TkPragma
    ITspec_prag         {} -> TkPragma
    ITspec_inline_prag  {} -> TkPragma
    ITsource_prag       {} -> TkPragma
    ITrules_prag        {} -> TkPragma
    ITwarning_prag      {} -> TkPragma
    ITdeprecated_prag   {} -> TkPragma
    ITline_prag         {} -> TkPragma
    ITcolumn_prag       {} -> TkPragma
    ITscc_prag          {} -> TkPragma
    ITgenerated_prag    {} -> TkPragma
    ITcore_prag         {} -> TkPragma
    ITunpack_prag       {} -> TkPragma
    ITnounpack_prag     {} -> TkPragma
    ITann_prag          {} -> TkPragma
    ITcomplete_prag     {} -> TkPragma
    ITclose_prag           -> TkPragma
    IToptions_prag      {} -> TkPragma
    ITinclude_prag      {} -> TkPragma
    ITlanguage_prag        -> TkPragma
    ITminimal_prag      {} -> TkPragma
    IToverlappable_prag {} -> TkPragma
    IToverlapping_prag  {} -> TkPragma
    IToverlaps_prag     {} -> TkPragma
    ITincoherent_prag   {} -> TkPragma
    ITctype             {} -> TkPragma

    ITdotdot               -> TkGlyph
    ITcolon                -> TkGlyph
    ITdcolon            {} -> TkGlyph
    ITequal                -> TkGlyph
    ITlam                  -> TkGlyph
    ITlcase                -> TkGlyph
    ITvbar                 -> TkGlyph
    ITlarrow            {} -> TkGlyph
    ITrarrow            {} -> TkGlyph
    ITat                   -> TkGlyph
    ITtilde                -> TkGlyph
    ITdarrow            {} -> TkGlyph
    ITminus                -> TkGlyph
    ITbang                 -> TkGlyph
    ITdot                  -> TkOperator
    ITstar              {} -> TkOperator
    ITtypeApp              -> TkGlyph

    ITbiglam               -> TkGlyph

    ITocurly               -> TkSpecial
    ITccurly               -> TkSpecial
    ITvocurly              -> TkSpecial
    ITvccurly              -> TkSpecial
    ITobrack               -> TkSpecial
    ITopabrack             -> TkSpecial
    ITcpabrack             -> TkSpecial
    ITcbrack               -> TkSpecial
    IToparen               -> TkSpecial
    ITcparen               -> TkSpecial
    IToubxparen            -> TkSpecial
    ITcubxparen            -> TkSpecial
    ITsemi                 -> TkSpecial
    ITcomma                -> TkSpecial
    ITunderscore           -> TkIdentifier
    ITbackquote            -> TkSpecial
    ITsimpleQuote          -> TkSpecial

    ITvarid             {} -> TkIdentifier
    ITconid             {} -> TkIdentifier
    ITvarsym            {} -> TkOperator
    ITconsym            {} -> TkOperator
    ITqvarid            {} -> TkIdentifier
    ITqconid            {} -> TkIdentifier
    ITqvarsym           {} -> TkOperator
    ITqconsym           {} -> TkOperator

    ITdupipvarid        {} -> TkUnknown
    ITlabelvarid        {} -> TkUnknown

    ITchar              {} -> TkChar
    ITstring            {} -> TkString
    ITinteger           {} -> TkNumber
    ITrational          {} -> TkNumber

    ITprimchar          {} -> TkChar
    ITprimstring        {} -> TkString
    ITprimint           {} -> TkNumber
    ITprimword          {} -> TkNumber
    ITprimfloat         {} -> TkNumber
    ITprimdouble        {} -> TkNumber

    ITopenExpQuote      {} -> TkSpecial
    ITopenPatQuote         -> TkSpecial
    ITopenDecQuote         -> TkSpecial
    ITopenTypQuote         -> TkSpecial
    ITcloseQuote        {} -> TkSpecial
    ITopenTExpQuote     {} -> TkSpecial
    ITcloseTExpQuote       -> TkSpecial
    ITidEscape          {} -> TkUnknown
    ITparenEscape          -> TkSpecial
    ITidTyEscape        {} -> TkUnknown
    ITparenTyEscape        -> TkSpecial
    ITtyQuote              -> TkSpecial
    ITquasiQuote        {} -> TkUnknown
    ITqQuasiQuote       {} -> TkUnknown

    ITproc                 -> TkKeyword
    ITrec                  -> TkKeyword
    IToparenbar         {} -> TkGlyph
    ITcparenbar         {} -> TkGlyph
    ITlarrowtail        {} -> TkGlyph
    ITrarrowtail        {} -> TkGlyph
    ITLarrowtail        {} -> TkGlyph
    ITRarrowtail        {} -> TkGlyph

    ITcomment_line_prag    -> TkUnknown
    ITunknown           {} -> TkUnknown
    ITeof                  -> TkUnknown

    ITlineComment       {} -> TkComment
    ITdocCommentNext    {} -> TkComment
    ITdocCommentPrev    {} -> TkComment
    ITdocCommentNamed   {} -> TkComment
    ITdocSection        {} -> TkComment
    ITdocOptions        {} -> TkComment

    -- The lexer considers top-level pragmas as comments (see `pragState` in
    -- the GHC lexer for more), so we have to manually reverse this. The
    -- following is a hammer: it smashes _all_ pragma-like block comments into
    -- pragmas.
    ITblockComment c
      | isPrefixOf "{-#" c
      , isSuffixOf "#-}" c -> TkPragma
      | otherwise          -> TkComment

-- | Classify given tokens as beginning pragmas (or not).
inPragma :: Bool           -- ^ currently in pragma
         -> Lexer.Token    -- ^ current token
         -> Bool           -- ^ new information about whether we are in a pragma
inPragma _ ITclose_prag = False
inPragma True _ = True
inPragma False tok =
  case tok of
    ITinline_prag       {} -> True
    ITspec_prag         {} -> True
    ITspec_inline_prag  {} -> True
    ITsource_prag       {} -> True
    ITrules_prag        {} -> True
    ITwarning_prag      {} -> True
    ITdeprecated_prag   {} -> True
    ITline_prag         {} -> True
    ITcolumn_prag       {} -> True
    ITscc_prag          {} -> True
    ITgenerated_prag    {} -> True
    ITcore_prag         {} -> True
    ITunpack_prag       {} -> True
    ITnounpack_prag     {} -> True
    ITann_prag          {} -> True
    ITcomplete_prag     {} -> True
    IToptions_prag      {} -> True
    ITinclude_prag      {} -> True
    ITlanguage_prag        -> True
    ITminimal_prag      {} -> True
    IToverlappable_prag {} -> True
    IToverlapping_prag  {} -> True
    IToverlaps_prag     {} -> True
    ITincoherent_prag   {} -> True
    ITctype             {} -> True

    _                      -> False