haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440

module Haddock.Backends.Hyperlinker.Parser (parse) where

import Data.Either         ( isRight, isLeft )
import Data.List           ( foldl', isPrefixOf, isSuffixOf )
import Data.Maybe          ( maybeToList )
import Data.Char           ( isSpace )
import qualified Text.Read as R

import GHC                 ( DynFlags, addSourceToTokens )
import SrcLoc
import FastString          ( mkFastString )
import StringBuffer        ( stringToStringBuffer )
import Lexer               ( Token(..) )
import qualified Lexer as L

import Haddock.Backends.Hyperlinker.Types as T


-- | Turn source code string into a stream of more descriptive tokens.
--
-- Result should retain original file layout (including comments, whitespace,
-- etc.), i.e. the following "law" should hold:
--
-- prop> concat . map tkValue . parse = id
--
-- (In reality, this only holds for input not containing '\r', '\t', '\f', '\v',
-- characters, since GHC transforms those into ' ' and '\n')
parse :: DynFlags -> FilePath -> String -> [T.Token]
parse dflags fp = ghcToks . processCPP dflags fp . filterCRLF
  where
    -- Remove CRLFs from source
    filterCRLF :: String -> String
    filterCRLF ('\r':'\n':cs) = '\n' : filterCRLF cs
    filterCRLF (c:cs) = c : filterCRLF cs
    filterCRLF [] = []

-- | Parse the source into tokens using the GHC lexer.
--
--   * CPP lines are removed and reinserted as line-comments
--   * top-level file pragmas are parsed as block comments (see the
--     'ITblockComment' case of 'classify' for more details)
--
processCPP :: DynFlags    -- ^ GHC's flags
           -> FilePath    -- ^ source file name (for position information)
           -> String      -- ^ source file contents
           -> [(Located L.Token, String)]
processCPP dflags fpath s = addSrc . go start . splitCPP $ s
  where
    start = mkRealSrcLoc (mkFastString fpath) 1 1
    addSrc = addSourceToTokens start (stringToStringBuffer s)

    -- Transform a list of Haskell/CPP lines into a list of tokens
    go :: RealSrcLoc -> [Either String String] -> [Located L.Token]
    go _   [] = []
    go pos ls =
      let (hLinesRight,  ls')  = span isRight ls
          (cppLinesLeft, rest) = span isLeft ls'

          hSrc   = concat [ hLine   | Right hLine  <- hLinesRight  ]
          cppSrc = concat [ cppLine | Left cppLine <- cppLinesLeft ]

      in case L.lexTokenStream (stringToStringBuffer hSrc) pos dflags of

           -- Stuff that fails to lex gets turned into comments
           L.PFailed _ _ss _msg ->
             let (src_pos, failed) = mkToken ITunknown pos hSrc
                 (new_pos, cpp)    = mkToken ITlineComment src_pos cppSrc
             in failed : cpp : go new_pos rest

           -- Successfully lexed
           L.POk ss toks ->
             let (new_pos, cpp) = mkToken ITlineComment (L.loc ss) cppSrc
             in toks ++ [cpp] ++ go new_pos rest

    -- Manually make a token from a 'String', advancing the cursor position
    mkToken tok start' str =
      let end = foldl' advanceSrcLoc start' str
      in (end, L (RealSrcSpan $ mkRealSrcSpan start' end) (tok str))


-- | Split apart the initial file into Haskell source lines ('Left' entries) and
-- CPP lines ('Right' entries).
--
-- All characters in the input are present in the output:
--
-- prop> concat . map (either id id) . splitCPP = id
splitCPP :: String -> [Either String String]
splitCPP "" = []
splitCPP s | isCPPline s = Left l : splitCPP rest
           | otherwise =  Right l : splitCPP rest
  where
    ~(l, rest) = spanToNewline 0 s


-- | Heuristic to decide if a line is going to be a CPP line. This should be a
-- cheap operation since it is going to be run on every line being processed.
--
-- Right now it just checks if the first non-whitespace character in the first
-- five characters of the line is a '#':
--
-- >>> isCPPline "#define FOO 1"
-- True
--
-- >>> isCPPline "\t\t  #ifdef GHC"
-- True
--
-- >>> isCPPline "       #endif"
-- False
--
isCPPline :: String -> Bool
isCPPline = isPrefixOf "#" . dropWhile (`elem` " \t") . take 5


-- | Split a "line" off the front of a string, hopefully without cutting tokens
-- in half. I say "hopefully" because knowing what a token is requires lexing,
-- yet lexing depends on this function.
--
-- All characters in the input are present in the output:
--
-- prop> curry (++) . spanToNewLine 0 = id
spanToNewline :: Int                 -- ^ open '{-'
              -> String              -- ^ input
              -> (String, String)

-- Base case and space characters
spanToNewline _ "" = ("", "")
spanToNewline n ('\n':str) | n <= 0 = ("\n", str)
spanToNewline n ('\n':str) | n <= 0 = ("\n", str)
spanToNewline n ('\\':'\n':str) =
    let (str', rest) = spanToNewline n str
    in ('\\':'\n':str', rest)

-- Block comments
spanToNewline n ('{':'-':str) =
    let (str', rest) = spanToNewline (n+1) str
    in ('{':'-':str', rest)
spanToNewline n ('-':'}':str) =
    let (str', rest) = spanToNewline (n-1) str
    in ('-':'}':str', rest)

-- When not in a block comment, try to lex a Haskell token
spanToNewline 0 str@(c:_) | ((lexed, str') : _) <- R.lex str, not (isSpace c) =
    if all (== '-') lexed && length lexed >= 2
      -- A Haskell line comment
      then case span (/= '\n') str' of
             (str'', '\n':rest) -> (lexed ++ str'' ++ "\n", rest)
             (_, _) -> (str, "") 

      -- An actual Haskell token
      else let (str'', rest) = spanToNewline 0 str'
           in (lexed ++ str'', rest)

-- In all other cases, advance one character at a time
spanToNewline n (c:str) =
    let (str', rest) = spanToNewline n str
    in (c:str', rest)


-- | Turn a list of GHC's 'L.Token' (and their source 'String') into a list of
-- Haddock's 'T.Token'.
ghcToks :: [(Located L.Token, String)] -> [T.Token]
ghcToks = reverse . (\(_,ts,_) -> ts) . foldl' go (start, [], False)
  where
    start = mkRealSrcLoc (mkFastString "lexing") 1 1

    go :: (RealSrcLoc, [T.Token], Bool)
       -- ^ current position, tokens accumulated, currently in pragma (or not)
       
       -> (Located L.Token, String)
       -- ^ next token, its content
       
       -> (RealSrcLoc, [T.Token], Bool)
       -- ^ new position, new tokens accumulated, currently in pragma (or not)

    go (pos, toks, in_prag) (L l tok, raw) =
        ( next_pos
        , classifiedTok ++ maybeToList white ++ toks
        , inPragma in_prag tok
        )
       where
         (next_pos, white) = mkWhitespace pos l
         
         classifiedTok = [ Token (classify' tok) raw rss
                         | RealSrcSpan rss <- [l]
                         , not (null raw)
                         ]
         
         classify' | in_prag = const TkPragma
                   | otherwise = classify


-- | Find the correct amount of whitespace between tokens.
mkWhitespace :: RealSrcLoc -> SrcSpan -> (RealSrcLoc, Maybe T.Token)
mkWhitespace prev spn =
  case spn of
    UnhelpfulSpan _ -> (prev,Nothing)
    RealSrcSpan s | null wsstring -> (end, Nothing)
                  | otherwise -> (end, Just (Token TkSpace wsstring wsspan))
      where
        start = realSrcSpanStart s
        end = realSrcSpanEnd s
        wsspan = mkRealSrcSpan prev start
        nls = srcLocLine start - srcLocLine prev
        spaces = if nls == 0 then srcLocCol start - srcLocCol prev
                             else srcLocCol start - 1
        wsstring = replicate nls '\n' ++ replicate spaces ' '


-- | Classify given tokens as appropriate Haskell token type.
classify :: L.Token -> TokenType
classify tok =
  case tok of
    ITas                   -> TkKeyword
    ITcase                 -> TkKeyword
    ITclass                -> TkKeyword
    ITdata                 -> TkKeyword
    ITdefault              -> TkKeyword
    ITderiving             -> TkKeyword
    ITdo                   -> TkKeyword
    ITelse                 -> TkKeyword
    IThiding               -> TkKeyword
    ITforeign              -> TkKeyword
    ITif                   -> TkKeyword
    ITimport               -> TkKeyword
    ITin                   -> TkKeyword
    ITinfix                -> TkKeyword
    ITinfixl               -> TkKeyword
    ITinfixr               -> TkKeyword
    ITinstance             -> TkKeyword
    ITlet                  -> TkKeyword
    ITmodule               -> TkKeyword
    ITnewtype              -> TkKeyword
    ITof                   -> TkKeyword
    ITqualified            -> TkKeyword
    ITthen                 -> TkKeyword
    ITtype                 -> TkKeyword
    ITvia                  -> TkKeyword
    ITwhere                -> TkKeyword

    ITforall            {} -> TkKeyword
    ITexport               -> TkKeyword
    ITlabel                -> TkKeyword
    ITdynamic              -> TkKeyword
    ITsafe                 -> TkKeyword
    ITinterruptible        -> TkKeyword
    ITunsafe               -> TkKeyword
    ITstdcallconv          -> TkKeyword
    ITccallconv            -> TkKeyword
    ITcapiconv             -> TkKeyword
    ITprimcallconv         -> TkKeyword
    ITjavascriptcallconv   -> TkKeyword
    ITmdo                  -> TkKeyword
    ITfamily               -> TkKeyword
    ITrole                 -> TkKeyword
    ITgroup                -> TkKeyword
    ITby                   -> TkKeyword
    ITusing                -> TkKeyword
    ITpattern              -> TkKeyword
    ITstatic               -> TkKeyword
    ITstock                -> TkKeyword
    ITanyclass             -> TkKeyword

    ITunit                 -> TkKeyword
    ITsignature            -> TkKeyword
    ITdependency           -> TkKeyword
    ITrequires             -> TkKeyword

    ITinline_prag       {} -> TkPragma
    ITspec_prag         {} -> TkPragma
    ITspec_inline_prag  {} -> TkPragma
    ITsource_prag       {} -> TkPragma
    ITrules_prag        {} -> TkPragma
    ITwarning_prag      {} -> TkPragma
    ITdeprecated_prag   {} -> TkPragma
    ITline_prag         {} -> TkPragma
    ITcolumn_prag       {} -> TkPragma
    ITscc_prag          {} -> TkPragma
    ITgenerated_prag    {} -> TkPragma
    ITcore_prag         {} -> TkPragma
    ITunpack_prag       {} -> TkPragma
    ITnounpack_prag     {} -> TkPragma
    ITann_prag          {} -> TkPragma
    ITcomplete_prag     {} -> TkPragma
    ITclose_prag           -> TkPragma
    IToptions_prag      {} -> TkPragma
    ITinclude_prag      {} -> TkPragma
    ITlanguage_prag        -> TkPragma
    ITminimal_prag      {} -> TkPragma
    IToverlappable_prag {} -> TkPragma
    IToverlapping_prag  {} -> TkPragma
    IToverlaps_prag     {} -> TkPragma
    ITincoherent_prag   {} -> TkPragma
    ITctype             {} -> TkPragma

    ITdotdot               -> TkGlyph
    ITcolon                -> TkGlyph
    ITdcolon            {} -> TkGlyph
    ITequal                -> TkGlyph
    ITlam                  -> TkGlyph
    ITlcase                -> TkGlyph
    ITvbar                 -> TkGlyph
    ITlarrow            {} -> TkGlyph
    ITrarrow            {} -> TkGlyph
    ITat                   -> TkGlyph
    ITtilde                -> TkGlyph
    ITdarrow            {} -> TkGlyph
    ITminus                -> TkGlyph
    ITbang                 -> TkGlyph
    ITdot                  -> TkOperator
    ITstar              {} -> TkOperator
    ITtypeApp              -> TkGlyph

    ITbiglam               -> TkGlyph

    ITocurly               -> TkSpecial
    ITccurly               -> TkSpecial
    ITvocurly              -> TkSpecial
    ITvccurly              -> TkSpecial
    ITobrack               -> TkSpecial
    ITopabrack             -> TkSpecial
    ITcpabrack             -> TkSpecial
    ITcbrack               -> TkSpecial
    IToparen               -> TkSpecial
    ITcparen               -> TkSpecial
    IToubxparen            -> TkSpecial
    ITcubxparen            -> TkSpecial
    ITsemi                 -> TkSpecial
    ITcomma                -> TkSpecial
    ITunderscore           -> TkIdentifier
    ITbackquote            -> TkSpecial
    ITsimpleQuote          -> TkSpecial

    ITvarid             {} -> TkIdentifier
    ITconid             {} -> TkIdentifier
    ITvarsym            {} -> TkOperator
    ITconsym            {} -> TkOperator
    ITqvarid            {} -> TkIdentifier
    ITqconid            {} -> TkIdentifier
    ITqvarsym           {} -> TkOperator
    ITqconsym           {} -> TkOperator

    ITdupipvarid        {} -> TkUnknown
    ITlabelvarid        {} -> TkUnknown

    ITchar              {} -> TkChar
    ITstring            {} -> TkString
    ITinteger           {} -> TkNumber
    ITrational          {} -> TkNumber

    ITprimchar          {} -> TkChar
    ITprimstring        {} -> TkString
    ITprimint           {} -> TkNumber
    ITprimword          {} -> TkNumber
    ITprimfloat         {} -> TkNumber
    ITprimdouble        {} -> TkNumber

    ITopenExpQuote      {} -> TkSpecial
    ITopenPatQuote         -> TkSpecial
    ITopenDecQuote         -> TkSpecial
    ITopenTypQuote         -> TkSpecial
    ITcloseQuote        {} -> TkSpecial
    ITopenTExpQuote     {} -> TkSpecial
    ITcloseTExpQuote       -> TkSpecial
    ITidEscape          {} -> TkUnknown
    ITparenEscape          -> TkSpecial
    ITidTyEscape        {} -> TkUnknown
    ITparenTyEscape        -> TkSpecial
    ITtyQuote              -> TkSpecial
    ITquasiQuote        {} -> TkUnknown
    ITqQuasiQuote       {} -> TkUnknown

    ITproc                 -> TkKeyword
    ITrec                  -> TkKeyword
    IToparenbar         {} -> TkGlyph
    ITcparenbar         {} -> TkGlyph
    ITlarrowtail        {} -> TkGlyph
    ITrarrowtail        {} -> TkGlyph
    ITLarrowtail        {} -> TkGlyph
    ITRarrowtail        {} -> TkGlyph

    ITunknown           {} -> TkUnknown
    ITeof                  -> TkUnknown

    -- Line comments are only supposed to start with '--'. Starting with '#'
    -- means that this was probably a CPP.
    ITlineComment s
      | isCPPline s        -> TkCpp
      | otherwise          -> TkComment

    ITdocCommentNext    {} -> TkComment
    ITdocCommentPrev    {} -> TkComment
    ITdocCommentNamed   {} -> TkComment
    ITdocSection        {} -> TkComment
    ITdocOptions        {} -> TkComment

    -- The lexer considers top-level pragmas as comments (see `pragState` in
    -- the GHC lexer for more), so we have to manually reverse this. The
    -- following is a hammer: it smashes _all_ pragma-like block comments into
    -- pragmas.
    ITblockComment c
      | isPrefixOf "{-#" c
      , isSuffixOf "#-}" c -> TkPragma
      | otherwise          -> TkComment

-- | Classify given tokens as beginning pragmas (or not).
inPragma :: Bool     -- ^ currently in pragma
         -> L.Token  -- ^ current token
         -> Bool     -- ^ new information about whether we are in a pragma
inPragma _ ITclose_prag = False
inPragma True _ = True
inPragma False tok =
  case tok of
    ITinline_prag       {} -> True
    ITspec_prag         {} -> True
    ITspec_inline_prag  {} -> True
    ITsource_prag       {} -> True
    ITrules_prag        {} -> True
    ITwarning_prag      {} -> True
    ITdeprecated_prag   {} -> True
    ITline_prag         {} -> True
    ITcolumn_prag       {} -> True
    ITscc_prag          {} -> True
    ITgenerated_prag    {} -> True
    ITcore_prag         {} -> True
    ITunpack_prag       {} -> True
    ITnounpack_prag     {} -> True
    ITann_prag          {} -> True
    ITcomplete_prag     {} -> True
    IToptions_prag      {} -> True
    ITinclude_prag      {} -> True
    ITlanguage_prag        -> True
    ITminimal_prag      {} -> True
    IToverlappable_prag {} -> True
    IToverlapping_prag  {} -> True
    IToverlaps_prag     {} -> True
    ITincoherent_prag   {} -> True
    ITctype             {} -> True

    _                      -> False