1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
|
module Haddock.Backends.Hyperlinker.Parser (parse) where
import Data.Either ( isRight, isLeft )
import Data.List ( foldl', isPrefixOf, isSuffixOf )
import Data.Maybe ( maybeToList )
import Data.Char ( isSpace )
import qualified Text.Read as R
import GHC ( DynFlags, addSourceToTokens )
import SrcLoc
import FastString ( mkFastString )
import StringBuffer ( stringToStringBuffer )
import Lexer ( Token(..) )
import qualified Lexer as L
import Haddock.Backends.Hyperlinker.Types as T
-- | Turn source code string into a stream of more descriptive tokens.
--
-- Result should retain original file layout (including comments, whitespace,
-- etc.), i.e. the following "law" should hold:
--
-- prop> concat . map tkValue . parse = id
--
-- (In reality, this only holds for input not containing '\r', '\t', '\f', '\v',
-- characters, since GHC transforms those into ' ' and '\n')
parse :: DynFlags -> FilePath -> String -> [T.Token]
parse dflags fp = ghcToks . processCPP dflags fp . filterCRLF
where
-- Remove CRLFs from source
filterCRLF :: String -> String
filterCRLF ('\r':'\n':cs) = '\n' : filterCRLF cs
filterCRLF (c:cs) = c : filterCRLF cs
filterCRLF [] = []
-- | Parse the source into tokens using the GHC lexer.
--
-- * CPP lines are removed and reinserted as line-comments
-- * top-level file pragmas are parsed as block comments (see the
-- 'ITblockComment' case of 'classify' for more details)
--
processCPP :: DynFlags -- ^ GHC's flags
-> FilePath -- ^ source file name (for position information)
-> String -- ^ source file contents
-> [(Located L.Token, String)]
processCPP dflags fpath s = addSrc . go start . splitCPP $ s
where
start = mkRealSrcLoc (mkFastString fpath) 1 1
addSrc = addSourceToTokens start (stringToStringBuffer s)
-- Transform a list of Haskell/CPP lines into a list of tokens
go :: RealSrcLoc -> [Either String String] -> [Located L.Token]
go _ [] = []
go pos ls =
let (hLinesRight, ls') = span isRight ls
(cppLinesLeft, rest) = span isLeft ls'
hSrc = concat [ hLine | Right hLine <- hLinesRight ]
cppSrc = concat [ cppLine | Left cppLine <- cppLinesLeft ]
in case L.lexTokenStream (stringToStringBuffer hSrc) pos dflags of
-- Stuff that fails to lex gets turned into comments
L.PFailed _ _ss _msg ->
let (src_pos, failed) = mkToken ITunknown pos hSrc
(new_pos, cpp) = mkToken ITlineComment src_pos cppSrc
in failed : cpp : go new_pos rest
-- Successfully lexed
L.POk ss toks ->
let (new_pos, cpp) = mkToken ITlineComment (L.loc ss) cppSrc
in toks ++ [cpp] ++ go new_pos rest
-- Manually make a token from a 'String', advancing the cursor position
mkToken tok start' str =
let end = foldl' advanceSrcLoc start' str
in (end, L (RealSrcSpan $ mkRealSrcSpan start' end) (tok str))
-- | Split apart the initial file into Haskell source lines ('Left' entries) and
-- CPP lines ('Right' entries).
--
-- All characters in the input are present in the output:
--
-- prop> concat . map (either id id) . splitCPP = id
splitCPP :: String -> [Either String String]
splitCPP "" = []
splitCPP s | isCPPline s = Left l : splitCPP rest
| otherwise = Right l : splitCPP rest
where
~(l, rest) = spanToNewline 0 s
-- | Heuristic to decide if a line is going to be a CPP line. This should be a
-- cheap operation since it is going to be run on every line being processed.
--
-- Right now it just checks if the first non-whitespace character in the first
-- five characters of the line is a '#':
--
-- >>> isCPPline "#define FOO 1"
-- True
--
-- >>> isCPPline "\t\t #ifdef GHC"
-- True
--
-- >>> isCPPline " #endif"
-- False
--
isCPPline :: String -> Bool
isCPPline = isPrefixOf "#" . dropWhile (`elem` " \t") . take 5
-- | Split a "line" off the front of a string, hopefully without cutting tokens
-- in half. I say "hopefully" because knowing what a token is requires lexing,
-- yet lexing depends on this function.
--
-- All characters in the input are present in the output:
--
-- prop> curry (++) . spanToNewLine 0 = id
spanToNewline :: Int -- ^ open '{-'
-> String -- ^ input
-> (String, String)
-- Base case and space characters
spanToNewline _ "" = ("", "")
spanToNewline n ('\n':str) | n <= 0 = ("\n", str)
spanToNewline n ('\n':str) | n <= 0 = ("\n", str)
spanToNewline n ('\\':'\n':str) =
let (str', rest) = spanToNewline n str
in ('\\':'\n':str', rest)
-- Block comments
spanToNewline n ('{':'-':str) =
let (str', rest) = spanToNewline (n+1) str
in ('{':'-':str', rest)
spanToNewline n ('-':'}':str) =
let (str', rest) = spanToNewline (n-1) str
in ('-':'}':str', rest)
-- When not in a block comment, try to lex a Haskell token
spanToNewline 0 str@(c:_) | ((lexed, str') : _) <- R.lex str, not (isSpace c) =
if all (== '-') lexed && length lexed >= 2
-- A Haskell line comment
then case span (/= '\n') str' of
(str'', '\n':rest) -> (lexed ++ str'' ++ "\n", rest)
(_, _) -> (str, "")
-- An actual Haskell token
else let (str'', rest) = spanToNewline 0 str'
in (lexed ++ str'', rest)
-- In all other cases, advance one character at a time
spanToNewline n (c:str) =
let (str', rest) = spanToNewline n str
in (c:str', rest)
-- | Turn a list of GHC's 'L.Token' (and their source 'String') into a list of
-- Haddock's 'T.Token'.
ghcToks :: [(Located L.Token, String)] -> [T.Token]
ghcToks = reverse . (\(_,ts,_) -> ts) . foldl' go (start, [], False)
where
start = mkRealSrcLoc (mkFastString "lexing") 1 1
go :: (RealSrcLoc, [T.Token], Bool)
-- ^ current position, tokens accumulated, currently in pragma (or not)
-> (Located L.Token, String)
-- ^ next token, its content
-> (RealSrcLoc, [T.Token], Bool)
-- ^ new position, new tokens accumulated, currently in pragma (or not)
go (pos, toks, in_prag) (L l tok, raw) =
( next_pos
, classifiedTok ++ maybeToList white ++ toks
, inPragma in_prag tok
)
where
(next_pos, white) = mkWhitespace pos l
classifiedTok = [ Token (classify' tok) raw rss
| RealSrcSpan rss <- [l]
, not (null raw)
]
classify' | in_prag = const TkPragma
| otherwise = classify
-- | Find the correct amount of whitespace between tokens.
mkWhitespace :: RealSrcLoc -> SrcSpan -> (RealSrcLoc, Maybe T.Token)
mkWhitespace prev spn =
case spn of
UnhelpfulSpan _ -> (prev,Nothing)
RealSrcSpan s | null wsstring -> (end, Nothing)
| otherwise -> (end, Just (Token TkSpace wsstring wsspan))
where
start = realSrcSpanStart s
end = realSrcSpanEnd s
wsspan = mkRealSrcSpan prev start
nls = srcLocLine start - srcLocLine prev
spaces = if nls == 0 then srcLocCol start - srcLocCol prev
else srcLocCol start - 1
wsstring = replicate nls '\n' ++ replicate spaces ' '
-- | Classify given tokens as appropriate Haskell token type.
classify :: L.Token -> TokenType
classify tok =
case tok of
ITas -> TkKeyword
ITcase -> TkKeyword
ITclass -> TkKeyword
ITdata -> TkKeyword
ITdefault -> TkKeyword
ITderiving -> TkKeyword
ITdo -> TkKeyword
ITelse -> TkKeyword
IThiding -> TkKeyword
ITforeign -> TkKeyword
ITif -> TkKeyword
ITimport -> TkKeyword
ITin -> TkKeyword
ITinfix -> TkKeyword
ITinfixl -> TkKeyword
ITinfixr -> TkKeyword
ITinstance -> TkKeyword
ITlet -> TkKeyword
ITmodule -> TkKeyword
ITnewtype -> TkKeyword
ITof -> TkKeyword
ITqualified -> TkKeyword
ITthen -> TkKeyword
ITtype -> TkKeyword
ITvia -> TkKeyword
ITwhere -> TkKeyword
ITforall {} -> TkKeyword
ITexport -> TkKeyword
ITlabel -> TkKeyword
ITdynamic -> TkKeyword
ITsafe -> TkKeyword
ITinterruptible -> TkKeyword
ITunsafe -> TkKeyword
ITstdcallconv -> TkKeyword
ITccallconv -> TkKeyword
ITcapiconv -> TkKeyword
ITprimcallconv -> TkKeyword
ITjavascriptcallconv -> TkKeyword
ITmdo -> TkKeyword
ITfamily -> TkKeyword
ITrole -> TkKeyword
ITgroup -> TkKeyword
ITby -> TkKeyword
ITusing -> TkKeyword
ITpattern -> TkKeyword
ITstatic -> TkKeyword
ITstock -> TkKeyword
ITanyclass -> TkKeyword
ITunit -> TkKeyword
ITsignature -> TkKeyword
ITdependency -> TkKeyword
ITrequires -> TkKeyword
ITinline_prag {} -> TkPragma
ITspec_prag {} -> TkPragma
ITspec_inline_prag {} -> TkPragma
ITsource_prag {} -> TkPragma
ITrules_prag {} -> TkPragma
ITwarning_prag {} -> TkPragma
ITdeprecated_prag {} -> TkPragma
ITline_prag {} -> TkPragma
ITcolumn_prag {} -> TkPragma
ITscc_prag {} -> TkPragma
ITgenerated_prag {} -> TkPragma
ITcore_prag {} -> TkPragma
ITunpack_prag {} -> TkPragma
ITnounpack_prag {} -> TkPragma
ITann_prag {} -> TkPragma
ITcomplete_prag {} -> TkPragma
ITclose_prag -> TkPragma
IToptions_prag {} -> TkPragma
ITinclude_prag {} -> TkPragma
ITlanguage_prag -> TkPragma
ITminimal_prag {} -> TkPragma
IToverlappable_prag {} -> TkPragma
IToverlapping_prag {} -> TkPragma
IToverlaps_prag {} -> TkPragma
ITincoherent_prag {} -> TkPragma
ITctype {} -> TkPragma
ITdotdot -> TkGlyph
ITcolon -> TkGlyph
ITdcolon {} -> TkGlyph
ITequal -> TkGlyph
ITlam -> TkGlyph
ITlcase -> TkGlyph
ITvbar -> TkGlyph
ITlarrow {} -> TkGlyph
ITrarrow {} -> TkGlyph
ITat -> TkGlyph
ITtilde -> TkGlyph
ITdarrow {} -> TkGlyph
ITminus -> TkGlyph
ITbang -> TkGlyph
ITdot -> TkOperator
ITtypeApp -> TkGlyph
ITbiglam -> TkGlyph
ITocurly -> TkSpecial
ITccurly -> TkSpecial
ITvocurly -> TkSpecial
ITvccurly -> TkSpecial
ITobrack -> TkSpecial
ITopabrack -> TkSpecial
ITcpabrack -> TkSpecial
ITcbrack -> TkSpecial
IToparen -> TkSpecial
ITcparen -> TkSpecial
IToubxparen -> TkSpecial
ITcubxparen -> TkSpecial
ITsemi -> TkSpecial
ITcomma -> TkSpecial
ITunderscore -> TkIdentifier
ITbackquote -> TkSpecial
ITsimpleQuote -> TkSpecial
ITvarid {} -> TkIdentifier
ITconid {} -> TkIdentifier
ITvarsym {} -> TkOperator
ITconsym {} -> TkOperator
ITqvarid {} -> TkIdentifier
ITqconid {} -> TkIdentifier
ITqvarsym {} -> TkOperator
ITqconsym {} -> TkOperator
ITdupipvarid {} -> TkUnknown
ITlabelvarid {} -> TkUnknown
ITchar {} -> TkChar
ITstring {} -> TkString
ITinteger {} -> TkNumber
ITrational {} -> TkNumber
ITprimchar {} -> TkChar
ITprimstring {} -> TkString
ITprimint {} -> TkNumber
ITprimword {} -> TkNumber
ITprimfloat {} -> TkNumber
ITprimdouble {} -> TkNumber
ITopenExpQuote {} -> TkSpecial
ITopenPatQuote -> TkSpecial
ITopenDecQuote -> TkSpecial
ITopenTypQuote -> TkSpecial
ITcloseQuote {} -> TkSpecial
ITopenTExpQuote {} -> TkSpecial
ITcloseTExpQuote -> TkSpecial
ITidEscape {} -> TkUnknown
ITparenEscape -> TkSpecial
ITidTyEscape {} -> TkUnknown
ITparenTyEscape -> TkSpecial
ITtyQuote -> TkSpecial
ITquasiQuote {} -> TkUnknown
ITqQuasiQuote {} -> TkUnknown
ITproc -> TkKeyword
ITrec -> TkKeyword
IToparenbar {} -> TkGlyph
ITcparenbar {} -> TkGlyph
ITlarrowtail {} -> TkGlyph
ITrarrowtail {} -> TkGlyph
ITLarrowtail {} -> TkGlyph
ITRarrowtail {} -> TkGlyph
ITunknown {} -> TkUnknown
ITeof -> TkUnknown
-- Line comments are only supposed to start with '--'. Starting with '#'
-- means that this was probably a CPP.
ITlineComment s
| isCPPline s -> TkCpp
| otherwise -> TkComment
ITdocCommentNext {} -> TkComment
ITdocCommentPrev {} -> TkComment
ITdocCommentNamed {} -> TkComment
ITdocSection {} -> TkComment
ITdocOptions {} -> TkComment
-- The lexer considers top-level pragmas as comments (see `pragState` in
-- the GHC lexer for more), so we have to manually reverse this. The
-- following is a hammer: it smashes _all_ pragma-like block comments into
-- pragmas.
ITblockComment c
| isPrefixOf "{-#" c
, isSuffixOf "#-}" c -> TkPragma
| otherwise -> TkComment
-- | Classify given tokens as beginning pragmas (or not).
inPragma :: Bool -- ^ currently in pragma
-> L.Token -- ^ current token
-> Bool -- ^ new information about whether we are in a pragma
inPragma _ ITclose_prag = False
inPragma True _ = True
inPragma False tok =
case tok of
ITinline_prag {} -> True
ITspec_prag {} -> True
ITspec_inline_prag {} -> True
ITsource_prag {} -> True
ITrules_prag {} -> True
ITwarning_prag {} -> True
ITdeprecated_prag {} -> True
ITline_prag {} -> True
ITcolumn_prag {} -> True
ITscc_prag {} -> True
ITgenerated_prag {} -> True
ITcore_prag {} -> True
ITunpack_prag {} -> True
ITnounpack_prag {} -> True
ITann_prag {} -> True
ITcomplete_prag {} -> True
IToptions_prag {} -> True
ITinclude_prag {} -> True
ITlanguage_prag -> True
ITminimal_prag {} -> True
IToverlappable_prag {} -> True
IToverlapping_prag {} -> True
IToverlaps_prag {} -> True
ITincoherent_prag {} -> True
ITctype {} -> True
_ -> False
|