From 4f75be94f45a0e92553eccefe56230c554333ce7 Mon Sep 17 00:00:00 2001
From: Alec Theriault <alec.theriault@gmail.com>
Date: Sun, 10 Dec 2017 12:22:21 -0800
Subject: Use the GHC lexer for the Hyperlinker backend (#714)

* Start changing to use GHC lexer

* better cpp

* Change SrcSpan to RealSrcSpan

* Remove error

* Try to stop too many open files

* wip

* wip

* Revert "wip"

This reverts commit b605510a195f26315e3d8ca90e6d95a6737553e1.

Conflicts:
	haddock-api/haddock-api.cabal
	haddock-api/src/Haddock/Interface.hs

* Remove pointless 'caching'

* Use dlist rather than lists when finding vars

* Use a map rather than list

* Delete bogus comment

* Rebase followup

Things now run using the GHC lexer. There are still

  - stray debug statements
  - unnecessary changes w.r.t. master

* Cleaned up differences w.r.t. current Haddock HEAD

Things are looking good. quasiquotes in particular look beautiful: the
TH ones (with Haskell source inside) colour/link their contents too!

Haven't yet begun to check for possible performance problems.

* Support CPP and top-level pragmas

The support for these is hackier - but no more hacky than the existing
support.

* Tests pass, CPP is better recognized

The tests were in some cases altered: I consider the new output to be more
correct than the old one....

* Fix shrinking of source without tabs in test

* Replace 'Position'/'Span' with GHC counterparts

Replaces 'Position' -> 'GHC.RealSrcLoc' and 'Span' -> 'GHC.RealSrcSpan'.

* Nits

* Forgot entry in .cabal

* Update changelog
---
 haddock-api/src/Haddock.hs                         |   2 +-
 .../src/Haddock/Backends/Hyperlinker/Ast.hs        |  19 +-
 .../src/Haddock/Backends/Hyperlinker/Parser.hs     | 534 +++++++++++++--------
 .../src/Haddock/Backends/Hyperlinker/Renderer.hs   |  14 +-
 .../src/Haddock/Backends/Hyperlinker/Types.hs      |  33 +-
 haddock-api/src/Haddock/Interface/Create.hs        |  24 +-
 6 files changed, 375 insertions(+), 251 deletions(-)

(limited to 'haddock-api/src')

diff --git a/haddock-api/src/Haddock.hs b/haddock-api/src/Haddock.hs
index f7fa52b3..3f5e5298 100644
--- a/haddock-api/src/Haddock.hs
+++ b/haddock-api/src/Haddock.hs
@@ -277,7 +277,7 @@ render dflags flags qual ifaces installedIfaces extSrcMap = do
       | Flag_HyperlinkedSource `elem` flags = Just hypSrcModuleUrlFormat
       | otherwise = srcModule
 
-    srcMap = mkSrcMap $ Map.union
+    srcMap = Map.union
       (Map.map SrcExternal extSrcMap)
       (Map.fromList [ (ifaceMod iface, SrcLocal) | iface <- ifaces ])
 
diff --git a/haddock-api/src/Haddock/Backends/Hyperlinker/Ast.hs b/haddock-api/src/Haddock/Backends/Hyperlinker/Ast.hs
index 57ff72ff..361bc15d 100644
--- a/haddock-api/src/Haddock/Backends/Hyperlinker/Ast.hs
+++ b/haddock-api/src/Haddock/Backends/Hyperlinker/Ast.hs
@@ -12,6 +12,7 @@ import qualified Haddock.Syb as Syb
 import Haddock.Backends.Hyperlinker.Types
 
 import qualified GHC
+import qualified SrcLoc
 
 import Control.Applicative
 import Control.Monad (guard)
@@ -51,10 +52,10 @@ type DetailsMap = Map.Map Position (Span, TokenDetails)
 
 mkDetailsMap :: [(GHC.SrcSpan, TokenDetails)] -> DetailsMap
 mkDetailsMap xs =
-  Map.fromListWith select_details [ (start, (token_span, token_details))
+  Map.fromListWith select_details [ (start, (span, token_details))
                                   | (ghc_span, token_details) <- xs
-                                  , Just !token_span <- [ghcSrcSpanToSpan ghc_span]
-                                  , let start = spStart token_span
+                                  , GHC.RealSrcSpan span <- [ghc_span]
+                                  , let start = SrcLoc.realSrcSpanStart span
                                   ]
   where
     -- favour token details which appear earlier in the list
@@ -62,17 +63,11 @@ mkDetailsMap xs =
 
 lookupBySpan :: Span -> DetailsMap -> Maybe TokenDetails
 lookupBySpan span details = do
-  (_, (tok_span, tok_details)) <- Map.lookupLE (spStart span) details
-  guard (tok_span `containsSpan` span )
+  let pos = SrcLoc.realSrcSpanStart span
+  (_, (tok_span, tok_details)) <- Map.lookupLE pos details
+  guard (tok_span `SrcLoc.containsSpan` span)
   return tok_details
 
-ghcSrcSpanToSpan :: GHC.SrcSpan -> Maybe Span
-ghcSrcSpanToSpan (GHC.RealSrcSpan span) =
-  Just (Span { spStart = Position (GHC.srcSpanStartLine span) (GHC.srcSpanStartCol span)
-             , spEnd   = Position (GHC.srcSpanEndLine span) (GHC.srcSpanEndCol span)
-             })
-ghcSrcSpanToSpan _ = Nothing
-
 enrichToken :: Token -> DetailsMap -> Maybe TokenDetails
 enrichToken (Token typ _ spn) dm
     | typ `elem` [TkIdentifier, TkOperator] = lookupBySpan spn dm
diff --git a/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs b/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
index e4345602..da1555b8 100644
--- a/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
+++ b/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
@@ -1,11 +1,17 @@
 module Haddock.Backends.Hyperlinker.Parser (parse) where
 
+import Data.Either         ( isRight, isLeft )
+import Data.List           ( foldl', isPrefixOf, isSuffixOf )
+import Data.Maybe          ( maybeToList )
 
-import Data.Char
-import Data.List
-import Data.Maybe
+import GHC                 ( DynFlags, addSourceToTokens )
+import SrcLoc
+import FastString          ( mkFastString )
+import StringBuffer        ( stringToStringBuffer )
+import Lexer               ( Token(..) )
+import qualified Lexer as L
 
-import Haddock.Backends.Hyperlinker.Types
+import Haddock.Backends.Hyperlinker.Types as T
 
 
 -- | Turn source code string into a stream of more descriptive tokens.
@@ -13,202 +19,350 @@ import Haddock.Backends.Hyperlinker.Types
 -- Result should retain original file layout (including comments, whitespace,
 -- etc.), i.e. the following "law" should hold:
 --
--- @concat . map 'tkValue' . 'parse' = id@
-parse :: String -> [Token]
-parse = tokenize . tag . chunk
+-- prop> concat . map tkValue . parse = id
+--
+-- (In reality, this only holds for input not containing '\r', '\t', '\f', '\v',
+-- characters, since GHC transforms those into ' ' and '\n')
+parse :: DynFlags -> FilePath -> String -> [T.Token]
+parse dflags fp s = ghcToks (processCPP dflags fp s)
+
 
--- | Split raw source string to more meaningful chunks.
+-- | Parse the source into tokens using the GHC lexer.
+--
+--   * CPP lines are removed and reinserted as line-comments
+--   * top-level file pragmas are parsed as block comments (see the
+--     'ITblockComment' case of 'classify' for more details)
 --
--- This is the initial stage of tokenization process. Each chunk is either
--- a comment (including comment delimiters), a whitespace string, preprocessor
--- macro (and all its content until the end of a line) or valid Haskell lexeme.
-chunk :: String -> [String]
-chunk [] = []
-chunk str@(c:_)
-    | isSpace c =
-        let (space, mcpp, rest) = spanSpaceOrCpp str
-        in [space] ++ maybeToList mcpp ++ chunk rest
-chunk str
-    | "--" `isPrefixOf` str = chunk' $ spanToNewline str
-    | "{-" `isPrefixOf` str = chunk' $ chunkComment 0 str
-    | otherwise = case lex' str of
-        (tok:_) -> chunk' tok
-        [] -> [str]
+processCPP :: DynFlags    -- ^ GHC's flags
+           -> FilePath    -- ^ source file name (for position information)
+           -> String      -- ^ source file contents
+           -> [(Located L.Token, String)]
+processCPP dflags fpath s = addSrc . go start . splitCPP $ s
   where
-    chunk' (c, rest) = c:(chunk rest)
+    start = mkRealSrcLoc (mkFastString fpath) 1 1
+    addSrc = addSourceToTokens start (stringToStringBuffer s)
 
--- | A bit better lexer then the default, i.e. handles DataKinds quotes
-lex' :: ReadS String
-lex' ('\'' : '\'' : rest)              = [("''", rest)]
-lex' str@('\'' : '\\' : _ : '\'' : _)  = lex str
-lex' str@('\'' : _ : '\'' : _)         = lex str
-lex' ('\'' : rest)                     = [("'", rest)]
-lex' str                               = lex str
+    -- Transform a list of Haskell/CPP lines into a list of tokens
+    go :: RealSrcLoc -> [Either String String] -> [Located L.Token]
+    go _   [] = []
+    go pos ls =
+      let (hLinesRight,  ls')  = span isRight ls
+          (cppLinesLeft, rest) = span isLeft ls'
+          
+          hSrc   = concat [ hLine   | Right hLine  <- hLinesRight  ]
+          cppSrc = concat [ cppLine | Left cppLine <- cppLinesLeft ]
+      
+      in case L.lexTokenStream (stringToStringBuffer hSrc) pos dflags of
 
--- | Split input to "first line" string and the rest of it.
---
--- Ideally, this should be done simply with @'break' (== '\n')@. However,
--- Haskell also allows line-unbreaking (or whatever it is called) so things
--- are not as simple and this function deals with that.
-spanToNewline :: String -> (String, String)
-spanToNewline [] = ([], [])
-spanToNewline ('\\':'\n':str) =
-    let (str', rest) = spanToNewline str
-    in ('\\':'\n':str', rest)
-spanToNewline str@('\n':_) = ("", str)
-spanToNewline (c:str) =
-    let (str', rest) = spanToNewline str
-    in (c:str', rest)
+           -- Stuff that fails to lex gets turned into comments
+           L.PFailed _ss _msg ->
+             let (src_pos, failed) = mkToken ITunknown pos hSrc
+                 (new_pos, cpp)    = mkToken ITlineComment src_pos cppSrc
+             in failed : cpp : go new_pos rest
 
--- | Split input to whitespace string, (optional) preprocessor directive and
--- the rest of it.
+           -- Successfully lexed 
+           L.POk ss toks ->
+             let (new_pos, cpp) = mkToken ITlineComment (L.loc ss) cppSrc
+             in toks ++ [cpp] ++ go new_pos rest
+ 
+    -- Manually make a token from a 'String', advancing the cursor position
+    mkToken tok start' str =
+      let end = foldl' advanceSrcLoc start' str
+      in (end, L (RealSrcSpan $ mkRealSrcSpan start' end) (tok str))
+
+
+-- | Split apart the initial file into Haskell source lines ('Left' entries) and
+-- CPP lines ('Right' entries).
 --
--- Again, using something like @'span' 'isSpace'@ would be nice to chunk input
--- to whitespace. The problem is with /#/ symbol - if it is placed at the very
--- beginning of a line, it should be recognized as preprocessor macro. In any
--- other case, it is ordinary Haskell symbol and can be used to declare
--- operators. Hence, while dealing with whitespace we also check whether there
--- happens to be /#/ symbol just after a newline character - if that is the
--- case, we begin treating the whole line as preprocessor macro.
-spanSpaceOrCpp :: String -> (String, Maybe String, String)
-spanSpaceOrCpp ('\n':'#':str) =
-    let (str', rest) = spanToNewline str
-    in ("\n", Just $ '#':str', rest)
-spanSpaceOrCpp (c:str')
-    | isSpace c =
-        let (space, mcpp, rest) = spanSpaceOrCpp str'
-        in (c:space, mcpp, rest)
-spanSpaceOrCpp str = ("", Nothing, str)
-
--- | Split input to comment content (including delimiters) and the rest.
+-- All characters in the input are present in the output:
 --
--- Again, some more logic than simple 'span' is required because of Haskell
--- comment nesting policy.
-chunkComment :: Int -> String -> (String, String)
-chunkComment _ [] = ("", "")
-chunkComment depth ('{':'-':str) =
-    let (c, rest) = chunkComment (depth + 1) str
-    in ("{-" ++ c, rest)
-chunkComment depth ('-':'}':str)
-    | depth == 1 = ("-}", str)
-    | otherwise =
-        let (c, rest) = chunkComment (depth - 1) str
-        in ("-}" ++ c, rest)
-chunkComment depth (e:str) =
-    let (c, rest) = chunkComment depth str
-    in (e:c, rest)
-
--- | Assign source location for each chunk in given stream.
-tag :: [String] -> [(Span, String)]
-tag =
-    reverse . snd . foldl aux (Position 1 1, [])
-  where
-    aux (pos, cs) str =
-        let pos' = foldl move pos str
-        in (pos', (Span pos pos', str):cs)
-    move pos '\n' = pos { posRow = posRow pos + 1, posCol = 1 }
-    move pos _ = pos { posCol = posCol pos + 1 }
-
--- | Turn unrecognised chunk stream to more descriptive token stream.
-tokenize :: [(Span, String)] -> [Token]
-tokenize =
-    map aux
+-- prop> concat . map (either id id) . splitCPP = id
+splitCPP :: String -> [Either String String]
+splitCPP "" = []
+splitCPP s | isCPPline s = Left l : splitCPP rest
+           | otherwise =  Right l : splitCPP rest
   where
-    aux (sp, str) = Token
-        { tkType = classify str
-        , tkValue = str
-        , tkSpan = sp
-        }
+    ~(l, rest) = spanToNewline 0 s
 
--- | Classify given string as appropriate Haskell token.
+
+-- | Heuristic to decide if a line is going to be a CPP line. This should be a
+-- cheap operation since it is going to be run on every line being processed.
 --
--- This method is based on Haskell 98 Report lexical structure description:
--- https://www.haskell.org/onlinereport/lexemes.html
+-- Right now it just checks if the first non-whitespace character in the first
+-- five characters of the line is a '#':
 --
--- However, this is probably far from being perfect and most probably does not
--- handle correctly all corner cases.
-classify :: String -> TokenType
-classify str
-    | "--" `isPrefixOf` str = TkComment
-    | "{-#" `isPrefixOf` str = TkPragma
-    | "{-" `isPrefixOf` str = TkComment
-classify "''" = TkSpecial
-classify "'"  = TkSpecial
-classify str@(c:_)
-    | isSpace c = TkSpace
-    | isDigit c = TkNumber
-    | c `elem` special = TkSpecial
-    | str `elem` glyphs = TkGlyph
-    | all (`elem` symbols) str = TkOperator
-    | c == '#' = TkCpp
-    | c == '"' = TkString
-    | c == '\'' = TkChar
-classify str
-    | str `elem` keywords = TkKeyword
-    | isIdentifier str = TkIdentifier
-    | otherwise = TkUnknown
-
-keywords :: [String]
-keywords =
-    [ "as"
-    , "case"
-    , "class"
-    , "data"
-    , "default"
-    , "deriving"
-    , "do"
-    , "else"
-    , "hiding"
-    , "if"
-    , "import"
-    , "in"
-    , "infix"
-    , "infixl"
-    , "infixr"
-    , "instance"
-    , "let"
-    , "module"
-    , "newtype"
-    , "of"
-    , "qualified"
-    , "then"
-    , "type"
-    , "where"
-    , "forall"
-    , "family"
-    , "mdo"
-    ]
-
-glyphs :: [String]
-glyphs =
-    [ ".."
-    , ":"
-    , "::"
-    , "="
-    , "\\"
-    , "|"
-    , "<-"
-    , "->"
-    , "@"
-    , "~"
-    , "~#"
-    , "=>"
-    , "-"
-    , "!"
-    ]
-
-special :: [Char]
-special = "()[]{},;`"
-
--- TODO: Add support for any Unicode symbol or punctuation.
--- source: http://stackoverflow.com/questions/10548170/what-characters-are-permitted-for-haskell-operators
-symbols :: [Char]
-symbols = "!#$%&*+./<=>?@\\^|-~:"
-
-isIdentifier :: String -> Bool
-isIdentifier (s:str)
-    | (isLower' s || isUpper s) && all isAlphaNum' str = True
+-- >>> isCPPline "#define FOO 1"
+-- True
+--
+-- >>> isCPPline "\t\t  #ifdef GHC"
+-- True
+--
+-- >>> isCPPline "       #endif"
+-- False
+--
+isCPPline :: String -> Bool
+isCPPline = isPrefixOf "#" . dropWhile (`elem` " \t") . take 5
+
+
+-- | Split a "line" off the front of a string, supporting newline escapes.
+--
+-- By "line", we understand: the shortest substring ending in a '\n' that is not
+-- 
+--   1. immediately preceded by a '\\'
+--   2. not inside some (possibly nested) block comment
+--
+-- All characters in the input are present in the output:
+--
+-- prop> curry (++) . spanToNewLine 0 = id
+spanToNewline :: Int                 -- ^ open '{-'
+              -> String              -- ^ input
+              -> (String, String)
+spanToNewline _ [] = ([], [])
+spanToNewline n ('\n':str) | n <= 0 = ("\n", str)
+spanToNewline n ('\\':'\n':str) =
+    let (str', rest) = spanToNewline n str
+    in ('\\':'\n':str', rest)
+spanToNewline n ('{':'-':str) =
+    let (str', rest) = spanToNewline (n+1) str
+    in ('{':'-':str', rest) 
+spanToNewline n ('-':'}':str) =
+    let (str', rest) = spanToNewline (n-1) str
+    in ('-':'}':str', rest) 
+spanToNewline n (c:str) =
+    let (str', rest) = spanToNewline n str
+    in (c:str', rest)
+
+
+-- | Turn a list of GHC's 'L.Token' (and their source 'String') into a list of
+-- Haddock's 'T.Token'.
+ghcToks :: [(Located L.Token, String)] -> [T.Token]
+ghcToks = reverse . snd . foldl' go (start, [])
   where
-    isLower' c = isLower c || c == '_'
-    isAlphaNum' c = isAlphaNum c || c == '_' || c == '\''
-isIdentifier _ = False
+    start = mkRealSrcLoc (mkFastString "lexing") 1 1
+
+    go :: (RealSrcLoc, [T.Token])    -- ^ current position, tokens accumulated
+       -> (Located L.Token, String)  -- ^ next token, its content
+       -> (RealSrcLoc, [T.Token])    -- ^ new position, new tokens accumulated
+    go (pos, toks) (L l tok, raw) = ( next_pos
+                                    , classifiedTok ++ maybeToList white ++ toks
+                                    )
+       where
+         (next_pos, white) = mkWhitespace pos l
+         classifiedTok = [ Token (classify tok) raw rss
+                         | RealSrcSpan rss <- [l]
+                         , not (null raw)
+                         ]
+
+-- | Find the correct amount of whitespace between tokens.
+mkWhitespace :: RealSrcLoc -> SrcSpan -> (RealSrcLoc, Maybe T.Token)
+mkWhitespace prev spn =
+  case spn of
+    UnhelpfulSpan _ -> (prev,Nothing)
+    RealSrcSpan s | null wsstring -> (end, Nothing)
+                  | otherwise -> (end, Just (Token TkSpace wsstring wsspan))
+      where
+        start = realSrcSpanStart s
+        end = realSrcSpanEnd s
+        wsspan = mkRealSrcSpan prev start
+        nls = srcLocLine start - srcLocLine prev
+        spaces = if nls == 0 then srcLocCol start - srcLocCol prev
+                             else srcLocCol start - 1
+        wsstring = replicate nls '\n' ++ replicate spaces ' '
+
+
+-- | Classify given string as appropriate Haskell token.
+classify :: L.Token -> TokenType
+classify tok =
+  case tok of
+    ITas                   -> TkKeyword
+    ITcase                 -> TkKeyword
+    ITclass                -> TkKeyword
+    ITdata                 -> TkKeyword
+    ITdefault              -> TkKeyword
+    ITderiving             -> TkKeyword
+    ITdo                   -> TkKeyword
+    ITelse                 -> TkKeyword
+    IThiding               -> TkKeyword
+    ITforeign              -> TkKeyword
+    ITif                   -> TkKeyword
+    ITimport               -> TkKeyword
+    ITin                   -> TkKeyword
+    ITinfix                -> TkKeyword
+    ITinfixl               -> TkKeyword
+    ITinfixr               -> TkKeyword
+    ITinstance             -> TkKeyword
+    ITlet                  -> TkKeyword
+    ITmodule               -> TkKeyword
+    ITnewtype              -> TkKeyword
+    ITof                   -> TkKeyword
+    ITqualified            -> TkKeyword
+    ITthen                 -> TkKeyword
+    ITtype                 -> TkKeyword
+    ITwhere                -> TkKeyword
+
+    ITforall {}            -> TkKeyword
+    ITexport               -> TkKeyword
+    ITlabel                -> TkKeyword
+    ITdynamic              -> TkKeyword
+    ITsafe                 -> TkKeyword
+    ITinterruptible        -> TkKeyword
+    ITunsafe               -> TkKeyword
+    ITstdcallconv          -> TkKeyword
+    ITccallconv            -> TkKeyword
+    ITcapiconv             -> TkKeyword
+    ITprimcallconv         -> TkKeyword
+    ITjavascriptcallconv   -> TkKeyword
+    ITmdo                  -> TkKeyword
+    ITfamily               -> TkKeyword
+    ITrole                 -> TkKeyword
+    ITgroup                -> TkKeyword
+    ITby                   -> TkKeyword
+    ITusing                -> TkKeyword
+    ITpattern              -> TkKeyword
+    ITstatic               -> TkKeyword
+    ITstock                -> TkKeyword
+    ITanyclass             -> TkKeyword
+
+    ITunit                 -> TkKeyword
+    ITsignature            -> TkKeyword
+    ITdependency           -> TkKeyword
+    ITrequires             -> TkKeyword
+
+    ITinline_prag       {} -> TkPragma
+    ITspec_prag         {} -> TkPragma
+    ITspec_inline_prag  {} -> TkPragma
+    ITsource_prag       {} -> TkPragma
+    ITrules_prag        {} -> TkPragma
+    ITwarning_prag      {} -> TkPragma
+    ITdeprecated_prag   {} -> TkPragma
+    ITline_prag            -> TkPragma
+    ITscc_prag          {} -> TkPragma
+    ITgenerated_prag    {} -> TkPragma
+    ITcore_prag         {} -> TkPragma
+    ITunpack_prag       {} -> TkPragma
+    ITnounpack_prag     {} -> TkPragma
+    ITann_prag          {} -> TkPragma
+    ITcomplete_prag     {} -> TkPragma
+    ITclose_prag           -> TkPragma
+    IToptions_prag      {} -> TkPragma
+    ITinclude_prag      {} -> TkPragma
+    ITlanguage_prag        -> TkPragma
+    ITvect_prag         {} -> TkPragma
+    ITvect_scalar_prag  {} -> TkPragma
+    ITnovect_prag       {} -> TkPragma
+    ITminimal_prag      {} -> TkPragma
+    IToverlappable_prag {} -> TkPragma
+    IToverlapping_prag  {} -> TkPragma
+    IToverlaps_prag     {} -> TkPragma
+    ITincoherent_prag   {} -> TkPragma
+    ITctype             {} -> TkPragma
+
+    ITdotdot               -> TkGlyph
+    ITcolon                -> TkGlyph
+    ITdcolon            {} -> TkGlyph
+    ITequal                -> TkGlyph
+    ITlam                  -> TkGlyph
+    ITlcase                -> TkGlyph
+    ITvbar                 -> TkGlyph
+    ITlarrow            {} -> TkGlyph
+    ITrarrow            {} -> TkGlyph
+    ITat                   -> TkGlyph
+    ITtilde                -> TkGlyph
+    ITtildehsh             -> TkGlyph
+    ITdarrow            {} -> TkGlyph
+    ITminus                -> TkGlyph
+    ITbang                 -> TkGlyph
+    ITdot                  -> TkOperator
+    ITtypeApp              -> TkGlyph
+
+    ITbiglam               -> TkGlyph
+
+    ITocurly               -> TkSpecial
+    ITccurly               -> TkSpecial
+    ITvocurly              -> TkSpecial
+    ITvccurly              -> TkSpecial
+    ITobrack               -> TkSpecial
+    ITopabrack             -> TkSpecial
+    ITcpabrack             -> TkSpecial
+    ITcbrack               -> TkSpecial
+    IToparen               -> TkSpecial
+    ITcparen               -> TkSpecial
+    IToubxparen            -> TkSpecial
+    ITcubxparen            -> TkSpecial
+    ITsemi                 -> TkSpecial
+    ITcomma                -> TkSpecial
+    ITunderscore           -> TkIdentifier
+    ITbackquote            -> TkSpecial
+    ITsimpleQuote          -> TkSpecial
+
+    ITvarid             {} -> TkIdentifier
+    ITconid             {} -> TkIdentifier
+    ITvarsym            {} -> TkOperator
+    ITconsym            {} -> TkOperator
+    ITqvarid            {} -> TkIdentifier
+    ITqconid            {} -> TkIdentifier
+    ITqvarsym           {} -> TkOperator
+    ITqconsym           {} -> TkOperator
+
+    ITdupipvarid        {} -> TkUnknown
+    ITlabelvarid        {} -> TkUnknown
+
+    ITchar              {} -> TkChar
+    ITstring            {} -> TkString
+    ITinteger           {} -> TkNumber
+    ITrational          {} -> TkNumber
+
+    ITprimchar          {} -> TkChar
+    ITprimstring        {} -> TkString
+    ITprimint           {} -> TkNumber
+    ITprimword          {} -> TkNumber
+    ITprimfloat         {} -> TkNumber
+    ITprimdouble        {} -> TkNumber
+
+    ITopenExpQuote      {} -> TkSpecial
+    ITopenPatQuote         -> TkSpecial
+    ITopenDecQuote         -> TkSpecial
+    ITopenTypQuote         -> TkSpecial
+    ITcloseQuote        {} -> TkSpecial
+    ITopenTExpQuote     {} -> TkSpecial
+    ITcloseTExpQuote       -> TkSpecial
+    ITidEscape          {} -> TkUnknown
+    ITparenEscape          -> TkSpecial
+    ITidTyEscape        {} -> TkUnknown
+    ITparenTyEscape        -> TkSpecial
+    ITtyQuote              -> TkSpecial
+    ITquasiQuote {}        -> TkUnknown
+    ITqQuasiQuote {}       -> TkUnknown
+
+    ITproc                 -> TkKeyword
+    ITrec                  -> TkKeyword
+    IToparenbar         {} -> TkGlyph
+    ITcparenbar         {} -> TkGlyph
+    ITlarrowtail        {} -> TkGlyph
+    ITrarrowtail        {} -> TkGlyph
+    ITLarrowtail        {} -> TkGlyph
+    ITRarrowtail        {} -> TkGlyph
+
+    ITunknown           {} -> TkUnknown
+    ITeof                  -> TkUnknown
+
+    -- Line comments are only supposed to start with '--'. Starting with '#'
+    -- means that this was probably a CPP.
+    ITlineComment s
+      | isCPPline s        -> TkCpp
+      | otherwise          -> TkComment
+
+    ITdocCommentNext    {} -> TkComment
+    ITdocCommentPrev    {} -> TkComment
+    ITdocCommentNamed   {} -> TkComment
+    ITdocSection        {} -> TkComment
+    ITdocOptions        {} -> TkComment
+
+    -- The lexer considers top-level pragmas as comments (see `pragState` in
+    -- the GHC lexer for more), so we have to manually reverse this. The
+    -- following is a hammer: it smashes _all_ pragma-like block comments into
+    -- pragmas.
+    ITblockComment c
+      | isPrefixOf "{-#" c
+      , isSuffixOf "#-}" c -> TkPragma
+      | otherwise          -> TkComment
diff --git a/haddock-api/src/Haddock/Backends/Hyperlinker/Renderer.hs b/haddock-api/src/Haddock/Backends/Hyperlinker/Renderer.hs
index 15793f0c..27bf7605 100644
--- a/haddock-api/src/Haddock/Backends/Hyperlinker/Renderer.hs
+++ b/haddock-api/src/Haddock/Backends/Hyperlinker/Renderer.hs
@@ -82,7 +82,7 @@ header mcss mjs =
 
 tokenGroup :: SrcMap -> TokenGroup -> Html
 tokenGroup _ (GrpNormal tok@(Token { .. }))
-    | tkType == TkSpace = renderSpace (posRow . spStart $ tkSpan) tkValue
+    | tkType == TkSpace = renderSpace (GHC.srcSpanStartLine tkSpan) tkValue
     | otherwise = tokenSpan tok ! attrs
   where
     attrs = [ multiclass . tokenStyle $ tkType ]
@@ -155,7 +155,7 @@ internalHyperlink name content =
     Html.anchor content ! [ Html.href $ "#" ++ internalAnchorIdent name ]
 
 externalNameHyperlink :: SrcMap -> GHC.Name -> Html -> Html
-externalNameHyperlink (srcs, _) name content = case Map.lookup mdl srcs of
+externalNameHyperlink srcs name content = case Map.lookup mdl srcs of
     Just SrcLocal -> Html.anchor content !
         [ Html.href $ hypSrcModuleNameUrl mdl name ]
     Just (SrcExternal path) -> Html.anchor content !
@@ -165,12 +165,14 @@ externalNameHyperlink (srcs, _) name content = case Map.lookup mdl srcs of
     mdl = GHC.nameModule name
 
 externalModHyperlink :: SrcMap -> GHC.ModuleName -> Html -> Html
-externalModHyperlink (_, srcs) name content = case Map.lookup name srcs of
-    Just SrcLocal -> Html.anchor content !
+externalModHyperlink srcs name content =
+    let srcs' = Map.mapKeys GHC.moduleName srcs in
+    case Map.lookup name srcs' of
+      Just SrcLocal -> Html.anchor content !
         [ Html.href $ hypSrcModuleUrl' name ]
-    Just (SrcExternal path) -> Html.anchor content !
+      Just (SrcExternal path) -> Html.anchor content !
         [ Html.href $ path </> hypSrcModuleUrl' name ]
-    Nothing -> content
+      Nothing -> content
 
 
 renderSpace :: Int -> String -> Html
diff --git a/haddock-api/src/Haddock/Backends/Hyperlinker/Types.hs b/haddock-api/src/Haddock/Backends/Hyperlinker/Types.hs
index d8ae89e4..e377471e 100644
--- a/haddock-api/src/Haddock/Backends/Hyperlinker/Types.hs
+++ b/haddock-api/src/Haddock/Backends/Hyperlinker/Types.hs
@@ -4,8 +4,6 @@ module Haddock.Backends.Hyperlinker.Types where
 import qualified GHC
 
 import Data.Map (Map)
-import qualified Data.Map as Map
-
 
 data Token = Token
     { tkType :: TokenType
@@ -14,23 +12,8 @@ data Token = Token
     }
     deriving (Show)
 
-data Position = Position
-    { posRow :: !Int
-    , posCol :: !Int
-    }
-    deriving (Eq, Ord, Show)
-
-data Span = Span
-    { spStart :: !Position
-    , spEnd   :: !Position
-    }
-    deriving (Show)
-
--- | Tests whether the first span "contains" the other span, meaning
--- that it covers at least as much source code. True where spans are equal.
-containsSpan :: Span -> Span -> Bool
-containsSpan s1 s2 =
-  spStart s1 <= spStart s2 && spEnd s1 >= spEnd s2
+type Position = GHC.RealSrcLoc
+type Span = GHC.RealSrcSpan
 
 data TokenType
     = TkIdentifier
@@ -80,15 +63,5 @@ data SrcPath
     | SrcLocal
 
 -- | Mapping from modules to cross-package source paths.
---
--- This mapping is actually a pair of maps instead of just one map. The reason
--- for this is because when hyperlinking modules in import lists we have no
--- 'GHC.Module' available. On the other hand, we can't just use map with
--- 'GHC.ModuleName' as indices because certain modules may have common name
--- but originate in different packages. Hence, we use both /rich/ and /poor/
--- versions, where the /poor/ is just projection of /rich/ one cached in pair
--- for better performance.
-type SrcMap = (Map GHC.Module SrcPath, Map GHC.ModuleName SrcPath)
+type SrcMap = Map GHC.Module SrcPath
 
-mkSrcMap :: Map GHC.Module SrcPath -> SrcMap
-mkSrcMap srcs = (srcs, Map.mapKeys GHC.moduleName srcs)
diff --git a/haddock-api/src/Haddock/Interface/Create.hs b/haddock-api/src/Haddock/Interface/Create.hs
index 2a56e87a..4309163f 100644
--- a/haddock-api/src/Haddock/Interface/Create.hs
+++ b/haddock-api/src/Haddock/Interface/Create.hs
@@ -20,7 +20,6 @@
 module Haddock.Interface.Create (createInterface) where
 
 import Documentation.Haddock.Doc (metaDocAppend)
-import Documentation.Haddock.Utf8 as Utf8
 import Haddock.Types
 import Haddock.Options
 import Haddock.GhcUtils
@@ -33,15 +32,14 @@ import Haddock.Backends.Hyperlinker.Parser as Hyperlinker
 
 import Data.Bifunctor
 import Data.Bitraversable
-import qualified Data.ByteString as BS
 import qualified Data.Map as M
 import Data.Map (Map)
 import Data.List
 import Data.Maybe
 import Data.Ord
 import Control.Applicative
-import Control.Exception (evaluate)
 import Control.Monad
+import Control.DeepSeq
 import Data.Traversable
 
 import Avail hiding (avail)
@@ -160,7 +158,7 @@ createInterface tm flags modMap instIfaceMap = do
 
   modWarn <- liftErrMsg (moduleWarning dflags gre warnings)
 
-  tokenizedSrc <- mkMaybeTokenizedSrc flags tm
+  tokenizedSrc <- mkMaybeTokenizedSrc dflags flags tm
 
   return $! Interface {
     ifaceMod               = mdl
@@ -1137,12 +1135,12 @@ seqList :: [a] -> ()
 seqList [] = ()
 seqList (x : xs) = x `seq` seqList xs
 
-mkMaybeTokenizedSrc :: [Flag] -> TypecheckedModule
+mkMaybeTokenizedSrc :: DynFlags -> [Flag] -> TypecheckedModule
                     -> ErrMsgGhc (Maybe [RichToken])
-mkMaybeTokenizedSrc flags tm
+mkMaybeTokenizedSrc dflags flags tm
     | Flag_HyperlinkedSource `elem` flags = case renamedSource tm of
         Just src -> do
-            tokens <- liftGhcToErrMsgGhc . liftIO $ mkTokenizedSrc summary src
+            tokens <- liftGhcToErrMsgGhc (liftIO (mkTokenizedSrc dflags summary src))
             return $ Just tokens
         Nothing -> do
             liftErrMsg . tell . pure $ concat
@@ -1155,12 +1153,14 @@ mkMaybeTokenizedSrc flags tm
   where
     summary = pm_mod_summary . tm_parsed_module $ tm
 
-mkTokenizedSrc :: ModSummary -> RenamedSource -> IO [RichToken]
-mkTokenizedSrc ms src = do
-  -- make sure to read the whole file at once otherwise
+mkTokenizedSrc :: DynFlags -> ModSummary -> RenamedSource -> IO [RichToken]
+mkTokenizedSrc dflags ms src = do
+  -- make sure to read the whole file at once otherwise 
   -- we run out of file descriptors (see #495)
-  rawSrc <- BS.readFile (msHsFilePath ms) >>= evaluate
-  return $ Hyperlinker.enrich src (Hyperlinker.parse (decodeUtf8 rawSrc))
+  file <- force <$> readFile (filepath)
+  return $ Hyperlinker.enrich src (Hyperlinker.parse dflags filepath file)
+  where
+    filepath = msHsFilePath ms
 
 -- | Find a stand-alone documentation comment by its name.
 findNamedDoc :: String -> [HsDecl GhcRn] -> ErrMsgM (Maybe HsDocString)
-- 
cgit v1.2.3