4 files changed, 476 insertions, 293 deletions
diff --git a/haddock-api/src/Haddock/Backends/Hyperlinker/Ast.hs b/haddock-api/src/Haddock/Backends/Hyperlinker/Ast.hs
index 56137f51..0ecf7109 100644
--- a/haddock-api/src/Haddock/Backends/Hyperlinker/Ast.hs
+++ b/haddock-api/src/Haddock/Backends/Hyperlinker/Ast.hs
@@ -12,6 +12,7 @@ import qualified Haddock.Syb as Syb
 import Haddock.Backends.Hyperlinker.Types
 
 import qualified GHC
+import qualified SrcLoc
 import qualified Outputable as GHC
 
 import Control.Applicative
@@ -52,10 +53,10 @@ type DetailsMap = Map.Map Position (Span, TokenDetails)
 
 mkDetailsMap :: [(GHC.SrcSpan, TokenDetails)] -> DetailsMap
 mkDetailsMap xs =
-  Map.fromListWith select_details [ (start, (token_span, token_details))
+  Map.fromListWith select_details [ (start, (span, token_details))
                                   | (ghc_span, token_details) <- xs
-                                  , Just !token_span <- [ghcSrcSpanToSpan ghc_span]
-                                  , let start = spStart token_span
+                                  , GHC.RealSrcSpan span <- [ghc_span]
+                                  , let start = SrcLoc.realSrcSpanStart span
                                   ]
   where
     -- favour token details which appear earlier in the list
@@ -63,17 +64,11 @@ mkDetailsMap xs =
 
 lookupBySpan :: Span -> DetailsMap -> Maybe TokenDetails
 lookupBySpan span details = do
-  (_, (tok_span, tok_details)) <- Map.lookupLE (spStart span) details
-  guard (tok_span `containsSpan` span )
+  let pos = SrcLoc.realSrcSpanStart span
+  (_, (tok_span, tok_details)) <- Map.lookupLE pos details
+  guard (tok_span `SrcLoc.containsSpan` span)
   return tok_details
 
-ghcSrcSpanToSpan :: GHC.SrcSpan -> Maybe Span
-ghcSrcSpanToSpan (GHC.RealSrcSpan span) =
-  Just (Span { spStart = Position (GHC.srcSpanStartLine span) (GHC.srcSpanStartCol span)
-             , spEnd   = Position (GHC.srcSpanEndLine span) (GHC.srcSpanEndCol span)
-             })
-ghcSrcSpanToSpan _ = Nothing
-
 enrichToken :: Token -> DetailsMap -> Maybe TokenDetails
 enrichToken (Token typ _ spn) dm
     | typ `elem` [TkIdentifier, TkOperator] = lookupBySpan spn dm
@@ -99,9 +94,12 @@ variables =
 types :: GHC.RenamedSource -> LTokenDetails
 types = everythingInRenamedSource ty
   where
+    ty :: forall a. Data a => a -> [(GHC.SrcSpan, TokenDetails)]
     ty term = case cast term of
         (Just ((GHC.L sspan (GHC.HsTyVar _ _ name)) :: GHC.LHsType GHC.GhcRn)) ->
             pure (sspan, RtkType (GHC.unLoc name))
+        (Just ((GHC.L sspan (GHC.HsOpTy _ l name r)) :: GHC.LHsType GHC.GhcRn)) ->
+            (sspan, RtkType (GHC.unLoc name)):(ty l ++ ty r)
         _ -> empty
 
 -- | Obtain details map for identifier bindings.
@@ -117,6 +115,11 @@ binds = everythingInRenamedSource
     fun term = case cast term of
         (Just (GHC.FunBind _ (GHC.L sspan name) _ _ _ :: GHC.HsBind GHC.GhcRn)) ->
             pure (sspan, RtkBind name)
+        (Just (GHC.PatSynBind _ (GHC.PSB _ (GHC.L sspan name) args _ _))) ->
+            pure (sspan, RtkBind name) ++ everythingInRenamedSource patsyn_binds args
+        _ -> empty
+    patsyn_binds term = case cast term of
+        (Just (GHC.L sspan (name :: GHC.Name))) -> pure (sspan, RtkVar name)
         _ -> empty
     pat term = case cast term of
         (Just ((GHC.L sspan (GHC.VarPat _ name)) :: GHC.LPat GHC.GhcRn)) ->
@@ -142,6 +145,7 @@ decls :: GHC.RenamedSource -> LTokenDetails
 decls (group, _, _, _) = concatMap ($ group)
     [ concat . map typ . concat . map GHC.group_tyclds . GHC.hs_tyclds
     , everythingInRenamedSource fun . GHC.hs_valds
+    , everythingInRenamedSource fix . GHC.hs_fixds
     , everythingInRenamedSource (con `Syb.combine` ins)
     ]
   where
@@ -149,11 +153,16 @@ decls (group, _, _, _) = concatMap ($ group)
         GHC.DataDecl { tcdLName = name } -> pure . decl $ name
         GHC.SynDecl _ name _ _ _ -> pure . decl $ name
         GHC.FamDecl _ fam -> pure . decl $ GHC.fdLName fam
-        GHC.ClassDecl{..} -> [decl tcdLName] ++ concatMap sig tcdSigs
+        GHC.ClassDecl{..} ->
+          [decl tcdLName]
+            ++ concatMap sig tcdSigs
+            ++ concatMap tyfam tcdATs
         GHC.XTyClDecl {} -> GHC.panic "haddock:decls"
     fun term = case cast term of
         (Just (GHC.FunBind _ (GHC.L sspan name) _ _ _ :: GHC.HsBind GHC.GhcRn))
             | GHC.isExternalName name -> pure (sspan, RtkDecl name)
+        (Just (GHC.PatSynBind _ (GHC.PSB _ (GHC.L sspan name) _ _ _)))
+            | GHC.isExternalName name -> pure (sspan, RtkDecl name)
         _ -> empty
     con term = case cast term of
         (Just (cdcl :: GHC.ConDecl GHC.GhcRn)) ->
@@ -171,7 +180,17 @@ decls (group, _, _, _) = concatMap ($ group)
         Just (field :: GHC.ConDeclField GHC.GhcRn)
           -> map (decl . fmap GHC.extFieldOcc) $ GHC.cd_fld_names field
         Nothing -> empty
+    fix term = case cast term of
+        Just ((GHC.FixitySig _ names _) :: GHC.FixitySig GHC.GhcRn)
+          -> map (\(GHC.L sspan x) -> (sspan, RtkVar x)) names
+        Just ((GHC.XFixitySig {}) :: GHC.FixitySig GHC.GhcRn)
+          -> GHC.panic "haddock:decls"
+        Nothing -> empty
+    tyfam (GHC.L _ (GHC.FamilyDecl{..})) = [decl fdLName]
+    tyfam (GHC.L _ (GHC.XFamilyDecl {})) = GHC.panic "haddock:dels"
     sig (GHC.L _ (GHC.TypeSig _ names _)) = map decl names
+    sig (GHC.L _ (GHC.PatSynSig _ names _)) = map decl names
+    sig (GHC.L _ (GHC.ClassOpSig _ _ names _)) = map decl names
     sig _ = []
     decl (GHC.L sspan name) = (sspan, RtkDecl name)
     tyref (GHC.L sspan name) = (sspan, RtkType name)
@@ -190,10 +209,11 @@ imports src@(_, imps, _, _) =
         (Just (GHC.IEThingAll _ t)) -> pure $ typ $ GHC.ieLWrappedName t
         (Just (GHC.IEThingWith _ t _ vs _fls)) ->
           [typ $ GHC.ieLWrappedName t] ++ map (var . GHC.ieLWrappedName) vs
+        (Just (GHC.IEModuleContents _ m)) -> pure $ modu m
         _ -> empty
     typ (GHC.L sspan name) = (sspan, RtkType name)
     var (GHC.L sspan name) = (sspan, RtkVar name)
-    imp idecl | not . GHC.ideclImplicit $ idecl =
-        let (GHC.L sspan name) = GHC.ideclName idecl
-        in Just (sspan, RtkModule name)
-    imp _ = Nothing
+    modu (GHC.L sspan name) = (sspan, RtkModule name)
+    imp idecl
+      | not . GHC.ideclImplicit $ idecl = Just (modu (GHC.ideclName idecl))
+      | otherwise = Nothing
diff --git a/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs b/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
index e4345602..e7ecac73 100644
--- a/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
+++ b/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
@@ -1,11 +1,19 @@
 module Haddock.Backends.Hyperlinker.Parser (parse) where
 
+import Data.Either         ( isRight, isLeft )
+import Data.List           ( foldl', isPrefixOf, isSuffixOf )
+import Data.Maybe          ( maybeToList )
+import Data.Char           ( isSpace )
+import qualified Text.Read as R
 
-import Data.Char
-import Data.List
-import Data.Maybe
+import GHC                 ( DynFlags, addSourceToTokens )
+import SrcLoc
+import FastString          ( mkFastString )
+import StringBuffer        ( stringToStringBuffer )
+import Lexer               ( Token(..) )
+import qualified Lexer as L
 
-import Haddock.Backends.Hyperlinker.Types
+import Haddock.Backends.Hyperlinker.Types as T
 
 
 -- | Turn source code string into a stream of more descriptive tokens.
@@ -13,202 +21,419 @@ import Haddock.Backends.Hyperlinker.Types
 -- Result should retain original file layout (including comments, whitespace,
 -- etc.), i.e. the following "law" should hold:
 --
--- @concat . map 'tkValue' . 'parse' = id@
-parse :: String -> [Token]
-parse = tokenize . tag . chunk
+-- prop> concat . map tkValue . parse = id
+--
+-- (In reality, this only holds for input not containing '\r', '\t', '\f', '\v',
+-- characters, since GHC transforms those into ' ' and '\n')
+parse :: DynFlags -> FilePath -> String -> [T.Token]
+parse dflags fp = ghcToks . processCPP dflags fp . filterCRLF
+  where
+    -- Remove CRLFs from source
+    filterCRLF :: String -> String
+    filterCRLF ('\r':'\n':cs) = '\n' : filterCRLF cs
+    filterCRLF (c:cs) = c : filterCRLF cs
+    filterCRLF [] = []
 
--- | Split raw source string to more meaningful chunks.
+-- | Parse the source into tokens using the GHC lexer.
 --
--- This is the initial stage of tokenization process. Each chunk is either
--- a comment (including comment delimiters), a whitespace string, preprocessor
--- macro (and all its content until the end of a line) or valid Haskell lexeme.
-chunk :: String -> [String]
-chunk [] = []
-chunk str@(c:_)
-    | isSpace c =
-        let (space, mcpp, rest) = spanSpaceOrCpp str
-        in [space] ++ maybeToList mcpp ++ chunk rest
-chunk str
-    | "--" `isPrefixOf` str = chunk' $ spanToNewline str
-    | "{-" `isPrefixOf` str = chunk' $ chunkComment 0 str
-    | otherwise = case lex' str of
-        (tok:_) -> chunk' tok
-        [] -> [str]
+--   * CPP lines are removed and reinserted as line-comments
+--   * top-level file pragmas are parsed as block comments (see the
+--     'ITblockComment' case of 'classify' for more details)
+--
+processCPP :: DynFlags    -- ^ GHC's flags
+           -> FilePath    -- ^ source file name (for position information)
+           -> String      -- ^ source file contents
+           -> [(Located L.Token, String)]
+processCPP dflags fpath s = addSrc . go start . splitCPP $ s
   where
-    chunk' (c, rest) = c:(chunk rest)
+    start = mkRealSrcLoc (mkFastString fpath) 1 1
+    addSrc = addSourceToTokens start (stringToStringBuffer s)
 
--- | A bit better lexer then the default, i.e. handles DataKinds quotes
-lex' :: ReadS String
-lex' ('\'' : '\'' : rest)              = [("''", rest)]
-lex' str@('\'' : '\\' : _ : '\'' : _)  = lex str
-lex' str@('\'' : _ : '\'' : _)         = lex str
-lex' ('\'' : rest)                     = [("'", rest)]
-lex' str                               = lex str
+    -- Transform a list of Haskell/CPP lines into a list of tokens
+    go :: RealSrcLoc -> [Either String String] -> [Located L.Token]
+    go _   [] = []
+    go pos ls =
+      let (hLinesRight,  ls')  = span isRight ls
+          (cppLinesLeft, rest) = span isLeft ls'
 
--- | Split input to "first line" string and the rest of it.
---
--- Ideally, this should be done simply with @'break' (== '\n')@. However,
--- Haskell also allows line-unbreaking (or whatever it is called) so things
--- are not as simple and this function deals with that.
-spanToNewline :: String -> (String, String)
-spanToNewline [] = ([], [])
-spanToNewline ('\\':'\n':str) =
-    let (str', rest) = spanToNewline str
-    in ('\\':'\n':str', rest)
-spanToNewline str@('\n':_) = ("", str)
-spanToNewline (c:str) =
-    let (str', rest) = spanToNewline str
-    in (c:str', rest)
+          hSrc   = concat [ hLine   | Right hLine  <- hLinesRight  ]
+          cppSrc = concat [ cppLine | Left cppLine <- cppLinesLeft ]
+
+      in case L.lexTokenStream (stringToStringBuffer hSrc) pos dflags of
 
--- | Split input to whitespace string, (optional) preprocessor directive and
--- the rest of it.
+           -- Stuff that fails to lex gets turned into comments
+           L.PFailed _ _ss _msg ->
+             let (src_pos, failed) = mkToken ITunknown pos hSrc
+                 (new_pos, cpp)    = mkToken ITlineComment src_pos cppSrc
+             in failed : cpp : go new_pos rest
+
+           -- Successfully lexed
+           L.POk ss toks ->
+             let (new_pos, cpp) = mkToken ITlineComment (L.loc ss) cppSrc
+             in toks ++ [cpp] ++ go new_pos rest
+
+    -- Manually make a token from a 'String', advancing the cursor position
+    mkToken tok start' str =
+      let end = foldl' advanceSrcLoc start' str
+      in (end, L (RealSrcSpan $ mkRealSrcSpan start' end) (tok str))
+
+
+-- | Split apart the initial file into Haskell source lines ('Left' entries) and
+-- CPP lines ('Right' entries).
 --
--- Again, using something like @'span' 'isSpace'@ would be nice to chunk input
--- to whitespace. The problem is with /#/ symbol - if it is placed at the very
--- beginning of a line, it should be recognized as preprocessor macro. In any
--- other case, it is ordinary Haskell symbol and can be used to declare
--- operators. Hence, while dealing with whitespace we also check whether there
--- happens to be /#/ symbol just after a newline character - if that is the
--- case, we begin treating the whole line as preprocessor macro.
-spanSpaceOrCpp :: String -> (String, Maybe String, String)
-spanSpaceOrCpp ('\n':'#':str) =
-    let (str', rest) = spanToNewline str
-    in ("\n", Just $ '#':str', rest)
-spanSpaceOrCpp (c:str')
-    | isSpace c =
-        let (space, mcpp, rest) = spanSpaceOrCpp str'
-        in (c:space, mcpp, rest)
-spanSpaceOrCpp str = ("", Nothing, str)
-
--- | Split input to comment content (including delimiters) and the rest.
+-- All characters in the input are present in the output:
 --
--- Again, some more logic than simple 'span' is required because of Haskell
--- comment nesting policy.
-chunkComment :: Int -> String -> (String, String)
-chunkComment _ [] = ("", "")
-chunkComment depth ('{':'-':str) =
-    let (c, rest) = chunkComment (depth + 1) str
-    in ("{-" ++ c, rest)
-chunkComment depth ('-':'}':str)
-    | depth == 1 = ("-}", str)
-    | otherwise =
-        let (c, rest) = chunkComment (depth - 1) str
-        in ("-}" ++ c, rest)
-chunkComment depth (e:str) =
-    let (c, rest) = chunkComment depth str
-    in (e:c, rest)
-
--- | Assign source location for each chunk in given stream.
-tag :: [String] -> [(Span, String)]
-tag =
-    reverse . snd . foldl aux (Position 1 1, [])
+-- prop> concat . map (either id id) . splitCPP = id
+splitCPP :: String -> [Either String String]
+splitCPP "" = []
+splitCPP s | isCPPline s = Left l : splitCPP rest
+           | otherwise =  Right l : splitCPP rest
   where
-    aux (pos, cs) str =
-        let pos' = foldl move pos str
-        in (pos', (Span pos pos', str):cs)
-    move pos '\n' = pos { posRow = posRow pos + 1, posCol = 1 }
-    move pos _ = pos { posCol = posCol pos + 1 }
-
--- | Turn unrecognised chunk stream to more descriptive token stream.
-tokenize :: [(Span, String)] -> [Token]
-tokenize =
-    map aux
-  where
-    aux (sp, str) = Token
-        { tkType = classify str
-        , tkValue = str
-        , tkSpan = sp
-        }
+    ~(l, rest) = spanToNewline 0 s
+
 
--- | Classify given string as appropriate Haskell token.
+-- | Heuristic to decide if a line is going to be a CPP line. This should be a
+-- cheap operation since it is going to be run on every line being processed.
+--
+-- Right now it just checks if the first non-whitespace character in the first
+-- five characters of the line is a '#':
 --
--- This method is based on Haskell 98 Report lexical structure description:
--- https://www.haskell.org/onlinereport/lexemes.html
+-- >>> isCPPline "#define FOO 1"
+-- True
 --
--- However, this is probably far from being perfect and most probably does not
--- handle correctly all corner cases.
-classify :: String -> TokenType
-classify str
-    | "--" `isPrefixOf` str = TkComment
-    | "{-#" `isPrefixOf` str = TkPragma
-    | "{-" `isPrefixOf` str = TkComment
-classify "''" = TkSpecial
-classify "'"  = TkSpecial
-classify str@(c:_)
-    | isSpace c = TkSpace
-    | isDigit c = TkNumber
-    | c `elem` special = TkSpecial
-    | str `elem` glyphs = TkGlyph
-    | all (`elem` symbols) str = TkOperator
-    | c == '#' = TkCpp
-    | c == '"' = TkString
-    | c == '\'' = TkChar
-classify str
-    | str `elem` keywords = TkKeyword
-    | isIdentifier str = TkIdentifier
-    | otherwise = TkUnknown
-
-keywords :: [String]
-keywords =
-    [ "as"
-    , "case"
-    , "class"
-    , "data"
-    , "default"
-    , "deriving"
-    , "do"
-    , "else"
-    , "hiding"
-    , "if"
-    , "import"
-    , "in"
-    , "infix"
-    , "infixl"
-    , "infixr"
-    , "instance"
-    , "let"
-    , "module"
-    , "newtype"
-    , "of"
-    , "qualified"
-    , "then"
-    , "type"
-    , "where"
-    , "forall"
-    , "family"
-    , "mdo"
-    ]
-
-glyphs :: [String]
-glyphs =
-    [ ".."
-    , ":"
-    , "::"
-    , "="
-    , "\\"
-    , "|"
-    , "<-"
-    , "->"
-    , "@"
-    , "~"
-    , "~#"
-    , "=>"
-    , "-"
-    , "!"
-    ]
-
-special :: [Char]
-special = "()[]{},;`"
-
--- TODO: Add support for any Unicode symbol or punctuation.
--- source: http://stackoverflow.com/questions/10548170/what-characters-are-permitted-for-haskell-operators
-symbols :: [Char]
-symbols = "!#$%&*+./<=>?@\\^|-~:"
-
-isIdentifier :: String -> Bool
-isIdentifier (s:str)
-    | (isLower' s || isUpper s) && all isAlphaNum' str = True
+-- >>> isCPPline "\t\t  #ifdef GHC"
+-- True
+--
+-- >>> isCPPline "       #endif"
+-- False
+--
+isCPPline :: String -> Bool
+isCPPline = isPrefixOf "#" . dropWhile (`elem` " \t") . take 5
+
+
+-- | Split a "line" off the front of a string, hopefully without cutting tokens
+-- in half. I say "hopefully" because knowing what a token is requires lexing,
+-- yet lexing depends on this function.
+--
+-- All characters in the input are present in the output:
+--
+-- prop> curry (++) . spanToNewLine 0 = id
+spanToNewline :: Int                 -- ^ open '{-'
+              -> String              -- ^ input
+              -> (String, String)
+
+-- Base case and space characters
+spanToNewline _ "" = ("", "")
+spanToNewline n ('\n':str) | n <= 0 = ("\n", str)
+spanToNewline n ('\n':str) | n <= 0 = ("\n", str)
+spanToNewline n ('\\':'\n':str) =
+    let (str', rest) = spanToNewline n str
+    in ('\\':'\n':str', rest)
+
+-- Block comments
+spanToNewline n ('{':'-':str) =
+    let (str', rest) = spanToNewline (n+1) str
+    in ('{':'-':str', rest)
+spanToNewline n ('-':'}':str) =
+    let (str', rest) = spanToNewline (n-1) str
+    in ('-':'}':str', rest)
+
+-- When not in a block comment, try to lex a Haskell token
+spanToNewline 0 str@(c:_) | ((lexed, str') : _) <- R.lex str, not (isSpace c) =
+    if all (== '-') lexed && length lexed >= 2
+      -- A Haskell line comment
+      then case span (/= '\n') str' of
+             (str'', '\n':rest) -> (lexed ++ str'' ++ "\n", rest)
+             (_, _) -> (str, "") 
+
+      -- An actual Haskell token
+      else let (str'', rest) = spanToNewline 0 str'
+           in (lexed ++ str'', rest)
+
+-- In all other cases, advance one character at a time
+spanToNewline n (c:str) =
+    let (str', rest) = spanToNewline n str
+    in (c:str', rest)
+
+
+-- | Turn a list of GHC's 'L.Token' (and their source 'String') into a list of
+-- Haddock's 'T.Token'.
+ghcToks :: [(Located L.Token, String)] -> [T.Token]
+ghcToks = reverse . (\(_,ts,_) -> ts) . foldl' go (start, [], False)
   where
-    isLower' c = isLower c || c == '_'
-    isAlphaNum' c = isAlphaNum c || c == '_' || c == '\''
-isIdentifier _ = False
+    start = mkRealSrcLoc (mkFastString "lexing") 1 1
+
+    go :: (RealSrcLoc, [T.Token], Bool)
+       -- ^ current position, tokens accumulated, currently in pragma (or not)
+       
+       -> (Located L.Token, String)
+       -- ^ next token, its content
+       
+       -> (RealSrcLoc, [T.Token], Bool)
+       -- ^ new position, new tokens accumulated, currently in pragma (or not)
+
+    go (pos, toks, in_prag) (L l tok, raw) =
+        ( next_pos
+        , classifiedTok ++ maybeToList white ++ toks
+        , inPragma in_prag tok
+        )
+       where
+         (next_pos, white) = mkWhitespace pos l
+         
+         classifiedTok = [ Token (classify' tok) raw rss
+                         | RealSrcSpan rss <- [l]
+                         , not (null raw)
+                         ]
+         
+         classify' | in_prag = const TkPragma
+                   | otherwise = classify
+
+
+-- | Find the correct amount of whitespace between tokens.
+mkWhitespace :: RealSrcLoc -> SrcSpan -> (RealSrcLoc, Maybe T.Token)
+mkWhitespace prev spn =
+  case spn of
+    UnhelpfulSpan _ -> (prev,Nothing)
+    RealSrcSpan s | null wsstring -> (end, Nothing)
+                  | otherwise -> (end, Just (Token TkSpace wsstring wsspan))
+      where
+        start = realSrcSpanStart s
+        end = realSrcSpanEnd s
+        wsspan = mkRealSrcSpan prev start
+        nls = srcLocLine start - srcLocLine prev
+        spaces = if nls == 0 then srcLocCol start - srcLocCol prev
+                             else srcLocCol start - 1
+        wsstring = replicate nls '\n' ++ replicate spaces ' '
+
+
+-- | Classify given tokens as appropriate Haskell token type.
+classify :: L.Token -> TokenType
+classify tok =
+  case tok of
+    ITas                   -> TkKeyword
+    ITcase                 -> TkKeyword
+    ITclass                -> TkKeyword
+    ITdata                 -> TkKeyword
+    ITdefault              -> TkKeyword
+    ITderiving             -> TkKeyword
+    ITdo                   -> TkKeyword
+    ITelse                 -> TkKeyword
+    IThiding               -> TkKeyword
+    ITforeign              -> TkKeyword
+    ITif                   -> TkKeyword
+    ITimport               -> TkKeyword
+    ITin                   -> TkKeyword
+    ITinfix                -> TkKeyword
+    ITinfixl               -> TkKeyword
+    ITinfixr               -> TkKeyword
+    ITinstance             -> TkKeyword
+    ITlet                  -> TkKeyword
+    ITmodule               -> TkKeyword
+    ITnewtype              -> TkKeyword
+    ITof                   -> TkKeyword
+    ITqualified            -> TkKeyword
+    ITthen                 -> TkKeyword
+    ITtype                 -> TkKeyword
+    ITvia                  -> TkKeyword
+    ITwhere                -> TkKeyword
+
+    ITforall            {} -> TkKeyword
+    ITexport               -> TkKeyword
+    ITlabel                -> TkKeyword
+    ITdynamic              -> TkKeyword
+    ITsafe                 -> TkKeyword
+    ITinterruptible        -> TkKeyword
+    ITunsafe               -> TkKeyword
+    ITstdcallconv          -> TkKeyword
+    ITccallconv            -> TkKeyword
+    ITcapiconv             -> TkKeyword
+    ITprimcallconv         -> TkKeyword
+    ITjavascriptcallconv   -> TkKeyword
+    ITmdo                  -> TkKeyword
+    ITfamily               -> TkKeyword
+    ITrole                 -> TkKeyword
+    ITgroup                -> TkKeyword
+    ITby                   -> TkKeyword
+    ITusing                -> TkKeyword
+    ITpattern              -> TkKeyword
+    ITstatic               -> TkKeyword
+    ITstock                -> TkKeyword
+    ITanyclass             -> TkKeyword
+
+    ITunit                 -> TkKeyword
+    ITsignature            -> TkKeyword
+    ITdependency           -> TkKeyword
+    ITrequires             -> TkKeyword
+
+    ITinline_prag       {} -> TkPragma
+    ITspec_prag         {} -> TkPragma
+    ITspec_inline_prag  {} -> TkPragma
+    ITsource_prag       {} -> TkPragma
+    ITrules_prag        {} -> TkPragma
+    ITwarning_prag      {} -> TkPragma
+    ITdeprecated_prag   {} -> TkPragma
+    ITline_prag         {} -> TkPragma
+    ITcolumn_prag       {} -> TkPragma
+    ITscc_prag          {} -> TkPragma
+    ITgenerated_prag    {} -> TkPragma
+    ITcore_prag         {} -> TkPragma
+    ITunpack_prag       {} -> TkPragma
+    ITnounpack_prag     {} -> TkPragma
+    ITann_prag          {} -> TkPragma
+    ITcomplete_prag     {} -> TkPragma
+    ITclose_prag           -> TkPragma
+    IToptions_prag      {} -> TkPragma
+    ITinclude_prag      {} -> TkPragma
+    ITlanguage_prag        -> TkPragma
+    ITminimal_prag      {} -> TkPragma
+    IToverlappable_prag {} -> TkPragma
+    IToverlapping_prag  {} -> TkPragma
+    IToverlaps_prag     {} -> TkPragma
+    ITincoherent_prag   {} -> TkPragma
+    ITctype             {} -> TkPragma
+
+    ITdotdot               -> TkGlyph
+    ITcolon                -> TkGlyph
+    ITdcolon            {} -> TkGlyph
+    ITequal                -> TkGlyph
+    ITlam                  -> TkGlyph
+    ITlcase                -> TkGlyph
+    ITvbar                 -> TkGlyph
+    ITlarrow            {} -> TkGlyph
+    ITrarrow            {} -> TkGlyph
+    ITat                   -> TkGlyph
+    ITtilde                -> TkGlyph
+    ITdarrow            {} -> TkGlyph
+    ITminus                -> TkGlyph
+    ITbang                 -> TkGlyph
+    ITdot                  -> TkOperator
+    ITtypeApp              -> TkGlyph
+
+    ITbiglam               -> TkGlyph
+
+    ITocurly               -> TkSpecial
+    ITccurly               -> TkSpecial
+    ITvocurly              -> TkSpecial
+    ITvccurly              -> TkSpecial
+    ITobrack               -> TkSpecial
+    ITopabrack             -> TkSpecial
+    ITcpabrack             -> TkSpecial
+    ITcbrack               -> TkSpecial
+    IToparen               -> TkSpecial
+    ITcparen               -> TkSpecial
+    IToubxparen            -> TkSpecial
+    ITcubxparen            -> TkSpecial
+    ITsemi                 -> TkSpecial
+    ITcomma                -> TkSpecial
+    ITunderscore           -> TkIdentifier
+    ITbackquote            -> TkSpecial
+    ITsimpleQuote          -> TkSpecial
+
+    ITvarid             {} -> TkIdentifier
+    ITconid             {} -> TkIdentifier
+    ITvarsym            {} -> TkOperator
+    ITconsym            {} -> TkOperator
+    ITqvarid            {} -> TkIdentifier
+    ITqconid            {} -> TkIdentifier
+    ITqvarsym           {} -> TkOperator
+    ITqconsym           {} -> TkOperator
+
+    ITdupipvarid        {} -> TkUnknown
+    ITlabelvarid        {} -> TkUnknown
+
+    ITchar              {} -> TkChar
+    ITstring            {} -> TkString
+    ITinteger           {} -> TkNumber
+    ITrational          {} -> TkNumber
+
+    ITprimchar          {} -> TkChar
+    ITprimstring        {} -> TkString
+    ITprimint           {} -> TkNumber
+    ITprimword          {} -> TkNumber
+    ITprimfloat         {} -> TkNumber
+    ITprimdouble        {} -> TkNumber
+
+    ITopenExpQuote      {} -> TkSpecial
+    ITopenPatQuote         -> TkSpecial
+    ITopenDecQuote         -> TkSpecial
+    ITopenTypQuote         -> TkSpecial
+    ITcloseQuote        {} -> TkSpecial
+    ITopenTExpQuote     {} -> TkSpecial
+    ITcloseTExpQuote       -> TkSpecial
+    ITidEscape          {} -> TkUnknown
+    ITparenEscape          -> TkSpecial
+    ITidTyEscape        {} -> TkUnknown
+    ITparenTyEscape        -> TkSpecial
+    ITtyQuote              -> TkSpecial
+    ITquasiQuote        {} -> TkUnknown
+    ITqQuasiQuote       {} -> TkUnknown
+
+    ITproc                 -> TkKeyword
+    ITrec                  -> TkKeyword
+    IToparenbar         {} -> TkGlyph
+    ITcparenbar         {} -> TkGlyph
+    ITlarrowtail        {} -> TkGlyph
+    ITrarrowtail        {} -> TkGlyph
+    ITLarrowtail        {} -> TkGlyph
+    ITRarrowtail        {} -> TkGlyph
+
+    ITunknown           {} -> TkUnknown
+    ITeof                  -> TkUnknown
+
+    -- Line comments are only supposed to start with '--'. Starting with '#'
+    -- means that this was probably a CPP.
+    ITlineComment s
+      | isCPPline s        -> TkCpp
+      | otherwise          -> TkComment
+
+    ITdocCommentNext    {} -> TkComment
+    ITdocCommentPrev    {} -> TkComment
+    ITdocCommentNamed   {} -> TkComment
+    ITdocSection        {} -> TkComment
+    ITdocOptions        {} -> TkComment
+
+    -- The lexer considers top-level pragmas as comments (see `pragState` in
+    -- the GHC lexer for more), so we have to manually reverse this. The
+    -- following is a hammer: it smashes _all_ pragma-like block comments into
+    -- pragmas.
+    ITblockComment c
+      | isPrefixOf "{-#" c
+      , isSuffixOf "#-}" c -> TkPragma
+      | otherwise          -> TkComment
+
+-- | Classify given tokens as beginning pragmas (or not).
+inPragma :: Bool     -- ^ currently in pragma
+         -> L.Token  -- ^ current token
+         -> Bool     -- ^ new information about whether we are in a pragma
+inPragma _ ITclose_prag = False
+inPragma True _ = True
+inPragma False tok =
+  case tok of
+    ITinline_prag       {} -> True
+    ITspec_prag         {} -> True
+    ITspec_inline_prag  {} -> True
+    ITsource_prag       {} -> True
+    ITrules_prag        {} -> True
+    ITwarning_prag      {} -> True
+    ITdeprecated_prag   {} -> True
+    ITline_prag         {} -> True
+    ITcolumn_prag       {} -> True
+    ITscc_prag          {} -> True
+    ITgenerated_prag    {} -> True
+    ITcore_prag         {} -> True
+    ITunpack_prag       {} -> True
+    ITnounpack_prag     {} -> True
+    ITann_prag          {} -> True
+    ITcomplete_prag     {} -> True
+    IToptions_prag      {} -> True
+    ITinclude_prag      {} -> True
+    ITlanguage_prag        -> True
+    ITminimal_prag      {} -> True
+    IToverlappable_prag {} -> True
+    IToverlapping_prag  {} -> True
+    IToverlaps_prag     {} -> True
+    ITincoherent_prag   {} -> True
+    ITctype             {} -> True
+
+    _                      -> False
+
diff --git a/haddock-api/src/Haddock/Backends/Hyperlinker/Renderer.hs b/haddock-api/src/Haddock/Backends/Hyperlinker/Renderer.hs
index 15793f0c..d7ea70a6 100644
--- a/haddock-api/src/Haddock/Backends/Hyperlinker/Renderer.hs
+++ b/haddock-api/src/Haddock/Backends/Hyperlinker/Renderer.hs
@@ -1,6 +1,5 @@
 {-# LANGUAGE RecordWildCards #-}
 
-
 module Haddock.Backends.Hyperlinker.Renderer (render) where
 
 
@@ -15,7 +14,6 @@ import System.FilePath.Posix ((</>))
 
 import Data.List
 import Data.Maybe
-import Data.Monoid
 import qualified Data.Map as Map
 
 import Text.XHtml (Html, HtmlAttr, (!))
@@ -29,36 +27,10 @@ render :: Maybe FilePath -> Maybe FilePath -> SrcMap -> [RichToken]
        -> Html
 render mcss mjs srcs tokens = header mcss mjs <> body srcs tokens
 
-
-data TokenGroup
-    = GrpNormal Token
-    | GrpRich TokenDetails [Token]
-
-
--- | Group consecutive tokens pointing to the same element.
---
--- We want to render qualified identifiers as one entity. For example,
--- @Bar.Baz.foo@ consists of 5 tokens (@Bar@, @.@, @Baz@, @.@, @foo@) but for
--- better user experience when highlighting and clicking links, these tokens
--- should be regarded as one identifier. Therefore, before rendering we must
--- group consecutive elements pointing to the same 'GHC.Name' (note that even
--- dot token has it if it is part of qualified name).
-groupTokens :: [RichToken] -> [TokenGroup]
-groupTokens [] = []
-groupTokens ((RichToken tok Nothing):rest) = (GrpNormal tok):(groupTokens rest)
-groupTokens ((RichToken tok (Just det)):rest) =
-    let (grp, rest') = span same rest
-    in (GrpRich det (tok:(map rtkToken grp))):(groupTokens rest')
-  where
-    same (RichToken _ (Just det')) = det == det'
-    same _ = False
-
-
 body :: SrcMap -> [RichToken] -> Html
-body srcs tokens =
-    Html.body . Html.pre $ hypsrc
+body srcs tokens = Html.body . Html.pre $ hypsrc
   where
-    hypsrc = mconcat . map (tokenGroup srcs) . groupTokens $ tokens
+    hypsrc = mconcat . map (richToken srcs) $ tokens
 
 
 header :: Maybe FilePath -> Maybe FilePath -> Html
@@ -79,29 +51,20 @@ header mcss mjs =
         , Html.src scriptFile
         ]
 
-
-tokenGroup :: SrcMap -> TokenGroup -> Html
-tokenGroup _ (GrpNormal tok@(Token { .. }))
-    | tkType == TkSpace = renderSpace (posRow . spStart $ tkSpan) tkValue
-    | otherwise = tokenSpan tok ! attrs
+-- | Given information about the source position of definitions, render a token
+richToken :: SrcMap -> RichToken -> Html
+richToken srcs (RichToken Token{..} details)
+    | tkType == TkSpace = renderSpace (GHC.srcSpanStartLine tkSpan) tkValue
+    | otherwise = linked content
   where
-    attrs = [ multiclass . tokenStyle $ tkType ]
-tokenGroup srcs (GrpRich det tokens) =
-    externalAnchor det . internalAnchor det . hyperlink srcs det $ content
-  where
-    content = mconcat . map (richToken det) $ tokens
-
-
-richToken :: TokenDetails -> Token -> Html
-richToken det tok =
-    tokenSpan tok ! [ multiclass style ]
-  where
-    style = (tokenStyle . tkType) tok ++ richTokenStyle det
-
-
-tokenSpan :: Token -> Html
-tokenSpan = Html.thespan . Html.toHtml . tkValue
+    content = tokenSpan ! [ multiclass style ]
+    tokenSpan = Html.thespan (Html.toHtml tkValue)
+    style = tokenStyle tkType ++ maybe [] richTokenStyle details
 
+    -- If we have name information, we can make links
+    linked = case details of
+      Just d -> externalAnchor d . internalAnchor d . hyperlink srcs d
+      Nothing -> id
 
 richTokenStyle :: TokenDetails -> [StyleClass]
 richTokenStyle (RtkVar _) = ["hs-var"]
@@ -155,7 +118,7 @@ internalHyperlink name content =
     Html.anchor content ! [ Html.href $ "#" ++ internalAnchorIdent name ]
 
 externalNameHyperlink :: SrcMap -> GHC.Name -> Html -> Html
-externalNameHyperlink (srcs, _) name content = case Map.lookup mdl srcs of
+externalNameHyperlink srcs name content = case Map.lookup mdl srcs of
     Just SrcLocal -> Html.anchor content !
         [ Html.href $ hypSrcModuleNameUrl mdl name ]
     Just (SrcExternal path) -> Html.anchor content !
@@ -165,12 +128,14 @@ externalNameHyperlink (srcs, _) name content = case Map.lookup mdl srcs of
     mdl = GHC.nameModule name
 
 externalModHyperlink :: SrcMap -> GHC.ModuleName -> Html -> Html
-externalModHyperlink (_, srcs) name content = case Map.lookup name srcs of
-    Just SrcLocal -> Html.anchor content !
+externalModHyperlink srcs name content =
+    let srcs' = Map.mapKeys GHC.moduleName srcs in
+    case Map.lookup name srcs' of
+      Just SrcLocal -> Html.anchor content !
         [ Html.href $ hypSrcModuleUrl' name ]
-    Just (SrcExternal path) -> Html.anchor content !
+      Just (SrcExternal path) -> Html.anchor content !
         [ Html.href $ path </> hypSrcModuleUrl' name ]
-    Nothing -> content
+      Nothing -> content
 
 
 renderSpace :: Int -> String -> Html
diff --git a/haddock-api/src/Haddock/Backends/Hyperlinker/Types.hs b/haddock-api/src/Haddock/Backends/Hyperlinker/Types.hs
index d8ae89e4..e377471e 100644
--- a/haddock-api/src/Haddock/Backends/Hyperlinker/Types.hs
+++ b/haddock-api/src/Haddock/Backends/Hyperlinker/Types.hs
@@ -4,8 +4,6 @@ module Haddock.Backends.Hyperlinker.Types where
 import qualified GHC
 
 import Data.Map (Map)
-import qualified Data.Map as Map
-
 
 data Token = Token
     { tkType :: TokenType
@@ -14,23 +12,8 @@ data Token = Token
     }
     deriving (Show)
 
-data Position = Position
-    { posRow :: !Int
-    , posCol :: !Int
-    }
-    deriving (Eq, Ord, Show)
-
-data Span = Span
-    { spStart :: !Position
-    , spEnd   :: !Position
-    }
-    deriving (Show)
-
--- | Tests whether the first span "contains" the other span, meaning
--- that it covers at least as much source code. True where spans are equal.
-containsSpan :: Span -> Span -> Bool
-containsSpan s1 s2 =
-  spStart s1 <= spStart s2 && spEnd s1 >= spEnd s2
+type Position = GHC.RealSrcLoc
+type Span = GHC.RealSrcSpan
 
 data TokenType
     = TkIdentifier
@@ -80,15 +63,5 @@ data SrcPath
     | SrcLocal
 
 -- | Mapping from modules to cross-package source paths.
---
--- This mapping is actually a pair of maps instead of just one map. The reason
--- for this is because when hyperlinking modules in import lists we have no
--- 'GHC.Module' available. On the other hand, we can't just use map with
--- 'GHC.ModuleName' as indices because certain modules may have common name
--- but originate in different packages. Hence, we use both /rich/ and /poor/
--- versions, where the /poor/ is just projection of /rich/ one cached in pair
--- for better performance.
-type SrcMap = (Map GHC.Module SrcPath, Map GHC.ModuleName SrcPath)
+type SrcMap = Map GHC.Module SrcPath
 
-mkSrcMap :: Map GHC.Module SrcPath -> SrcMap
-mkSrcMap srcs = (srcs, Map.mapKeys GHC.moduleName srcs)