From 4f75be94f45a0e92553eccefe56230c554333ce7 Mon Sep 17 00:00:00 2001 From: Alec Theriault Date: Sun, 10 Dec 2017 12:22:21 -0800 Subject: Use the GHC lexer for the Hyperlinker backend (#714) * Start changing to use GHC lexer * better cpp * Change SrcSpan to RealSrcSpan * Remove error * Try to stop too many open files * wip * wip * Revert "wip" This reverts commit b605510a195f26315e3d8ca90e6d95a6737553e1. Conflicts: haddock-api/haddock-api.cabal haddock-api/src/Haddock/Interface.hs * Remove pointless 'caching' * Use dlist rather than lists when finding vars * Use a map rather than list * Delete bogus comment * Rebase followup Things now run using the GHC lexer. There are still - stray debug statements - unnecessary changes w.r.t. master * Cleaned up differences w.r.t. current Haddock HEAD Things are looking good. quasiquotes in particular look beautiful: the TH ones (with Haskell source inside) colour/link their contents too! Haven't yet begun to check for possible performance problems. * Support CPP and top-level pragmas The support for these is hackier - but no more hacky than the existing support. * Tests pass, CPP is better recognized The tests were in some cases altered: I consider the new output to be more correct than the old one.... * Fix shrinking of source without tabs in test * Replace 'Position'/'Span' with GHC counterparts Replaces 'Position' -> 'GHC.RealSrcLoc' and 'Span' -> 'GHC.RealSrcSpan'. * Nits * Forgot entry in .cabal * Update changelog --- haddock-api/src/Haddock.hs | 2 +- .../src/Haddock/Backends/Hyperlinker/Ast.hs | 19 +- .../src/Haddock/Backends/Hyperlinker/Parser.hs | 534 +++++++++++++-------- .../src/Haddock/Backends/Hyperlinker/Renderer.hs | 14 +- .../src/Haddock/Backends/Hyperlinker/Types.hs | 33 +- haddock-api/src/Haddock/Interface/Create.hs | 24 +- 6 files changed, 375 insertions(+), 251 deletions(-) (limited to 'haddock-api/src') diff --git a/haddock-api/src/Haddock.hs b/haddock-api/src/Haddock.hs index f7fa52b3..3f5e5298 100644 --- a/haddock-api/src/Haddock.hs +++ b/haddock-api/src/Haddock.hs @@ -277,7 +277,7 @@ render dflags flags qual ifaces installedIfaces extSrcMap = do | Flag_HyperlinkedSource `elem` flags = Just hypSrcModuleUrlFormat | otherwise = srcModule - srcMap = mkSrcMap $ Map.union + srcMap = Map.union (Map.map SrcExternal extSrcMap) (Map.fromList [ (ifaceMod iface, SrcLocal) | iface <- ifaces ]) diff --git a/haddock-api/src/Haddock/Backends/Hyperlinker/Ast.hs b/haddock-api/src/Haddock/Backends/Hyperlinker/Ast.hs index 57ff72ff..361bc15d 100644 --- a/haddock-api/src/Haddock/Backends/Hyperlinker/Ast.hs +++ b/haddock-api/src/Haddock/Backends/Hyperlinker/Ast.hs @@ -12,6 +12,7 @@ import qualified Haddock.Syb as Syb import Haddock.Backends.Hyperlinker.Types import qualified GHC +import qualified SrcLoc import Control.Applicative import Control.Monad (guard) @@ -51,10 +52,10 @@ type DetailsMap = Map.Map Position (Span, TokenDetails) mkDetailsMap :: [(GHC.SrcSpan, TokenDetails)] -> DetailsMap mkDetailsMap xs = - Map.fromListWith select_details [ (start, (token_span, token_details)) + Map.fromListWith select_details [ (start, (span, token_details)) | (ghc_span, token_details) <- xs - , Just !token_span <- [ghcSrcSpanToSpan ghc_span] - , let start = spStart token_span + , GHC.RealSrcSpan span <- [ghc_span] + , let start = SrcLoc.realSrcSpanStart span ] where -- favour token details which appear earlier in the list @@ -62,17 +63,11 @@ mkDetailsMap xs = lookupBySpan :: Span -> DetailsMap -> Maybe TokenDetails lookupBySpan span details = do - (_, (tok_span, tok_details)) <- Map.lookupLE (spStart span) details - guard (tok_span `containsSpan` span ) + let pos = SrcLoc.realSrcSpanStart span + (_, (tok_span, tok_details)) <- Map.lookupLE pos details + guard (tok_span `SrcLoc.containsSpan` span) return tok_details -ghcSrcSpanToSpan :: GHC.SrcSpan -> Maybe Span -ghcSrcSpanToSpan (GHC.RealSrcSpan span) = - Just (Span { spStart = Position (GHC.srcSpanStartLine span) (GHC.srcSpanStartCol span) - , spEnd = Position (GHC.srcSpanEndLine span) (GHC.srcSpanEndCol span) - }) -ghcSrcSpanToSpan _ = Nothing - enrichToken :: Token -> DetailsMap -> Maybe TokenDetails enrichToken (Token typ _ spn) dm | typ `elem` [TkIdentifier, TkOperator] = lookupBySpan spn dm diff --git a/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs b/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs index e4345602..da1555b8 100644 --- a/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs +++ b/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs @@ -1,11 +1,17 @@ module Haddock.Backends.Hyperlinker.Parser (parse) where +import Data.Either ( isRight, isLeft ) +import Data.List ( foldl', isPrefixOf, isSuffixOf ) +import Data.Maybe ( maybeToList ) -import Data.Char -import Data.List -import Data.Maybe +import GHC ( DynFlags, addSourceToTokens ) +import SrcLoc +import FastString ( mkFastString ) +import StringBuffer ( stringToStringBuffer ) +import Lexer ( Token(..) ) +import qualified Lexer as L -import Haddock.Backends.Hyperlinker.Types +import Haddock.Backends.Hyperlinker.Types as T -- | Turn source code string into a stream of more descriptive tokens. @@ -13,202 +19,350 @@ import Haddock.Backends.Hyperlinker.Types -- Result should retain original file layout (including comments, whitespace, -- etc.), i.e. the following "law" should hold: -- --- @concat . map 'tkValue' . 'parse' = id@ -parse :: String -> [Token] -parse = tokenize . tag . chunk +-- prop> concat . map tkValue . parse = id +-- +-- (In reality, this only holds for input not containing '\r', '\t', '\f', '\v', +-- characters, since GHC transforms those into ' ' and '\n') +parse :: DynFlags -> FilePath -> String -> [T.Token] +parse dflags fp s = ghcToks (processCPP dflags fp s) + --- | Split raw source string to more meaningful chunks. +-- | Parse the source into tokens using the GHC lexer. +-- +-- * CPP lines are removed and reinserted as line-comments +-- * top-level file pragmas are parsed as block comments (see the +-- 'ITblockComment' case of 'classify' for more details) -- --- This is the initial stage of tokenization process. Each chunk is either --- a comment (including comment delimiters), a whitespace string, preprocessor --- macro (and all its content until the end of a line) or valid Haskell lexeme. -chunk :: String -> [String] -chunk [] = [] -chunk str@(c:_) - | isSpace c = - let (space, mcpp, rest) = spanSpaceOrCpp str - in [space] ++ maybeToList mcpp ++ chunk rest -chunk str - | "--" `isPrefixOf` str = chunk' $ spanToNewline str - | "{-" `isPrefixOf` str = chunk' $ chunkComment 0 str - | otherwise = case lex' str of - (tok:_) -> chunk' tok - [] -> [str] +processCPP :: DynFlags -- ^ GHC's flags + -> FilePath -- ^ source file name (for position information) + -> String -- ^ source file contents + -> [(Located L.Token, String)] +processCPP dflags fpath s = addSrc . go start . splitCPP $ s where - chunk' (c, rest) = c:(chunk rest) + start = mkRealSrcLoc (mkFastString fpath) 1 1 + addSrc = addSourceToTokens start (stringToStringBuffer s) --- | A bit better lexer then the default, i.e. handles DataKinds quotes -lex' :: ReadS String -lex' ('\'' : '\'' : rest) = [("''", rest)] -lex' str@('\'' : '\\' : _ : '\'' : _) = lex str -lex' str@('\'' : _ : '\'' : _) = lex str -lex' ('\'' : rest) = [("'", rest)] -lex' str = lex str + -- Transform a list of Haskell/CPP lines into a list of tokens + go :: RealSrcLoc -> [Either String String] -> [Located L.Token] + go _ [] = [] + go pos ls = + let (hLinesRight, ls') = span isRight ls + (cppLinesLeft, rest) = span isLeft ls' + + hSrc = concat [ hLine | Right hLine <- hLinesRight ] + cppSrc = concat [ cppLine | Left cppLine <- cppLinesLeft ] + + in case L.lexTokenStream (stringToStringBuffer hSrc) pos dflags of --- | Split input to "first line" string and the rest of it. --- --- Ideally, this should be done simply with @'break' (== '\n')@. However, --- Haskell also allows line-unbreaking (or whatever it is called) so things --- are not as simple and this function deals with that. -spanToNewline :: String -> (String, String) -spanToNewline [] = ([], []) -spanToNewline ('\\':'\n':str) = - let (str', rest) = spanToNewline str - in ('\\':'\n':str', rest) -spanToNewline str@('\n':_) = ("", str) -spanToNewline (c:str) = - let (str', rest) = spanToNewline str - in (c:str', rest) + -- Stuff that fails to lex gets turned into comments + L.PFailed _ss _msg -> + let (src_pos, failed) = mkToken ITunknown pos hSrc + (new_pos, cpp) = mkToken ITlineComment src_pos cppSrc + in failed : cpp : go new_pos rest --- | Split input to whitespace string, (optional) preprocessor directive and --- the rest of it. + -- Successfully lexed + L.POk ss toks -> + let (new_pos, cpp) = mkToken ITlineComment (L.loc ss) cppSrc + in toks ++ [cpp] ++ go new_pos rest + + -- Manually make a token from a 'String', advancing the cursor position + mkToken tok start' str = + let end = foldl' advanceSrcLoc start' str + in (end, L (RealSrcSpan $ mkRealSrcSpan start' end) (tok str)) + + +-- | Split apart the initial file into Haskell source lines ('Left' entries) and +-- CPP lines ('Right' entries). -- --- Again, using something like @'span' 'isSpace'@ would be nice to chunk input --- to whitespace. The problem is with /#/ symbol - if it is placed at the very --- beginning of a line, it should be recognized as preprocessor macro. In any --- other case, it is ordinary Haskell symbol and can be used to declare --- operators. Hence, while dealing with whitespace we also check whether there --- happens to be /#/ symbol just after a newline character - if that is the --- case, we begin treating the whole line as preprocessor macro. -spanSpaceOrCpp :: String -> (String, Maybe String, String) -spanSpaceOrCpp ('\n':'#':str) = - let (str', rest) = spanToNewline str - in ("\n", Just $ '#':str', rest) -spanSpaceOrCpp (c:str') - | isSpace c = - let (space, mcpp, rest) = spanSpaceOrCpp str' - in (c:space, mcpp, rest) -spanSpaceOrCpp str = ("", Nothing, str) - --- | Split input to comment content (including delimiters) and the rest. +-- All characters in the input are present in the output: -- --- Again, some more logic than simple 'span' is required because of Haskell --- comment nesting policy. -chunkComment :: Int -> String -> (String, String) -chunkComment _ [] = ("", "") -chunkComment depth ('{':'-':str) = - let (c, rest) = chunkComment (depth + 1) str - in ("{-" ++ c, rest) -chunkComment depth ('-':'}':str) - | depth == 1 = ("-}", str) - | otherwise = - let (c, rest) = chunkComment (depth - 1) str - in ("-}" ++ c, rest) -chunkComment depth (e:str) = - let (c, rest) = chunkComment depth str - in (e:c, rest) - --- | Assign source location for each chunk in given stream. -tag :: [String] -> [(Span, String)] -tag = - reverse . snd . foldl aux (Position 1 1, []) - where - aux (pos, cs) str = - let pos' = foldl move pos str - in (pos', (Span pos pos', str):cs) - move pos '\n' = pos { posRow = posRow pos + 1, posCol = 1 } - move pos _ = pos { posCol = posCol pos + 1 } - --- | Turn unrecognised chunk stream to more descriptive token stream. -tokenize :: [(Span, String)] -> [Token] -tokenize = - map aux +-- prop> concat . map (either id id) . splitCPP = id +splitCPP :: String -> [Either String String] +splitCPP "" = [] +splitCPP s | isCPPline s = Left l : splitCPP rest + | otherwise = Right l : splitCPP rest where - aux (sp, str) = Token - { tkType = classify str - , tkValue = str - , tkSpan = sp - } + ~(l, rest) = spanToNewline 0 s --- | Classify given string as appropriate Haskell token. + +-- | Heuristic to decide if a line is going to be a CPP line. This should be a +-- cheap operation since it is going to be run on every line being processed. -- --- This method is based on Haskell 98 Report lexical structure description: --- https://www.haskell.org/onlinereport/lexemes.html +-- Right now it just checks if the first non-whitespace character in the first +-- five characters of the line is a '#': -- --- However, this is probably far from being perfect and most probably does not --- handle correctly all corner cases. -classify :: String -> TokenType -classify str - | "--" `isPrefixOf` str = TkComment - | "{-#" `isPrefixOf` str = TkPragma - | "{-" `isPrefixOf` str = TkComment -classify "''" = TkSpecial -classify "'" = TkSpecial -classify str@(c:_) - | isSpace c = TkSpace - | isDigit c = TkNumber - | c `elem` special = TkSpecial - | str `elem` glyphs = TkGlyph - | all (`elem` symbols) str = TkOperator - | c == '#' = TkCpp - | c == '"' = TkString - | c == '\'' = TkChar -classify str - | str `elem` keywords = TkKeyword - | isIdentifier str = TkIdentifier - | otherwise = TkUnknown - -keywords :: [String] -keywords = - [ "as" - , "case" - , "class" - , "data" - , "default" - , "deriving" - , "do" - , "else" - , "hiding" - , "if" - , "import" - , "in" - , "infix" - , "infixl" - , "infixr" - , "instance" - , "let" - , "module" - , "newtype" - , "of" - , "qualified" - , "then" - , "type" - , "where" - , "forall" - , "family" - , "mdo" - ] - -glyphs :: [String] -glyphs = - [ ".." - , ":" - , "::" - , "=" - , "\\" - , "|" - , "<-" - , "->" - , "@" - , "~" - , "~#" - , "=>" - , "-" - , "!" - ] - -special :: [Char] -special = "()[]{},;`" - --- TODO: Add support for any Unicode symbol or punctuation. --- source: http://stackoverflow.com/questions/10548170/what-characters-are-permitted-for-haskell-operators -symbols :: [Char] -symbols = "!#$%&*+./<=>?@\\^|-~:" - -isIdentifier :: String -> Bool -isIdentifier (s:str) - | (isLower' s || isUpper s) && all isAlphaNum' str = True +-- >>> isCPPline "#define FOO 1" +-- True +-- +-- >>> isCPPline "\t\t #ifdef GHC" +-- True +-- +-- >>> isCPPline " #endif" +-- False +-- +isCPPline :: String -> Bool +isCPPline = isPrefixOf "#" . dropWhile (`elem` " \t") . take 5 + + +-- | Split a "line" off the front of a string, supporting newline escapes. +-- +-- By "line", we understand: the shortest substring ending in a '\n' that is not +-- +-- 1. immediately preceded by a '\\' +-- 2. not inside some (possibly nested) block comment +-- +-- All characters in the input are present in the output: +-- +-- prop> curry (++) . spanToNewLine 0 = id +spanToNewline :: Int -- ^ open '{-' + -> String -- ^ input + -> (String, String) +spanToNewline _ [] = ([], []) +spanToNewline n ('\n':str) | n <= 0 = ("\n", str) +spanToNewline n ('\\':'\n':str) = + let (str', rest) = spanToNewline n str + in ('\\':'\n':str', rest) +spanToNewline n ('{':'-':str) = + let (str', rest) = spanToNewline (n+1) str + in ('{':'-':str', rest) +spanToNewline n ('-':'}':str) = + let (str', rest) = spanToNewline (n-1) str + in ('-':'}':str', rest) +spanToNewline n (c:str) = + let (str', rest) = spanToNewline n str + in (c:str', rest) + + +-- | Turn a list of GHC's 'L.Token' (and their source 'String') into a list of +-- Haddock's 'T.Token'. +ghcToks :: [(Located L.Token, String)] -> [T.Token] +ghcToks = reverse . snd . foldl' go (start, []) where - isLower' c = isLower c || c == '_' - isAlphaNum' c = isAlphaNum c || c == '_' || c == '\'' -isIdentifier _ = False + start = mkRealSrcLoc (mkFastString "lexing") 1 1 + + go :: (RealSrcLoc, [T.Token]) -- ^ current position, tokens accumulated + -> (Located L.Token, String) -- ^ next token, its content + -> (RealSrcLoc, [T.Token]) -- ^ new position, new tokens accumulated + go (pos, toks) (L l tok, raw) = ( next_pos + , classifiedTok ++ maybeToList white ++ toks + ) + where + (next_pos, white) = mkWhitespace pos l + classifiedTok = [ Token (classify tok) raw rss + | RealSrcSpan rss <- [l] + , not (null raw) + ] + +-- | Find the correct amount of whitespace between tokens. +mkWhitespace :: RealSrcLoc -> SrcSpan -> (RealSrcLoc, Maybe T.Token) +mkWhitespace prev spn = + case spn of + UnhelpfulSpan _ -> (prev,Nothing) + RealSrcSpan s | null wsstring -> (end, Nothing) + | otherwise -> (end, Just (Token TkSpace wsstring wsspan)) + where + start = realSrcSpanStart s + end = realSrcSpanEnd s + wsspan = mkRealSrcSpan prev start + nls = srcLocLine start - srcLocLine prev + spaces = if nls == 0 then srcLocCol start - srcLocCol prev + else srcLocCol start - 1 + wsstring = replicate nls '\n' ++ replicate spaces ' ' + + +-- | Classify given string as appropriate Haskell token. +classify :: L.Token -> TokenType +classify tok = + case tok of + ITas -> TkKeyword + ITcase -> TkKeyword + ITclass -> TkKeyword + ITdata -> TkKeyword + ITdefault -> TkKeyword + ITderiving -> TkKeyword + ITdo -> TkKeyword + ITelse -> TkKeyword + IThiding -> TkKeyword + ITforeign -> TkKeyword + ITif -> TkKeyword + ITimport -> TkKeyword + ITin -> TkKeyword + ITinfix -> TkKeyword + ITinfixl -> TkKeyword + ITinfixr -> TkKeyword + ITinstance -> TkKeyword + ITlet -> TkKeyword + ITmodule -> TkKeyword + ITnewtype -> TkKeyword + ITof -> TkKeyword + ITqualified -> TkKeyword + ITthen -> TkKeyword + ITtype -> TkKeyword + ITwhere -> TkKeyword + + ITforall {} -> TkKeyword + ITexport -> TkKeyword + ITlabel -> TkKeyword + ITdynamic -> TkKeyword + ITsafe -> TkKeyword + ITinterruptible -> TkKeyword + ITunsafe -> TkKeyword + ITstdcallconv -> TkKeyword + ITccallconv -> TkKeyword + ITcapiconv -> TkKeyword + ITprimcallconv -> TkKeyword + ITjavascriptcallconv -> TkKeyword + ITmdo -> TkKeyword + ITfamily -> TkKeyword + ITrole -> TkKeyword + ITgroup -> TkKeyword + ITby -> TkKeyword + ITusing -> TkKeyword + ITpattern -> TkKeyword + ITstatic -> TkKeyword + ITstock -> TkKeyword + ITanyclass -> TkKeyword + + ITunit -> TkKeyword + ITsignature -> TkKeyword + ITdependency -> TkKeyword + ITrequires -> TkKeyword + + ITinline_prag {} -> TkPragma + ITspec_prag {} -> TkPragma + ITspec_inline_prag {} -> TkPragma + ITsource_prag {} -> TkPragma + ITrules_prag {} -> TkPragma + ITwarning_prag {} -> TkPragma + ITdeprecated_prag {} -> TkPragma + ITline_prag -> TkPragma + ITscc_prag {} -> TkPragma + ITgenerated_prag {} -> TkPragma + ITcore_prag {} -> TkPragma + ITunpack_prag {} -> TkPragma + ITnounpack_prag {} -> TkPragma + ITann_prag {} -> TkPragma + ITcomplete_prag {} -> TkPragma + ITclose_prag -> TkPragma + IToptions_prag {} -> TkPragma + ITinclude_prag {} -> TkPragma + ITlanguage_prag -> TkPragma + ITvect_prag {} -> TkPragma + ITvect_scalar_prag {} -> TkPragma + ITnovect_prag {} -> TkPragma + ITminimal_prag {} -> TkPragma + IToverlappable_prag {} -> TkPragma + IToverlapping_prag {} -> TkPragma + IToverlaps_prag {} -> TkPragma + ITincoherent_prag {} -> TkPragma + ITctype {} -> TkPragma + + ITdotdot -> TkGlyph + ITcolon -> TkGlyph + ITdcolon {} -> TkGlyph + ITequal -> TkGlyph + ITlam -> TkGlyph + ITlcase -> TkGlyph + ITvbar -> TkGlyph + ITlarrow {} -> TkGlyph + ITrarrow {} -> TkGlyph + ITat -> TkGlyph + ITtilde -> TkGlyph + ITtildehsh -> TkGlyph + ITdarrow {} -> TkGlyph + ITminus -> TkGlyph + ITbang -> TkGlyph + ITdot -> TkOperator + ITtypeApp -> TkGlyph + + ITbiglam -> TkGlyph + + ITocurly -> TkSpecial + ITccurly -> TkSpecial + ITvocurly -> TkSpecial + ITvccurly -> TkSpecial + ITobrack -> TkSpecial + ITopabrack -> TkSpecial + ITcpabrack -> TkSpecial + ITcbrack -> TkSpecial + IToparen -> TkSpecial + ITcparen -> TkSpecial + IToubxparen -> TkSpecial + ITcubxparen -> TkSpecial + ITsemi -> TkSpecial + ITcomma -> TkSpecial + ITunderscore -> TkIdentifier + ITbackquote -> TkSpecial + ITsimpleQuote -> TkSpecial + + ITvarid {} -> TkIdentifier + ITconid {} -> TkIdentifier + ITvarsym {} -> TkOperator + ITconsym {} -> TkOperator + ITqvarid {} -> TkIdentifier + ITqconid {} -> TkIdentifier + ITqvarsym {} -> TkOperator + ITqconsym {} -> TkOperator + + ITdupipvarid {} -> TkUnknown + ITlabelvarid {} -> TkUnknown + + ITchar {} -> TkChar + ITstring {} -> TkString + ITinteger {} -> TkNumber + ITrational {} -> TkNumber + + ITprimchar {} -> TkChar + ITprimstring {} -> TkString + ITprimint {} -> TkNumber + ITprimword {} -> TkNumber + ITprimfloat {} -> TkNumber + ITprimdouble {} -> TkNumber + + ITopenExpQuote {} -> TkSpecial + ITopenPatQuote -> TkSpecial + ITopenDecQuote -> TkSpecial + ITopenTypQuote -> TkSpecial + ITcloseQuote {} -> TkSpecial + ITopenTExpQuote {} -> TkSpecial + ITcloseTExpQuote -> TkSpecial + ITidEscape {} -> TkUnknown + ITparenEscape -> TkSpecial + ITidTyEscape {} -> TkUnknown + ITparenTyEscape -> TkSpecial + ITtyQuote -> TkSpecial + ITquasiQuote {} -> TkUnknown + ITqQuasiQuote {} -> TkUnknown + + ITproc -> TkKeyword + ITrec -> TkKeyword + IToparenbar {} -> TkGlyph + ITcparenbar {} -> TkGlyph + ITlarrowtail {} -> TkGlyph + ITrarrowtail {} -> TkGlyph + ITLarrowtail {} -> TkGlyph + ITRarrowtail {} -> TkGlyph + + ITunknown {} -> TkUnknown + ITeof -> TkUnknown + + -- Line comments are only supposed to start with '--'. Starting with '#' + -- means that this was probably a CPP. + ITlineComment s + | isCPPline s -> TkCpp + | otherwise -> TkComment + + ITdocCommentNext {} -> TkComment + ITdocCommentPrev {} -> TkComment + ITdocCommentNamed {} -> TkComment + ITdocSection {} -> TkComment + ITdocOptions {} -> TkComment + + -- The lexer considers top-level pragmas as comments (see `pragState` in + -- the GHC lexer for more), so we have to manually reverse this. The + -- following is a hammer: it smashes _all_ pragma-like block comments into + -- pragmas. + ITblockComment c + | isPrefixOf "{-#" c + , isSuffixOf "#-}" c -> TkPragma + | otherwise -> TkComment diff --git a/haddock-api/src/Haddock/Backends/Hyperlinker/Renderer.hs b/haddock-api/src/Haddock/Backends/Hyperlinker/Renderer.hs index 15793f0c..27bf7605 100644 --- a/haddock-api/src/Haddock/Backends/Hyperlinker/Renderer.hs +++ b/haddock-api/src/Haddock/Backends/Hyperlinker/Renderer.hs @@ -82,7 +82,7 @@ header mcss mjs = tokenGroup :: SrcMap -> TokenGroup -> Html tokenGroup _ (GrpNormal tok@(Token { .. })) - | tkType == TkSpace = renderSpace (posRow . spStart $ tkSpan) tkValue + | tkType == TkSpace = renderSpace (GHC.srcSpanStartLine tkSpan) tkValue | otherwise = tokenSpan tok ! attrs where attrs = [ multiclass . tokenStyle $ tkType ] @@ -155,7 +155,7 @@ internalHyperlink name content = Html.anchor content ! [ Html.href $ "#" ++ internalAnchorIdent name ] externalNameHyperlink :: SrcMap -> GHC.Name -> Html -> Html -externalNameHyperlink (srcs, _) name content = case Map.lookup mdl srcs of +externalNameHyperlink srcs name content = case Map.lookup mdl srcs of Just SrcLocal -> Html.anchor content ! [ Html.href $ hypSrcModuleNameUrl mdl name ] Just (SrcExternal path) -> Html.anchor content ! @@ -165,12 +165,14 @@ externalNameHyperlink (srcs, _) name content = case Map.lookup mdl srcs of mdl = GHC.nameModule name externalModHyperlink :: SrcMap -> GHC.ModuleName -> Html -> Html -externalModHyperlink (_, srcs) name content = case Map.lookup name srcs of - Just SrcLocal -> Html.anchor content ! +externalModHyperlink srcs name content = + let srcs' = Map.mapKeys GHC.moduleName srcs in + case Map.lookup name srcs' of + Just SrcLocal -> Html.anchor content ! [ Html.href $ hypSrcModuleUrl' name ] - Just (SrcExternal path) -> Html.anchor content ! + Just (SrcExternal path) -> Html.anchor content ! [ Html.href $ path hypSrcModuleUrl' name ] - Nothing -> content + Nothing -> content renderSpace :: Int -> String -> Html diff --git a/haddock-api/src/Haddock/Backends/Hyperlinker/Types.hs b/haddock-api/src/Haddock/Backends/Hyperlinker/Types.hs index d8ae89e4..e377471e 100644 --- a/haddock-api/src/Haddock/Backends/Hyperlinker/Types.hs +++ b/haddock-api/src/Haddock/Backends/Hyperlinker/Types.hs @@ -4,8 +4,6 @@ module Haddock.Backends.Hyperlinker.Types where import qualified GHC import Data.Map (Map) -import qualified Data.Map as Map - data Token = Token { tkType :: TokenType @@ -14,23 +12,8 @@ data Token = Token } deriving (Show) -data Position = Position - { posRow :: !Int - , posCol :: !Int - } - deriving (Eq, Ord, Show) - -data Span = Span - { spStart :: !Position - , spEnd :: !Position - } - deriving (Show) - --- | Tests whether the first span "contains" the other span, meaning --- that it covers at least as much source code. True where spans are equal. -containsSpan :: Span -> Span -> Bool -containsSpan s1 s2 = - spStart s1 <= spStart s2 && spEnd s1 >= spEnd s2 +type Position = GHC.RealSrcLoc +type Span = GHC.RealSrcSpan data TokenType = TkIdentifier @@ -80,15 +63,5 @@ data SrcPath | SrcLocal -- | Mapping from modules to cross-package source paths. --- --- This mapping is actually a pair of maps instead of just one map. The reason --- for this is because when hyperlinking modules in import lists we have no --- 'GHC.Module' available. On the other hand, we can't just use map with --- 'GHC.ModuleName' as indices because certain modules may have common name --- but originate in different packages. Hence, we use both /rich/ and /poor/ --- versions, where the /poor/ is just projection of /rich/ one cached in pair --- for better performance. -type SrcMap = (Map GHC.Module SrcPath, Map GHC.ModuleName SrcPath) +type SrcMap = Map GHC.Module SrcPath -mkSrcMap :: Map GHC.Module SrcPath -> SrcMap -mkSrcMap srcs = (srcs, Map.mapKeys GHC.moduleName srcs) diff --git a/haddock-api/src/Haddock/Interface/Create.hs b/haddock-api/src/Haddock/Interface/Create.hs index 2a56e87a..4309163f 100644 --- a/haddock-api/src/Haddock/Interface/Create.hs +++ b/haddock-api/src/Haddock/Interface/Create.hs @@ -20,7 +20,6 @@ module Haddock.Interface.Create (createInterface) where import Documentation.Haddock.Doc (metaDocAppend) -import Documentation.Haddock.Utf8 as Utf8 import Haddock.Types import Haddock.Options import Haddock.GhcUtils @@ -33,15 +32,14 @@ import Haddock.Backends.Hyperlinker.Parser as Hyperlinker import Data.Bifunctor import Data.Bitraversable -import qualified Data.ByteString as BS import qualified Data.Map as M import Data.Map (Map) import Data.List import Data.Maybe import Data.Ord import Control.Applicative -import Control.Exception (evaluate) import Control.Monad +import Control.DeepSeq import Data.Traversable import Avail hiding (avail) @@ -160,7 +158,7 @@ createInterface tm flags modMap instIfaceMap = do modWarn <- liftErrMsg (moduleWarning dflags gre warnings) - tokenizedSrc <- mkMaybeTokenizedSrc flags tm + tokenizedSrc <- mkMaybeTokenizedSrc dflags flags tm return $! Interface { ifaceMod = mdl @@ -1137,12 +1135,12 @@ seqList :: [a] -> () seqList [] = () seqList (x : xs) = x `seq` seqList xs -mkMaybeTokenizedSrc :: [Flag] -> TypecheckedModule +mkMaybeTokenizedSrc :: DynFlags -> [Flag] -> TypecheckedModule -> ErrMsgGhc (Maybe [RichToken]) -mkMaybeTokenizedSrc flags tm +mkMaybeTokenizedSrc dflags flags tm | Flag_HyperlinkedSource `elem` flags = case renamedSource tm of Just src -> do - tokens <- liftGhcToErrMsgGhc . liftIO $ mkTokenizedSrc summary src + tokens <- liftGhcToErrMsgGhc (liftIO (mkTokenizedSrc dflags summary src)) return $ Just tokens Nothing -> do liftErrMsg . tell . pure $ concat @@ -1155,12 +1153,14 @@ mkMaybeTokenizedSrc flags tm where summary = pm_mod_summary . tm_parsed_module $ tm -mkTokenizedSrc :: ModSummary -> RenamedSource -> IO [RichToken] -mkTokenizedSrc ms src = do - -- make sure to read the whole file at once otherwise +mkTokenizedSrc :: DynFlags -> ModSummary -> RenamedSource -> IO [RichToken] +mkTokenizedSrc dflags ms src = do + -- make sure to read the whole file at once otherwise -- we run out of file descriptors (see #495) - rawSrc <- BS.readFile (msHsFilePath ms) >>= evaluate - return $ Hyperlinker.enrich src (Hyperlinker.parse (decodeUtf8 rawSrc)) + file <- force <$> readFile (filepath) + return $ Hyperlinker.enrich src (Hyperlinker.parse dflags filepath file) + where + filepath = msHsFilePath ms -- | Find a stand-alone documentation comment by its name. findNamedDoc :: String -> [HsDecl GhcRn] -> ErrMsgM (Maybe HsDocString) -- cgit v1.2.3