diff options
author | Alec Theriault <alec.theriault@gmail.com> | 2017-12-10 12:22:21 -0800 |
---|---|---|
committer | Alexander Biehl <alexbiehl@gmail.com> | 2018-02-01 14:58:18 +0100 |
commit | 4f75be94f45a0e92553eccefe56230c554333ce7 (patch) | |
tree | b88a2dd52d4bcd001f423c490c14b4c3cbaaee0e | |
parent | 60e10eb876899165e9644013508361bf72048bdb (diff) |
Use the GHC lexer for the Hyperlinker backend (#714)
* Start changing to use GHC lexer
* better cpp
* Change SrcSpan to RealSrcSpan
* Remove error
* Try to stop too many open files
* wip
* wip
* Revert "wip"
This reverts commit b605510a195f26315e3d8ca90e6d95a6737553e1.
Conflicts:
haddock-api/haddock-api.cabal
haddock-api/src/Haddock/Interface.hs
* Remove pointless 'caching'
* Use dlist rather than lists when finding vars
* Use a map rather than list
* Delete bogus comment
* Rebase followup
Things now run using the GHC lexer. There are still
- stray debug statements
- unnecessary changes w.r.t. master
* Cleaned up differences w.r.t. current Haddock HEAD
Things are looking good. quasiquotes in particular look beautiful: the
TH ones (with Haskell source inside) colour/link their contents too!
Haven't yet begun to check for possible performance problems.
* Support CPP and top-level pragmas
The support for these is hackier - but no more hacky than the existing
support.
* Tests pass, CPP is better recognized
The tests were in some cases altered: I consider the new output to be more
correct than the old one....
* Fix shrinking of source without tabs in test
* Replace 'Position'/'Span' with GHC counterparts
Replaces 'Position' -> 'GHC.RealSrcLoc' and 'Span' -> 'GHC.RealSrcSpan'.
* Nits
* Forgot entry in .cabal
* Update changelog
-rw-r--r-- | CHANGES.md | 3 | ||||
-rw-r--r-- | haddock-api/haddock-api.cabal | 62 | ||||
-rw-r--r-- | haddock-api/src/Haddock.hs | 2 | ||||
-rw-r--r-- | haddock-api/src/Haddock/Backends/Hyperlinker/Ast.hs | 19 | ||||
-rw-r--r-- | haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs | 534 | ||||
-rw-r--r-- | haddock-api/src/Haddock/Backends/Hyperlinker/Renderer.hs | 14 | ||||
-rw-r--r-- | haddock-api/src/Haddock/Backends/Hyperlinker/Types.hs | 33 | ||||
-rw-r--r-- | haddock-api/src/Haddock/Interface/Create.hs | 24 | ||||
-rw-r--r-- | haddock-api/test/Haddock/Backends/Hyperlinker/ParserSpec.hs | 135 | ||||
-rw-r--r-- | hypsrc-test/ref/src/Identifiers.html | 8 |
10 files changed, 522 insertions, 312 deletions
@@ -19,6 +19,9 @@ * Fix: Generate constraint signatures for constructors exported as pattern synonyms (#663) + * The hyperlinker backend now uses the GHC lexer instead of a custom one. + This notably fixes rendering of quasiquotes. + * Overhaul Haddock's rendering of kind signatures so that invisible kind parameters are not printed (#681) diff --git a/haddock-api/haddock-api.cabal b/haddock-api/haddock-api.cabal index a91afd36..95af5ef5 100644 --- a/haddock-api/haddock-api.cabal +++ b/haddock-api/haddock-api.cabal @@ -124,24 +124,68 @@ test-suite spec test , src - -- NB: We only use a small subset of lib:haddock-api here, which - -- explains why this component has a smaller build-depends set other-modules: + Haddock + Haddock.Backends.Hoogle + Haddock.Backends.Hyperlinker + Haddock.Backends.Hyperlinker.Ast + Haddock.Backends.Hyperlinker.Renderer + Haddock.Backends.Hyperlinker.Utils + Haddock.Backends.LaTeX + Haddock.Backends.Xhtml + Haddock.Backends.Xhtml.Decl + Haddock.Backends.Xhtml.DocMarkup + Haddock.Backends.Xhtml.Layout + Haddock.Backends.Xhtml.Meta + Haddock.Backends.Xhtml.Names + Haddock.Backends.Xhtml.Themes + Haddock.Backends.Xhtml.Types + Haddock.Backends.Xhtml.Utils + Haddock.Convert + Haddock.Doc + Haddock.GhcUtils + Haddock.Interface + Haddock.Interface.AttachInstances + Haddock.Interface.Create + Haddock.Interface.Json + Haddock.Interface.LexParseRn + Haddock.Interface.ParseModuleHeader + Haddock.Interface.Rename + Haddock.Interface.Specialize + Haddock.InterfaceFile + Haddock.ModuleTree + Haddock.Options + Haddock.Parser + Haddock.Syb + Haddock.Types + Haddock.Utils + Haddock.Utils.Json + Haddock.Version + Paths_haddock_api Haddock.Backends.Hyperlinker.ParserSpec Haddock.Backends.Hyperlinker.Parser Haddock.Backends.Hyperlinker.Types - build-depends: - ghc ^>= 8.4 - , hspec ^>= 2.4.4 - , QuickCheck ^>= 2.10 + build-depends: Cabal ^>= 2.0.0 + , ghc ^>= 8.4 + , ghc-paths ^>= 0.1.0.9 + , haddock-library ^>= 1.4.6 + , xhtml ^>= 3000.2.2 + , hspec ^>= 2.4.4 + , QuickCheck ^>= 2.10 -- Versions for the dependencies below are transitively pinned by -- the non-reinstallable `ghc` package and hence need no version -- bounds - build-depends: - base - , containers + build-depends: base + , array + , bytestring + , containers + , deepseq + , directory + , filepath + , ghc-boot + , transformers build-tool-depends: hspec-discover:hspec-discover ^>= 2.4.4 diff --git a/haddock-api/src/Haddock.hs b/haddock-api/src/Haddock.hs index f7fa52b3..3f5e5298 100644 --- a/haddock-api/src/Haddock.hs +++ b/haddock-api/src/Haddock.hs @@ -277,7 +277,7 @@ render dflags flags qual ifaces installedIfaces extSrcMap = do | Flag_HyperlinkedSource `elem` flags = Just hypSrcModuleUrlFormat | otherwise = srcModule - srcMap = mkSrcMap $ Map.union + srcMap = Map.union (Map.map SrcExternal extSrcMap) (Map.fromList [ (ifaceMod iface, SrcLocal) | iface <- ifaces ]) diff --git a/haddock-api/src/Haddock/Backends/Hyperlinker/Ast.hs b/haddock-api/src/Haddock/Backends/Hyperlinker/Ast.hs index 57ff72ff..361bc15d 100644 --- a/haddock-api/src/Haddock/Backends/Hyperlinker/Ast.hs +++ b/haddock-api/src/Haddock/Backends/Hyperlinker/Ast.hs @@ -12,6 +12,7 @@ import qualified Haddock.Syb as Syb import Haddock.Backends.Hyperlinker.Types import qualified GHC +import qualified SrcLoc import Control.Applicative import Control.Monad (guard) @@ -51,10 +52,10 @@ type DetailsMap = Map.Map Position (Span, TokenDetails) mkDetailsMap :: [(GHC.SrcSpan, TokenDetails)] -> DetailsMap mkDetailsMap xs = - Map.fromListWith select_details [ (start, (token_span, token_details)) + Map.fromListWith select_details [ (start, (span, token_details)) | (ghc_span, token_details) <- xs - , Just !token_span <- [ghcSrcSpanToSpan ghc_span] - , let start = spStart token_span + , GHC.RealSrcSpan span <- [ghc_span] + , let start = SrcLoc.realSrcSpanStart span ] where -- favour token details which appear earlier in the list @@ -62,17 +63,11 @@ mkDetailsMap xs = lookupBySpan :: Span -> DetailsMap -> Maybe TokenDetails lookupBySpan span details = do - (_, (tok_span, tok_details)) <- Map.lookupLE (spStart span) details - guard (tok_span `containsSpan` span ) + let pos = SrcLoc.realSrcSpanStart span + (_, (tok_span, tok_details)) <- Map.lookupLE pos details + guard (tok_span `SrcLoc.containsSpan` span) return tok_details -ghcSrcSpanToSpan :: GHC.SrcSpan -> Maybe Span -ghcSrcSpanToSpan (GHC.RealSrcSpan span) = - Just (Span { spStart = Position (GHC.srcSpanStartLine span) (GHC.srcSpanStartCol span) - , spEnd = Position (GHC.srcSpanEndLine span) (GHC.srcSpanEndCol span) - }) -ghcSrcSpanToSpan _ = Nothing - enrichToken :: Token -> DetailsMap -> Maybe TokenDetails enrichToken (Token typ _ spn) dm | typ `elem` [TkIdentifier, TkOperator] = lookupBySpan spn dm diff --git a/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs b/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs index e4345602..da1555b8 100644 --- a/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs +++ b/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs @@ -1,11 +1,17 @@ module Haddock.Backends.Hyperlinker.Parser (parse) where +import Data.Either ( isRight, isLeft ) +import Data.List ( foldl', isPrefixOf, isSuffixOf ) +import Data.Maybe ( maybeToList ) -import Data.Char -import Data.List -import Data.Maybe +import GHC ( DynFlags, addSourceToTokens ) +import SrcLoc +import FastString ( mkFastString ) +import StringBuffer ( stringToStringBuffer ) +import Lexer ( Token(..) ) +import qualified Lexer as L -import Haddock.Backends.Hyperlinker.Types +import Haddock.Backends.Hyperlinker.Types as T -- | Turn source code string into a stream of more descriptive tokens. @@ -13,202 +19,350 @@ import Haddock.Backends.Hyperlinker.Types -- Result should retain original file layout (including comments, whitespace, -- etc.), i.e. the following "law" should hold: -- --- @concat . map 'tkValue' . 'parse' = id@ -parse :: String -> [Token] -parse = tokenize . tag . chunk +-- prop> concat . map tkValue . parse = id +-- +-- (In reality, this only holds for input not containing '\r', '\t', '\f', '\v', +-- characters, since GHC transforms those into ' ' and '\n') +parse :: DynFlags -> FilePath -> String -> [T.Token] +parse dflags fp s = ghcToks (processCPP dflags fp s) + --- | Split raw source string to more meaningful chunks. +-- | Parse the source into tokens using the GHC lexer. +-- +-- * CPP lines are removed and reinserted as line-comments +-- * top-level file pragmas are parsed as block comments (see the +-- 'ITblockComment' case of 'classify' for more details) -- --- This is the initial stage of tokenization process. Each chunk is either --- a comment (including comment delimiters), a whitespace string, preprocessor --- macro (and all its content until the end of a line) or valid Haskell lexeme. -chunk :: String -> [String] -chunk [] = [] -chunk str@(c:_) - | isSpace c = - let (space, mcpp, rest) = spanSpaceOrCpp str - in [space] ++ maybeToList mcpp ++ chunk rest -chunk str - | "--" `isPrefixOf` str = chunk' $ spanToNewline str - | "{-" `isPrefixOf` str = chunk' $ chunkComment 0 str - | otherwise = case lex' str of - (tok:_) -> chunk' tok - [] -> [str] +processCPP :: DynFlags -- ^ GHC's flags + -> FilePath -- ^ source file name (for position information) + -> String -- ^ source file contents + -> [(Located L.Token, String)] +processCPP dflags fpath s = addSrc . go start . splitCPP $ s where - chunk' (c, rest) = c:(chunk rest) + start = mkRealSrcLoc (mkFastString fpath) 1 1 + addSrc = addSourceToTokens start (stringToStringBuffer s) --- | A bit better lexer then the default, i.e. handles DataKinds quotes -lex' :: ReadS String -lex' ('\'' : '\'' : rest) = [("''", rest)] -lex' str@('\'' : '\\' : _ : '\'' : _) = lex str -lex' str@('\'' : _ : '\'' : _) = lex str -lex' ('\'' : rest) = [("'", rest)] -lex' str = lex str + -- Transform a list of Haskell/CPP lines into a list of tokens + go :: RealSrcLoc -> [Either String String] -> [Located L.Token] + go _ [] = [] + go pos ls = + let (hLinesRight, ls') = span isRight ls + (cppLinesLeft, rest) = span isLeft ls' + + hSrc = concat [ hLine | Right hLine <- hLinesRight ] + cppSrc = concat [ cppLine | Left cppLine <- cppLinesLeft ] + + in case L.lexTokenStream (stringToStringBuffer hSrc) pos dflags of --- | Split input to "first line" string and the rest of it. --- --- Ideally, this should be done simply with @'break' (== '\n')@. However, --- Haskell also allows line-unbreaking (or whatever it is called) so things --- are not as simple and this function deals with that. -spanToNewline :: String -> (String, String) -spanToNewline [] = ([], []) -spanToNewline ('\\':'\n':str) = - let (str', rest) = spanToNewline str - in ('\\':'\n':str', rest) -spanToNewline str@('\n':_) = ("", str) -spanToNewline (c:str) = - let (str', rest) = spanToNewline str - in (c:str', rest) + -- Stuff that fails to lex gets turned into comments + L.PFailed _ss _msg -> + let (src_pos, failed) = mkToken ITunknown pos hSrc + (new_pos, cpp) = mkToken ITlineComment src_pos cppSrc + in failed : cpp : go new_pos rest --- | Split input to whitespace string, (optional) preprocessor directive and --- the rest of it. + -- Successfully lexed + L.POk ss toks -> + let (new_pos, cpp) = mkToken ITlineComment (L.loc ss) cppSrc + in toks ++ [cpp] ++ go new_pos rest + + -- Manually make a token from a 'String', advancing the cursor position + mkToken tok start' str = + let end = foldl' advanceSrcLoc start' str + in (end, L (RealSrcSpan $ mkRealSrcSpan start' end) (tok str)) + + +-- | Split apart the initial file into Haskell source lines ('Left' entries) and +-- CPP lines ('Right' entries). -- --- Again, using something like @'span' 'isSpace'@ would be nice to chunk input --- to whitespace. The problem is with /#/ symbol - if it is placed at the very --- beginning of a line, it should be recognized as preprocessor macro. In any --- other case, it is ordinary Haskell symbol and can be used to declare --- operators. Hence, while dealing with whitespace we also check whether there --- happens to be /#/ symbol just after a newline character - if that is the --- case, we begin treating the whole line as preprocessor macro. -spanSpaceOrCpp :: String -> (String, Maybe String, String) -spanSpaceOrCpp ('\n':'#':str) = - let (str', rest) = spanToNewline str - in ("\n", Just $ '#':str', rest) -spanSpaceOrCpp (c:str') - | isSpace c = - let (space, mcpp, rest) = spanSpaceOrCpp str' - in (c:space, mcpp, rest) -spanSpaceOrCpp str = ("", Nothing, str) - --- | Split input to comment content (including delimiters) and the rest. +-- All characters in the input are present in the output: -- --- Again, some more logic than simple 'span' is required because of Haskell --- comment nesting policy. -chunkComment :: Int -> String -> (String, String) -chunkComment _ [] = ("", "") -chunkComment depth ('{':'-':str) = - let (c, rest) = chunkComment (depth + 1) str - in ("{-" ++ c, rest) -chunkComment depth ('-':'}':str) - | depth == 1 = ("-}", str) - | otherwise = - let (c, rest) = chunkComment (depth - 1) str - in ("-}" ++ c, rest) -chunkComment depth (e:str) = - let (c, rest) = chunkComment depth str - in (e:c, rest) - --- | Assign source location for each chunk in given stream. -tag :: [String] -> [(Span, String)] -tag = - reverse . snd . foldl aux (Position 1 1, []) - where - aux (pos, cs) str = - let pos' = foldl move pos str - in (pos', (Span pos pos', str):cs) - move pos '\n' = pos { posRow = posRow pos + 1, posCol = 1 } - move pos _ = pos { posCol = posCol pos + 1 } - --- | Turn unrecognised chunk stream to more descriptive token stream. -tokenize :: [(Span, String)] -> [Token] -tokenize = - map aux +-- prop> concat . map (either id id) . splitCPP = id +splitCPP :: String -> [Either String String] +splitCPP "" = [] +splitCPP s | isCPPline s = Left l : splitCPP rest + | otherwise = Right l : splitCPP rest where - aux (sp, str) = Token - { tkType = classify str - , tkValue = str - , tkSpan = sp - } + ~(l, rest) = spanToNewline 0 s --- | Classify given string as appropriate Haskell token. + +-- | Heuristic to decide if a line is going to be a CPP line. This should be a +-- cheap operation since it is going to be run on every line being processed. -- --- This method is based on Haskell 98 Report lexical structure description: --- https://www.haskell.org/onlinereport/lexemes.html +-- Right now it just checks if the first non-whitespace character in the first +-- five characters of the line is a '#': -- --- However, this is probably far from being perfect and most probably does not --- handle correctly all corner cases. -classify :: String -> TokenType -classify str - | "--" `isPrefixOf` str = TkComment - | "{-#" `isPrefixOf` str = TkPragma - | "{-" `isPrefixOf` str = TkComment -classify "''" = TkSpecial -classify "'" = TkSpecial -classify str@(c:_) - | isSpace c = TkSpace - | isDigit c = TkNumber - | c `elem` special = TkSpecial - | str `elem` glyphs = TkGlyph - | all (`elem` symbols) str = TkOperator - | c == '#' = TkCpp - | c == '"' = TkString - | c == '\'' = TkChar -classify str - | str `elem` keywords = TkKeyword - | isIdentifier str = TkIdentifier - | otherwise = TkUnknown - -keywords :: [String] -keywords = - [ "as" - , "case" - , "class" - , "data" - , "default" - , "deriving" - , "do" - , "else" - , "hiding" - , "if" - , "import" - , "in" - , "infix" - , "infixl" - , "infixr" - , "instance" - , "let" - , "module" - , "newtype" - , "of" - , "qualified" - , "then" - , "type" - , "where" - , "forall" - , "family" - , "mdo" - ] - -glyphs :: [String] -glyphs = - [ ".." - , ":" - , "::" - , "=" - , "\\" - , "|" - , "<-" - , "->" - , "@" - , "~" - , "~#" - , "=>" - , "-" - , "!" - ] - -special :: [Char] -special = "()[]{},;`" - --- TODO: Add support for any Unicode symbol or punctuation. --- source: http://stackoverflow.com/questions/10548170/what-characters-are-permitted-for-haskell-operators -symbols :: [Char] -symbols = "!#$%&*+./<=>?@\\^|-~:" - -isIdentifier :: String -> Bool -isIdentifier (s:str) - | (isLower' s || isUpper s) && all isAlphaNum' str = True +-- >>> isCPPline "#define FOO 1" +-- True +-- +-- >>> isCPPline "\t\t #ifdef GHC" +-- True +-- +-- >>> isCPPline " #endif" +-- False +-- +isCPPline :: String -> Bool +isCPPline = isPrefixOf "#" . dropWhile (`elem` " \t") . take 5 + + +-- | Split a "line" off the front of a string, supporting newline escapes. +-- +-- By "line", we understand: the shortest substring ending in a '\n' that is not +-- +-- 1. immediately preceded by a '\\' +-- 2. not inside some (possibly nested) block comment +-- +-- All characters in the input are present in the output: +-- +-- prop> curry (++) . spanToNewLine 0 = id +spanToNewline :: Int -- ^ open '{-' + -> String -- ^ input + -> (String, String) +spanToNewline _ [] = ([], []) +spanToNewline n ('\n':str) | n <= 0 = ("\n", str) +spanToNewline n ('\\':'\n':str) = + let (str', rest) = spanToNewline n str + in ('\\':'\n':str', rest) +spanToNewline n ('{':'-':str) = + let (str', rest) = spanToNewline (n+1) str + in ('{':'-':str', rest) +spanToNewline n ('-':'}':str) = + let (str', rest) = spanToNewline (n-1) str + in ('-':'}':str', rest) +spanToNewline n (c:str) = + let (str', rest) = spanToNewline n str + in (c:str', rest) + + +-- | Turn a list of GHC's 'L.Token' (and their source 'String') into a list of +-- Haddock's 'T.Token'. +ghcToks :: [(Located L.Token, String)] -> [T.Token] +ghcToks = reverse . snd . foldl' go (start, []) where - isLower' c = isLower c || c == '_' - isAlphaNum' c = isAlphaNum c || c == '_' || c == '\'' -isIdentifier _ = False + start = mkRealSrcLoc (mkFastString "lexing") 1 1 + + go :: (RealSrcLoc, [T.Token]) -- ^ current position, tokens accumulated + -> (Located L.Token, String) -- ^ next token, its content + -> (RealSrcLoc, [T.Token]) -- ^ new position, new tokens accumulated + go (pos, toks) (L l tok, raw) = ( next_pos + , classifiedTok ++ maybeToList white ++ toks + ) + where + (next_pos, white) = mkWhitespace pos l + classifiedTok = [ Token (classify tok) raw rss + | RealSrcSpan rss <- [l] + , not (null raw) + ] + +-- | Find the correct amount of whitespace between tokens. +mkWhitespace :: RealSrcLoc -> SrcSpan -> (RealSrcLoc, Maybe T.Token) +mkWhitespace prev spn = + case spn of + UnhelpfulSpan _ -> (prev,Nothing) + RealSrcSpan s | null wsstring -> (end, Nothing) + | otherwise -> (end, Just (Token TkSpace wsstring wsspan)) + where + start = realSrcSpanStart s + end = realSrcSpanEnd s + wsspan = mkRealSrcSpan prev start + nls = srcLocLine start - srcLocLine prev + spaces = if nls == 0 then srcLocCol start - srcLocCol prev + else srcLocCol start - 1 + wsstring = replicate nls '\n' ++ replicate spaces ' ' + + +-- | Classify given string as appropriate Haskell token. +classify :: L.Token -> TokenType +classify tok = + case tok of + ITas -> TkKeyword + ITcase -> TkKeyword + ITclass -> TkKeyword + ITdata -> TkKeyword + ITdefault -> TkKeyword + ITderiving -> TkKeyword + ITdo -> TkKeyword + ITelse -> TkKeyword + IThiding -> TkKeyword + ITforeign -> TkKeyword + ITif -> TkKeyword + ITimport -> TkKeyword + ITin -> TkKeyword + ITinfix -> TkKeyword + ITinfixl -> TkKeyword + ITinfixr -> TkKeyword + ITinstance -> TkKeyword + ITlet -> TkKeyword + ITmodule -> TkKeyword + ITnewtype -> TkKeyword + ITof -> TkKeyword + ITqualified -> TkKeyword + ITthen -> TkKeyword + ITtype -> TkKeyword + ITwhere -> TkKeyword + + ITforall {} -> TkKeyword + ITexport -> TkKeyword + ITlabel -> TkKeyword + ITdynamic -> TkKeyword + ITsafe -> TkKeyword + ITinterruptible -> TkKeyword + ITunsafe -> TkKeyword + ITstdcallconv -> TkKeyword + ITccallconv -> TkKeyword + ITcapiconv -> TkKeyword + ITprimcallconv -> TkKeyword + ITjavascriptcallconv -> TkKeyword + ITmdo -> TkKeyword + ITfamily -> TkKeyword + ITrole -> TkKeyword + ITgroup -> TkKeyword + ITby -> TkKeyword + ITusing -> TkKeyword + ITpattern -> TkKeyword + ITstatic -> TkKeyword + ITstock -> TkKeyword + ITanyclass -> TkKeyword + + ITunit -> TkKeyword + ITsignature -> TkKeyword + ITdependency -> TkKeyword + ITrequires -> TkKeyword + + ITinline_prag {} -> TkPragma + ITspec_prag {} -> TkPragma + ITspec_inline_prag {} -> TkPragma + ITsource_prag {} -> TkPragma + ITrules_prag {} -> TkPragma + ITwarning_prag {} -> TkPragma + ITdeprecated_prag {} -> TkPragma + ITline_prag -> TkPragma + ITscc_prag {} -> TkPragma + ITgenerated_prag {} -> TkPragma + ITcore_prag {} -> TkPragma + ITunpack_prag {} -> TkPragma + ITnounpack_prag {} -> TkPragma + ITann_prag {} -> TkPragma + ITcomplete_prag {} -> TkPragma + ITclose_prag -> TkPragma + IToptions_prag {} -> TkPragma + ITinclude_prag {} -> TkPragma + ITlanguage_prag -> TkPragma + ITvect_prag {} -> TkPragma + ITvect_scalar_prag {} -> TkPragma + ITnovect_prag {} -> TkPragma + ITminimal_prag {} -> TkPragma + IToverlappable_prag {} -> TkPragma + IToverlapping_prag {} -> TkPragma + IToverlaps_prag {} -> TkPragma + ITincoherent_prag {} -> TkPragma + ITctype {} -> TkPragma + + ITdotdot -> TkGlyph + ITcolon -> TkGlyph + ITdcolon {} -> TkGlyph + ITequal -> TkGlyph + ITlam -> TkGlyph + ITlcase -> TkGlyph + ITvbar -> TkGlyph + ITlarrow {} -> TkGlyph + ITrarrow {} -> TkGlyph + ITat -> TkGlyph + ITtilde -> TkGlyph + ITtildehsh -> TkGlyph + ITdarrow {} -> TkGlyph + ITminus -> TkGlyph + ITbang -> TkGlyph + ITdot -> TkOperator + ITtypeApp -> TkGlyph + + ITbiglam -> TkGlyph + + ITocurly -> TkSpecial + ITccurly -> TkSpecial + ITvocurly -> TkSpecial + ITvccurly -> TkSpecial + ITobrack -> TkSpecial + ITopabrack -> TkSpecial + ITcpabrack -> TkSpecial + ITcbrack -> TkSpecial + IToparen -> TkSpecial + ITcparen -> TkSpecial + IToubxparen -> TkSpecial + ITcubxparen -> TkSpecial + ITsemi -> TkSpecial + ITcomma -> TkSpecial + ITunderscore -> TkIdentifier + ITbackquote -> TkSpecial + ITsimpleQuote -> TkSpecial + + ITvarid {} -> TkIdentifier + ITconid {} -> TkIdentifier + ITvarsym {} -> TkOperator + ITconsym {} -> TkOperator + ITqvarid {} -> TkIdentifier + ITqconid {} -> TkIdentifier + ITqvarsym {} -> TkOperator + ITqconsym {} -> TkOperator + + ITdupipvarid {} -> TkUnknown + ITlabelvarid {} -> TkUnknown + + ITchar {} -> TkChar + ITstring {} -> TkString + ITinteger {} -> TkNumber + ITrational {} -> TkNumber + + ITprimchar {} -> TkChar + ITprimstring {} -> TkString + ITprimint {} -> TkNumber + ITprimword {} -> TkNumber + ITprimfloat {} -> TkNumber + ITprimdouble {} -> TkNumber + + ITopenExpQuote {} -> TkSpecial + ITopenPatQuote -> TkSpecial + ITopenDecQuote -> TkSpecial + ITopenTypQuote -> TkSpecial + ITcloseQuote {} -> TkSpecial + ITopenTExpQuote {} -> TkSpecial + ITcloseTExpQuote -> TkSpecial + ITidEscape {} -> TkUnknown + ITparenEscape -> TkSpecial + ITidTyEscape {} -> TkUnknown + ITparenTyEscape -> TkSpecial + ITtyQuote -> TkSpecial + ITquasiQuote {} -> TkUnknown + ITqQuasiQuote {} -> TkUnknown + + ITproc -> TkKeyword + ITrec -> TkKeyword + IToparenbar {} -> TkGlyph + ITcparenbar {} -> TkGlyph + ITlarrowtail {} -> TkGlyph + ITrarrowtail {} -> TkGlyph + ITLarrowtail {} -> TkGlyph + ITRarrowtail {} -> TkGlyph + + ITunknown {} -> TkUnknown + ITeof -> TkUnknown + + -- Line comments are only supposed to start with '--'. Starting with '#' + -- means that this was probably a CPP. + ITlineComment s + | isCPPline s -> TkCpp + | otherwise -> TkComment + + ITdocCommentNext {} -> TkComment + ITdocCommentPrev {} -> TkComment + ITdocCommentNamed {} -> TkComment + ITdocSection {} -> TkComment + ITdocOptions {} -> TkComment + + -- The lexer considers top-level pragmas as comments (see `pragState` in + -- the GHC lexer for more), so we have to manually reverse this. The + -- following is a hammer: it smashes _all_ pragma-like block comments into + -- pragmas. + ITblockComment c + | isPrefixOf "{-#" c + , isSuffixOf "#-}" c -> TkPragma + | otherwise -> TkComment diff --git a/haddock-api/src/Haddock/Backends/Hyperlinker/Renderer.hs b/haddock-api/src/Haddock/Backends/Hyperlinker/Renderer.hs index 15793f0c..27bf7605 100644 --- a/haddock-api/src/Haddock/Backends/Hyperlinker/Renderer.hs +++ b/haddock-api/src/Haddock/Backends/Hyperlinker/Renderer.hs @@ -82,7 +82,7 @@ header mcss mjs = tokenGroup :: SrcMap -> TokenGroup -> Html tokenGroup _ (GrpNormal tok@(Token { .. })) - | tkType == TkSpace = renderSpace (posRow . spStart $ tkSpan) tkValue + | tkType == TkSpace = renderSpace (GHC.srcSpanStartLine tkSpan) tkValue | otherwise = tokenSpan tok ! attrs where attrs = [ multiclass . tokenStyle $ tkType ] @@ -155,7 +155,7 @@ internalHyperlink name content = Html.anchor content ! [ Html.href $ "#" ++ internalAnchorIdent name ] externalNameHyperlink :: SrcMap -> GHC.Name -> Html -> Html -externalNameHyperlink (srcs, _) name content = case Map.lookup mdl srcs of +externalNameHyperlink srcs name content = case Map.lookup mdl srcs of Just SrcLocal -> Html.anchor content ! [ Html.href $ hypSrcModuleNameUrl mdl name ] Just (SrcExternal path) -> Html.anchor content ! @@ -165,12 +165,14 @@ externalNameHyperlink (srcs, _) name content = case Map.lookup mdl srcs of mdl = GHC.nameModule name externalModHyperlink :: SrcMap -> GHC.ModuleName -> Html -> Html -externalModHyperlink (_, srcs) name content = case Map.lookup name srcs of - Just SrcLocal -> Html.anchor content ! +externalModHyperlink srcs name content = + let srcs' = Map.mapKeys GHC.moduleName srcs in + case Map.lookup name srcs' of + Just SrcLocal -> Html.anchor content ! [ Html.href $ hypSrcModuleUrl' name ] - Just (SrcExternal path) -> Html.anchor content ! + Just (SrcExternal path) -> Html.anchor content ! [ Html.href $ path </> hypSrcModuleUrl' name ] - Nothing -> content + Nothing -> content renderSpace :: Int -> String -> Html diff --git a/haddock-api/src/Haddock/Backends/Hyperlinker/Types.hs b/haddock-api/src/Haddock/Backends/Hyperlinker/Types.hs index d8ae89e4..e377471e 100644 --- a/haddock-api/src/Haddock/Backends/Hyperlinker/Types.hs +++ b/haddock-api/src/Haddock/Backends/Hyperlinker/Types.hs @@ -4,8 +4,6 @@ module Haddock.Backends.Hyperlinker.Types where import qualified GHC import Data.Map (Map) -import qualified Data.Map as Map - data Token = Token { tkType :: TokenType @@ -14,23 +12,8 @@ data Token = Token } deriving (Show) -data Position = Position - { posRow :: !Int - , posCol :: !Int - } - deriving (Eq, Ord, Show) - -data Span = Span - { spStart :: !Position - , spEnd :: !Position - } - deriving (Show) - --- | Tests whether the first span "contains" the other span, meaning --- that it covers at least as much source code. True where spans are equal. -containsSpan :: Span -> Span -> Bool -containsSpan s1 s2 = - spStart s1 <= spStart s2 && spEnd s1 >= spEnd s2 +type Position = GHC.RealSrcLoc +type Span = GHC.RealSrcSpan data TokenType = TkIdentifier @@ -80,15 +63,5 @@ data SrcPath | SrcLocal -- | Mapping from modules to cross-package source paths. --- --- This mapping is actually a pair of maps instead of just one map. The reason --- for this is because when hyperlinking modules in import lists we have no --- 'GHC.Module' available. On the other hand, we can't just use map with --- 'GHC.ModuleName' as indices because certain modules may have common name --- but originate in different packages. Hence, we use both /rich/ and /poor/ --- versions, where the /poor/ is just projection of /rich/ one cached in pair --- for better performance. -type SrcMap = (Map GHC.Module SrcPath, Map GHC.ModuleName SrcPath) +type SrcMap = Map GHC.Module SrcPath -mkSrcMap :: Map GHC.Module SrcPath -> SrcMap -mkSrcMap srcs = (srcs, Map.mapKeys GHC.moduleName srcs) diff --git a/haddock-api/src/Haddock/Interface/Create.hs b/haddock-api/src/Haddock/Interface/Create.hs index 2a56e87a..4309163f 100644 --- a/haddock-api/src/Haddock/Interface/Create.hs +++ b/haddock-api/src/Haddock/Interface/Create.hs @@ -20,7 +20,6 @@ module Haddock.Interface.Create (createInterface) where import Documentation.Haddock.Doc (metaDocAppend) -import Documentation.Haddock.Utf8 as Utf8 import Haddock.Types import Haddock.Options import Haddock.GhcUtils @@ -33,15 +32,14 @@ import Haddock.Backends.Hyperlinker.Parser as Hyperlinker import Data.Bifunctor import Data.Bitraversable -import qualified Data.ByteString as BS import qualified Data.Map as M import Data.Map (Map) import Data.List import Data.Maybe import Data.Ord import Control.Applicative -import Control.Exception (evaluate) import Control.Monad +import Control.DeepSeq import Data.Traversable import Avail hiding (avail) @@ -160,7 +158,7 @@ createInterface tm flags modMap instIfaceMap = do modWarn <- liftErrMsg (moduleWarning dflags gre warnings) - tokenizedSrc <- mkMaybeTokenizedSrc flags tm + tokenizedSrc <- mkMaybeTokenizedSrc dflags flags tm return $! Interface { ifaceMod = mdl @@ -1137,12 +1135,12 @@ seqList :: [a] -> () seqList [] = () seqList (x : xs) = x `seq` seqList xs -mkMaybeTokenizedSrc :: [Flag] -> TypecheckedModule +mkMaybeTokenizedSrc :: DynFlags -> [Flag] -> TypecheckedModule -> ErrMsgGhc (Maybe [RichToken]) -mkMaybeTokenizedSrc flags tm +mkMaybeTokenizedSrc dflags flags tm | Flag_HyperlinkedSource `elem` flags = case renamedSource tm of Just src -> do - tokens <- liftGhcToErrMsgGhc . liftIO $ mkTokenizedSrc summary src + tokens <- liftGhcToErrMsgGhc (liftIO (mkTokenizedSrc dflags summary src)) return $ Just tokens Nothing -> do liftErrMsg . tell . pure $ concat @@ -1155,12 +1153,14 @@ mkMaybeTokenizedSrc flags tm where summary = pm_mod_summary . tm_parsed_module $ tm -mkTokenizedSrc :: ModSummary -> RenamedSource -> IO [RichToken] -mkTokenizedSrc ms src = do - -- make sure to read the whole file at once otherwise +mkTokenizedSrc :: DynFlags -> ModSummary -> RenamedSource -> IO [RichToken] +mkTokenizedSrc dflags ms src = do + -- make sure to read the whole file at once otherwise -- we run out of file descriptors (see #495) - rawSrc <- BS.readFile (msHsFilePath ms) >>= evaluate - return $ Hyperlinker.enrich src (Hyperlinker.parse (decodeUtf8 rawSrc)) + file <- force <$> readFile (filepath) + return $ Hyperlinker.enrich src (Hyperlinker.parse dflags filepath file) + where + filepath = msHsFilePath ms -- | Find a stand-alone documentation comment by its name. findNamedDoc :: String -> [HsDecl GhcRn] -> ErrMsgM (Maybe HsDocString) diff --git a/haddock-api/test/Haddock/Backends/Hyperlinker/ParserSpec.hs b/haddock-api/test/Haddock/Backends/Hyperlinker/ParserSpec.hs index 8cd2690e..dcb30e41 100644 --- a/haddock-api/test/Haddock/Backends/Hyperlinker/ParserSpec.hs +++ b/haddock-api/test/Haddock/Backends/Hyperlinker/ParserSpec.hs @@ -4,95 +4,138 @@ module Haddock.Backends.Hyperlinker.ParserSpec (main, spec) where import Test.Hspec import Test.QuickCheck +import qualified GHC +import Control.Monad.IO.Class + +import Haddock (getGhcDirs) import Haddock.Backends.Hyperlinker.Parser import Haddock.Backends.Hyperlinker.Types +withDynFlags :: (GHC.DynFlags -> IO ()) -> IO () +withDynFlags cont = do + libDir <- fmap snd (getGhcDirs []) + GHC.runGhc (Just libDir) $ do + dflags <- GHC.getSessionDynFlags + liftIO $ cont dflags + main :: IO () main = hspec spec spec :: Spec -spec = do - describe "parse" parseSpec +spec = describe "parse" parseSpec -parseSpec :: Spec -parseSpec = do +-- | Defined for its instance of 'Arbitrary' +newtype NoTabs = NoTabs String deriving (Show, Eq) - it "is total" $ - property $ \src -> length (parse src) `shouldSatisfy` (>= 0) +noTabs :: String -> Bool +noTabs = all (\c -> c `notElem` "\r\t\f\v") - it "retains file layout" $ - property $ \src -> concatMap tkValue (parse src) == src +-- | Does not generate content with space characters other than ' ' and '\n' +instance Arbitrary NoTabs where + arbitrary = fmap NoTabs (arbitrary `suchThat` noTabs) + shrink (NoTabs src) = [ NoTabs shrunk | shrunk <- shrink src, noTabs shrunk ] - context "when parsing single-line comments" $ do - it "should ignore content until the end of line" $ - "-- some very simple comment\nidentifier" - `shouldParseTo` - [TkComment, TkSpace, TkIdentifier] +parseSpec :: Spec +parseSpec = around withDynFlags $ do - it "should allow endline escaping" $ - "-- first line\\\nsecond line\\\nand another one" - `shouldParseTo` - [TkComment] + it "is total" $ \dflags -> + property $ \src -> length (parse dflags "" src) `shouldSatisfy` (>= 0) - context "when parsing multi-line comments" $ do + it "retains file layout" $ \dflags -> + property $ \(NoTabs src) -> concatMap tkValue (parse dflags "" src) == src - it "should support nested comments" $ - "{- comment {- nested -} still comment -} {- next comment -}" - `shouldParseTo` - [TkComment, TkSpace, TkComment] + context "when parsing single-line comments" $ do + + it "should ignore content until the end of line" $ \dflags -> + shouldParseTo + "-- some very simple comment\nidentifier" + [TkComment, TkSpace, TkIdentifier] + dflags - it "should distinguish compiler pragma" $ - "{- comment -}{-# LANGUAGE GADTs #-}{- comment -}" - `shouldParseTo` - [TkComment, TkPragma, TkComment] + it "should allow endline escaping" $ \dflags -> + shouldParseTo + "#define first line\\\nsecond line\\\nand another one" + [TkCpp] + dflags - it "should recognize preprocessor directives" $ do - "\n#define foo bar" `shouldParseTo` [TkSpace, TkCpp] - "x # y" `shouldParseTo` - [TkIdentifier, TkSpace, TkOperator, TkSpace,TkIdentifier] + context "when parsing multi-line comments" $ do - it "should distinguish basic language constructs" $ do - "(* 2) <$> (\"abc\", foo)" `shouldParseTo` + it "should support nested comments" $ \dflags -> + shouldParseTo + "{- comment {- nested -} still comment -} {- next comment -}" + [TkComment, TkSpace, TkComment] + dflags + + it "should distinguish compiler pragma" $ \dflags -> + shouldParseTo + "{- comment -}{-# LANGUAGE GADTs #-}{- comment -}" + [TkComment, TkPragma, TkComment] + dflags + + it "should recognize preprocessor directives" $ \dflags -> do + shouldParseTo + "\n#define foo bar" + [TkSpace, TkCpp] + dflags + shouldParseTo + "x # y" + [TkIdentifier, TkSpace, TkOperator, TkSpace,TkIdentifier] + dflags + + it "should distinguish basic language constructs" $ \dflags -> do + + shouldParseTo + "(* 2) <$> (\"abc\", foo)" [ TkSpecial, TkOperator, TkSpace, TkNumber, TkSpecial , TkSpace, TkOperator, TkSpace , TkSpecial, TkString, TkSpecial, TkSpace, TkIdentifier, TkSpecial ] - "let foo' = foo in foo' + foo'" `shouldParseTo` + dflags + + shouldParseTo + "let foo' = foo in foo' + foo'" [ TkKeyword, TkSpace, TkIdentifier , TkSpace, TkGlyph, TkSpace , TkIdentifier, TkSpace, TkKeyword, TkSpace , TkIdentifier, TkSpace, TkOperator, TkSpace, TkIdentifier ] - "square x = y^2 where y = x" `shouldParseTo` + dflags + + shouldParseTo + "square x = y^2 where y = x" [ TkIdentifier, TkSpace, TkIdentifier , TkSpace, TkGlyph, TkSpace , TkIdentifier, TkOperator, TkNumber , TkSpace, TkKeyword, TkSpace , TkIdentifier, TkSpace, TkGlyph, TkSpace, TkIdentifier ] + dflags - it "should parse do-notation syntax" $ do - "do { foo <- getLine; putStrLn foo }" `shouldParseTo` + it "should parse do-notation syntax" $ \dflags -> do + shouldParseTo + "do { foo <- getLine; putStrLn foo }" [ TkKeyword, TkSpace, TkSpecial, TkSpace , TkIdentifier, TkSpace, TkGlyph, TkSpace , TkIdentifier, TkSpecial, TkSpace , TkIdentifier, TkSpace, TkIdentifier, TkSpace, TkSpecial ] - - unlines - [ "do" - , " foo <- getLine" - , " putStrLn foo" - ] `shouldParseTo` + dflags + + shouldParseTo + (unlines + [ "do" + , " foo <- getLine" + , " putStrLn foo" + ]) [ TkKeyword, TkSpace, TkIdentifier , TkSpace, TkGlyph, TkSpace, TkIdentifier, TkSpace , TkIdentifier, TkSpace, TkIdentifier, TkSpace ] - - -shouldParseTo :: String -> [TokenType] -> Expectation -str `shouldParseTo` tokens = map tkType (parse str) `shouldBe` tokens + dflags + where + shouldParseTo :: String -> [TokenType] -> GHC.DynFlags -> Expectation + shouldParseTo str tokens dflags = map tkType (parse dflags "" str) `shouldBe` tokens diff --git a/hypsrc-test/ref/src/Identifiers.html b/hypsrc-test/ref/src/Identifiers.html index f52db4ab..ce69ad37 100644 --- a/hypsrc-test/ref/src/Identifiers.html +++ b/hypsrc-test/ref/src/Identifiers.html @@ -827,11 +827,7 @@ > </span ><a href="Identifiers.html#norf" ><span class="hs-identifier hs-var" - >Identifiers</span - ><span class="hs-operator hs-var" - >.</span - ><span class="hs-identifier hs-var" - >norf</span + >Identifiers.norf</span ></a ><span > </span @@ -931,4 +927,4 @@ ></pre ></body ></html ->
\ No newline at end of file +> |