From 1b26460fb3b5df5215cc1e6715661cbc7c950085 Mon Sep 17 00:00:00 2001 From: Alec Theriault Date: Thu, 31 Jan 2019 01:37:25 -0800 Subject: Use `.hie` files for the Hyperlinker backend (#977) # Summary This is a large architectural change to the Hyperlinker. * extract link (and now also type) information from `.hie` instead of doing ad-hoc SYB traversals of the `RenamedSource`. Also adds a superb type-on-hover feature (#715). * re-engineer the lexer to avoid needless string conversions. By going directly through GHC's `P` monad and taking bytestring slices, we avoid a ton of allocation and have better handling of position pragmas and CPP. In terms of performance, the Haddock side of things has gotten _much_ more efficient. Unfortunately, much of this is cancelled out by the increased GHC workload for generating `.hie` files. For the full set of boot libs (including `ghc`-the-library) * the sum of total time went down by 9-10% overall * the sum of total allocations went down by 6-7% # Motivation Haddock is moving towards working entirely over `.hi` and `.hie` files. This change means we no longer need the `RenamedSource` from `TypecheckedModule` (something which is _not_ in `.hi` files). # Details Along the way a bunch of things were fixed: * Cross package (and other) links are now more reliable (#496) * The lexer tries to recover from errors on every line (instead of at CPP boundaries) * `LINE`/`COLUMN` pragmas are taken into account * filter out zero length tokens before rendering * avoid recomputing the `ModuleName`-based `SrcMap` * remove the last use of `Documentation.Haddock.Utf8` (see #998) * restructure temporary folder logic for `.hi`/`.hie` model --- haddock-api/src/Haddock/GhcUtils.hs | 134 +++++++++++++++++++++++++++++++++++- 1 file changed, 131 insertions(+), 3 deletions(-) (limited to 'haddock-api/src/Haddock/GhcUtils.hs') diff --git a/haddock-api/src/Haddock/GhcUtils.hs b/haddock-api/src/Haddock/GhcUtils.hs index cdaf6ae4..a342de00 100644 --- a/haddock-api/src/Haddock/GhcUtils.hs +++ b/haddock-api/src/Haddock/GhcUtils.hs @@ -19,10 +19,12 @@ module Haddock.GhcUtils where import Control.Arrow +import Data.Char ( isSpace ) + import Haddock.Types( DocNameI ) import Exception -import Outputable +import Outputable ( Outputable, panic, showPpr ) import Name import NameSet import Module @@ -30,6 +32,14 @@ import HscTypes import GHC import Class import DynFlags +import SrcLoc ( advanceSrcLoc ) + +import StringBuffer ( StringBuffer ) +import qualified StringBuffer as S + +import Data.ByteString ( ByteString ) +import qualified Data.ByteString as BS +import qualified Data.ByteString.Internal as BS moduleString :: Module -> String @@ -413,11 +423,129 @@ minimalDef n = do ------------------------------------------------------------------------------- -setObjectDir, setHiDir, setStubDir, setOutputDir :: String -> DynFlags -> DynFlags +setObjectDir, setHiDir, setHieDir, setStubDir, setOutputDir :: String -> DynFlags -> DynFlags setObjectDir f d = d{ objectDir = Just f} setHiDir f d = d{ hiDir = Just f} +setHieDir f d = d{ hieDir = Just f} setStubDir f d = d{ stubDir = Just f , includePaths = addGlobalInclude (includePaths d) [f] } -- -stubdir D adds an implicit -I D, so that gcc can find the _stub.h file -- \#included from the .hc file when compiling with -fvia-C. -setOutputDir f = setObjectDir f . setHiDir f . setStubDir f +setOutputDir f = setObjectDir f . setHiDir f . setHieDir f . setStubDir f + + +------------------------------------------------------------------------------- +-- * 'StringBuffer' and 'ByteString' +------------------------------------------------------------------------------- +-- We get away with a bunch of these functions because 'StringBuffer' and +-- 'ByteString' have almost exactly the same structure. + +-- | Convert a UTF-8 encoded 'ByteString' into a 'StringBuffer. This really +-- relies on the internals of both 'ByteString' and 'StringBuffer'. +-- +-- /O(n)/ (but optimized into a @memcpy@ by @bytestring@ under the hood) +stringBufferFromByteString :: ByteString -> StringBuffer +stringBufferFromByteString bs = + let BS.PS fp off len = bs <> BS.pack [0,0,0] + in S.StringBuffer { S.buf = fp, S.len = len - 3, S.cur = off } + +-- | Take the first @n@ /bytes/ of the 'StringBuffer' and put them in a +-- 'ByteString'. +-- +-- /O(1)/ +takeStringBuffer :: Int -> StringBuffer -> ByteString +takeStringBuffer !n !(S.StringBuffer fp _ cur) = BS.PS fp cur n + +-- | Return the prefix of the first 'StringBuffer' that /isn't/ in the second +-- 'StringBuffer'. **The behavior is undefined if the 'StringBuffers' use +-- separate buffers.** +-- +-- /O(1)/ +splitStringBuffer :: StringBuffer -> StringBuffer -> ByteString +splitStringBuffer buf1 buf2 = takeStringBuffer n buf1 + where n = S.byteDiff buf1 buf2 + +-- | Split the 'StringBuffer' at the next newline (or the end of the buffer). +-- Also: initial position is passed in and the updated position is returned. +-- +-- /O(n)/ (but /O(1)/ space) +spanLine :: RealSrcLoc -> StringBuffer -> (ByteString, RealSrcLoc, StringBuffer) +spanLine !loc !buf = go loc buf + where + + go !l !b + | not (S.atEnd b) + = case S.nextChar b of + ('\n', b') -> (splitStringBuffer buf b', advanceSrcLoc l '\n', b') + (c, b') -> go (advanceSrcLoc l c) b' + | otherwise + = (splitStringBuffer buf b, advanceSrcLoc l '\n', b) + +-- | Given a start position and a buffer with that start position, split the +-- buffer at an end position. +-- +-- /O(n)/ (but /O(1)/ space) +spanPosition :: RealSrcLoc -- ^ start of buffeer + -> RealSrcLoc -- ^ position until which to take + -> StringBuffer -- ^ buffer from which to take + -> (ByteString, StringBuffer) +spanPosition !start !end !buf = go start buf + where + + go !l !b + | l < end + , not (S.atEnd b) + , (c, b') <- S.nextChar b + = go (advanceSrcLoc l c) b' + | otherwise + = (splitStringBuffer buf b, b) + +-- | Try to parse a line of CPP from the from of the buffer. A \"line\" of CPP +-- consists of +-- +-- * at most 10 whitespace characters, including at least one newline +-- * a @#@ character +-- * keep parsing lines until you find a line not ending in @\\@. +-- +-- This is chock full of heuristics about what a line of CPP is. +-- +-- /O(n)/ (but /O(1)/ space) +tryCppLine :: RealSrcLoc -> StringBuffer -> Maybe (ByteString, RealSrcLoc, StringBuffer) +tryCppLine !loc !buf = spanSpace (S.prevChar buf '\n' == '\n') loc buf + where + + -- Keep consuming space characters until we hit either a @#@ or something + -- else. If we hit a @#@, start parsing CPP + spanSpace !seenNl !l !b + | S.atEnd b + = Nothing + | otherwise + = case S.nextChar b of + ('#' , b') | not (S.atEnd b') + , ('-', b'') <- S.nextChar b' + , ('}', _) <- S.nextChar b'' + -> Nothing -- Edge case exception for @#-}@ + | seenNl + -> Just (spanCppLine (advanceSrcLoc l '#') b') -- parse CPP + | otherwise + -> Nothing -- We didn't see a newline, so this can't be CPP! + + (c , b') | isSpace c -> spanSpace (seenNl || c == '\n') + (advanceSrcLoc l c) b' + | otherwise -> Nothing + + -- Consume a CPP line to its "end" (basically the first line that ends not + -- with a @\@ character) + spanCppLine !l !b + | S.atEnd b + = (splitStringBuffer buf b, l, b) + | otherwise + = case S.nextChar b of + ('\\', b') | not (S.atEnd b') + , ('\n', b'') <- S.nextChar b' + -> spanCppLine (advanceSrcLoc (advanceSrcLoc l '\\') '\n') b'' + + ('\n', b') -> (splitStringBuffer buf b', advanceSrcLoc l '\n', b') + + (c , b') -> spanCppLine (advanceSrcLoc l c) b' + -- cgit v1.2.3