From 1b26460fb3b5df5215cc1e6715661cbc7c950085 Mon Sep 17 00:00:00 2001
From: Alec Theriault <alec.theriault@gmail.com>
Date: Thu, 31 Jan 2019 01:37:25 -0800
Subject: Use `.hie` files for the Hyperlinker backend (#977)

# Summary

This is a large architectural change to the Hyperlinker.

  * extract link (and now also type) information from `.hie` instead
    of doing ad-hoc SYB traversals of the `RenamedSource`. Also
    adds a superb type-on-hover feature (#715).

 * re-engineer the lexer to avoid needless string conversions. By going
    directly through GHC's `P` monad and taking bytestring slices, we
    avoid a ton of allocation and have better handling of position
    pragmas and CPP.

In terms of performance, the Haddock side of things has gotten _much_
more efficient. Unfortunately, much of this is cancelled out by the
increased GHC workload for generating `.hie` files. For the full set of
boot libs (including `ghc`-the-library)

  * the sum of total time went down by 9-10% overall
  * the sum of total allocations went down by 6-7%

# Motivation

Haddock is moving towards working entirely over `.hi` and `.hie` files.
This change means we no longer need the `RenamedSource` from
`TypecheckedModule` (something which is _not_ in `.hi` files).

# Details

Along the way a bunch of things were fixed:

 * Cross package (and other) links are now more reliable (#496)
 * The lexer tries to recover from errors on every line (instead of at CPP
    boundaries)
 * `LINE`/`COLUMN` pragmas are taken into account
 * filter out zero length tokens before rendering
 * avoid recomputing the `ModuleName`-based `SrcMap`
 * remove the last use of `Documentation.Haddock.Utf8` (see  #998)
 * restructure temporary folder logic for `.hi`/`.hie` model
---
 .../src/Haddock/Backends/Hyperlinker/Parser.hs     | 386 ++++++++++-----------
 1 file changed, 190 insertions(+), 196 deletions(-)

(limited to 'haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs')

diff --git a/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs b/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
index f8494242..1d5576cc 100644
--- a/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
+++ b/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
@@ -1,213 +1,212 @@
+{-# LANGUAGE OverloadedStrings #-}
+{-# OPTIONS_GHC -fno-warn-orphans #-}
 module Haddock.Backends.Hyperlinker.Parser (parse) where
 
-import Data.Either         ( isRight, isLeft )
-import Data.List           ( foldl', isPrefixOf, isSuffixOf )
-import Data.Maybe          ( maybeToList )
-import Data.Char           ( isSpace )
-import qualified Text.Read as R
+import Control.Applicative ( Alternative(..) )
+import Data.List           ( isPrefixOf, isSuffixOf )
 
-import GHC                 ( DynFlags, addSourceToTokens )
-import SrcLoc
+import qualified Data.ByteString as BS
+import qualified Data.ByteString.Char8 as BSC
+
+import GHC.LanguageExtensions.Type
+
+import BasicTypes          ( IntegralLit(..) )
+import DynFlags
+import qualified EnumSet as E
+import ErrUtils            ( emptyMessages )
 import FastString          ( mkFastString )
-import StringBuffer        ( stringToStringBuffer )
-import Lexer               ( Token(..) )
-import qualified Lexer as L
+import Lexer               ( P(..), ParseResult(..), PState(..), Token(..)
+                           , mkPStatePure, lexer, mkParserFlags' )
+import Outputable          ( showSDoc, panic )
+import SrcLoc
+import StringBuffer        ( StringBuffer, atEnd )
 
 import Haddock.Backends.Hyperlinker.Types as T
-
+import Haddock.GhcUtils
 
 -- | Turn source code string into a stream of more descriptive tokens.
 --
--- Result should retain original file layout (including comments, whitespace,
--- etc.), i.e. the following "law" should hold:
---
--- prop> concat . map tkValue . parse = id
---
--- (In reality, this only holds for input not containing '\r', '\t', '\f', '\v',
--- characters, since GHC transforms those into ' ' and '\n')
-parse :: DynFlags -> FilePath -> String -> [T.Token]
-parse dflags fp = ghcToks . processCPP dflags fp . filterCRLF
+-- Result should retain original file layout (including comments,
+-- whitespace, and CPP).
+parse
+  :: CompilerInfo  -- ^ Underlying CC compiler (whatever expanded CPP)
+  -> DynFlags      -- ^ Flags for this module
+  -> FilePath      -- ^ Path to the source of this module
+  -> BS.ByteString -- ^ Raw UTF-8 encoded source of this module
+  -> [T.Token]
+parse comp dflags fpath bs = case unP (go False []) initState of
+    POk _ toks -> reverse toks
+    PFailed _ ss errMsg -> panic $ "Hyperlinker parse error at " ++ show ss ++
+                                   ": " ++ showSDoc dflags errMsg
   where
-    -- Remove CRLFs from source
-    filterCRLF :: String -> String
-    filterCRLF ('\r':'\n':cs) = '\n' : filterCRLF cs
-    filterCRLF (c:cs) = c : filterCRLF cs
-    filterCRLF [] = []
 
--- | Parse the source into tokens using the GHC lexer.
+    initState = mkPStatePure pflags buf start
+    buf = stringBufferFromByteString bs
+    start = mkRealSrcLoc (mkFastString fpath) 1 1
+    needPragHack' = needPragHack comp dflags
+    pflags = mkParserFlags' (warningFlags dflags)
+                            (extensionFlags dflags)
+                            (thisPackage dflags)
+                            (safeImportsOn dflags)
+                            False -- lex Haddocks as comment tokens
+                            True  -- produce comment tokens
+                            False -- produce position pragmas tokens
+
+    go :: Bool        -- ^ are we currently in a pragma?
+       -> [T.Token]   -- ^ tokens accumulated so far (in reverse)
+       -> P [T.Token]
+    go inPrag toks = do
+      (b, _) <- getInput
+      if not (atEnd b)
+        then do
+          (newToks, inPrag') <- parseCppLine <|> parsePlainTok inPrag <|> unknownLine
+          go inPrag' (newToks ++ toks)
+        else
+          pure toks
+
+    -- | Like 'Lexer.lexer', but slower, with a better API, and filtering out empty tokens
+    wrappedLexer :: P (RealLocated Lexer.Token)
+    wrappedLexer = Lexer.lexer False andThen
+      where andThen (L (RealSrcSpan s) t)
+              | srcSpanStartLine s /= srcSpanEndLine s ||
+                srcSpanStartCol s /= srcSpanEndCol s
+              = pure (L s t)
+            andThen (L (RealSrcSpan s) ITeof) = pure (L s ITeof)
+            andThen _ = wrappedLexer
+
+    -- | Try to parse a CPP line (can fail)
+    parseCppLine :: P ([T.Token], Bool)
+    parseCppLine = do
+      (b, l) <- getInput
+      case tryCppLine l b of
+        Just (cppBStr, l', b')
+             -> let cppTok = T.Token { tkType = TkCpp
+                                     , tkValue = cppBStr
+                                     , tkSpan = mkRealSrcSpan l l' }
+                in setInput (b', l') *> pure ([cppTok], False)
+        _    -> empty
+
+    -- | Try to parse a regular old token (can fail)
+    parsePlainTok :: Bool -> P ([T.Token], Bool)  -- return list is only ever 0-2 elements
+    parsePlainTok inPrag = do
+      (bInit, lInit) <- getInput
+      L sp tok <- Lexer.lexer False return
+      (bEnd, _) <- getInput
+      case sp of
+        UnhelpfulSpan _ -> pure ([], False) -- pretend the token never existed
+        RealSrcSpan rsp -> do
+          let typ = if inPrag then TkPragma else classify tok
+              RealSrcLoc lStart = srcSpanStart sp -- safe since @sp@ is real
+              (spaceBStr, bStart) = spanPosition lInit lStart bInit
+              inPragDef = inPragma inPrag tok
+
+          (bEnd', inPrag') <- case tok of
+
+            -- Update internal line + file position if this is a LINE pragma
+            ITline_prag _ -> tryOrElse (bEnd, inPragDef) $ do
+              L _ (ITinteger (IL { il_value = line })) <- wrappedLexer
+              L _ (ITstring _ file)                    <- wrappedLexer
+              L spF ITclose_prag                       <- wrappedLexer
+
+              let newLoc = mkRealSrcLoc file (fromIntegral line - 1) (srcSpanEndCol spF)
+              (bEnd'', _) <- getInput
+              setInput (bEnd'', newLoc)
+
+              pure (bEnd'', False)
+
+            -- Update internal column position if this is a COLUMN pragma
+            ITcolumn_prag _ -> tryOrElse (bEnd, inPragDef) $ do
+              L _ (ITinteger (IL { il_value = col }))  <- wrappedLexer
+              L spF ITclose_prag                       <- wrappedLexer
+
+              let newLoc = mkRealSrcLoc (srcSpanFile spF) (srcSpanEndLine spF) (fromIntegral col)
+              (bEnd'', _) <- getInput
+              setInput (bEnd'', newLoc)
+
+              pure (bEnd'', False)
+
+            -- See 'needPragHack'
+            ITclose_prag{}
+              | needPragHack'
+              , '\n' `BSC.elem` spaceBStr
+              -> getInput >>= \(b,p) -> setInput (b,advanceSrcLoc p '\n') >> pure (bEnd, False)
+
+            _ -> pure (bEnd, inPragDef)
+
+          let tokBStr = splitStringBuffer bStart bEnd'
+              plainTok = T.Token { tkType = typ
+                                 , tkValue = tokBStr
+                                 , tkSpan = rsp }
+              spaceTok = T.Token { tkType = TkSpace
+                                 , tkValue = spaceBStr
+                                 , tkSpan = mkRealSrcSpan lInit lStart }
+
+          pure (plainTok : [ spaceTok | not (BS.null spaceBStr) ], inPrag')
+
+    -- | Parse whatever remains of the line as an unknown token (can't fail)
+    unknownLine :: P ([T.Token], Bool)
+    unknownLine = do
+      (b, l) <- getInput
+      let (unkBStr, l', b') = spanLine l b
+          unkTok = T.Token { tkType = TkUnknown
+                           , tkValue = unkBStr
+                           , tkSpan = mkRealSrcSpan l l' }
+      setInput (b', l')
+      pure ([unkTok], False)
+
+
+-- | This is really, really, /really/ gross. Problem: consider a Haskell
+-- file that looks like:
 --
---   * CPP lines are removed and reinserted as line-comments
---   * top-level file pragmas are parsed as block comments (see the
---     'ITblockComment' case of 'classify' for more details)
+-- @
+-- {-# LANGUAGE CPP #-}
+-- module SomeMod where
 --
-processCPP :: DynFlags    -- ^ GHC's flags
-           -> FilePath    -- ^ source file name (for position information)
-           -> String      -- ^ source file contents
-           -> [(Located L.Token, String)]
-processCPP dflags fpath s = addSrc . go start . splitCPP $ s
-  where
-    start = mkRealSrcLoc (mkFastString fpath) 1 1
-    addSrc = addSourceToTokens start (stringToStringBuffer s)
-
-    -- Transform a list of Haskell/CPP lines into a list of tokens
-    go :: RealSrcLoc -> [Either String String] -> [Located L.Token]
-    go _   [] = []
-    go pos ls =
-      let (hLinesRight,  ls')  = span isRight ls
-          (cppLinesLeft, rest) = span isLeft ls'
-
-          hSrc   = concat [ hLine   | Right hLine  <- hLinesRight  ]
-          cppSrc = concat [ cppLine | Left cppLine <- cppLinesLeft ]
-
-      in case L.lexTokenStream (stringToStringBuffer hSrc) pos dflags of
-
-           -- Stuff that fails to lex gets turned into comments
-           L.PFailed _ _ss _msg ->
-             let (src_pos, failed) = mkToken ITunknown pos hSrc
-                 (new_pos, cpp)    = mkToken ITlineComment src_pos cppSrc
-             in failed : cpp : go new_pos rest
-
-           -- Successfully lexed
-           L.POk ss toks ->
-             let (new_pos, cpp) = mkToken ITlineComment (L.loc ss) cppSrc
-             in toks ++ [cpp] ++ go new_pos rest
-
-    -- Manually make a token from a 'String', advancing the cursor position
-    mkToken tok start' str =
-      let end = foldl' advanceSrcLoc start' str
-      in (end, L (RealSrcSpan $ mkRealSrcSpan start' end) (tok str))
-
-
--- | Split apart the initial file into Haskell source lines ('Left' entries) and
--- CPP lines ('Right' entries).
+-- #define SIX 6
+--
+-- {-# INLINE foo
+--   #-}
+-- foo = 1
+-- @
 --
--- All characters in the input are present in the output:
+-- Clang's CPP replaces the @#define SIX 6@ line with an empty line (as it
+-- should), but get confused about @#-}@. I'm guessing it /starts/ by
+-- parsing that as a pre-processor directive and, when it fails to, it just
+-- leaves the line alone. HOWEVER, it still adds an extra newline. =.=
 --
--- prop> concat . map (either id id) . splitCPP = id
-splitCPP :: String -> [Either String String]
-splitCPP "" = []
-splitCPP s | isCPPline s = Left l : splitCPP rest
-           | otherwise =  Right l : splitCPP rest
+-- This function makes sure that the Hyperlinker backend also adds that
+-- extra newline (or else our spans won't line up with GHC's anymore).
+needPragHack :: CompilerInfo -> DynFlags -> Bool
+needPragHack comp dflags = isCcClang && E.member Cpp (extensionFlags dflags)
   where
-    ~(l, rest) = spanToNewline 0 s
+    isCcClang = case comp of
+      GCC -> False
+      Clang -> True
+      AppleClang -> True
+      AppleClang51 -> True
+      UnknownCC -> False
 
+-- | Get the input
+getInput :: P (StringBuffer, RealSrcLoc)
+getInput = P $ \p @ PState { buffer = buf, loc = srcLoc } -> POk p (buf, srcLoc)
 
--- | Heuristic to decide if a line is going to be a CPP line. This should be a
--- cheap operation since it is going to be run on every line being processed.
---
--- Right now it just checks if the first non-whitespace character in the first
--- five characters of the line is a '#':
---
--- >>> isCPPline "#define FOO 1"
--- True
---
--- >>> isCPPline "\t\t  #ifdef GHC"
--- True
---
--- >>> isCPPline "       #endif"
--- False
---
-isCPPline :: String -> Bool
-isCPPline = isPrefixOf "#" . dropWhile (`elem` " \t") . take 5
+-- | Set the input
+setInput :: (StringBuffer, RealSrcLoc) -> P ()
+setInput (buf, srcLoc) = P $ \p -> POk (p { buffer = buf, loc = srcLoc }) ()
 
 
--- | Split a "line" off the front of a string, hopefully without cutting tokens
--- in half. I say "hopefully" because knowing what a token is requires lexing,
--- yet lexing depends on this function.
---
--- All characters in the input are present in the output:
---
--- prop> curry (++) . spanToNewLine 0 = id
-spanToNewline :: Int                 -- ^ open '{-'
-              -> String              -- ^ input
-              -> (String, String)
-
--- Base case and space characters
-spanToNewline _ "" = ("", "")
-spanToNewline n ('\n':str) | n <= 0 = ("\n", str)
-spanToNewline n ('\n':str) | n <= 0 = ("\n", str)
-spanToNewline n ('\\':'\n':str) =
-    let (str', rest) = spanToNewline n str
-    in ('\\':'\n':str', rest)
-
--- Block comments
-spanToNewline n ('{':'-':str) =
-    let (str', rest) = spanToNewline (n+1) str
-    in ('{':'-':str', rest)
-spanToNewline n ('-':'}':str) =
-    let (str', rest) = spanToNewline (n-1) str
-    in ('-':'}':str', rest)
-
--- When not in a block comment, try to lex a Haskell token
-spanToNewline 0 str@(c:_) | ((lexed, str') : _) <- R.lex str, not (isSpace c) =
-    if all (== '-') lexed && length lexed >= 2
-      -- A Haskell line comment
-      then case span (/= '\n') str' of
-             (str'', '\n':rest) -> (lexed ++ str'' ++ "\n", rest)
-             (_, _) -> (str, "")
-
-      -- An actual Haskell token
-      else let (str'', rest) = spanToNewline 0 str'
-           in (lexed ++ str'', rest)
-
--- In all other cases, advance one character at a time
-spanToNewline n (c:str) =
-    let (str', rest) = spanToNewline n str
-    in (c:str', rest)
-
-
--- | Turn a list of GHC's 'L.Token' (and their source 'String') into a list of
--- Haddock's 'T.Token'.
-ghcToks :: [(Located L.Token, String)] -> [T.Token]
-ghcToks = reverse . (\(_,ts,_) -> ts) . foldl' go (start, [], False)
-  where
-    start = mkRealSrcLoc (mkFastString "lexing") 1 1
-
-    go :: (RealSrcLoc, [T.Token], Bool)
-       -- ^ current position, tokens accumulated, currently in pragma (or not)
-
-       -> (Located L.Token, String)
-       -- ^ next token, its content
-
-       -> (RealSrcLoc, [T.Token], Bool)
-       -- ^ new position, new tokens accumulated, currently in pragma (or not)
-
-    go (pos, toks, in_prag) (L l tok, raw) =
-        ( next_pos
-        , classifiedTok ++ maybeToList white ++ toks
-        , inPragma in_prag tok
-        )
-       where
-         (next_pos, white) = mkWhitespace pos l
-
-         classifiedTok = [ Token (classify' tok) raw rss
-                         | RealSrcSpan rss <- [l]
-                         , not (null raw)
-                         ]
-
-         classify' | in_prag = const TkPragma
-                   | otherwise = classify
-
-
--- | Find the correct amount of whitespace between tokens.
-mkWhitespace :: RealSrcLoc -> SrcSpan -> (RealSrcLoc, Maybe T.Token)
-mkWhitespace prev spn =
-  case spn of
-    UnhelpfulSpan _ -> (prev,Nothing)
-    RealSrcSpan s | null wsstring -> (end, Nothing)
-                  | otherwise -> (end, Just (Token TkSpace wsstring wsspan))
-      where
-        start = realSrcSpanStart s
-        end = realSrcSpanEnd s
-        wsspan = mkRealSrcSpan prev start
-        nls = srcLocLine start - srcLocLine prev
-        spaces = if nls == 0 then srcLocCol start - srcLocCol prev
-                             else srcLocCol start - 1
-        wsstring = replicate nls '\n' ++ replicate spaces ' '
+-- | Orphan instance that adds backtracking to 'P'
+instance Alternative P where
+  empty = P $ \_ -> PFailed (const emptyMessages) noSrcSpan "Alterative.empty"
+  P x <|> P y = P $ \s -> case x s of { p@POk{} -> p
+                                      ; _ -> y s }
 
+-- | Try a parser. If it fails, backtrack and return the pure value.
+tryOrElse :: a -> P a -> P a
+tryOrElse x p = p <|> pure x
 
 -- | Classify given tokens as appropriate Haskell token type.
-classify :: L.Token -> TokenType
+classify :: Lexer.Token -> TokenType
 classify tok =
   case tok of
     ITas                   -> TkKeyword
@@ -382,12 +381,7 @@ classify tok =
     ITunknown           {} -> TkUnknown
     ITeof                  -> TkUnknown
 
-    -- Line comments are only supposed to start with '--'. Starting with '#'
-    -- means that this was probably a CPP.
-    ITlineComment s
-      | isCPPline s        -> TkCpp
-      | otherwise          -> TkComment
-
+    ITlineComment       {} -> TkComment
     ITdocCommentNext    {} -> TkComment
     ITdocCommentPrev    {} -> TkComment
     ITdocCommentNamed   {} -> TkComment
@@ -404,9 +398,9 @@ classify tok =
       | otherwise          -> TkComment
 
 -- | Classify given tokens as beginning pragmas (or not).
-inPragma :: Bool     -- ^ currently in pragma
-         -> L.Token  -- ^ current token
-         -> Bool     -- ^ new information about whether we are in a pragma
+inPragma :: Bool           -- ^ currently in pragma
+         -> Lexer.Token    -- ^ current token
+         -> Bool           -- ^ new information about whether we are in a pragma
 inPragma _ ITclose_prag = False
 inPragma True _ = True
 inPragma False tok =
-- 
cgit v1.2.3