From 4f75be94f45a0e92553eccefe56230c554333ce7 Mon Sep 17 00:00:00 2001
From: Alec Theriault <alec.theriault@gmail.com>
Date: Sun, 10 Dec 2017 12:22:21 -0800
Subject: Use the GHC lexer for the Hyperlinker backend (#714)

* Start changing to use GHC lexer

* better cpp

* Change SrcSpan to RealSrcSpan

* Remove error

* Try to stop too many open files

* wip

* wip

* Revert "wip"

This reverts commit b605510a195f26315e3d8ca90e6d95a6737553e1.

Conflicts:
	haddock-api/haddock-api.cabal
	haddock-api/src/Haddock/Interface.hs

* Remove pointless 'caching'

* Use dlist rather than lists when finding vars

* Use a map rather than list

* Delete bogus comment

* Rebase followup

Things now run using the GHC lexer. There are still

  - stray debug statements
  - unnecessary changes w.r.t. master

* Cleaned up differences w.r.t. current Haddock HEAD

Things are looking good. quasiquotes in particular look beautiful: the
TH ones (with Haskell source inside) colour/link their contents too!

Haven't yet begun to check for possible performance problems.

* Support CPP and top-level pragmas

The support for these is hackier - but no more hacky than the existing
support.

* Tests pass, CPP is better recognized

The tests were in some cases altered: I consider the new output to be more
correct than the old one....

* Fix shrinking of source without tabs in test

* Replace 'Position'/'Span' with GHC counterparts

Replaces 'Position' -> 'GHC.RealSrcLoc' and 'Span' -> 'GHC.RealSrcSpan'.

* Nits

* Forgot entry in .cabal

* Update changelog
---
 .../src/Haddock/Backends/Hyperlinker/Parser.hs     | 534 +++++++++++++--------
 1 file changed, 344 insertions(+), 190 deletions(-)

(limited to 'haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs')

diff --git a/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs b/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
index e4345602..da1555b8 100644
--- a/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
+++ b/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
@@ -1,11 +1,17 @@
 module Haddock.Backends.Hyperlinker.Parser (parse) where
 
+import Data.Either         ( isRight, isLeft )
+import Data.List           ( foldl', isPrefixOf, isSuffixOf )
+import Data.Maybe          ( maybeToList )
 
-import Data.Char
-import Data.List
-import Data.Maybe
+import GHC                 ( DynFlags, addSourceToTokens )
+import SrcLoc
+import FastString          ( mkFastString )
+import StringBuffer        ( stringToStringBuffer )
+import Lexer               ( Token(..) )
+import qualified Lexer as L
 
-import Haddock.Backends.Hyperlinker.Types
+import Haddock.Backends.Hyperlinker.Types as T
 
 
 -- | Turn source code string into a stream of more descriptive tokens.
@@ -13,202 +19,350 @@ import Haddock.Backends.Hyperlinker.Types
 -- Result should retain original file layout (including comments, whitespace,
 -- etc.), i.e. the following "law" should hold:
 --
--- @concat . map 'tkValue' . 'parse' = id@
-parse :: String -> [Token]
-parse = tokenize . tag . chunk
+-- prop> concat . map tkValue . parse = id
+--
+-- (In reality, this only holds for input not containing '\r', '\t', '\f', '\v',
+-- characters, since GHC transforms those into ' ' and '\n')
+parse :: DynFlags -> FilePath -> String -> [T.Token]
+parse dflags fp s = ghcToks (processCPP dflags fp s)
+
 
--- | Split raw source string to more meaningful chunks.
+-- | Parse the source into tokens using the GHC lexer.
+--
+--   * CPP lines are removed and reinserted as line-comments
+--   * top-level file pragmas are parsed as block comments (see the
+--     'ITblockComment' case of 'classify' for more details)
 --
--- This is the initial stage of tokenization process. Each chunk is either
--- a comment (including comment delimiters), a whitespace string, preprocessor
--- macro (and all its content until the end of a line) or valid Haskell lexeme.
-chunk :: String -> [String]
-chunk [] = []
-chunk str@(c:_)
-    | isSpace c =
-        let (space, mcpp, rest) = spanSpaceOrCpp str
-        in [space] ++ maybeToList mcpp ++ chunk rest
-chunk str
-    | "--" `isPrefixOf` str = chunk' $ spanToNewline str
-    | "{-" `isPrefixOf` str = chunk' $ chunkComment 0 str
-    | otherwise = case lex' str of
-        (tok:_) -> chunk' tok
-        [] -> [str]
+processCPP :: DynFlags    -- ^ GHC's flags
+           -> FilePath    -- ^ source file name (for position information)
+           -> String      -- ^ source file contents
+           -> [(Located L.Token, String)]
+processCPP dflags fpath s = addSrc . go start . splitCPP $ s
   where
-    chunk' (c, rest) = c:(chunk rest)
+    start = mkRealSrcLoc (mkFastString fpath) 1 1
+    addSrc = addSourceToTokens start (stringToStringBuffer s)
 
--- | A bit better lexer then the default, i.e. handles DataKinds quotes
-lex' :: ReadS String
-lex' ('\'' : '\'' : rest)              = [("''", rest)]
-lex' str@('\'' : '\\' : _ : '\'' : _)  = lex str
-lex' str@('\'' : _ : '\'' : _)         = lex str
-lex' ('\'' : rest)                     = [("'", rest)]
-lex' str                               = lex str
+    -- Transform a list of Haskell/CPP lines into a list of tokens
+    go :: RealSrcLoc -> [Either String String] -> [Located L.Token]
+    go _   [] = []
+    go pos ls =
+      let (hLinesRight,  ls')  = span isRight ls
+          (cppLinesLeft, rest) = span isLeft ls'
+          
+          hSrc   = concat [ hLine   | Right hLine  <- hLinesRight  ]
+          cppSrc = concat [ cppLine | Left cppLine <- cppLinesLeft ]
+      
+      in case L.lexTokenStream (stringToStringBuffer hSrc) pos dflags of
 
--- | Split input to "first line" string and the rest of it.
---
--- Ideally, this should be done simply with @'break' (== '\n')@. However,
--- Haskell also allows line-unbreaking (or whatever it is called) so things
--- are not as simple and this function deals with that.
-spanToNewline :: String -> (String, String)
-spanToNewline [] = ([], [])
-spanToNewline ('\\':'\n':str) =
-    let (str', rest) = spanToNewline str
-    in ('\\':'\n':str', rest)
-spanToNewline str@('\n':_) = ("", str)
-spanToNewline (c:str) =
-    let (str', rest) = spanToNewline str
-    in (c:str', rest)
+           -- Stuff that fails to lex gets turned into comments
+           L.PFailed _ss _msg ->
+             let (src_pos, failed) = mkToken ITunknown pos hSrc
+                 (new_pos, cpp)    = mkToken ITlineComment src_pos cppSrc
+             in failed : cpp : go new_pos rest
 
--- | Split input to whitespace string, (optional) preprocessor directive and
--- the rest of it.
+           -- Successfully lexed 
+           L.POk ss toks ->
+             let (new_pos, cpp) = mkToken ITlineComment (L.loc ss) cppSrc
+             in toks ++ [cpp] ++ go new_pos rest
+ 
+    -- Manually make a token from a 'String', advancing the cursor position
+    mkToken tok start' str =
+      let end = foldl' advanceSrcLoc start' str
+      in (end, L (RealSrcSpan $ mkRealSrcSpan start' end) (tok str))
+
+
+-- | Split apart the initial file into Haskell source lines ('Left' entries) and
+-- CPP lines ('Right' entries).
 --
--- Again, using something like @'span' 'isSpace'@ would be nice to chunk input
--- to whitespace. The problem is with /#/ symbol - if it is placed at the very
--- beginning of a line, it should be recognized as preprocessor macro. In any
--- other case, it is ordinary Haskell symbol and can be used to declare
--- operators. Hence, while dealing with whitespace we also check whether there
--- happens to be /#/ symbol just after a newline character - if that is the
--- case, we begin treating the whole line as preprocessor macro.
-spanSpaceOrCpp :: String -> (String, Maybe String, String)
-spanSpaceOrCpp ('\n':'#':str) =
-    let (str', rest) = spanToNewline str
-    in ("\n", Just $ '#':str', rest)
-spanSpaceOrCpp (c:str')
-    | isSpace c =
-        let (space, mcpp, rest) = spanSpaceOrCpp str'
-        in (c:space, mcpp, rest)
-spanSpaceOrCpp str = ("", Nothing, str)
-
--- | Split input to comment content (including delimiters) and the rest.
+-- All characters in the input are present in the output:
 --
--- Again, some more logic than simple 'span' is required because of Haskell
--- comment nesting policy.
-chunkComment :: Int -> String -> (String, String)
-chunkComment _ [] = ("", "")
-chunkComment depth ('{':'-':str) =
-    let (c, rest) = chunkComment (depth + 1) str
-    in ("{-" ++ c, rest)
-chunkComment depth ('-':'}':str)
-    | depth == 1 = ("-}", str)
-    | otherwise =
-        let (c, rest) = chunkComment (depth - 1) str
-        in ("-}" ++ c, rest)
-chunkComment depth (e:str) =
-    let (c, rest) = chunkComment depth str
-    in (e:c, rest)
-
--- | Assign source location for each chunk in given stream.
-tag :: [String] -> [(Span, String)]
-tag =
-    reverse . snd . foldl aux (Position 1 1, [])
-  where
-    aux (pos, cs) str =
-        let pos' = foldl move pos str
-        in (pos', (Span pos pos', str):cs)
-    move pos '\n' = pos { posRow = posRow pos + 1, posCol = 1 }
-    move pos _ = pos { posCol = posCol pos + 1 }
-
--- | Turn unrecognised chunk stream to more descriptive token stream.
-tokenize :: [(Span, String)] -> [Token]
-tokenize =
-    map aux
+-- prop> concat . map (either id id) . splitCPP = id
+splitCPP :: String -> [Either String String]
+splitCPP "" = []
+splitCPP s | isCPPline s = Left l : splitCPP rest
+           | otherwise =  Right l : splitCPP rest
   where
-    aux (sp, str) = Token
-        { tkType = classify str
-        , tkValue = str
-        , tkSpan = sp
-        }
+    ~(l, rest) = spanToNewline 0 s
 
--- | Classify given string as appropriate Haskell token.
+
+-- | Heuristic to decide if a line is going to be a CPP line. This should be a
+-- cheap operation since it is going to be run on every line being processed.
 --
--- This method is based on Haskell 98 Report lexical structure description:
--- https://www.haskell.org/onlinereport/lexemes.html
+-- Right now it just checks if the first non-whitespace character in the first
+-- five characters of the line is a '#':
 --
--- However, this is probably far from being perfect and most probably does not
--- handle correctly all corner cases.
-classify :: String -> TokenType
-classify str
-    | "--" `isPrefixOf` str = TkComment
-    | "{-#" `isPrefixOf` str = TkPragma
-    | "{-" `isPrefixOf` str = TkComment
-classify "''" = TkSpecial
-classify "'"  = TkSpecial
-classify str@(c:_)
-    | isSpace c = TkSpace
-    | isDigit c = TkNumber
-    | c `elem` special = TkSpecial
-    | str `elem` glyphs = TkGlyph
-    | all (`elem` symbols) str = TkOperator
-    | c == '#' = TkCpp
-    | c == '"' = TkString
-    | c == '\'' = TkChar
-classify str
-    | str `elem` keywords = TkKeyword
-    | isIdentifier str = TkIdentifier
-    | otherwise = TkUnknown
-
-keywords :: [String]
-keywords =
-    [ "as"
-    , "case"
-    , "class"
-    , "data"
-    , "default"
-    , "deriving"
-    , "do"
-    , "else"
-    , "hiding"
-    , "if"
-    , "import"
-    , "in"
-    , "infix"
-    , "infixl"
-    , "infixr"
-    , "instance"
-    , "let"
-    , "module"
-    , "newtype"
-    , "of"
-    , "qualified"
-    , "then"
-    , "type"
-    , "where"
-    , "forall"
-    , "family"
-    , "mdo"
-    ]
-
-glyphs :: [String]
-glyphs =
-    [ ".."
-    , ":"
-    , "::"
-    , "="
-    , "\\"
-    , "|"
-    , "<-"
-    , "->"
-    , "@"
-    , "~"
-    , "~#"
-    , "=>"
-    , "-"
-    , "!"
-    ]
-
-special :: [Char]
-special = "()[]{},;`"
-
--- TODO: Add support for any Unicode symbol or punctuation.
--- source: http://stackoverflow.com/questions/10548170/what-characters-are-permitted-for-haskell-operators
-symbols :: [Char]
-symbols = "!#$%&*+./<=>?@\\^|-~:"
-
-isIdentifier :: String -> Bool
-isIdentifier (s:str)
-    | (isLower' s || isUpper s) && all isAlphaNum' str = True
+-- >>> isCPPline "#define FOO 1"
+-- True
+--
+-- >>> isCPPline "\t\t  #ifdef GHC"
+-- True
+--
+-- >>> isCPPline "       #endif"
+-- False
+--
+isCPPline :: String -> Bool
+isCPPline = isPrefixOf "#" . dropWhile (`elem` " \t") . take 5
+
+
+-- | Split a "line" off the front of a string, supporting newline escapes.
+--
+-- By "line", we understand: the shortest substring ending in a '\n' that is not
+-- 
+--   1. immediately preceded by a '\\'
+--   2. not inside some (possibly nested) block comment
+--
+-- All characters in the input are present in the output:
+--
+-- prop> curry (++) . spanToNewLine 0 = id
+spanToNewline :: Int                 -- ^ open '{-'
+              -> String              -- ^ input
+              -> (String, String)
+spanToNewline _ [] = ([], [])
+spanToNewline n ('\n':str) | n <= 0 = ("\n", str)
+spanToNewline n ('\\':'\n':str) =
+    let (str', rest) = spanToNewline n str
+    in ('\\':'\n':str', rest)
+spanToNewline n ('{':'-':str) =
+    let (str', rest) = spanToNewline (n+1) str
+    in ('{':'-':str', rest) 
+spanToNewline n ('-':'}':str) =
+    let (str', rest) = spanToNewline (n-1) str
+    in ('-':'}':str', rest) 
+spanToNewline n (c:str) =
+    let (str', rest) = spanToNewline n str
+    in (c:str', rest)
+
+
+-- | Turn a list of GHC's 'L.Token' (and their source 'String') into a list of
+-- Haddock's 'T.Token'.
+ghcToks :: [(Located L.Token, String)] -> [T.Token]
+ghcToks = reverse . snd . foldl' go (start, [])
   where
-    isLower' c = isLower c || c == '_'
-    isAlphaNum' c = isAlphaNum c || c == '_' || c == '\''
-isIdentifier _ = False
+    start = mkRealSrcLoc (mkFastString "lexing") 1 1
+
+    go :: (RealSrcLoc, [T.Token])    -- ^ current position, tokens accumulated
+       -> (Located L.Token, String)  -- ^ next token, its content
+       -> (RealSrcLoc, [T.Token])    -- ^ new position, new tokens accumulated
+    go (pos, toks) (L l tok, raw) = ( next_pos
+                                    , classifiedTok ++ maybeToList white ++ toks
+                                    )
+       where
+         (next_pos, white) = mkWhitespace pos l
+         classifiedTok = [ Token (classify tok) raw rss
+                         | RealSrcSpan rss <- [l]
+                         , not (null raw)
+                         ]
+
+-- | Find the correct amount of whitespace between tokens.
+mkWhitespace :: RealSrcLoc -> SrcSpan -> (RealSrcLoc, Maybe T.Token)
+mkWhitespace prev spn =
+  case spn of
+    UnhelpfulSpan _ -> (prev,Nothing)
+    RealSrcSpan s | null wsstring -> (end, Nothing)
+                  | otherwise -> (end, Just (Token TkSpace wsstring wsspan))
+      where
+        start = realSrcSpanStart s
+        end = realSrcSpanEnd s
+        wsspan = mkRealSrcSpan prev start
+        nls = srcLocLine start - srcLocLine prev
+        spaces = if nls == 0 then srcLocCol start - srcLocCol prev
+                             else srcLocCol start - 1
+        wsstring = replicate nls '\n' ++ replicate spaces ' '
+
+
+-- | Classify given string as appropriate Haskell token.
+classify :: L.Token -> TokenType
+classify tok =
+  case tok of
+    ITas                   -> TkKeyword
+    ITcase                 -> TkKeyword
+    ITclass                -> TkKeyword
+    ITdata                 -> TkKeyword
+    ITdefault              -> TkKeyword
+    ITderiving             -> TkKeyword
+    ITdo                   -> TkKeyword
+    ITelse                 -> TkKeyword
+    IThiding               -> TkKeyword
+    ITforeign              -> TkKeyword
+    ITif                   -> TkKeyword
+    ITimport               -> TkKeyword
+    ITin                   -> TkKeyword
+    ITinfix                -> TkKeyword
+    ITinfixl               -> TkKeyword
+    ITinfixr               -> TkKeyword
+    ITinstance             -> TkKeyword
+    ITlet                  -> TkKeyword
+    ITmodule               -> TkKeyword
+    ITnewtype              -> TkKeyword
+    ITof                   -> TkKeyword
+    ITqualified            -> TkKeyword
+    ITthen                 -> TkKeyword
+    ITtype                 -> TkKeyword
+    ITwhere                -> TkKeyword
+
+    ITforall {}            -> TkKeyword
+    ITexport               -> TkKeyword
+    ITlabel                -> TkKeyword
+    ITdynamic              -> TkKeyword
+    ITsafe                 -> TkKeyword
+    ITinterruptible        -> TkKeyword
+    ITunsafe               -> TkKeyword
+    ITstdcallconv          -> TkKeyword
+    ITccallconv            -> TkKeyword
+    ITcapiconv             -> TkKeyword
+    ITprimcallconv         -> TkKeyword
+    ITjavascriptcallconv   -> TkKeyword
+    ITmdo                  -> TkKeyword
+    ITfamily               -> TkKeyword
+    ITrole                 -> TkKeyword
+    ITgroup                -> TkKeyword
+    ITby                   -> TkKeyword
+    ITusing                -> TkKeyword
+    ITpattern              -> TkKeyword
+    ITstatic               -> TkKeyword
+    ITstock                -> TkKeyword
+    ITanyclass             -> TkKeyword
+
+    ITunit                 -> TkKeyword
+    ITsignature            -> TkKeyword
+    ITdependency           -> TkKeyword
+    ITrequires             -> TkKeyword
+
+    ITinline_prag       {} -> TkPragma
+    ITspec_prag         {} -> TkPragma
+    ITspec_inline_prag  {} -> TkPragma
+    ITsource_prag       {} -> TkPragma
+    ITrules_prag        {} -> TkPragma
+    ITwarning_prag      {} -> TkPragma
+    ITdeprecated_prag   {} -> TkPragma
+    ITline_prag            -> TkPragma
+    ITscc_prag          {} -> TkPragma
+    ITgenerated_prag    {} -> TkPragma
+    ITcore_prag         {} -> TkPragma
+    ITunpack_prag       {} -> TkPragma
+    ITnounpack_prag     {} -> TkPragma
+    ITann_prag          {} -> TkPragma
+    ITcomplete_prag     {} -> TkPragma
+    ITclose_prag           -> TkPragma
+    IToptions_prag      {} -> TkPragma
+    ITinclude_prag      {} -> TkPragma
+    ITlanguage_prag        -> TkPragma
+    ITvect_prag         {} -> TkPragma
+    ITvect_scalar_prag  {} -> TkPragma
+    ITnovect_prag       {} -> TkPragma
+    ITminimal_prag      {} -> TkPragma
+    IToverlappable_prag {} -> TkPragma
+    IToverlapping_prag  {} -> TkPragma
+    IToverlaps_prag     {} -> TkPragma
+    ITincoherent_prag   {} -> TkPragma
+    ITctype             {} -> TkPragma
+
+    ITdotdot               -> TkGlyph
+    ITcolon                -> TkGlyph
+    ITdcolon            {} -> TkGlyph
+    ITequal                -> TkGlyph
+    ITlam                  -> TkGlyph
+    ITlcase                -> TkGlyph
+    ITvbar                 -> TkGlyph
+    ITlarrow            {} -> TkGlyph
+    ITrarrow            {} -> TkGlyph
+    ITat                   -> TkGlyph
+    ITtilde                -> TkGlyph
+    ITtildehsh             -> TkGlyph
+    ITdarrow            {} -> TkGlyph
+    ITminus                -> TkGlyph
+    ITbang                 -> TkGlyph
+    ITdot                  -> TkOperator
+    ITtypeApp              -> TkGlyph
+
+    ITbiglam               -> TkGlyph
+
+    ITocurly               -> TkSpecial
+    ITccurly               -> TkSpecial
+    ITvocurly              -> TkSpecial
+    ITvccurly              -> TkSpecial
+    ITobrack               -> TkSpecial
+    ITopabrack             -> TkSpecial
+    ITcpabrack             -> TkSpecial
+    ITcbrack               -> TkSpecial
+    IToparen               -> TkSpecial
+    ITcparen               -> TkSpecial
+    IToubxparen            -> TkSpecial
+    ITcubxparen            -> TkSpecial
+    ITsemi                 -> TkSpecial
+    ITcomma                -> TkSpecial
+    ITunderscore           -> TkIdentifier
+    ITbackquote            -> TkSpecial
+    ITsimpleQuote          -> TkSpecial
+
+    ITvarid             {} -> TkIdentifier
+    ITconid             {} -> TkIdentifier
+    ITvarsym            {} -> TkOperator
+    ITconsym            {} -> TkOperator
+    ITqvarid            {} -> TkIdentifier
+    ITqconid            {} -> TkIdentifier
+    ITqvarsym           {} -> TkOperator
+    ITqconsym           {} -> TkOperator
+
+    ITdupipvarid        {} -> TkUnknown
+    ITlabelvarid        {} -> TkUnknown
+
+    ITchar              {} -> TkChar
+    ITstring            {} -> TkString
+    ITinteger           {} -> TkNumber
+    ITrational          {} -> TkNumber
+
+    ITprimchar          {} -> TkChar
+    ITprimstring        {} -> TkString
+    ITprimint           {} -> TkNumber
+    ITprimword          {} -> TkNumber
+    ITprimfloat         {} -> TkNumber
+    ITprimdouble        {} -> TkNumber
+
+    ITopenExpQuote      {} -> TkSpecial
+    ITopenPatQuote         -> TkSpecial
+    ITopenDecQuote         -> TkSpecial
+    ITopenTypQuote         -> TkSpecial
+    ITcloseQuote        {} -> TkSpecial
+    ITopenTExpQuote     {} -> TkSpecial
+    ITcloseTExpQuote       -> TkSpecial
+    ITidEscape          {} -> TkUnknown
+    ITparenEscape          -> TkSpecial
+    ITidTyEscape        {} -> TkUnknown
+    ITparenTyEscape        -> TkSpecial
+    ITtyQuote              -> TkSpecial
+    ITquasiQuote {}        -> TkUnknown
+    ITqQuasiQuote {}       -> TkUnknown
+
+    ITproc                 -> TkKeyword
+    ITrec                  -> TkKeyword
+    IToparenbar         {} -> TkGlyph
+    ITcparenbar         {} -> TkGlyph
+    ITlarrowtail        {} -> TkGlyph
+    ITrarrowtail        {} -> TkGlyph
+    ITLarrowtail        {} -> TkGlyph
+    ITRarrowtail        {} -> TkGlyph
+
+    ITunknown           {} -> TkUnknown
+    ITeof                  -> TkUnknown
+
+    -- Line comments are only supposed to start with '--'. Starting with '#'
+    -- means that this was probably a CPP.
+    ITlineComment s
+      | isCPPline s        -> TkCpp
+      | otherwise          -> TkComment
+
+    ITdocCommentNext    {} -> TkComment
+    ITdocCommentPrev    {} -> TkComment
+    ITdocCommentNamed   {} -> TkComment
+    ITdocSection        {} -> TkComment
+    ITdocOptions        {} -> TkComment
+
+    -- The lexer considers top-level pragmas as comments (see `pragState` in
+    -- the GHC lexer for more), so we have to manually reverse this. The
+    -- following is a hammer: it smashes _all_ pragma-like block comments into
+    -- pragmas.
+    ITblockComment c
+      | isPrefixOf "{-#" c
+      , isSuffixOf "#-}" c -> TkPragma
+      | otherwise          -> TkComment
-- 
cgit v1.2.3