aboutsummaryrefslogblamecommitdiff
path: root/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
blob: acb2c892eeb676dc4593302fd09bc7aeb344cb07 (plain) (tree)
1
2
3
4
5
6
7
8
                                                        

                                                             
                                      
 




                                                          
 
                                              
 
 



                                                                             



                                                                                





                                                             
 



                                                                   
  



                                                                          
       
                                                             
 




                                                                   
 
                                                                    
 
                                                                        
 
                                                               
                                  

                                                                         
 
                                

                                                                         
 






                                                                                
  
                                                         
  



                                                    
       
                                  
 

                                                                              
  
                                                                              
  











                                                              

                                                                               





                                                         


                                                 


                                                 
                 
                                              
                           
                                              
                           












                                                                               






                                                                              
                                                                    
       
                                                    












                                                                              
                                               
                                                        

                                                 


                                             
















                                                                            
                                                             

























                                       
                                       
                                       
                                       
































                                       
                                      
                                      









                                      
















                                      


                                        
                                        
























































                                          
                                       































                                                                              















                                                                          
                                  








                                  







                                   
module Haddock.Backends.Hyperlinker.Parser (parse) where

import Data.Either         ( isRight, isLeft )
import Data.List           ( foldl', isPrefixOf, isSuffixOf )
import Data.Maybe          ( maybeToList )
import Data.Char           ( isSpace )
import qualified Text.Read as R

import GHC                 ( DynFlags, addSourceToTokens )
import SrcLoc
import FastString          ( mkFastString )
import StringBuffer        ( stringToStringBuffer )
import Lexer               ( Token(..) )
import qualified Lexer as L

import Haddock.Backends.Hyperlinker.Types as T


-- | Turn source code string into a stream of more descriptive tokens.
--
-- Result should retain original file layout (including comments, whitespace,
-- etc.), i.e. the following "law" should hold:
--
-- prop> concat . map tkValue . parse = id
--
-- (In reality, this only holds for input not containing '\r', '\t', '\f', '\v',
-- characters, since GHC transforms those into ' ' and '\n')
parse :: DynFlags -> FilePath -> String -> [T.Token]
parse dflags fp = ghcToks . processCPP dflags fp . filterCRLF
  where
    -- Remove CRLFs from source
    filterCRLF :: String -> String
    filterCRLF ('\r':'\n':cs) = '\n' : filterCRLF cs
    filterCRLF (c:cs) = c : filterCRLF cs
    filterCRLF [] = []

-- | Parse the source into tokens using the GHC lexer.
--
--   * CPP lines are removed and reinserted as line-comments
--   * top-level file pragmas are parsed as block comments (see the
--     'ITblockComment' case of 'classify' for more details)
--
processCPP :: DynFlags    -- ^ GHC's flags
           -> FilePath    -- ^ source file name (for position information)
           -> String      -- ^ source file contents
           -> [(Located L.Token, String)]
processCPP dflags fpath s = addSrc . go start . splitCPP $ s
  where
    start = mkRealSrcLoc (mkFastString fpath) 1 1
    addSrc = addSourceToTokens start (stringToStringBuffer s)

    -- Transform a list of Haskell/CPP lines into a list of tokens
    go :: RealSrcLoc -> [Either String String] -> [Located L.Token]
    go _   [] = []
    go pos ls =
      let (hLinesRight,  ls')  = span isRight ls
          (cppLinesLeft, rest) = span isLeft ls'

          hSrc   = concat [ hLine   | Right hLine  <- hLinesRight  ]
          cppSrc = concat [ cppLine | Left cppLine <- cppLinesLeft ]

      in case L.lexTokenStream (stringToStringBuffer hSrc) pos dflags of

           -- Stuff that fails to lex gets turned into comments
           L.PFailed _ _ss _msg ->
             let (src_pos, failed) = mkToken ITunknown pos hSrc
                 (new_pos, cpp)    = mkToken ITlineComment src_pos cppSrc
             in failed : cpp : go new_pos rest

           -- Successfully lexed
           L.POk ss toks ->
             let (new_pos, cpp) = mkToken ITlineComment (L.loc ss) cppSrc
             in toks ++ [cpp] ++ go new_pos rest

    -- Manually make a token from a 'String', advancing the cursor position
    mkToken tok start' str =
      let end = foldl' advanceSrcLoc start' str
      in (end, L (RealSrcSpan $ mkRealSrcSpan start' end) (tok str))


-- | Split apart the initial file into Haskell source lines ('Left' entries) and
-- CPP lines ('Right' entries).
--
-- All characters in the input are present in the output:
--
-- prop> concat . map (either id id) . splitCPP = id
splitCPP :: String -> [Either String String]
splitCPP "" = []
splitCPP s | isCPPline s = Left l : splitCPP rest
           | otherwise =  Right l : splitCPP rest
  where
    ~(l, rest) = spanToNewline 0 s


-- | Heuristic to decide if a line is going to be a CPP line. This should be a
-- cheap operation since it is going to be run on every line being processed.
--
-- Right now it just checks if the first non-whitespace character in the first
-- five characters of the line is a '#':
--
-- >>> isCPPline "#define FOO 1"
-- True
--
-- >>> isCPPline "\t\t  #ifdef GHC"
-- True
--
-- >>> isCPPline "       #endif"
-- False
--
isCPPline :: String -> Bool
isCPPline = isPrefixOf "#" . dropWhile (`elem` " \t") . take 5


-- | Split a "line" off the front of a string, hopefully without cutting tokens
-- in half. I say "hopefully" because knowing what a token is requires lexing,
-- yet lexing depends on this function.
--
-- All characters in the input are present in the output:
--
-- prop> curry (++) . spanToNewLine 0 = id
spanToNewline :: Int                 -- ^ open '{-'
              -> String              -- ^ input
              -> (String, String)

-- Base case and space characters
spanToNewline _ "" = ("", "")
spanToNewline n ('\n':str) | n <= 0 = ("\n", str)
spanToNewline n ('\n':str) | n <= 0 = ("\n", str)
spanToNewline n ('\\':'\n':str) =
    let (str', rest) = spanToNewline n str
    in ('\\':'\n':str', rest)

-- Block comments
spanToNewline n ('{':'-':str) =
    let (str', rest) = spanToNewline (n+1) str
    in ('{':'-':str', rest)
spanToNewline n ('-':'}':str) =
    let (str', rest) = spanToNewline (n-1) str
    in ('-':'}':str', rest)

-- When not in a block comment, try to lex a Haskell token
spanToNewline 0 str@(c:_) | ((lexed, str') : _) <- R.lex str, not (isSpace c) =
    if all (== '-') lexed && length lexed >= 2
      -- A Haskell line comment
      then case span (/= '\n') str' of
             (str'', '\n':rest) -> (lexed ++ str'' ++ "\n", rest)
             (_, _) -> (str, "") 

      -- An actual Haskell token
      else let (str'', rest) = spanToNewline 0 str'
           in (lexed ++ str'', rest)

-- In all other cases, advance one character at a time
spanToNewline n (c:str) =
    let (str', rest) = spanToNewline n str
    in (c:str', rest)


-- | Turn a list of GHC's 'L.Token' (and their source 'String') into a list of
-- Haddock's 'T.Token'.
ghcToks :: [(Located L.Token, String)] -> [T.Token]
ghcToks = reverse . (\(_,ts,_) -> ts) . foldl' go (start, [], False)
  where
    start = mkRealSrcLoc (mkFastString "lexing") 1 1

    go :: (RealSrcLoc, [T.Token], Bool)
       -- ^ current position, tokens accumulated, currently in pragma (or not)
       
       -> (Located L.Token, String)
       -- ^ next token, its content
       
       -> (RealSrcLoc, [T.Token], Bool)
       -- ^ new position, new tokens accumulated, currently in pragma (or not)

    go (pos, toks, in_prag) (L l tok, raw) =
        ( next_pos
        , classifiedTok ++ maybeToList white ++ toks
        , inPragma in_prag tok
        )
       where
         (next_pos, white) = mkWhitespace pos l
         
         classifiedTok = [ Token (classify' tok) raw rss
                         | RealSrcSpan rss <- [l]
                         , not (null raw)
                         ]
         
         classify' | in_prag = const TkPragma
                   | otherwise = classify


-- | Find the correct amount of whitespace between tokens.
mkWhitespace :: RealSrcLoc -> SrcSpan -> (RealSrcLoc, Maybe T.Token)
mkWhitespace prev spn =
  case spn of
    UnhelpfulSpan _ -> (prev,Nothing)
    RealSrcSpan s | null wsstring -> (end, Nothing)
                  | otherwise -> (end, Just (Token TkSpace wsstring wsspan))
      where
        start = realSrcSpanStart s
        end = realSrcSpanEnd s
        wsspan = mkRealSrcSpan prev start
        nls = srcLocLine start - srcLocLine prev
        spaces = if nls == 0 then srcLocCol start - srcLocCol prev
                             else srcLocCol start - 1
        wsstring = replicate nls '\n' ++ replicate spaces ' '


-- | Classify given tokens as appropriate Haskell token type.
classify :: L.Token -> TokenType
classify tok =
  case tok of
    ITas                   -> TkKeyword
    ITcase                 -> TkKeyword
    ITclass                -> TkKeyword
    ITdata                 -> TkKeyword
    ITdefault              -> TkKeyword
    ITderiving             -> TkKeyword
    ITdo                   -> TkKeyword
    ITelse                 -> TkKeyword
    IThiding               -> TkKeyword
    ITforeign              -> TkKeyword
    ITif                   -> TkKeyword
    ITimport               -> TkKeyword
    ITin                   -> TkKeyword
    ITinfix                -> TkKeyword
    ITinfixl               -> TkKeyword
    ITinfixr               -> TkKeyword
    ITinstance             -> TkKeyword
    ITlet                  -> TkKeyword
    ITmodule               -> TkKeyword
    ITnewtype              -> TkKeyword
    ITof                   -> TkKeyword
    ITqualified            -> TkKeyword
    ITthen                 -> TkKeyword
    ITtype                 -> TkKeyword
    ITvia                  -> TkKeyword
    ITwhere                -> TkKeyword

    ITforall            {} -> TkKeyword
    ITexport               -> TkKeyword
    ITlabel                -> TkKeyword
    ITdynamic              -> TkKeyword
    ITsafe                 -> TkKeyword
    ITinterruptible        -> TkKeyword
    ITunsafe               -> TkKeyword
    ITstdcallconv          -> TkKeyword
    ITccallconv            -> TkKeyword
    ITcapiconv             -> TkKeyword
    ITprimcallconv         -> TkKeyword
    ITjavascriptcallconv   -> TkKeyword
    ITmdo                  -> TkKeyword
    ITfamily               -> TkKeyword
    ITrole                 -> TkKeyword
    ITgroup                -> TkKeyword
    ITby                   -> TkKeyword
    ITusing                -> TkKeyword
    ITpattern              -> TkKeyword
    ITstatic               -> TkKeyword
    ITstock                -> TkKeyword
    ITanyclass             -> TkKeyword

    ITunit                 -> TkKeyword
    ITsignature            -> TkKeyword
    ITdependency           -> TkKeyword
    ITrequires             -> TkKeyword

    ITinline_prag       {} -> TkPragma
    ITspec_prag         {} -> TkPragma
    ITspec_inline_prag  {} -> TkPragma
    ITsource_prag       {} -> TkPragma
    ITrules_prag        {} -> TkPragma
    ITwarning_prag      {} -> TkPragma
    ITdeprecated_prag   {} -> TkPragma
    ITline_prag         {} -> TkPragma
    ITcolumn_prag       {} -> TkPragma
    ITscc_prag          {} -> TkPragma
    ITgenerated_prag    {} -> TkPragma
    ITcore_prag         {} -> TkPragma
    ITunpack_prag       {} -> TkPragma
    ITnounpack_prag     {} -> TkPragma
    ITann_prag          {} -> TkPragma
    ITcomplete_prag     {} -> TkPragma
    ITclose_prag           -> TkPragma
    IToptions_prag      {} -> TkPragma
    ITinclude_prag      {} -> TkPragma
    ITlanguage_prag        -> TkPragma
    ITminimal_prag      {} -> TkPragma
    IToverlappable_prag {} -> TkPragma
    IToverlapping_prag  {} -> TkPragma
    IToverlaps_prag     {} -> TkPragma
    ITincoherent_prag   {} -> TkPragma
    ITctype             {} -> TkPragma

    ITdotdot               -> TkGlyph
    ITcolon                -> TkGlyph
    ITdcolon            {} -> TkGlyph
    ITequal                -> TkGlyph
    ITlam                  -> TkGlyph
    ITlcase                -> TkGlyph
    ITvbar                 -> TkGlyph
    ITlarrow            {} -> TkGlyph
    ITrarrow            {} -> TkGlyph
    ITat                   -> TkGlyph
    ITtilde                -> TkGlyph
    ITdarrow            {} -> TkGlyph
    ITminus                -> TkGlyph
    ITbang                 -> TkGlyph
    ITdot                  -> TkOperator
    ITstar              {} -> TkOperator
    ITtypeApp              -> TkGlyph

    ITbiglam               -> TkGlyph

    ITocurly               -> TkSpecial
    ITccurly               -> TkSpecial
    ITvocurly              -> TkSpecial
    ITvccurly              -> TkSpecial
    ITobrack               -> TkSpecial
    ITopabrack             -> TkSpecial
    ITcpabrack             -> TkSpecial
    ITcbrack               -> TkSpecial
    IToparen               -> TkSpecial
    ITcparen               -> TkSpecial
    IToubxparen            -> TkSpecial
    ITcubxparen            -> TkSpecial
    ITsemi                 -> TkSpecial
    ITcomma                -> TkSpecial
    ITunderscore           -> TkIdentifier
    ITbackquote            -> TkSpecial
    ITsimpleQuote          -> TkSpecial

    ITvarid             {} -> TkIdentifier
    ITconid             {} -> TkIdentifier
    ITvarsym            {} -> TkOperator
    ITconsym            {} -> TkOperator
    ITqvarid            {} -> TkIdentifier
    ITqconid            {} -> TkIdentifier
    ITqvarsym           {} -> TkOperator
    ITqconsym           {} -> TkOperator

    ITdupipvarid        {} -> TkUnknown
    ITlabelvarid        {} -> TkUnknown

    ITchar              {} -> TkChar
    ITstring            {} -> TkString
    ITinteger           {} -> TkNumber
    ITrational          {} -> TkNumber

    ITprimchar          {} -> TkChar
    ITprimstring        {} -> TkString
    ITprimint           {} -> TkNumber
    ITprimword          {} -> TkNumber
    ITprimfloat         {} -> TkNumber
    ITprimdouble        {} -> TkNumber

    ITopenExpQuote      {} -> TkSpecial
    ITopenPatQuote         -> TkSpecial
    ITopenDecQuote         -> TkSpecial
    ITopenTypQuote         -> TkSpecial
    ITcloseQuote        {} -> TkSpecial
    ITopenTExpQuote     {} -> TkSpecial
    ITcloseTExpQuote       -> TkSpecial
    ITidEscape          {} -> TkUnknown
    ITparenEscape          -> TkSpecial
    ITidTyEscape        {} -> TkUnknown
    ITparenTyEscape        -> TkSpecial
    ITtyQuote              -> TkSpecial
    ITquasiQuote        {} -> TkUnknown
    ITqQuasiQuote       {} -> TkUnknown

    ITproc                 -> TkKeyword
    ITrec                  -> TkKeyword
    IToparenbar         {} -> TkGlyph
    ITcparenbar         {} -> TkGlyph
    ITlarrowtail        {} -> TkGlyph
    ITrarrowtail        {} -> TkGlyph
    ITLarrowtail        {} -> TkGlyph
    ITRarrowtail        {} -> TkGlyph

    ITunknown           {} -> TkUnknown
    ITeof                  -> TkUnknown

    -- Line comments are only supposed to start with '--'. Starting with '#'
    -- means that this was probably a CPP.
    ITlineComment s
      | isCPPline s        -> TkCpp
      | otherwise          -> TkComment

    ITdocCommentNext    {} -> TkComment
    ITdocCommentPrev    {} -> TkComment
    ITdocCommentNamed   {} -> TkComment
    ITdocSection        {} -> TkComment
    ITdocOptions        {} -> TkComment

    -- The lexer considers top-level pragmas as comments (see `pragState` in
    -- the GHC lexer for more), so we have to manually reverse this. The
    -- following is a hammer: it smashes _all_ pragma-like block comments into
    -- pragmas.
    ITblockComment c
      | isPrefixOf "{-#" c
      , isSuffixOf "#-}" c -> TkPragma
      | otherwise          -> TkComment

-- | Classify given tokens as beginning pragmas (or not).
inPragma :: Bool     -- ^ currently in pragma
         -> L.Token  -- ^ current token
         -> Bool     -- ^ new information about whether we are in a pragma
inPragma _ ITclose_prag = False
inPragma True _ = True
inPragma False tok =
  case tok of
    ITinline_prag       {} -> True
    ITspec_prag         {} -> True
    ITspec_inline_prag  {} -> True
    ITsource_prag       {} -> True
    ITrules_prag        {} -> True
    ITwarning_prag      {} -> True
    ITdeprecated_prag   {} -> True
    ITline_prag         {} -> True
    ITcolumn_prag       {} -> True
    ITscc_prag          {} -> True
    ITgenerated_prag    {} -> True
    ITcore_prag         {} -> True
    ITunpack_prag       {} -> True
    ITnounpack_prag     {} -> True
    ITann_prag          {} -> True
    ITcomplete_prag     {} -> True
    IToptions_prag      {} -> True
    ITinclude_prag      {} -> True
    ITlanguage_prag        -> True
    ITminimal_prag      {} -> True
    IToverlappable_prag {} -> True
    IToverlapping_prag  {} -> True
    IToverlaps_prag     {} -> True
    ITincoherent_prag   {} -> True
    ITctype             {} -> True

    _                      -> False