{-# LANGUAGE OverloadedStrings #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
module Haddock.Backends.Hyperlinker.Parser (parse) where

import Control.Applicative ( Alternative(..) )
import Data.List           ( isPrefixOf, isSuffixOf )

import qualified Data.ByteString as BS
import qualified Data.ByteString.Char8 as BSC

import GHC.LanguageExtensions.Type

import BasicTypes          ( IntegralLit(..) )
import DynFlags
import qualified EnumSet as E
import ErrUtils            ( emptyMessages )
import FastString          ( mkFastString )
import Lexer               ( P(..), ParseResult(..), PState(..), Token(..)
                           , mkPStatePure, lexer, mkParserFlags' )
import Outputable          ( showSDoc, panic )
import SrcLoc
import StringBuffer        ( StringBuffer, atEnd )

import Haddock.Backends.Hyperlinker.Types as T
import Haddock.GhcUtils

-- | Turn source code string into a stream of more descriptive tokens.
--
-- Result should retain original file layout (including comments,
-- whitespace, and CPP).
parse
  :: CompilerInfo  -- ^ Underlying CC compiler (whatever expanded CPP)
  -> DynFlags      -- ^ Flags for this module
  -> FilePath      -- ^ Path to the source of this module
  -> BS.ByteString -- ^ Raw UTF-8 encoded source of this module
  -> [T.Token]
parse comp dflags fpath bs = case unP (go False []) initState of
    POk _ toks -> reverse toks
    PFailed _ ss errMsg -> panic $ "Hyperlinker parse error at " ++ show ss ++
                                   ": " ++ showSDoc dflags errMsg
  where

    initState = mkPStatePure pflags buf start
    buf = stringBufferFromByteString bs
    start = mkRealSrcLoc (mkFastString fpath) 1 1
    needPragHack' = needPragHack comp dflags
    pflags = mkParserFlags' (warningFlags dflags)
                            (extensionFlags dflags)
                            (thisPackage dflags)
                            (safeImportsOn dflags)
                            False -- lex Haddocks as comment tokens
                            True  -- produce comment tokens
                            False -- produce position pragmas tokens

    go :: Bool        -- ^ are we currently in a pragma?
       -> [T.Token]   -- ^ tokens accumulated so far (in reverse)
       -> P [T.Token]
    go inPrag toks = do
      (b, _) <- getInput
      if not (atEnd b)
        then do
          (newToks, inPrag') <- parseCppLine <|> parsePlainTok inPrag <|> unknownLine
          go inPrag' (newToks ++ toks)
        else
          pure toks

    -- | Like 'Lexer.lexer', but slower, with a better API, and filtering out empty tokens
    wrappedLexer :: P (RealLocated Lexer.Token)
    wrappedLexer = Lexer.lexer False andThen
      where andThen (L (RealSrcSpan s) t)
              | srcSpanStartLine s /= srcSpanEndLine s ||
                srcSpanStartCol s /= srcSpanEndCol s
              = pure (L s t)
            andThen (L (RealSrcSpan s) ITeof) = pure (L s ITeof)
            andThen _ = wrappedLexer

    -- | Try to parse a CPP line (can fail)
    parseCppLine :: P ([T.Token], Bool)
    parseCppLine = do
      (b, l) <- getInput
      case tryCppLine l b of
        Just (cppBStr, l', b')
             -> let cppTok = T.Token { tkType = TkCpp
                                     , tkValue = cppBStr
                                     , tkSpan = mkRealSrcSpan l l' }
                in setInput (b', l') *> pure ([cppTok], False)
        _    -> empty

    -- | Try to parse a regular old token (can fail)
    parsePlainTok :: Bool -> P ([T.Token], Bool)  -- return list is only ever 0-2 elements
    parsePlainTok inPrag = do
      (bInit, lInit) <- getInput
      L sp tok <- Lexer.lexer False return
      (bEnd, _) <- getInput
      case sp of
        UnhelpfulSpan _ -> pure ([], False) -- pretend the token never existed
        RealSrcSpan rsp -> do
          let typ = if inPrag then TkPragma else classify tok
              RealSrcLoc lStart = srcSpanStart sp -- safe since @sp@ is real
              (spaceBStr, bStart) = spanPosition lInit lStart bInit
              inPragDef = inPragma inPrag tok

          (bEnd', inPrag') <- case tok of

            -- Update internal line + file position if this is a LINE pragma
            ITline_prag _ -> tryOrElse (bEnd, inPragDef) $ do
              L _ (ITinteger (IL { il_value = line })) <- wrappedLexer
              L _ (ITstring _ file)                    <- wrappedLexer
              L spF ITclose_prag                       <- wrappedLexer

              let newLoc = mkRealSrcLoc file (fromIntegral line - 1) (srcSpanEndCol spF)
              (bEnd'', _) <- getInput
              setInput (bEnd'', newLoc)

              pure (bEnd'', False)

            -- Update internal column position if this is a COLUMN pragma
            ITcolumn_prag _ -> tryOrElse (bEnd, inPragDef) $ do
              L _ (ITinteger (IL { il_value = col }))  <- wrappedLexer
              L spF ITclose_prag                       <- wrappedLexer

              let newLoc = mkRealSrcLoc (srcSpanFile spF) (srcSpanEndLine spF) (fromIntegral col)
              (bEnd'', _) <- getInput
              setInput (bEnd'', newLoc)

              pure (bEnd'', False)

            -- See 'needPragHack'
            ITclose_prag{}
              | needPragHack'
              , '\n' `BSC.elem` spaceBStr
              -> getInput >>= \(b,p) -> setInput (b,advanceSrcLoc p '\n') >> pure (bEnd, False)

            _ -> pure (bEnd, inPragDef)

          let tokBStr = splitStringBuffer bStart bEnd'
              plainTok = T.Token { tkType = typ
                                 , tkValue = tokBStr
                                 , tkSpan = rsp }
              spaceTok = T.Token { tkType = TkSpace
                                 , tkValue = spaceBStr
                                 , tkSpan = mkRealSrcSpan lInit lStart }

          pure (plainTok : [ spaceTok | not (BS.null spaceBStr) ], inPrag')

    -- | Parse whatever remains of the line as an unknown token (can't fail)
    unknownLine :: P ([T.Token], Bool)
    unknownLine = do
      (b, l) <- getInput
      let (unkBStr, l', b') = spanLine l b
          unkTok = T.Token { tkType = TkUnknown
                           , tkValue = unkBStr
                           , tkSpan = mkRealSrcSpan l l' }
      setInput (b', l')
      pure ([unkTok], False)


-- | This is really, really, /really/ gross. Problem: consider a Haskell
-- file that looks like:
--
-- @
-- {-# LANGUAGE CPP #-}
-- module SomeMod where
--
-- #define SIX 6
--
-- {-# INLINE foo
--   #-}
-- foo = 1
-- @
--
-- Clang's CPP replaces the @#define SIX 6@ line with an empty line (as it
-- should), but get confused about @#-}@. I'm guessing it /starts/ by
-- parsing that as a pre-processor directive and, when it fails to, it just
-- leaves the line alone. HOWEVER, it still adds an extra newline. =.=
--
-- This function makes sure that the Hyperlinker backend also adds that
-- extra newline (or else our spans won't line up with GHC's anymore).
needPragHack :: CompilerInfo -> DynFlags -> Bool
needPragHack comp dflags = isCcClang && E.member Cpp (extensionFlags dflags)
  where
    isCcClang = case comp of
      GCC -> False
      Clang -> True
      AppleClang -> True
      AppleClang51 -> True
      UnknownCC -> False

-- | Get the input
getInput :: P (StringBuffer, RealSrcLoc)
getInput = P $ \p @ PState { buffer = buf, loc = srcLoc } -> POk p (buf, srcLoc)

-- | Set the input
setInput :: (StringBuffer, RealSrcLoc) -> P ()
setInput (buf, srcLoc) = P $ \p -> POk (p { buffer = buf, loc = srcLoc }) ()


-- | Orphan instance that adds backtracking to 'P'
instance Alternative P where
  empty = P $ \_ -> PFailed (const emptyMessages) noSrcSpan "Alterative.empty"
  P x <|> P y = P $ \s -> case x s of { p@POk{} -> p
                                      ; _ -> y s }

-- | Try a parser. If it fails, backtrack and return the pure value.
tryOrElse :: a -> P a -> P a
tryOrElse x p = p <|> pure x

-- | Classify given tokens as appropriate Haskell token type.
classify :: Lexer.Token -> TokenType
classify tok =
  case tok of
    ITas                   -> TkKeyword
    ITcase                 -> TkKeyword
    ITclass                -> TkKeyword
    ITdata                 -> TkKeyword
    ITdefault              -> TkKeyword
    ITderiving             -> TkKeyword
    ITdo                   -> TkKeyword
    ITelse                 -> TkKeyword
    IThiding               -> TkKeyword
    ITforeign              -> TkKeyword
    ITif                   -> TkKeyword
    ITimport               -> TkKeyword
    ITin                   -> TkKeyword
    ITinfix                -> TkKeyword
    ITinfixl               -> TkKeyword
    ITinfixr               -> TkKeyword
    ITinstance             -> TkKeyword
    ITlet                  -> TkKeyword
    ITmodule               -> TkKeyword
    ITnewtype              -> TkKeyword
    ITof                   -> TkKeyword
    ITqualified            -> TkKeyword
    ITthen                 -> TkKeyword
    ITtype                 -> TkKeyword
    ITvia                  -> TkKeyword
    ITwhere                -> TkKeyword

    ITforall            {} -> TkKeyword
    ITexport               -> TkKeyword
    ITlabel                -> TkKeyword
    ITdynamic              -> TkKeyword
    ITsafe                 -> TkKeyword
    ITinterruptible        -> TkKeyword
    ITunsafe               -> TkKeyword
    ITstdcallconv          -> TkKeyword
    ITccallconv            -> TkKeyword
    ITcapiconv             -> TkKeyword
    ITprimcallconv         -> TkKeyword
    ITjavascriptcallconv   -> TkKeyword
    ITmdo                  -> TkKeyword
    ITfamily               -> TkKeyword
    ITrole                 -> TkKeyword
    ITgroup                -> TkKeyword
    ITby                   -> TkKeyword
    ITusing                -> TkKeyword
    ITpattern              -> TkKeyword
    ITstatic               -> TkKeyword
    ITstock                -> TkKeyword
    ITanyclass             -> TkKeyword

    ITunit                 -> TkKeyword
    ITsignature            -> TkKeyword
    ITdependency           -> TkKeyword
    ITrequires             -> TkKeyword

    ITinline_prag       {} -> TkPragma
    ITspec_prag         {} -> TkPragma
    ITspec_inline_prag  {} -> TkPragma
    ITsource_prag       {} -> TkPragma
    ITrules_prag        {} -> TkPragma
    ITwarning_prag      {} -> TkPragma
    ITdeprecated_prag   {} -> TkPragma
    ITline_prag         {} -> TkPragma
    ITcolumn_prag       {} -> TkPragma
    ITscc_prag          {} -> TkPragma
    ITgenerated_prag    {} -> TkPragma
    ITcore_prag         {} -> TkPragma
    ITunpack_prag       {} -> TkPragma
    ITnounpack_prag     {} -> TkPragma
    ITann_prag          {} -> TkPragma
    ITcomplete_prag     {} -> TkPragma
    ITclose_prag           -> TkPragma
    IToptions_prag      {} -> TkPragma
    ITinclude_prag      {} -> TkPragma
    ITlanguage_prag        -> TkPragma
    ITminimal_prag      {} -> TkPragma
    IToverlappable_prag {} -> TkPragma
    IToverlapping_prag  {} -> TkPragma
    IToverlaps_prag     {} -> TkPragma
    ITincoherent_prag   {} -> TkPragma
    ITctype             {} -> TkPragma

    ITdotdot               -> TkGlyph
    ITcolon                -> TkGlyph
    ITdcolon            {} -> TkGlyph
    ITequal                -> TkGlyph
    ITlam                  -> TkGlyph
    ITlcase                -> TkGlyph
    ITvbar                 -> TkGlyph
    ITlarrow            {} -> TkGlyph
    ITrarrow            {} -> TkGlyph
    ITat                   -> TkGlyph
    ITtilde                -> TkGlyph
    ITdarrow            {} -> TkGlyph
    ITminus                -> TkGlyph
    ITbang                 -> TkGlyph
    ITdot                  -> TkOperator
    ITstar              {} -> TkOperator
    ITtypeApp              -> TkGlyph

    ITbiglam               -> TkGlyph

    ITocurly               -> TkSpecial
    ITccurly               -> TkSpecial
    ITvocurly              -> TkSpecial
    ITvccurly              -> TkSpecial
    ITobrack               -> TkSpecial
    ITopabrack             -> TkSpecial
    ITcpabrack             -> TkSpecial
    ITcbrack               -> TkSpecial
    IToparen               -> TkSpecial
    ITcparen               -> TkSpecial
    IToubxparen            -> TkSpecial
    ITcubxparen            -> TkSpecial
    ITsemi                 -> TkSpecial
    ITcomma                -> TkSpecial
    ITunderscore           -> TkIdentifier
    ITbackquote            -> TkSpecial
    ITsimpleQuote          -> TkSpecial

    ITvarid             {} -> TkIdentifier
    ITconid             {} -> TkIdentifier
    ITvarsym            {} -> TkOperator
    ITconsym            {} -> TkOperator
    ITqvarid            {} -> TkIdentifier
    ITqconid            {} -> TkIdentifier
    ITqvarsym           {} -> TkOperator
    ITqconsym           {} -> TkOperator

    ITdupipvarid        {} -> TkUnknown
    ITlabelvarid        {} -> TkUnknown

    ITchar              {} -> TkChar
    ITstring            {} -> TkString
    ITinteger           {} -> TkNumber
    ITrational          {} -> TkNumber

    ITprimchar          {} -> TkChar
    ITprimstring        {} -> TkString
    ITprimint           {} -> TkNumber
    ITprimword          {} -> TkNumber
    ITprimfloat         {} -> TkNumber
    ITprimdouble        {} -> TkNumber

    ITopenExpQuote      {} -> TkSpecial
    ITopenPatQuote         -> TkSpecial
    ITopenDecQuote         -> TkSpecial
    ITopenTypQuote         -> TkSpecial
    ITcloseQuote        {} -> TkSpecial
    ITopenTExpQuote     {} -> TkSpecial
    ITcloseTExpQuote       -> TkSpecial
    ITidEscape          {} -> TkUnknown
    ITparenEscape          -> TkSpecial
    ITidTyEscape        {} -> TkUnknown
    ITparenTyEscape        -> TkSpecial
    ITtyQuote              -> TkSpecial
    ITquasiQuote        {} -> TkUnknown
    ITqQuasiQuote       {} -> TkUnknown

    ITproc                 -> TkKeyword
    ITrec                  -> TkKeyword
    IToparenbar         {} -> TkGlyph
    ITcparenbar         {} -> TkGlyph
    ITlarrowtail        {} -> TkGlyph
    ITrarrowtail        {} -> TkGlyph
    ITLarrowtail        {} -> TkGlyph
    ITRarrowtail        {} -> TkGlyph

    ITcomment_line_prag    -> TkUnknown
    ITunknown           {} -> TkUnknown
    ITeof                  -> TkUnknown

    ITlineComment       {} -> TkComment
    ITdocCommentNext    {} -> TkComment
    ITdocCommentPrev    {} -> TkComment
    ITdocCommentNamed   {} -> TkComment
    ITdocSection        {} -> TkComment
    ITdocOptions        {} -> TkComment

    -- The lexer considers top-level pragmas as comments (see `pragState` in
    -- the GHC lexer for more), so we have to manually reverse this. The
    -- following is a hammer: it smashes _all_ pragma-like block comments into
    -- pragmas.
    ITblockComment c
      | isPrefixOf "{-#" c
      , isSuffixOf "#-}" c -> TkPragma
      | otherwise          -> TkComment

-- | Classify given tokens as beginning pragmas (or not).
inPragma :: Bool           -- ^ currently in pragma
         -> Lexer.Token    -- ^ current token
         -> Bool           -- ^ new information about whether we are in a pragma
inPragma _ ITclose_prag = False
inPragma True _ = True
inPragma False tok =
  case tok of
    ITinline_prag       {} -> True
    ITspec_prag         {} -> True
    ITspec_inline_prag  {} -> True
    ITsource_prag       {} -> True
    ITrules_prag        {} -> True
    ITwarning_prag      {} -> True
    ITdeprecated_prag   {} -> True
    ITline_prag         {} -> True
    ITcolumn_prag       {} -> True
    ITscc_prag          {} -> True
    ITgenerated_prag    {} -> True
    ITcore_prag         {} -> True
    ITunpack_prag       {} -> True
    ITnounpack_prag     {} -> True
    ITann_prag          {} -> True
    ITcomplete_prag     {} -> True
    IToptions_prag      {} -> True
    ITinclude_prag      {} -> True
    ITlanguage_prag        -> True
    ITminimal_prag      {} -> True
    IToverlappable_prag {} -> True
    IToverlapping_prag  {} -> True
    IToverlaps_prag     {} -> True
    ITincoherent_prag   {} -> True
    ITctype             {} -> True

    _                      -> False