--
-- Haddock - A Haskell Documentation Tool
--
-- (c) Simon Marlow 2002
--
-- This file was modified and integrated into GHC by David Waern 2006.
-- Then moved back into Haddock by Isaac Dupree in 2009 :-)
--

{
{-# LANGUAGE BangPatterns #-}   -- Generated by Alex
{-# OPTIONS -Wwarn -w #-}
-- The above warning supression flag is a temporary kludge.
-- While working on this module you are encouraged to remove it and fix
-- any warnings in the module. See
--     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
-- for details

module Haddock.Lex (
	Token(..),
	LToken,
	tokenise
 ) where

import Lexer hiding (Token)
import Parser ( parseIdentifier )
import StringBuffer
import RdrName
import SrcLoc
import DynFlags
import FastString

import qualified Data.Bits
import Data.Char
import Data.Word (Word8)
import Numeric
import System.IO.Unsafe
import Debug.Trace
}

$ws    = $white # \n
$digit = [0-9]
$hexdigit = [0-9a-fA-F]
$special =  [\"\@]
$alphanum = [A-Za-z0-9]
$ident    = [$alphanum \'\_\.\!\#\$\%\&\*\+\/\<\=\>\?\@\\\\\^\|\-\~\:]

:-

-- beginning of a paragraph
<0,para> {
 $ws* \n		;
 $ws* \>		{ begin birdtrack }
 $ws* prop \> .* \n	{ strtoken TokProperty `andBegin` property}
 $ws* \>\>\>            { strtoken TokExamplePrompt `andBegin` exampleexpr }
 $ws* [\*\-]		{ token TokBullet `andBegin` string }
 $ws* \[		{ token TokDefStart `andBegin` def }
 $ws* \( $digit+ \) 	{ token TokNumber `andBegin` string }
 $ws* $digit+ \. 	{ token TokNumber `andBegin` string }
 $ws*			{ begin string }		
}

-- beginning of a line
<line> {
  $ws* \>		{ begin birdtrack }
  $ws* \>\>\>		{ strtoken TokExamplePrompt `andBegin` exampleexpr }
  $ws* \n		{ token TokPara `andBegin` para }
  -- Here, we really want to be able to say
  -- $ws* (\n | <eof>) 	{ token TokPara `andBegin` para}
  -- because otherwise a trailing line of whitespace will result in 
  -- a spurious TokString at the end of a docstring.  We don't have <eof>,
  -- though (NOW I realise what it was for :-).  To get around this, we always
  -- append \n to the end of a docstring.
  () 			{ begin string }
}

<birdtrack> .*	\n?	{ strtokenNL TokBirdTrack `andBegin` line }

<property> ()           { token TokPara `andBegin` para }

<example> {
  $ws*	\n		{ token TokPara `andBegin` para }
  $ws* \>\>\>	        { strtoken TokExamplePrompt `andBegin` exampleexpr }
  ()			{ begin exampleresult }
}

<exampleexpr> .* \n	{ strtokenNL TokExampleExpression `andBegin` example }

<exampleresult> .* \n	{ strtokenNL TokExampleResult `andBegin` example }

<string,def> {
  $special			{ strtoken $ \s -> TokSpecial (head s) }
  \<\< [^\>]* \>\>              { strtoken $ \s -> TokPic (init $ init $ tail $ tail s) }
  \< [^\>]* \>			{ strtoken $ \s -> TokURL (init (tail s)) }
  \# [^\#]* \#			{ strtoken $ \s -> TokAName (init (tail s)) }
  \/ [^\/]* \/                  { strtoken $ \s -> TokEmphasis (init (tail s)) }
  [\'\`] $ident+ [\'\`]		{ ident }
  \\ .				{ strtoken (TokString . tail) }
  "&#" $digit+ \;		{ strtoken $ \s -> TokString [chr (read (init (drop 2 s)))] }
  "&#" [xX] $hexdigit+ \;	{ strtoken $ \s -> case readHex (init (drop 3 s)) of [(n,_)] -> TokString [chr n] }
  -- allow special characters through if they don't fit one of the previous
  -- patterns.
  [\/\'\`\<\#\&\\]			{ strtoken TokString }
  [^ $special \/ \< \# \n \'\` \& \\ \]]* \n { strtokenNL TokString `andBegin` line }
  [^ $special \/ \< \# \n \'\` \& \\ \]]+    { strtoken TokString }
}

<def> {
  \]				{ token TokDefEnd `andBegin` string }
}

-- ']' doesn't have any special meaning outside of the [...] at the beginning
-- of a definition paragraph.
<string> {
  \]				{ strtoken TokString }
}

{
-- | A located token
type LToken = (Token, AlexPosn)

data Token
  = TokPara
  | TokNumber
  | TokBullet
  | TokDefStart
  | TokDefEnd
  | TokSpecial Char
  | TokIdent RdrName
  | TokString String
  | TokURL String
  | TokPic String
  | TokEmphasis String
  | TokAName String
  | TokBirdTrack String
  | TokProperty String
  | TokExamplePrompt String
  | TokExampleExpression String
  | TokExampleResult String
--  deriving Show

tokenPos :: LToken -> (Int, Int)
tokenPos t = let AlexPn _ line col = snd t in (line, col)

-- -----------------------------------------------------------------------------
-- Alex support stuff

-- XXX: copied the posn wrapper code from Alex to make this lexer work
-- with both Alex 2.x and Alex 3.x.  However, we are not using the
-- Unicode/UTF-8 support in Alex 3.x, and Unicode documentation will
-- probably get mangled.

-- | Encode a Haskell String to a list of Word8 values, in UTF8 format.
utf8Encode :: Char -> [Word8]
utf8Encode = map fromIntegral . go . ord
 where
  go oc
   | oc <= 0x7f       = [oc]

   | oc <= 0x7ff      = [ 0xc0 + (oc `Data.Bits.shiftR` 6)
                        , 0x80 + oc Data.Bits..&. 0x3f
                        ]

   | oc <= 0xffff     = [ 0xe0 + (oc `Data.Bits.shiftR` 12)
                        , 0x80 + ((oc `Data.Bits.shiftR` 6) Data.Bits..&. 0x3f)
                        , 0x80 + oc Data.Bits..&. 0x3f
                        ]
   | otherwise        = [ 0xf0 + (oc `Data.Bits.shiftR` 18)
                        , 0x80 + ((oc `Data.Bits.shiftR` 12) Data.Bits..&. 0x3f)
                        , 0x80 + ((oc `Data.Bits.shiftR` 6) Data.Bits..&. 0x3f)
                        , 0x80 + oc Data.Bits..&. 0x3f
                        ]

type Byte = Word8

type AlexInput = (AlexPosn,     -- current position,
                  Char,         -- previous char
                  [Byte],       -- pending bytes on current char
                  String)       -- current input string

alexInputPrevChar :: AlexInput -> Char
alexInputPrevChar (p,c,bs,s) = c

alexGetByte :: AlexInput -> Maybe (Byte,AlexInput)
alexGetByte (p,c,(b:bs),s) = Just (b,(p,c,bs,s))
alexGetByte (p,c,[],[]) = Nothing
alexGetByte (p,_,[],(c:s))  = let p' = alexMove p c
                                  (b:bs) = utf8Encode c
                              in p' `seq`  Just (b, (p', c, bs, s))

data AlexPosn = AlexPn !Int !Int !Int
        deriving (Eq,Show)

alexMove :: AlexPosn -> Char -> AlexPosn
alexMove (AlexPn a l c) '\t' = AlexPn (a+1)  l     (((c+7) `div` 8)*8+1)
alexMove (AlexPn a l c) '\n' = AlexPn (a+1) (l+1)   1
alexMove (AlexPn a l c) _    = AlexPn (a+1)  l     (c+1)

type StartCode = Int
type Action = AlexPosn -> String -> StartCode -> (StartCode -> [LToken]) -> DynFlags -> [LToken]

tokenise :: DynFlags -> String -> (Int, Int) -> [LToken]
tokenise dflags str (line, col) = go (posn,'\n',[],eofHack str) para
  where posn = AlexPn 0 line col
        go inp@(pos,_,_,str) sc =
          case alexScan inp sc of
                AlexEOF -> []
                AlexError _ -> []
                AlexSkip  inp' len     -> go inp' sc
                AlexToken inp' len act -> act pos (take len str) sc (\sc -> go inp' sc) dflags

-- NB. we add a final \n to the string, (see comment in the beginning of line
-- production above).
eofHack str = str++"\n"

andBegin  :: Action -> StartCode -> Action
andBegin act new_sc = \pos str _ cont dflags -> act pos str new_sc cont dflags

token :: Token -> Action
token t = \pos _ sc cont _ -> (t, pos) : cont sc

strtoken, strtokenNL :: (String -> Token) -> Action
strtoken t = \pos str sc cont _ -> (t str, pos) : cont sc
strtokenNL t = \pos str sc cont _ -> (t (filter (/= '\r') str), pos) : cont sc
-- ^ We only want LF line endings in our internal doc string format, so we
-- filter out all CRs.

begin :: StartCode -> Action
begin sc = \_ _ _ cont _ -> cont sc

-- -----------------------------------------------------------------------------
-- Lex a string as a Haskell identifier

ident :: Action
ident pos str sc cont dflags = 
  case parseIdent dflags loc id of
	Just names -> (TokIdent names, pos) : cont sc
	Nothing -> (TokString str, pos) : cont sc
 where id = init (tail str)
       -- TODO: Get the real filename here. Maybe we should just be
       --       using GHC SrcLoc's ourself?
       filename = mkFastString "<unknown file>"
       loc = case pos of
             AlexPn _ line col ->
                 mkRealSrcLoc filename line col

parseIdent :: DynFlags -> RealSrcLoc -> String -> Maybe RdrName
parseIdent dflags loc str0 = 
  let buffer = stringToStringBuffer str0
      pstate = mkPState dflags buffer loc
      result = unP parseIdentifier pstate 
  in case result of 
       POk _ name -> Just (unLoc name)
       _ -> Nothing
}