-- -- Haddock - A Haskell Documentation Tool -- -- (c) Simon Marlow 2002 -- -- This file was modified and integrated into GHC by David Waern 2006. -- Then moved back into Haddock by Isaac Dupree in 2009 :-) -- { {-# LANGUAGE BangPatterns #-} -- Generated by Alex {-# OPTIONS -Wwarn -w #-} -- The above warning supression flag is a temporary kludge. -- While working on this module you are encouraged to remove it and fix -- any warnings in the module. See -- http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings -- for details module Haddock.Lex ( Token(..), LToken, tokenise ) where import Lexer hiding (Token) import Parser ( parseIdentifier ) import StringBuffer import RdrName import SrcLoc import DynFlags import FastString import qualified Data.Bits import Data.Char import Data.Word (Word8) import Numeric import System.IO.Unsafe import Debug.Trace } $ws = $white # \n $digit = [0-9] $hexdigit = [0-9a-fA-F] $special = [\"\@] $alphanum = [A-Za-z0-9] $ident = [$alphanum \'\_\.\!\#\$\%\&\*\+\/\<\=\>\?\@\\\\\^\|\-\~\:] :- -- beginning of a paragraph <0,para> { $ws* \n ; $ws* \> { begin birdtrack } $ws* prop \> .* \n { strtoken TokProperty `andBegin` property} $ws* \>\>\> { strtoken TokExamplePrompt `andBegin` exampleexpr } $ws* [\*\-] { token TokBullet `andBegin` string } $ws* \[ { token TokDefStart `andBegin` def } $ws* $ $digit+ $ { token TokNumber `andBegin` string } $ws* $digit+ \. { token TokNumber `andBegin` string } $ws* { begin string } } -- beginning of a line { $ws* \> { begin birdtrack } $ws* \>\>\> { strtoken TokExamplePrompt `andBegin` exampleexpr } $ws* \n { token TokPara `andBegin` para } -- Here, we really want to be able to say -- $ws* (\n | ) { token TokPara `andBegin` para} -- because otherwise a trailing line of whitespace will result in -- a spurious TokString at the end of a docstring. We don't have , -- though (NOW I realise what it was for :-). To get around this, we always -- append \n to the end of a docstring. () { begin string } } .* \n? { strtokenNL TokBirdTrack `andBegin` line } () { token TokPara `andBegin` para } { $ws* \n { token TokPara `andBegin` para } $ws* \>\>\> { strtoken TokExamplePrompt `andBegin` exampleexpr } () { begin exampleresult } } .* \n { strtokenNL TokExampleExpression `andBegin` example } .* \n { strtokenNL TokExampleResult `andBegin` example } { $special { strtoken $ \s -> TokSpecial (head s) } \<\< [^\>]* \>\> { strtoken $ \s -> TokPic (init $ init $ tail $ tail s) } \< [^\>]* \> { strtoken $ \s -> TokURL (init (tail s)) } \# [^\#]* \# { strtoken $ \s -> TokAName (init (tail s)) } \/ [^\/]* \/ { strtoken $ \s -> TokEmphasis (init (tail s)) } [\'\`] $ident+ [\'\`] { ident } \\ . { strtoken (TokString . tail) } "&#" $digit+ \; { strtoken $ \s -> TokString [chr (read (init (drop 2 s)))] } "&#" [xX] $hexdigit+ \; { strtoken $ \s -> case readHex (init (drop 3 s)) of [(n,_)] -> TokString [chr n] } -- allow special characters through if they don't fit one of the previous -- patterns. [\/\'\`\<\#\&\\] { strtoken TokString } [^ $special \/ \< \# \n \'\` \& \\ \]]* \n { strtokenNL TokString `andBegin` line } [^ $special \/ \< \# \n \'\` \& \\ \]]+ { strtoken TokString } } { \] { token TokDefEnd `andBegin` string } } -- ']' doesn't have any special meaning outside of the [...] at the beginning -- of a definition paragraph. { \] { strtoken TokString } } { -- | A located token type LToken = (Token, AlexPosn) data Token = TokPara | TokNumber | TokBullet | TokDefStart | TokDefEnd | TokSpecial Char | TokIdent RdrName | TokString String | TokURL String | TokPic String | TokEmphasis String | TokAName String | TokBirdTrack String | TokProperty String | TokExamplePrompt String | TokExampleExpression String | TokExampleResult String -- deriving Show tokenPos :: LToken -> (Int, Int) tokenPos t = let AlexPn _ line col = snd t in (line, col) -- ----------------------------------------------------------------------------- -- Alex support stuff -- XXX: copied the posn wrapper code from Alex to make this lexer work -- with both Alex 2.x and Alex 3.x. However, we are not using the -- Unicode/UTF-8 support in Alex 3.x, and Unicode documentation will -- probably get mangled. -- | Encode a Haskell String to a list of Word8 values, in UTF8 format. utf8Encode :: Char -> [Word8] utf8Encode = map fromIntegral . go . ord where go oc | oc <= 0x7f = [oc] | oc <= 0x7ff = [ 0xc0 + (oc `Data.Bits.shiftR` 6) , 0x80 + oc Data.Bits..&. 0x3f ] | oc <= 0xffff = [ 0xe0 + (oc `Data.Bits.shiftR` 12) , 0x80 + ((oc `Data.Bits.shiftR` 6) Data.Bits..&. 0x3f) , 0x80 + oc Data.Bits..&. 0x3f ] | otherwise = [ 0xf0 + (oc `Data.Bits.shiftR` 18) , 0x80 + ((oc `Data.Bits.shiftR` 12) Data.Bits..&. 0x3f) , 0x80 + ((oc `Data.Bits.shiftR` 6) Data.Bits..&. 0x3f) , 0x80 + oc Data.Bits..&. 0x3f ] type Byte = Word8 type AlexInput = (AlexPosn, -- current position, Char, -- previous char [Byte], -- pending bytes on current char String) -- current input string alexInputPrevChar :: AlexInput -> Char alexInputPrevChar (p,c,bs,s) = c alexGetByte :: AlexInput -> Maybe (Byte,AlexInput) alexGetByte (p,c,(b:bs),s) = Just (b,(p,c,bs,s)) alexGetByte (p,c,[],[]) = Nothing alexGetByte (p,_,[],(c:s)) = let p' = alexMove p c (b:bs) = utf8Encode c in p' `seq` Just (b, (p', c, bs, s)) data AlexPosn = AlexPn !Int !Int !Int deriving (Eq,Show) alexMove :: AlexPosn -> Char -> AlexPosn alexMove (AlexPn a l c) '\t' = AlexPn (a+1) l (((c+7) `div` 8)*8+1) alexMove (AlexPn a l c) '\n' = AlexPn (a+1) (l+1) 1 alexMove (AlexPn a l c) _ = AlexPn (a+1) l (c+1) type StartCode = Int type Action = AlexPosn -> String -> StartCode -> (StartCode -> [LToken]) -> DynFlags -> [LToken] tokenise :: DynFlags -> String -> (Int, Int) -> [LToken] tokenise dflags str (line, col) = go (posn,'\n',[],eofHack str) para where posn = AlexPn 0 line col go inp@(pos,_,_,str) sc = case alexScan inp sc of AlexEOF -> [] AlexError _ -> [] AlexSkip inp' len -> go inp' sc AlexToken inp' len act -> act pos (take len str) sc (\sc -> go inp' sc) dflags -- NB. we add a final \n to the string, (see comment in the beginning of line -- production above). eofHack str = str++"\n" andBegin :: Action -> StartCode -> Action andBegin act new_sc = \pos str _ cont dflags -> act pos str new_sc cont dflags token :: Token -> Action token t = \pos _ sc cont _ -> (t, pos) : cont sc strtoken, strtokenNL :: (String -> Token) -> Action strtoken t = \pos str sc cont _ -> (t str, pos) : cont sc strtokenNL t = \pos str sc cont _ -> (t (filter (/= '\r') str), pos) : cont sc -- ^ We only want LF line endings in our internal doc string format, so we -- filter out all CRs. begin :: StartCode -> Action begin sc = \_ _ _ cont _ -> cont sc -- ----------------------------------------------------------------------------- -- Lex a string as a Haskell identifier ident :: Action ident pos str sc cont dflags = case parseIdent dflags loc id of Just names -> (TokIdent names, pos) : cont sc Nothing -> (TokString str, pos) : cont sc where id = init (tail str) -- TODO: Get the real filename here. Maybe we should just be -- using GHC SrcLoc's ourself? filename = mkFastString "" loc = case pos of AlexPn _ line col -> mkRealSrcLoc filename line col parseIdent :: DynFlags -> RealSrcLoc -> String -> Maybe RdrName parseIdent dflags loc str0 = let buffer = stringToStringBuffer str0 pstate = mkPState dflags buffer loc result = unP parseIdentifier pstate in case result of POk _ name -> Just (unLoc name) _ -> Nothing }