aboutsummaryrefslogtreecommitdiff
path: root/src/Haddock/Lex.x
diff options
context:
space:
mode:
Diffstat (limited to 'src/Haddock/Lex.x')
-rw-r--r--src/Haddock/Lex.x172
1 files changed, 172 insertions, 0 deletions
diff --git a/src/Haddock/Lex.x b/src/Haddock/Lex.x
new file mode 100644
index 00000000..04b9f0c3
--- /dev/null
+++ b/src/Haddock/Lex.x
@@ -0,0 +1,172 @@
+--
+-- Haddock - A Haskell Documentation Tool
+--
+-- (c) Simon Marlow 2002
+--
+-- This file was modified and integrated into GHC by David Waern 2006.
+-- Then moved back into Haddock by Isaac Dupree in 2009 :-)
+--
+
+{
+{-# OPTIONS -Wwarn -w #-}
+-- The above warning supression flag is a temporary kludge.
+-- While working on this module you are encouraged to remove it and fix
+-- any warnings in the module. See
+-- http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
+-- for details
+
+module Haddock.Lex (
+ Token(..),
+ tokenise
+ ) where
+
+import Lexer hiding (Token)
+import Parser ( parseIdentifier )
+import StringBuffer
+import RdrName
+import SrcLoc
+import DynFlags
+
+import Data.Char
+import Numeric
+import System.IO.Unsafe
+}
+
+$ws = $white # \n
+$digit = [0-9]
+$hexdigit = [0-9a-fA-F]
+$special = [\"\@]
+$alphanum = [A-Za-z0-9]
+$ident = [$alphanum \'\_\.\!\#\$\%\&\*\+\/\<\=\>\?\@\\\\\^\|\-\~]
+
+:-
+
+-- beginning of a paragraph
+<0,para> {
+ $ws* \n ;
+ $ws* \> { begin birdtrack }
+ $ws* [\*\-] { token TokBullet `andBegin` string }
+ $ws* \[ { token TokDefStart `andBegin` def }
+ $ws* \( $digit+ \) { token TokNumber `andBegin` string }
+ $ws* { begin string }
+}
+
+-- beginning of a line
+<line> {
+ $ws* \> { begin birdtrack }
+ $ws* \n { token TokPara `andBegin` para }
+ -- Here, we really want to be able to say
+ -- $ws* (\n | <eof>) { token TokPara `andBegin` para}
+ -- because otherwise a trailing line of whitespace will result in
+ -- a spurious TokString at the end of a docstring. We don't have <eof>,
+ -- though (NOW I realise what it was for :-). To get around this, we always
+ -- append \n to the end of a docstring.
+ () { begin string }
+}
+
+<birdtrack> .* \n? { strtokenNL TokBirdTrack `andBegin` line }
+
+<string,def> {
+ $special { strtoken $ \s -> TokSpecial (head s) }
+ \<\<.*\>\> { strtoken $ \s -> TokPic (init $ init $ tail $ tail s) }
+ \<.*\> { strtoken $ \s -> TokURL (init (tail s)) }
+ \#.*\# { strtoken $ \s -> TokAName (init (tail s)) }
+ \/ [^\/]* \/ { strtoken $ \s -> TokEmphasis (init (tail s)) }
+ [\'\`] $ident+ [\'\`] { ident }
+ \\ . { strtoken (TokString . tail) }
+ "&#" $digit+ \; { strtoken $ \s -> TokString [chr (read (init (drop 2 s)))] }
+ "&#" [xX] $hexdigit+ \; { strtoken $ \s -> case readHex (init (drop 3 s)) of [(n,_)] -> TokString [chr n] }
+ -- allow special characters through if they don't fit one of the previous
+ -- patterns.
+ [\/\'\`\<\#\&\\] { strtoken TokString }
+ [^ $special \/ \< \# \n \'\` \& \\ \]]* \n { strtokenNL TokString `andBegin` line }
+ [^ $special \/ \< \# \n \'\` \& \\ \]]+ { strtoken TokString }
+}
+
+<def> {
+ \] { token TokDefEnd `andBegin` string }
+}
+
+-- ']' doesn't have any special meaning outside of the [...] at the beginning
+-- of a definition paragraph.
+<string> {
+ \] { strtoken TokString }
+}
+
+{
+data Token
+ = TokPara
+ | TokNumber
+ | TokBullet
+ | TokDefStart
+ | TokDefEnd
+ | TokSpecial Char
+ | TokIdent [RdrName]
+ | TokString String
+ | TokURL String
+ | TokPic String
+ | TokEmphasis String
+ | TokAName String
+ | TokBirdTrack String
+-- deriving Show
+
+-- -----------------------------------------------------------------------------
+-- Alex support stuff
+
+type StartCode = Int
+type Action = String -> StartCode -> (StartCode -> [Token]) -> [Token]
+
+type AlexInput = (Char,String)
+
+alexGetChar (_, []) = Nothing
+alexGetChar (_, c:cs) = Just (c, (c,cs))
+
+alexInputPrevChar (c,_) = c
+
+tokenise :: String -> [Token]
+tokenise str = let toks = go ('\n', eofHack str) para in {-trace (show toks)-} toks
+ where go inp@(_,str) sc =
+ case alexScan inp sc of
+ AlexEOF -> []
+ AlexError _ -> error "lexical error"
+ AlexSkip inp' _ -> go inp' sc
+ AlexToken inp' len act -> act (take len str) sc (\sc -> go inp' sc)
+
+-- NB. we add a final \n to the string, (see comment in the beginning of line
+-- production above).
+eofHack str = str++"\n"
+
+andBegin :: Action -> StartCode -> Action
+andBegin act new_sc = \str _ cont -> act str new_sc cont
+
+token :: Token -> Action
+token t = \_ sc cont -> t : cont sc
+
+strtoken, strtokenNL :: (String -> Token) -> Action
+strtoken t = \str sc cont -> t str : cont sc
+strtokenNL t = \str sc cont -> t (filter (/= '\r') str) : cont sc
+-- ^ We only want LF line endings in our internal doc string format, so we
+-- filter out all CRs.
+
+begin :: StartCode -> Action
+begin sc = \_ _ cont -> cont sc
+
+-- -----------------------------------------------------------------------------
+-- Lex a string as a Haskell identifier
+
+ident :: Action
+ident str sc cont =
+ case strToHsQNames id of
+ Just names -> TokIdent names : cont sc
+ Nothing -> TokString str : cont sc
+ where id = init (tail str)
+
+strToHsQNames :: String -> Maybe [RdrName]
+strToHsQNames str0 =
+ let buffer = unsafePerformIO (stringToStringBuffer str0)
+ pstate = mkPState buffer noSrcLoc defaultDynFlags
+ result = unP parseIdentifier pstate
+ in case result of
+ POk _ name -> Just [unLoc name]
+ _ -> Nothing
+}