Move doc parsing/lexing into Haddock for ghc>=6.11

author: Isaac Dupree <id@isaac.cedarswampstudios.org> 2009-08-23 03:08:03 +0000
committer: Isaac Dupree <id@isaac.cedarswampstudios.org> 2009-08-23 03:08:03 +0000
commit: 70945271f1f4deab363c1acfef3ce51a1b7f617d (patch)
tree: 81d518f2ca4dde1a1c72072b8a67290de2952388 /src/Haddock/Interface/Lex.x
parent: 9dc98d20af5bbcb8bff7624b3d8c4d840ed6bb4e (diff)
1 files changed, 171 insertions, 0 deletions
diff --git a/src/Haddock/Interface/Lex.x b/src/Haddock/Interface/Lex.x
new file mode 100644
index 00000000..5f93084a
--- /dev/null
+++ b/src/Haddock/Interface/Lex.x
@@ -0,0 +1,171 @@
+--
+-- Haddock - A Haskell Documentation Tool
+--
+-- (c) Simon Marlow 2002
+--
+-- This file was modified and integrated into GHC by David Waern 2006
+--
+
+{
+{-# OPTIONS -Wwarn -w #-}
+-- The above warning supression flag is a temporary kludge.
+-- While working on this module you are encouraged to remove it and fix
+-- any warnings in the module. See
+--     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
+-- for details
+
+module Haddock.Interface.Lex (
+	Token(..),
+	tokenise
+ ) where
+
+import Lexer hiding (Token)
+import Parser ( parseIdentifier )
+import StringBuffer
+import RdrName
+import SrcLoc
+import DynFlags
+
+import Data.Char
+import Numeric
+import System.IO.Unsafe
+}
+
+$ws    = $white # \n
+$digit = [0-9]
+$hexdigit = [0-9a-fA-F]
+$special =  [\"\@]
+$alphanum = [A-Za-z0-9]
+$ident    = [$alphanum \'\_\.\!\#\$\%\&\*\+\/\<\=\>\?\@\\\\\^\|\-\~]
+
+:-
+
+-- beginning of a paragraph
+<0,para> {
+ $ws* \n		;
+ $ws* \>		{ begin birdtrack }
+ $ws* [\*\-]		{ token TokBullet `andBegin` string }
+ $ws* \[		{ token TokDefStart `andBegin` def }
+ $ws* \( $digit+ \) 	{ token TokNumber `andBegin` string }
+ $ws*			{ begin string }		
+}
+
+-- beginning of a line
+<line> {
+  $ws* \>		{ begin birdtrack }
+  $ws* \n		{ token TokPara `andBegin` para }
+  -- Here, we really want to be able to say
+  -- $ws* (\n | <eof>) 	{ token TokPara `andBegin` para}
+  -- because otherwise a trailing line of whitespace will result in 
+  -- a spurious TokString at the end of a docstring.  We don't have <eof>,
+  -- though (NOW I realise what it was for :-).  To get around this, we always
+  -- append \n to the end of a docstring.
+  () 			{ begin string }
+}
+
+<birdtrack> .*	\n?	{ strtokenNL TokBirdTrack `andBegin` line }
+
+<string,def> {
+  $special			{ strtoken $ \s -> TokSpecial (head s) }
+  \<\<.*\>\>                    { strtoken $ \s -> TokPic (init $ init $ tail $ tail s) }
+  \<.*\>			{ strtoken $ \s -> TokURL (init (tail s)) }
+  \#.*\#			{ strtoken $ \s -> TokAName (init (tail s)) }
+  \/ [^\/]* \/                  { strtoken $ \s -> TokEmphasis (init (tail s)) }
+  [\'\`] $ident+ [\'\`]		{ ident }
+  \\ .				{ strtoken (TokString . tail) }
+  "&#" $digit+ \;		{ strtoken $ \s -> TokString [chr (read (init (drop 2 s)))] }
+  "&#" [xX] $hexdigit+ \;	{ strtoken $ \s -> case readHex (init (drop 3 s)) of [(n,_)] -> TokString [chr n] }
+  -- allow special characters through if they don't fit one of the previous
+  -- patterns.
+  [\/\'\`\<\#\&\\]			{ strtoken TokString }
+  [^ $special \/ \< \# \n \'\` \& \\ \]]* \n { strtokenNL TokString `andBegin` line }
+  [^ $special \/ \< \# \n \'\` \& \\ \]]+    { strtoken TokString }
+}
+
+<def> {
+  \]				{ token TokDefEnd `andBegin` string }
+}
+
+-- ']' doesn't have any special meaning outside of the [...] at the beginning
+-- of a definition paragraph.
+<string> {
+  \]				{ strtoken TokString }
+}
+
+{
+data Token
+  = TokPara
+  | TokNumber
+  | TokBullet
+  | TokDefStart
+  | TokDefEnd
+  | TokSpecial Char
+  | TokIdent [RdrName]
+  | TokString String
+  | TokURL String
+  | TokPic String
+  | TokEmphasis String
+  | TokAName String
+  | TokBirdTrack String
+--  deriving Show
+
+-- -----------------------------------------------------------------------------
+-- Alex support stuff
+
+type StartCode = Int
+type Action = String -> StartCode -> (StartCode -> [Token]) -> [Token]
+
+type AlexInput = (Char,String)
+
+alexGetChar (_, [])   = Nothing
+alexGetChar (_, c:cs) = Just (c, (c,cs))
+
+alexInputPrevChar (c,_) = c
+
+tokenise :: String -> [Token]
+tokenise str = let toks = go ('\n', eofHack str) para in {-trace (show toks)-} toks
+  where go inp@(_,str) sc =
+	  case alexScan inp sc of
+		AlexEOF -> []
+		AlexError _ -> error "lexical error"
+		AlexSkip  inp' _       -> go inp' sc
+		AlexToken inp' len act -> act (take len str) sc (\sc -> go inp' sc)
+
+-- NB. we add a final \n to the string, (see comment in the beginning of line
+-- production above).
+eofHack str = str++"\n"
+
+andBegin  :: Action -> StartCode -> Action
+andBegin act new_sc = \str _ cont -> act str new_sc cont
+
+token :: Token -> Action
+token t = \_ sc cont -> t : cont sc
+
+strtoken, strtokenNL :: (String -> Token) -> Action
+strtoken t = \str sc cont -> t str : cont sc
+strtokenNL t = \str sc cont -> t (filter (/= '\r') str) : cont sc
+-- ^ We only want LF line endings in our internal doc string format, so we
+-- filter out all CRs.
+
+begin :: StartCode -> Action
+begin sc = \_ _ cont -> cont sc
+
+-- -----------------------------------------------------------------------------
+-- Lex a string as a Haskell identifier
+
+ident :: Action
+ident str sc cont = 
+  case strToHsQNames id of
+	Just names -> TokIdent names : cont sc
+	Nothing -> TokString str : cont sc
+ where id = init (tail str)
+
+strToHsQNames :: String -> Maybe [RdrName]
+strToHsQNames str0 = 
+  let buffer = unsafePerformIO (stringToStringBuffer str0)
+      pstate = mkPState buffer noSrcLoc defaultDynFlags
+      result = unP parseIdentifier pstate 
+  in case result of 
+       POk _ name -> Just [unLoc name] 
+       _ -> Nothing
+}
author	Isaac Dupree <id@isaac.cedarswampstudios.org>	2009-08-23 03:08:03 +0000
committer	Isaac Dupree <id@isaac.cedarswampstudios.org>	2009-08-23 03:08:03 +0000
commit	70945271f1f4deab363c1acfef3ce51a1b7f617d (patch)
tree	81d518f2ca4dde1a1c72072b8a67290de2952388 /src/Haddock/Interface/Lex.x
parent	9dc98d20af5bbcb8bff7624b3d8c4d840ed6bb4e (diff)