From ff5c7d6ddfabadf0afe24e33a5487d630fbbd406 Mon Sep 17 00:00:00 2001 From: simonmar Date: Fri, 15 Aug 2003 14:42:59 +0000 Subject: [haddock @ 2003-08-15 14:42:59 by simonmar] Convert the lexer to Alex, and fix a bug in the process. --- src/HaddockLex.hs | 155 ----------------------------------------------------- src/HaddockLex.x | 137 ++++++++++++++++++++++++++++++++++++++++++++++ src/HaddockParse.y | 6 +-- 3 files changed, 140 insertions(+), 158 deletions(-) delete mode 100644 src/HaddockLex.hs create mode 100644 src/HaddockLex.x diff --git a/src/HaddockLex.hs b/src/HaddockLex.hs deleted file mode 100644 index b1de971e..00000000 --- a/src/HaddockLex.hs +++ /dev/null @@ -1,155 +0,0 @@ --- --- Haddock - A Haskell Documentation Tool --- --- (c) Simon Marlow 2002 --- - -module HaddockLex ( - Token(..), - tokenise - ) where - -import Char -import HsSyn -import HsLexer hiding (Token) -import HsParseMonad - -data Token - = TokPara - | TokNumber - | TokBullet - | TokSpecial Char - | TokIdent [HsQName] - | TokString String - | TokURL String - | TokBirdTrack - | TokAName String - deriving Show - -isSpecial, isSingleQuote, isIdent :: Char -> Bool -isSpecial c = c `elem` ['\"', '@', '/', '#'] -isSingleQuote c = c `elem` ['\'', '`'] -isIdent c = isAlphaNum c || c `elem` "_.!#$%&*+/<=>?@\\^|-~" - --- simple finite-state machine for tokenising the doc string - --- ---------------------------------------------------------------------------- --- At the beginning of a paragraph, we throw away initial whitespace - -tokenise :: String -> [Token] -tokenise "" = [] -tokenise str = case str of - '<':cs -> tokenise_url cs - '\n':cs -> tokenise_newline cs - c:cs | isSingleQuote c -> tokenise_identifier c cs - | isSpecial c -> TokSpecial c : tokenise1 cs - _other -> tokenise_string "" str - -tokenise_newline :: String -> [Token] -tokenise_newline cs0 = - case dropWhile nonNewlineSpace cs0 of - '\n':cs -> TokPara : tokenise_para cs -- paragraph break - '>':cs -> TokBirdTrack : tokenise_birdtrack cs -- bird track - _ -> tokenise_string "" cs0 - -tokenise_para :: String -> [Token] -tokenise_para cs0 = - case dropWhile nonNewlineSpace cs0 of - -- bullet: '*' - '*':cs -> TokBullet : tokenise cs - -- bullet: '-' - '-':cs -> TokBullet : tokenise cs - -- enumerated item: '1.' - '>':cs -> TokBirdTrack : tokenise_birdtrack cs - -- bird track - str | (ds,'.':cs) <- span isDigit str, not (null ds) - -> TokNumber : tokenise cs - -- enumerated item: '(1)' - '(':cs | (ds,')':cs') <- span isDigit cs, not (null ds) - -> TokNumber : tokenise cs' - _ -> tokenise cs0 - -nonNewlineSpace :: Char -> Bool -nonNewlineSpace c = isSpace c && c /= '\n' - --- ---------------------------------------------------------------------------- --- Within a paragraph, we don't throw away any whitespace (except before a --- birdtrack, and before a paragraph break). - -tokenise1 :: String -> [Token] -tokenise1 str = tokenise_string "" str - --- found a single quote, check whether we have an identifier... -tokenise_identifier :: Char -> String -> [Token] -tokenise_identifier q cs0 = - let (ident,rest) = break (not.isIdent) cs0 in - case (rest, strToHsQNames ident) of - (c:cs, Just names) | isSingleQuote c -> TokIdent names : tokenise1 cs - _ -> tokenise_string [q] cs0 - -tokenise_url :: String -> [Token] -tokenise_url cs = - let (url,rest0) = break (=='>') cs in - TokURL url : case rest0 of - '>':rest -> tokenise1 rest - _ -> tokenise1 rest0 - -tokenise_string :: String -> String -> [Token] -tokenise_string str cs0 = - case cs0 of - [] -> tokString str [] - '\\':c:cs -> tokenise_string (c:str) cs - '\n':cs -> tokenise_string_newline str cs - '<':cs -> tokString str (tokenise_url cs) - c:cs | isSpecial c -> tokString str (TokSpecial c : tokenise1 cs) - | isSingleQuote c -> tokString str (tokenise_identifier c cs) - | otherwise -> tokenise_string (c:str) cs - -tokenise_string_newline :: String -> String -> [Token] -tokenise_string_newline str cs0 = - case dropWhile nonNewlineSpace cs0 of - '\n':cs -> tokString str (TokPara : tokenise_para cs) - -- paragraph break: throw away all whitespace - '>':cs -> tokString ('\n':str) (TokBirdTrack : tokenise_birdtrack cs) - -- keep the \n, but throw away any space before the '>' - _ -> tokenise_string ('\n':str) cs0 - -- don't throw away whitespace at all - -tokString :: String -> [Token] -> [Token] -tokString [] rest = rest -tokString cs rest = TokString (reverse cs) : rest - --- A bird-tracked line is verbatim, no markup characters are interpreted -tokenise_birdtrack :: String -> [Token] -tokenise_birdtrack cs = - let (line, rest) = break (=='\n') cs in - TokString line : tokenise1 rest - --- ----------------------------------------------------------------------------- --- Lex a string as a Haskell identifier - -strToHsQNames :: String -> Maybe [HsQName] -strToHsQNames str0 - = case lexer (\t -> returnP t) str0 (SrcLoc 1 1) 1 1 [] of - Ok _ (VarId str) - -> Just [ UnQual (HsVarName (HsIdent str)) ] - Ok _ (QVarId (mod0,str)) - -> Just [ Qual (Module mod0) (HsVarName (HsIdent str)) ] - Ok _ (ConId str) - -> Just [ UnQual (HsTyClsName (HsIdent str)), - UnQual (HsVarName (HsIdent str)) ] - Ok _ (QConId (mod0,str)) - -> Just [ Qual (Module mod0) (HsTyClsName (HsIdent str)), - Qual (Module mod0) (HsVarName (HsIdent str)) ] - Ok _ (VarSym str) - -> Just [ UnQual (HsVarName (HsSymbol str)) ] - Ok _ (ConSym str) - -> Just [ UnQual (HsTyClsName (HsSymbol str)), - UnQual (HsVarName (HsSymbol str)) ] - Ok _ (QVarSym (mod0,str)) - -> Just [ Qual (Module mod0) (HsVarName (HsSymbol str)) ] - Ok _ (QConSym (mod0,str)) - -> Just [ Qual (Module mod0) (HsTyClsName (HsSymbol str)), - Qual (Module mod0) (HsVarName (HsSymbol str)) ] - _other - -> Nothing diff --git a/src/HaddockLex.x b/src/HaddockLex.x new file mode 100644 index 00000000..d888aeff --- /dev/null +++ b/src/HaddockLex.x @@ -0,0 +1,137 @@ +-- +-- Haddock - A Haskell Documentation Tool +-- +-- (c) Simon Marlow 2002 +-- + +{ +module HaddockLex ( + Token(..), + tokenise + ) where + +import Char +import HsSyn +import HsLexer hiding (Token) +import HsParseMonad +import Debug.Trace +} + +$ws = $white # \n +$digit = [0-9] +$special = [\"\@\/\#] +$alphanum = [A-Za-z0-9] +$ident = [$alphanum \_\.\!\#\$\%\&\*\+\/\<\=\>\?\@\\\\\^\|\-\~] + +:- + +-- beginning of a paragraph +<0,para> { + $ws* \n ; + $ws* \> { begin birdtrack } + $ws* [\*\-] { token TokBullet } + $ws* \( $digit+ \) { token TokNumber } + $ws* { begin string } +} + +-- beginning of a line: don't ignore whitespace, unless this is the start +-- of a new paragraph. + { + $ws* \> { begin birdtrack } + $ws* \n { token TokPara `andBegin` para } + () { begin string } +} + + .* \n? { strtoken TokBirdTrack `andBegin` line } + + { + $special { strtoken $ \s -> TokSpecial (head s) } + \<.*\> { strtoken $ \s -> TokURL (init (tail s)) } + [\'\`] $ident+ [\'\`] { ident } + \\ . { strtoken (TokString . tail) } + [^ $special \< \n \'\` \\]* \n { strtoken TokString `andBegin` line } + [^ $special \< \n \'\` \\]+ { strtoken TokString } +} + +{ +data Token + = TokPara + | TokNumber + | TokBullet + | TokSpecial Char + | TokIdent [HsQName] + | TokString String + | TokURL String + | TokBirdTrack String + deriving Show + +-- ----------------------------------------------------------------------------- +-- Alex support stuff + +type StartCode = Int +type Action = String -> StartCode -> (StartCode -> [Token]) -> [Token] + +type AlexInput = (Char,String) + +alexGetChar (_, []) = Nothing +alexGetChar (_, c:cs) = Just (c, (c,cs)) + +alexInputPrevChar (c,_) = c + +tokenise :: String -> [Token] +tokenise str = let toks = go ('\n',str) para in trace (show toks) toks + where go inp@(_,str) sc = + case alexScan inp sc of + AlexEOF -> [] + AlexError _ -> error "lexical error" + AlexSkip inp' len -> go inp' sc + AlexToken inp' len act -> act (take len str) sc (\sc -> go inp' sc) + +andBegin :: Action -> StartCode -> Action +andBegin act new_sc = \str sc cont -> act str new_sc cont + +token :: Token -> Action +token t = \str sc cont -> t : cont sc + +strtoken :: (String -> Token) -> Action +strtoken t = \str sc cont -> t str : cont sc + +begin :: StartCode -> Action +begin sc = \str _ cont -> cont sc + +-- ----------------------------------------------------------------------------- +-- Lex a string as a Haskell identifier + +ident :: Action +ident str sc cont = + case strToHsQNames id of + Just names -> TokIdent names : cont sc + Nothing -> TokString str : cont sc + where id = init (tail str) + +strToHsQNames :: String -> Maybe [HsQName] +strToHsQNames str0 + = case lexer (\t -> returnP t) str0 (SrcLoc 1 1) 1 1 [] of + Ok _ (VarId str) + -> Just [ UnQual (HsVarName (HsIdent str)) ] + Ok _ (QVarId (mod0,str)) + -> Just [ Qual (Module mod0) (HsVarName (HsIdent str)) ] + Ok _ (ConId str) + -> Just [ UnQual (HsTyClsName (HsIdent str)), + UnQual (HsVarName (HsIdent str)) ] + Ok _ (QConId (mod0,str)) + -> Just [ Qual (Module mod0) (HsTyClsName (HsIdent str)), + Qual (Module mod0) (HsVarName (HsIdent str)) ] + Ok _ (VarSym str) + -> Just [ UnQual (HsVarName (HsSymbol str)) ] + Ok _ (ConSym str) + -> Just [ UnQual (HsTyClsName (HsSymbol str)), + UnQual (HsVarName (HsSymbol str)) ] + Ok _ (QVarSym (mod0,str)) + -> Just [ Qual (Module mod0) (HsVarName (HsSymbol str)) ] + Ok _ (QConSym (mod0,str)) + -> Just [ Qual (Module mod0) (HsTyClsName (HsSymbol str)), + Qual (Module mod0) (HsVarName (HsSymbol str)) ] + _other + -> Nothing +} diff --git a/src/HaddockParse.y b/src/HaddockParse.y index 15eda968..db712c42 100644 --- a/src/HaddockParse.y +++ b/src/HaddockParse.y @@ -14,7 +14,7 @@ import HsSyn URL { TokURL $$ } '*' { TokBullet } '(n)' { TokNumber } - '>' { TokBirdTrack } + '>..' { TokBirdTrack $$ } IDENT { TokIdent $$ } PARA { TokPara } STRING { TokString $$ } @@ -48,8 +48,8 @@ para :: { Doc } | codepara { DocCodeBlock $1 } codepara :: { Doc } - : '>' seq codepara { docAppend $2 $3 } - | '>' seq { $2 } + : '>..' codepara { docAppend (DocString $1) $2 } + | '>..' { DocString $1 } seq :: { Doc } : elem seq { docAppend $1 $2 } -- cgit v1.2.3