diff options
Diffstat (limited to 'src/HsLexer.lhs')
-rw-r--r-- | src/HsLexer.lhs | 577 |
1 files changed, 577 insertions, 0 deletions
diff --git a/src/HsLexer.lhs b/src/HsLexer.lhs new file mode 100644 index 00000000..767ffc9c --- /dev/null +++ b/src/HsLexer.lhs @@ -0,0 +1,577 @@ +----------------------------------------------------------------------------- +-- $Id: HsLexer.lhs,v 1.1 2002/04/04 16:23:43 simonmar Exp $ +-- +-- (c) The GHC Team, 1997-2000 +-- +-- Lexer for Haskell. +-- +----------------------------------------------------------------------------- + +ToDo: Parsing floats is a *real* hack... +ToDo: Introduce different tokens for decimal, octal and hexadecimal (?) +ToDo: FloatTok should have three parts (integer part, fraction, exponent) +ToDo: Use a lexical analyser generator (lx?) + +\begin{code} +module HsLexer (Token(..), lexer, parseError,isSymbol) where + +import HsParseMonad +import HsParseUtils +import HsSyn(SrcLoc(..)) + +import Numeric ( readHex, readOct ) +import Char +\end{code} + +\begin{code} +data Token + = VarId String + | QVarId (String,String) + | ConId String + | QConId (String,String) + | VarSym String + | ConSym String + | QVarSym (String,String) + | QConSym (String,String) + +-- Literals + + | IntTok Integer + | FloatTok String + | Character Char + | StringTok String + | PrimChar Char -- GHC extension + | PrimInt Integer -- GHC extension + | PrimString String -- GHC extension + | PrimFloat String -- GHC extension + | PrimDouble String -- GHC extension + +-- Symbols + + | LeftParen + | RightParen + | SemiColon + | LeftCurly + | RightCurly + | VRightCurly -- a virtual close brace + | LeftSquare + | RightSquare + | Comma + | Underscore + | BackQuote + | LeftUT -- GHC Extension: (# + | RightUT -- GHC Extension: #) + +-- Documentation annotations + + | DocCommentNext String -- something beginning '-- |' + | DocCommentPrev String -- something beginning '-- ^' + | DocCommentNamed String -- something beginning '-- @' + | DocSection Int String -- a section heading + +-- Reserved operators + + | Dot -- GHC extension + | DotDot + | DoubleColon + | Equals + | Backslash + | Bar + | LeftArrow + | RightArrow + | At + | Tilde + | DoubleArrow + | Minus + | Exclamation + +-- Reserved Ids + + | KW_As + | KW_Case + | KW_CCall + | KW_Class + | KW_Data + | KW_Default + | KW_Deriving + | KW_Do + | KW_DotNet + | KW_Else + | KW_Export + | KW_Forall + | KW_Foreign + | KW_Hiding + | KW_If + | KW_Import + | KW_In + | KW_Infix + | KW_InfixL + | KW_InfixR + | KW_Instance + | KW_Let + | KW_Module + | KW_NewType + | KW_Of + | KW_Safe + | KW_StdCall + | KW_Then + | KW_ThreadSafe + | KW_Type + | KW_Unsafe + | KW_Where + | KW_Qualified + + | EOF + deriving (Eq,Show) + +reserved_ops :: [(String,Token)] +reserved_ops = [ + ( ".", Dot ), -- GHC extension + ( "..", DotDot ), + ( "::", DoubleColon ), + ( "=", Equals ), + ( "\\", Backslash ), + ( "|", Bar ), + ( "<-", LeftArrow ), + ( "->", RightArrow ), + ( "@", At ), + ( "~", Tilde ), + ( "=>", DoubleArrow ), + ( "-", Minus ), --ToDo: shouldn't be here + ( "!", Exclamation ) --ditto + ] + +reserved_ids :: [(String,Token)] +reserved_ids = [ + ( "_", Underscore ), + ( "case", KW_Case ), + ( "ccall", KW_CCall ), + ( "class", KW_Class ), + ( "data", KW_Data ), + ( "default", KW_Default ), + ( "deriving", KW_Deriving ), + ( "do", KW_Do ), + ( "dotnet", KW_DotNet ), + ( "else", KW_Else ), + ( "export", KW_Export ), + ( "forall", KW_Forall ), + ( "foreign", KW_Foreign ), + ( "if", KW_If ), + ( "import", KW_Import ), + ( "in", KW_In ), + ( "infix", KW_Infix ), + ( "infixl", KW_InfixL ), + ( "infixr", KW_InfixR ), + ( "instance", KW_Instance ), + ( "let", KW_Let ), + ( "module", KW_Module ), + ( "newtype", KW_NewType ), + ( "of", KW_Of ), + ( "safe", KW_Safe ), + ( "then", KW_Then ), + ( "threadsafe",KW_ThreadSafe ), + ( "type", KW_Type ), + ( "unsafe", KW_Unsafe ), + ( "where", KW_Where ), + ( "as", KW_As ), + ( "qualified", KW_Qualified ), + ( "hiding", KW_Hiding ) + ] + +isIdent c = isAlpha c || isDigit c || c == '\'' || c == '_' +isSymbol c = elem c ":!#$%&*+./<=>?@\\^|-~" +isWhite c = elem c " \n\r\t\v\f" + +tAB_LENGTH = 8 :: Int + +-- The source location, (y,x), is the coordinates of the previous token. +-- col is the current column in the source file. If col is 0, we are +-- somewhere at the beginning of the line before the first token. + +-- Setting col to 0 is used in two places: just after emitting a virtual +-- close brace due to layout, so that next time through we check whether +-- we also need to emit a semi-colon, and at the beginning of the file, +-- to kick off the lexer. + + +lexer :: (Token -> P a) -> P a +lexer cont input (SrcLoc _ x) y col = + if col == 0 + then tab y x True input + else tab y col False input -- throw away old x + where + -- move past whitespace and comments + tab y x bol [] = + cont EOF [] (SrcLoc y x) col y + tab y x bol ('\t':s) = + tab y (nextTab x) bol s + tab y x bol ('\n':s) = + newLine cont s y + tab y x bol ('-':'-':s) | not (doc s) = + newLine cont (drop 1 (dropWhile (/= '\n') s)) y + tab y x bol ('{':'-':s) = nestedComment tab y x bol s + tab y x bol (c:s) + | isWhite c = tab y (x+1) bol s + | otherwise = + if bol then lexBOL cont (c:s) (SrcLoc y x) y x + else lexToken cont (c:s) (SrcLoc y x) y x + + newLine cont s y = tab (y+1) 1 True s + + doc (' ':'|':_) = True + doc (' ':'^':_) = True + doc (' ':'*':_) = True + doc _ = False + +nextTab x = x + (tAB_LENGTH - (x-1) `mod` tAB_LENGTH) + +-- When we are lexing the first token of a line, check whether we need to +-- insert virtual semicolons or close braces due to layout. + +lexBOL :: (Token -> P a) -> P a +lexBOL cont s loc y x context = + if need_close_curly then + -- trace "layout: inserting '}'\n" $ + -- Set col to 0, indicating that we're still at the + -- beginning of the line, in case we need a semi-colon too. + -- Also pop the context here, so that we don't insert + -- another close brace before the parser can pop it. + cont VRightCurly s loc y 0 (tail context) + else if need_semi_colon then + --trace "layout: inserting ';'\n" $ + cont SemiColon s loc y x context + else + lexToken cont s loc y x context + where + need_close_curly = + case context of + [] -> False + (i:_) -> case i of + NoLayout -> False + Layout n -> x < n + need_semi_colon = + case context of + [] -> False + (i:_) -> case i of + NoLayout -> False + Layout n -> x == n + +lexToken :: (Token -> P a) -> P a +lexToken cont s loc y x = + -- trace ("lexer: y="++show y++" x="++show x++"\n") $ + case s of + -- First the special symbols + '(':'#':s -> forward 2 LeftUT s + '(':s -> forward 1 LeftParen s + '#':')':s -> forward 2 RightUT s + ')':s -> forward 1 RightParen s + ',':s -> forward 1 Comma s + ';':s -> forward 1 SemiColon s + '[':s -> forward 1 LeftSquare s + ']':s -> forward 1 RightSquare s + '`':s -> forward 1 BackQuote s + '{':s -> \ctxt -> forward 1 LeftCurly s (NoLayout : ctxt) + '}':s -> \ctxt -> case ctxt of + (_:ctxt) -> forward 1 RightCurly s ctxt + -- pop context on '}' + [] -> error "Internal error: empty context in lexToken" + + '-':'-':' ':'|':s -> docComment DocCommentNext cont s loc y x + '-':'-':' ':'^':s -> docComment DocCommentPrev cont s loc y x + '-':'-':' ':'*':s -> docSection cont ('*':s) loc y x + + '\'':s -> lexChar cont s loc y (x+1) + '\"':s{-"-} -> lexString cont s loc y (x+1) + + '0':'x':c:s | isHexDigit c -> + let (num, rest) = span isHexDigit (c:s) + [(i,_)] = readHex num + in + afterNum cont i rest loc y (x+length num) + '0':'o':c:s | isOctDigit c -> + let (num, rest) = span isOctDigit (c:s) + [(i,_)] = readOct num + in + afterNum cont i rest loc y (x+length num) + + c:s | isLower c || c == '_' -> + let + (idtail, rest) = slurpIdent s + id = c:idtail + l_id = 1 + length idtail + in + case lookup id reserved_ids of + Just keyword -> forward l_id keyword rest + Nothing -> forward l_id (VarId id) rest + + | isUpper c -> lexCon "" cont (c:s) loc y x + | isSymbol c -> + let + (symtail, rest) = span isSymbol s + sym = c : symtail + l_sym = 1 + length symtail + in + case lookup sym reserved_ops of + Just t -> forward l_sym t rest + Nothing -> case c of + ':' -> forward l_sym (ConSym sym) rest + _ -> forward l_sym (VarSym sym) rest + + | isDigit c -> lexNum cont c s loc y x + + | otherwise -> + parseError ("illegal character \'" ++ show c ++ "\'\n") + s loc y x + + where forward n t s = cont t s loc y (x+n) + +lexToken _ _ _ _ _ = error "Internal error: empty input in lexToken" + +afterNum cont i ('#':s) loc y x = cont (PrimInt i) s loc y (x+1) +afterNum cont i s loc y x = cont (IntTok i) s loc y x + +lexNum cont c s loc y x = + let (num, after_num) = span isDigit (c:s) + in + case after_num of + '.':c:s | isDigit c -> + let (frac,after_frac) = span isDigit s + in + let float = num ++ '.':frac + (f, after_exp) + = case after_frac of + 'E':s -> do_exponent s + 'e':s -> do_exponent s + _ -> (float, after_frac) + + do_exponent s = + case s of + '-':c:s | isDigit c -> + let (exp,rest) = span isDigit (c:s) in + (float ++ 'e':'-':exp, rest) + '+':c:s | isDigit c -> + let (exp,rest) = span isDigit (c:s) in + (float ++ 'e':'+':exp, rest) + c:s | isDigit c -> + let (exp,rest) = span isDigit (c:s) in + (float ++ 'e':exp, rest) + _ -> (float, after_frac) + + x' = x + length f + + in case after_exp of -- glasgow exts only + '#':'#':s -> cont (PrimDouble f) s loc y x' + '#':s -> cont (PrimFloat f) s loc y x' + s -> cont (FloatTok f) s loc y x' + + _ -> afterNum cont (parseInteger 10 num) after_num loc y (x + length num) + + +-- GHC extension: allow trailing '#'s in an identifier. +slurpIdent s = slurp' s [] + where + slurp' [] i = (reverse i, []) + slurp' (c:cs) i + | isIdent c = slurp' cs (c:i) + | c == '#' = slurphashes cs (c:i) + slurp' cs i = (reverse i, cs) + +slurphashes [] i = (reverse i, []) +slurphashes ('#':cs) i = slurphashes cs ('#':i) +slurphashes s i = (reverse i, s) + + +lexCon qual cont s loc y x = + let + forward n t s = cont t s loc y (x+n) + + (con, rest) = slurpIdent s + l_con = length con + + just_a_conid + | null qual = forward l_con (ConId con) rest + | otherwise = forward l_con (QConId (qual,con)) rest + in + case rest of + '.':c1:s1 + | isLower c1 -> -- qualified varid? + let + (idtail, rest1) = slurpIdent s1 + id = c1:idtail + l_id = 1 + length idtail + in + case lookup id reserved_ids of + -- cannot qualify a reserved word + Just keyword -> just_a_conid + Nothing -> forward (l_con+1+l_id) (QVarId (con, id)) rest1 + + | isUpper c1 -> -- qualified conid? + let qual' | null qual = con + | otherwise = qual ++ '.':con + in + lexCon qual' cont (c1:s1) loc y (x+l_con+1) + + | isSymbol c1 -> -- qualified symbol? + let + (symtail, rest1) = span isSymbol s1 + sym = c1 : symtail + l_sym = 1 + length symtail + in + case lookup sym reserved_ops of + -- cannot qualify a reserved operator + Just _ -> just_a_conid + Nothing -> case c1 of + ':' -> forward (l_con+1+l_sym) + (QConSym (con, sym)) rest1 + _ -> forward (l_con+1+l_sym) + (QVarSym (con, sym)) rest1 + + _ -> just_a_conid -- not a qualified thing + + +lexChar :: (Token -> P a) -> P a +lexChar cont s loc y x = case s of + '\\':s -> (escapeChar s `thenP` \(e,s,i) _ _ _ _ -> + charEnd e s loc y (x+i)) s loc y x + c:s -> charEnd c s loc y (x+1) + [] -> error "Internal error: lexChar" + + where charEnd c ('\'':'#':s) = \loc y x -> cont (PrimChar c) s loc y (x+2) + charEnd c ('\'':s) = \loc y x -> cont (Character c) s loc y (x+1) + charEnd c s = parseError "Improperly terminated character constant" s + +lexString :: (Token -> P a) -> P a +lexString cont s loc y x = loop "" s x y + where + loop e s x y = case s of + '\\':'&':s -> loop e s (x+2) y + '\\':c:s | isSpace c -> stringGap e s (x+2) y + | otherwise -> (escapeChar (c:s) `thenP` \(e',s,i) _ _ _ _ -> + loop (e':e) s (x+i) y) s loc y x + '\"':s{-"-} -> cont (StringTok (reverse e)) s loc y (x+1) + c:s -> loop (c:e) s (x+1) y + [] -> parseError "Improperly terminated string" s loc y x + + stringGap e s x y = case s of + '\n':s -> stringGap e s 1 (y+1) + '\\':s -> loop e s (x+1) y + c:s' | isSpace c -> stringGap e s' (x+1) y + | otherwise -> + parseError "Illegal character in string gap" s loc y x + [] -> error "Internal error: stringGap" + +-- ToDo: \o, \x, \<octal> things. + +escapeChar :: String -> P (Char,String,Int) +escapeChar s = case s of + + 'x':c:s | isHexDigit c -> + let (num,rest) = span isHexDigit (c:s) in + returnP (chr (fromIntegral (parseInteger 16 num)), rest, length num) + + 'o':c:s | isOctDigit c -> + let (num,rest) = span isOctDigit (c:s) in + returnP (chr (fromIntegral (parseInteger 8 num)), rest, length num) + + c:s | isDigit c -> let (num,rest) = span isDigit (c:s) in + returnP (chr (read num), rest, length num) + +-- Production charesc from section B.2 (Note: \& is handled by caller) + + 'a':s -> returnP ('\a',s,2) + 'b':s -> returnP ('\b',s,2) + 'f':s -> returnP ('\f',s,2) + 'n':s -> returnP ('\n',s,2) + 'r':s -> returnP ('\r',s,2) + 't':s -> returnP ('\t',s,2) + 'v':s -> returnP ('\v',s,2) + '\\':s -> returnP ('\\',s,2) + '"':s -> returnP ('\"',s,2) + '\'':s -> returnP ('\'',s,2) + +-- Production ascii from section B.2 + + '^':x@(c:s) -> cntrl x + 'N':'U':'L':s -> returnP ('\NUL',s,4) + 'S':'O':'H':s -> returnP ('\SOH',s,4) + 'S':'T':'X':s -> returnP ('\STX',s,4) + 'E':'T':'X':s -> returnP ('\ETX',s,4) + 'E':'O':'T':s -> returnP ('\EOT',s,4) + 'E':'N':'Q':s -> returnP ('\ENQ',s,4) + 'A':'C':'K':s -> returnP ('\ACK',s,4) + 'B':'E':'L':s -> returnP ('\BEL',s,4) + 'B':'S':s -> returnP ('\BS', s,3) + 'H':'T':s -> returnP ('\HT', s,3) + 'L':'F':s -> returnP ('\LF', s,3) + 'V':'T':s -> returnP ('\VT', s,3) + 'F':'F':s -> returnP ('\FF', s,3) + 'C':'R':s -> returnP ('\CR', s,3) + 'S':'O':s -> returnP ('\SO', s,3) + 'S':'I':s -> returnP ('\SI', s,3) + 'D':'L':'E':s -> returnP ('\DLE',s,4) + 'D':'C':'1':s -> returnP ('\DC1',s,4) + 'D':'C':'2':s -> returnP ('\DC2',s,4) + 'D':'C':'3':s -> returnP ('\DC3',s,4) + 'D':'C':'4':s -> returnP ('\DC4',s,4) + 'N':'A':'K':s -> returnP ('\NAK',s,4) + 'S':'Y':'N':s -> returnP ('\SYN',s,4) + 'E':'T':'B':s -> returnP ('\ETB',s,4) + 'C':'A':'N':s -> returnP ('\CAN',s,4) + 'E':'M':s -> returnP ('\EM', s,3) + 'S':'U':'B':s -> returnP ('\SUB',s,4) + 'E':'S':'C':s -> returnP ('\ESC',s,4) + 'F':'S':s -> returnP ('\FS', s,3) + 'G':'S':s -> returnP ('\GS', s,3) + 'R':'S':s -> returnP ('\RS', s,3) + 'U':'S':s -> returnP ('\US', s,3) + 'S':'P':s -> returnP ('\SP', s,3) + 'D':'E':'L':s -> returnP ('\DEL',s,4) + + _ -> parseError "Illegal escape sequence" + + +-- Stolen from Hugs's Prelude +parseInteger :: Integer -> String -> Integer +parseInteger radix ds = + foldl1 (\n d -> n * radix + d) (map (toInteger . digitToInt) ds) + +-- Production cntrl from section B.2 + +cntrl :: String -> P (Char,String,Int) +cntrl (c:s) | c >= '@' && c <= '_' = returnP (chr (ord c - ord '@'), s,2) +cntrl _ = parseError "Illegal control character" + +nestedComment cont y x bol s = + case s of + '-':'}':s -> cont y (x+2) bol s + '{':'-':s -> nestedComment (nestedComment cont) y (x+2) bol s + '\t':s -> nestedComment cont y (nextTab x) bol s + '\n':s -> nestedComment cont (y+1) 1 True s + c:s -> nestedComment cont y (x+1) bol s + [] -> error "Internal error: nestedComment" + + +docComment f cont s loc y x + = let (s', comment, y') = slurpExtraCommentLines s [] y in + cont (f comment) s' loc y' x -- continue with the newline char + +slurpExtraCommentLines s lines y + = case rest of + '\n':nextline -> + case dropWhile nonNewlineSpace nextline of + '-':'-':s -> slurpExtraCommentLines s + ((line++"\n"):lines) (y+1) + _ -> (rest, finished, y) + other -> (rest, finished, y) + where + (line, rest) = break (== '\n') s + finished = concat (reverse (line:lines)) + +nonNewlineSpace c = isSpace c && c /= '\n' + +docSection cont s loc y x + = let (stars, rest') = break (/= '*') s + (line, rest) = break (== '\n') rest' + in + cont (DocSection (length stars) line) rest loc y x +\end{code} |