From 6fb8d5abbcc92f5155fdc9596ca1c87fe87f6187 Mon Sep 17 00:00:00 2001 From: Ɓukasz Hanuszczak Date: Thu, 4 Jun 2015 23:21:17 +0200 Subject: Create basic token classification method. --- .../src/Haddock/Backends/Hyperlinker/Parser.hs | 104 +++++++++++++++++++-- 1 file changed, 98 insertions(+), 6 deletions(-) (limited to 'haddock-api') diff --git a/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs b/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs index 4e0d7382..be6b7ce5 100644 --- a/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs +++ b/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs @@ -20,11 +20,18 @@ data Span = Span } data TokenType - = Identifier - | Comment - | Whitespace - | Operator - | Symbol + = TkIdentifier + | TkKeyword + | TkString + | TkChar + | TkNumber + | TkOperator + | TkGlyph + | TkSpecial + | TkSpace + | TkComment + | TkCpp + | TkUnknown parse :: String -> [Token] parse = tokenize . tag . chunk @@ -66,4 +73,89 @@ tag = in (pos', (Span pos pos', c):cs) tokenize :: [(Span, String)] -> [Token] -tokenize = undefined +tokenize = + map aux + where + aux (sp, str) = Token + { tkType = classify str + , tkValue = str + , tkSpan = sp + } + +classify :: String -> TokenType +classify (c:_) + | isSpace c = TkSpace + | isDigit c = TkNumber + | c `elem` special = TkSpecial + | c == '#' = TkCpp + | c == '"' = TkString + | c == '\'' = TkChar +classify str + | str `elem` keywords = TkKeyword + | str `elem` glyphs = TkGlyph + | all (`elem` symbols) str = TkOperator + | "--" `isPrefixOf` str = TkComment + | "{-" `isPrefixOf` str = TkComment + | isIdentifier str = TkIdentifier + | otherwise = TkUnknown + +keywords :: [String] +keywords = + [ "as" + , "case" + , "class" + , "data" + , "default" + , "deriving" + , "do" + , "else" + , "hiding" + , "if" + , "import" + , "in" + , "infix" + , "infixl" + , "infixr" + , "instance" + , "let" + , "module" + , "newtype" + , "of" + , "qualified" + , "then" + , "type" + , "where" + , "forall" + , "mdo" + ] + +glyphs :: [String] +glyphs = + [ ".." + , ":" + , "::" + , "=" + , "\\" + , "|" + , "<-" + , "->" + , "@" + , "~" + , "~#" + , "=>" + , "-" + , "!" + ] + +special :: [Char] +special = "()[]{},;`" + +-- TODO: Add support for any Unicode symbol or punctuation. +-- source: http://stackoverflow.com/questions/10548170/what-characters-are-permitted-for-haskell-operators +symbols :: [Char] +symbols = "!#$%&*+./<=>?@\\^|-~:" + +isIdentifier :: String -> Bool +isIdentifier (c:str) + | isLetter c = all (\c' -> isAlphaNum c' || c == '\'') str +isIdentifier _ = False -- cgit v1.2.3