haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235

module Haddock.Backends.Hyperlinker.Parser
    ( parse
    , Token(..), TokenType(..)
    , Position(..), Span(..)
    ) where

import Data.Char
import Data.List
import Data.Maybe

data Token = Token
    { tkType :: TokenType
    , tkValue :: String
    , tkSpan :: Span
    }

data Position = Position
    { posRow :: !Int
    , posCol :: !Int
    }

data Span = Span
    { spStart :: Position
    , spEnd :: Position
    }

data TokenType
    = TkIdentifier
    | TkKeyword
    | TkString
    | TkChar
    | TkNumber
    | TkOperator
    | TkGlyph
    | TkSpecial
    | TkSpace
    | TkComment
    | TkCpp
    | TkPragma
    | TkUnknown
    deriving (Show, Eq)

-- | Turn source code string into a stream of more descriptive tokens.
--
-- Result should retain original file layout (including comments, whitespace,
-- etc.), i.e. the following "law" should hold:
--
-- @concat . map 'tkValue' . 'parse' = id@
parse :: String -> [Token]
parse = tokenize . tag . chunk

-- | Split raw source string to more meaningful chunks.
--
-- This is the initial stage of tokenization process. Each chunk is either
-- a comment (including comment delimiters), a whitespace string, preprocessor
-- macro (and all its content until the end of a line) or valid Haskell lexeme.
chunk :: String -> [String]
chunk [] = []
chunk str@(c:_)
    | isSpace c =
        let (space, mcpp, rest) = spanSpaceOrCpp str
        in [space] ++ maybeToList mcpp ++ chunk rest
chunk str
    | "--" `isPrefixOf` str = chunk' $ spanToNewline str
    | "{-" `isPrefixOf` str = chunk' $ chunkComment 0 str
    | otherwise = case lex str of
        (tok:_) -> chunk' tok
        [] -> [str]
  where
    chunk' (c, rest) = c:(chunk rest)

-- | Split input to "first line" string and the rest of it.
--
-- Ideally, this should be done simply with @'break' (== '\n')@. However,
-- Haskell also allows line-unbreaking (or whatever it is called) so things
-- are not as simple and this function deals with that.
spanToNewline :: String -> (String, String)
spanToNewline [] = ([], [])
spanToNewline ('\\':'\n':str) =
    let (str', rest) = spanToNewline str
    in ('\\':'\n':str', rest)
spanToNewline str@('\n':_) = ("", str)
spanToNewline (c:str) =
    let (str', rest) = spanToNewline str
    in (c:str', rest)

-- | Split input to whitespace string, (optional) preprocessor directive and
-- the rest of it.
--
-- Again, using something like @'span' 'isSpace'@ would be nice to chunk input
-- to whitespace. The problem is with /#/ symbol - if it is placed at the very
-- beginning of a line, it should be recognized as preprocessor macro. In any
-- other case, it is ordinary Haskell symbol and can be used to declare
-- operators. Hence, while dealing with whitespace we also check whether there
-- happens to be /#/ symbol just after a newline character - if that is the
-- case, we begin treating the whole line as preprocessor macro.
spanSpaceOrCpp :: String -> (String, Maybe String, String)
spanSpaceOrCpp ('\n':'#':str) =
    let (str', rest) = spanToNewline str
    in ("\n", Just $ '#':str', rest)
spanSpaceOrCpp (c:str')
    | isSpace c =
        let (space, mcpp, rest) = spanSpaceOrCpp str'
        in (c:space, mcpp, rest)
spanSpaceOrCpp str = ("", Nothing, str)

-- | Split input to comment content (including delimiters) and the rest.
--
-- Again, some more logic than simple 'span' is required because of Haskell
-- comment nesting policy.
chunkComment :: Int -> String -> (String, String)
chunkComment _ [] = ("", "")
chunkComment depth ('{':'-':str) =
    let (c, rest) = chunkComment (depth + 1) str
    in ("{-" ++ c, rest)
chunkComment depth ('-':'}':str)
    | depth == 1 = ("-}", str)
    | otherwise =
        let (c, rest) = chunkComment (depth - 1) str
        in ("-}" ++ c, rest)
chunkComment depth (e:str) =
    let (c, rest) = chunkComment depth str
    in (e:c, rest)

-- | Assign source location for each chunk in given stream.
tag :: [String] -> [(Span, String)]
tag =
    reverse . snd . foldl aux (Position 1 1, [])
  where
    aux (pos, cs) str =
        let pos' = foldl move pos str
        in (pos', (Span pos pos', str):cs)
    move pos '\n' = pos { posRow = posRow pos + 1, posCol = 1 }
    move pos _ = pos { posCol = posCol pos + 1 }

-- | Turn unrecognised chunk stream to more descriptive token stream.
tokenize :: [(Span, String)] -> [Token]
tokenize =
    map aux
  where
    aux (sp, str) = Token
        { tkType = classify str
        , tkValue = str
        , tkSpan = sp
        }

-- | Classify given string as appropriate Haskell token.
--
-- This method is based on Haskell 98 Report lexical structure description:
-- https://www.haskell.org/onlinereport/lexemes.html
--
-- However, this is probably far from being perfect and most probably does not
-- handle correctly all corner cases.
classify :: String -> TokenType
classify str
    | "--" `isPrefixOf` str = TkComment
    | "{-#" `isPrefixOf` str = TkPragma
    | "{-" `isPrefixOf` str = TkComment
classify (c:_)
    | isSpace c = TkSpace
    | isDigit c = TkNumber
    | c `elem` special = TkSpecial
    | c == '#' = TkCpp
    | c == '"' = TkString
    | c == '\'' = TkChar
classify str
    | str `elem` keywords = TkKeyword
    | str `elem` glyphs = TkGlyph
    | all (`elem` symbols) str = TkOperator
    | isIdentifier str = TkIdentifier
    | otherwise = TkUnknown

keywords :: [String]
keywords =
    [ "as"
    , "case"
    , "class"
    , "data"
    , "default"
    , "deriving"
    , "do"
    , "else"
    , "hiding"
    , "if"
    , "import"
    , "in"
    , "infix"
    , "infixl"
    , "infixr"
    , "instance"
    , "let"
    , "module"
    , "newtype"
    , "of"
    , "qualified"
    , "then"
    , "type"
    , "where"
    , "forall"
    , "mdo"
    ]

glyphs :: [String]
glyphs =
    [ ".."
    , ":"
    , "::"
    , "="
    , "\\"
    , "|"
    , "<-"
    , "->"
    , "@"
    , "~"
    , "~#"
    , "=>"
    , "-"
    , "!"
    ]

special :: [Char]
special = "()[]{},;`"

-- TODO: Add support for any Unicode symbol or punctuation.
-- source: http://stackoverflow.com/questions/10548170/what-characters-are-permitted-for-haskell-operators
symbols :: [Char]
symbols = "!#$%&*+./<=>?@\\^|-~:"

isIdentifier :: String -> Bool
isIdentifier (s:str)
    | (isLower' s || isUpper s) && all isAlphaNum' str = True
  where
    isLower' c = isLower c || c == '_'
    isAlphaNum' c = isAlphaNum c || c == '_' || c == '\''
isIdentifier _ = False