aboutsummaryrefslogtreecommitdiff
path: root/src/Haddock/Lex.x
blob: 9e59fa4cc3b4bb4d2afc5ce6215851020477127e (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
--
-- Haddock - A Haskell Documentation Tool
--
-- (c) Simon Marlow 2002
--
-- This file was modified and integrated into GHC by David Waern 2006.
-- Then moved back into Haddock by Isaac Dupree in 2009 :-)
--

{
{-# LANGUAGE BangPatterns #-}   -- Generated by Alex
{-# OPTIONS -Wwarn -w #-}
-- The above warning supression flag is a temporary kludge.
-- While working on this module you are encouraged to remove it and fix
-- any warnings in the module. See
--     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
-- for details

module Haddock.Lex (
	Token(..),
	LToken,
	tokenise
 ) where

import Lexer hiding (Token)
import Parser ( parseIdentifier )
import StringBuffer
import RdrName
import SrcLoc
import DynFlags
import FastString

import qualified Data.Bits
import Data.Char
import Data.Word (Word8)
import Numeric
import System.IO.Unsafe
import Debug.Trace
}

$ws    = $white # \n
$digit = [0-9]
$hexdigit = [0-9a-fA-F]
$special =  [\"\@]
$alphanum = [A-Za-z0-9]
$ident    = [$alphanum \'\_\.\!\#\$\%\&\*\+\/\<\=\>\?\@\\\\\^\|\-\~\:]

:-

-- beginning of a paragraph
<0,para> {
 $ws* \n		;
 $ws* \>		{ begin birdtrack }
 $ws* prop \> .* \n	{ strtoken TokProperty `andBegin` property}
 $ws* \>\>\>            { strtoken TokExamplePrompt `andBegin` exampleexpr }
 $ws* [\*\-]		{ token TokBullet `andBegin` string }
 $ws* \[		{ token TokDefStart `andBegin` def }
 $ws* \( $digit+ \) 	{ token TokNumber `andBegin` string }
 $ws* $digit+ \. 	{ token TokNumber `andBegin` string }
 $ws*			{ begin string }		
}

-- beginning of a line
<line> {
  $ws* \>		{ begin birdtrack }
  $ws* \>\>\>		{ strtoken TokExamplePrompt `andBegin` exampleexpr }
  $ws* \n		{ token TokPara `andBegin` para }
  -- Here, we really want to be able to say
  -- $ws* (\n | <eof>) 	{ token TokPara `andBegin` para}
  -- because otherwise a trailing line of whitespace will result in 
  -- a spurious TokString at the end of a docstring.  We don't have <eof>,
  -- though (NOW I realise what it was for :-).  To get around this, we always
  -- append \n to the end of a docstring.
  () 			{ begin string }
}

<birdtrack> .*	\n?	{ strtokenNL TokBirdTrack `andBegin` line }

<property> ()           { token TokPara `andBegin` para }

<example> {
  $ws*	\n		{ token TokPara `andBegin` para }
  $ws* \>\>\>	        { strtoken TokExamplePrompt `andBegin` exampleexpr }
  ()			{ begin exampleresult }
}

<exampleexpr> .* \n	{ strtokenNL TokExampleExpression `andBegin` example }

<exampleresult> .* \n	{ strtokenNL TokExampleResult `andBegin` example }

<string,def> {
  $special			{ strtoken $ \s -> TokSpecial (head s) }
  \<\< [^\>]* \>\>              { strtoken $ \s -> TokPic (init $ init $ tail $ tail s) }
  \< [^\>]* \>			{ strtoken $ \s -> TokURL (init (tail s)) }
  \# [^\#]* \#			{ strtoken $ \s -> TokAName (init (tail s)) }
  \/ [^\/]* \/                  { strtoken $ \s -> TokEmphasis (init (tail s)) }
  [\'\`] $ident+ [\'\`]		{ ident }
  \\ .				{ strtoken (TokString . tail) }
  "&#" $digit+ \;		{ strtoken $ \s -> TokString [chr (read (init (drop 2 s)))] }
  "&#" [xX] $hexdigit+ \;	{ strtoken $ \s -> case readHex (init (drop 3 s)) of [(n,_)] -> TokString [chr n] }
  -- allow special characters through if they don't fit one of the previous
  -- patterns.
  [\/\'\`\<\#\&\\]			{ strtoken TokString }
  [^ $special \/ \< \# \n \'\` \& \\ \]]* \n { strtokenNL TokString `andBegin` line }
  [^ $special \/ \< \# \n \'\` \& \\ \]]+    { strtoken TokString }
}

<def> {
  \]				{ token TokDefEnd `andBegin` string }
}

-- ']' doesn't have any special meaning outside of the [...] at the beginning
-- of a definition paragraph.
<string> {
  \]				{ strtoken TokString }
}

{
-- | A located token
type LToken = (Token, AlexPosn)

data Token
  = TokPara
  | TokNumber
  | TokBullet
  | TokDefStart
  | TokDefEnd
  | TokSpecial Char
  | TokIdent RdrName
  | TokString String
  | TokURL String
  | TokPic String
  | TokEmphasis String
  | TokAName String
  | TokBirdTrack String
  | TokProperty String
  | TokExamplePrompt String
  | TokExampleExpression String
  | TokExampleResult String
--  deriving Show

tokenPos :: LToken -> (Int, Int)
tokenPos t = let AlexPn _ line col = snd t in (line, col)

-- -----------------------------------------------------------------------------
-- Alex support stuff

-- XXX: copied the posn wrapper code from Alex to make this lexer work
-- with both Alex 2.x and Alex 3.x.  However, we are not using the
-- Unicode/UTF-8 support in Alex 3.x, and Unicode documentation will
-- probably get mangled.

-- | Encode a Haskell String to a list of Word8 values, in UTF8 format.
utf8Encode :: Char -> [Word8]
utf8Encode = map fromIntegral . go . ord
 where
  go oc
   | oc <= 0x7f       = [oc]

   | oc <= 0x7ff      = [ 0xc0 + (oc `Data.Bits.shiftR` 6)
                        , 0x80 + oc Data.Bits..&. 0x3f
                        ]

   | oc <= 0xffff     = [ 0xe0 + (oc `Data.Bits.shiftR` 12)
                        , 0x80 + ((oc `Data.Bits.shiftR` 6) Data.Bits..&. 0x3f)
                        , 0x80 + oc Data.Bits..&. 0x3f
                        ]
   | otherwise        = [ 0xf0 + (oc `Data.Bits.shiftR` 18)
                        , 0x80 + ((oc `Data.Bits.shiftR` 12) Data.Bits..&. 0x3f)
                        , 0x80 + ((oc `Data.Bits.shiftR` 6) Data.Bits..&. 0x3f)
                        , 0x80 + oc Data.Bits..&. 0x3f
                        ]

type Byte = Word8

type AlexInput = (AlexPosn,     -- current position,
                  Char,         -- previous char
                  [Byte],       -- pending bytes on current char
                  String)       -- current input string

alexInputPrevChar :: AlexInput -> Char
alexInputPrevChar (p,c,bs,s) = c

alexGetByte :: AlexInput -> Maybe (Byte,AlexInput)
alexGetByte (p,c,(b:bs),s) = Just (b,(p,c,bs,s))
alexGetByte (p,c,[],[]) = Nothing
alexGetByte (p,_,[],(c:s))  = let p' = alexMove p c
                                  (b:bs) = utf8Encode c
                              in p' `seq`  Just (b, (p', c, bs, s))

data AlexPosn = AlexPn !Int !Int !Int
        deriving (Eq,Show)

alexMove :: AlexPosn -> Char -> AlexPosn
alexMove (AlexPn a l c) '\t' = AlexPn (a+1)  l     (((c+7) `div` 8)*8+1)
alexMove (AlexPn a l c) '\n' = AlexPn (a+1) (l+1)   1
alexMove (AlexPn a l c) _    = AlexPn (a+1)  l     (c+1)

type StartCode = Int
type Action = AlexPosn -> String -> StartCode -> (StartCode -> [LToken]) -> DynFlags -> [LToken]

tokenise :: DynFlags -> String -> (Int, Int) -> [LToken]
tokenise dflags str (line, col) = go (posn,'\n',[],eofHack str) para
  where posn = AlexPn 0 line col
        go inp@(pos,_,_,str) sc =
          case alexScan inp sc of
                AlexEOF -> []
                AlexError _ -> []
                AlexSkip  inp' len     -> go inp' sc
                AlexToken inp' len act -> act pos (take len str) sc (\sc -> go inp' sc) dflags

-- NB. we add a final \n to the string, (see comment in the beginning of line
-- production above).
eofHack str = str++"\n"

andBegin  :: Action -> StartCode -> Action
andBegin act new_sc = \pos str _ cont dflags -> act pos str new_sc cont dflags

token :: Token -> Action
token t = \pos _ sc cont _ -> (t, pos) : cont sc

strtoken, strtokenNL :: (String -> Token) -> Action
strtoken t = \pos str sc cont _ -> (t str, pos) : cont sc
strtokenNL t = \pos str sc cont _ -> (t (filter (/= '\r') str), pos) : cont sc
-- ^ We only want LF line endings in our internal doc string format, so we
-- filter out all CRs.

begin :: StartCode -> Action
begin sc = \_ _ _ cont _ -> cont sc

-- -----------------------------------------------------------------------------
-- Lex a string as a Haskell identifier

ident :: Action
ident pos str sc cont dflags = 
  case parseIdent dflags loc id of
	Just names -> (TokIdent names, pos) : cont sc
	Nothing -> (TokString str, pos) : cont sc
 where id = init (tail str)
       -- TODO: Get the real filename here. Maybe we should just be
       --       using GHC SrcLoc's ourself?
       filename = mkFastString "<unknown file>"
       loc = case pos of
             AlexPn _ line col ->
                 mkRealSrcLoc filename line col

parseIdent :: DynFlags -> RealSrcLoc -> String -> Maybe RdrName
parseIdent dflags loc str0 = 
  let buffer = stringToStringBuffer str0
      pstate = mkPState dflags buffer loc
      result = unP parseIdentifier pstate 
  in case result of 
       POk _ name -> Just (unLoc name)
       _ -> Nothing
}