aboutsummaryrefslogtreecommitdiff
path: root/src/Haddock/Lex.x
diff options
context:
space:
mode:
authorMax Bolingbroke <batterseapower@hotmail.com>2013-02-03 18:56:18 +0000
committerDavid Waern <waern@ubuntu.(none)>2013-03-10 09:38:28 -0700
commit478b606bb0d61c4fea2d72171e2e5bb881a366d2 (patch)
tree18ae4050764787dfa16ba2de116349592c52d8cc /src/Haddock/Lex.x
parent1956cf715f00382c822141561951590cb0ed8c3e (diff)
Use Alex 3's Unicode support to properly lex source files as UTF-8
Signed-off-by: David Waern <david.waern@gmail.com>
Diffstat (limited to 'src/Haddock/Lex.x')
-rw-r--r--src/Haddock/Lex.x65
1 files changed, 42 insertions, 23 deletions
diff --git a/src/Haddock/Lex.x b/src/Haddock/Lex.x
index 0d8dd954..9e59fa4c 100644
--- a/src/Haddock/Lex.x
+++ b/src/Haddock/Lex.x
@@ -30,6 +30,7 @@ import SrcLoc
import DynFlags
import FastString
+import qualified Data.Bits
import Data.Char
import Data.Word (Word8)
import Numeric
@@ -149,46 +150,64 @@ tokenPos t = let AlexPn _ line col = snd t in (line, col)
-- Unicode/UTF-8 support in Alex 3.x, and Unicode documentation will
-- probably get mangled.
+-- | Encode a Haskell String to a list of Word8 values, in UTF8 format.
+utf8Encode :: Char -> [Word8]
+utf8Encode = map fromIntegral . go . ord
+ where
+ go oc
+ | oc <= 0x7f = [oc]
+
+ | oc <= 0x7ff = [ 0xc0 + (oc `Data.Bits.shiftR` 6)
+ , 0x80 + oc Data.Bits..&. 0x3f
+ ]
+
+ | oc <= 0xffff = [ 0xe0 + (oc `Data.Bits.shiftR` 12)
+ , 0x80 + ((oc `Data.Bits.shiftR` 6) Data.Bits..&. 0x3f)
+ , 0x80 + oc Data.Bits..&. 0x3f
+ ]
+ | otherwise = [ 0xf0 + (oc `Data.Bits.shiftR` 18)
+ , 0x80 + ((oc `Data.Bits.shiftR` 12) Data.Bits..&. 0x3f)
+ , 0x80 + ((oc `Data.Bits.shiftR` 6) Data.Bits..&. 0x3f)
+ , 0x80 + oc Data.Bits..&. 0x3f
+ ]
+
+type Byte = Word8
+
type AlexInput = (AlexPosn, -- current position,
Char, -- previous char
+ [Byte], -- pending bytes on current char
String) -- current input string
alexInputPrevChar :: AlexInput -> Char
-alexInputPrevChar (p,c,s) = c
+alexInputPrevChar (p,c,bs,s) = c
-alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
-alexGetByte (p,c,[]) = Nothing
-alexGetByte (p,_,(c:s)) = let p' = alexMove p c
- in p' `seq` Just (fromIntegral (ord c), (p', c, s))
+alexGetByte :: AlexInput -> Maybe (Byte,AlexInput)
+alexGetByte (p,c,(b:bs),s) = Just (b,(p,c,bs,s))
+alexGetByte (p,c,[],[]) = Nothing
+alexGetByte (p,_,[],(c:s)) = let p' = alexMove p c
+ (b:bs) = utf8Encode c
+ in p' `seq` Just (b, (p', c, bs, s))
--- for compat with Alex 2.x:
-alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
-alexGetChar i = case alexGetByte i of
- Nothing -> Nothing
- Just (b,i') -> Just (chr (fromIntegral b), i')
+data AlexPosn = AlexPn !Int !Int !Int
+ deriving (Eq,Show)
alexMove :: AlexPosn -> Char -> AlexPosn
alexMove (AlexPn a l c) '\t' = AlexPn (a+1) l (((c+7) `div` 8)*8+1)
alexMove (AlexPn a l c) '\n' = AlexPn (a+1) (l+1) 1
alexMove (AlexPn a l c) _ = AlexPn (a+1) l (c+1)
-data AlexPosn = AlexPn !Int !Int !Int
- deriving (Eq,Show)
-
type StartCode = Int
type Action = AlexPosn -> String -> StartCode -> (StartCode -> [LToken]) -> DynFlags -> [LToken]
tokenise :: DynFlags -> String -> (Int, Int) -> [LToken]
-tokenise dflags str (line, col) = let toks = go (posn, '\n', eofHack str) para in {-trace (show toks)-} toks
- where
- posn = AlexPn 0 line col
-
- go inp@(pos, _, str) sc =
- case alexScan inp sc of
- AlexEOF -> []
- AlexError _ -> []
- AlexSkip inp' _ -> go inp' sc
- AlexToken inp'@(pos',_,_) len act -> act pos (take len str) sc (\sc -> go inp' sc) dflags
+tokenise dflags str (line, col) = go (posn,'\n',[],eofHack str) para
+ where posn = AlexPn 0 line col
+ go inp@(pos,_,_,str) sc =
+ case alexScan inp sc of
+ AlexEOF -> []
+ AlexError _ -> []
+ AlexSkip inp' len -> go inp' sc
+ AlexToken inp' len act -> act pos (take len str) sc (\sc -> go inp' sc) dflags
-- NB. we add a final \n to the string, (see comment in the beginning of line
-- production above).