aboutsummaryrefslogtreecommitdiff
path: root/src/Haddock/Utf8.hs
diff options
context:
space:
mode:
authorMateusz Kowalczyk <fuuzetsu@fuuzetsu.co.uk>2013-07-09 14:24:10 +0100
committerAustin Seipp <austin@well-typed.com>2014-01-12 14:48:35 -0600
commitbb6cef20b82ef7a7f2d49f3ef6dc1a7ce880b5f0 (patch)
treeea07b1d4ab43169bc8d7074ff05bf1792c93feb0 /src/Haddock/Utf8.hs
parentc1228df0339d041b455bb993786a9ed6322c5e01 (diff)
One pass parser and tests.
We remove the HTML test as it is no longer necessary. We cover the test case in spec tests and other HTML tests but keeping this around fails: this is because the new parser has different semantics there. In fact, I suspect the original behaviour was a bug that wasn't caught/fixed but simply included as-is during the testing.
Diffstat (limited to 'src/Haddock/Utf8.hs')
-rw-r--r--src/Haddock/Utf8.hs74
1 files changed, 74 insertions, 0 deletions
diff --git a/src/Haddock/Utf8.hs b/src/Haddock/Utf8.hs
new file mode 100644
index 00000000..1fb0e818
--- /dev/null
+++ b/src/Haddock/Utf8.hs
@@ -0,0 +1,74 @@
+module Haddock.Utf8 (encodeUtf8, decodeUtf8) where
+import Data.Bits ((.|.), (.&.), shiftL, shiftR)
+import qualified Data.ByteString as BS
+import Data.Char (chr, ord)
+import Data.Word (Word8)
+
+-- | Helper that encodes and packs a 'String' into a 'BS.ByteString'
+encodeUtf8 :: String -> BS.ByteString
+encodeUtf8 = BS.pack . encode
+
+-- | Helper that unpacks and decodes a 'BS.ByteString' into a 'String'
+decodeUtf8 :: BS.ByteString -> String
+decodeUtf8 = decode . BS.unpack
+
+-- Copy/pasted functions from Codec.Binary.UTF8.String for encoding/decoding
+-- | Character to use when 'encode' or 'decode' fail for a byte.
+replacementCharacter :: Char
+replacementCharacter = '\xfffd'
+
+-- | Encode a Haskell String to a list of Word8 values, in UTF8 format.
+encode :: String -> [Word8]
+encode = concatMap (map fromIntegral . go . ord)
+ where
+ go oc
+ | oc <= 0x7f = [oc]
+
+ | oc <= 0x7ff = [ 0xc0 + (oc `shiftR` 6)
+ , 0x80 + oc .&. 0x3f
+ ]
+
+ | oc <= 0xffff = [ 0xe0 + (oc `shiftR` 12)
+ , 0x80 + ((oc `shiftR` 6) .&. 0x3f)
+ , 0x80 + oc .&. 0x3f
+ ]
+ | otherwise = [ 0xf0 + (oc `shiftR` 18)
+ , 0x80 + ((oc `shiftR` 12) .&. 0x3f)
+ , 0x80 + ((oc `shiftR` 6) .&. 0x3f)
+ , 0x80 + oc .&. 0x3f
+ ]
+
+-- | Decode a UTF8 string packed into a list of Word8 values, directly to String
+decode :: [Word8] -> String
+decode [ ] = ""
+decode (c:cs)
+ | c < 0x80 = chr (fromEnum c) : decode cs
+ | c < 0xc0 = replacementCharacter : decode cs
+ | c < 0xe0 = multi1
+ | c < 0xf0 = multi_byte 2 0xf 0x800
+ | c < 0xf8 = multi_byte 3 0x7 0x10000
+ | c < 0xfc = multi_byte 4 0x3 0x200000
+ | c < 0xfe = multi_byte 5 0x1 0x4000000
+ | otherwise = replacementCharacter : decode cs
+ where
+ multi1 = case cs of
+ c1 : ds | c1 .&. 0xc0 == 0x80 ->
+ let d = ((fromEnum c .&. 0x1f) `shiftL` 6) .|. fromEnum (c1 .&. 0x3f)
+ in if d >= 0x000080 then toEnum d : decode ds
+ else replacementCharacter : decode ds
+ _ -> replacementCharacter : decode cs
+
+ multi_byte :: Int -> Word8 -> Int -> String
+ multi_byte i mask overlong = aux i cs (fromEnum (c .&. mask))
+ where
+ aux 0 rs acc
+ | overlong <= acc && acc <= 0x10ffff &&
+ (acc < 0xd800 || 0xdfff < acc) &&
+ (acc < 0xfffe || 0xffff < acc) = chr acc : decode rs
+ | otherwise = replacementCharacter : decode rs
+
+ aux n (r:rs) acc
+ | r .&. 0xc0 == 0x80 = aux (n-1) rs
+ $ shiftL acc 6 .|. fromEnum (r .&. 0x3f)
+
+ aux _ rs _ = replacementCharacter : decode rs