diff options
author | Mateusz Kowalczyk <fuuzetsu@fuuzetsu.co.uk> | 2013-07-09 14:24:10 +0100 |
---|---|---|
committer | Austin Seipp <austin@well-typed.com> | 2014-01-12 14:48:35 -0600 |
commit | bb6cef20b82ef7a7f2d49f3ef6dc1a7ce880b5f0 (patch) | |
tree | ea07b1d4ab43169bc8d7074ff05bf1792c93feb0 /src/Haddock/Utf8.hs | |
parent | c1228df0339d041b455bb993786a9ed6322c5e01 (diff) |
One pass parser and tests.
We remove the HTML test as it is no longer necessary. We cover the
test case in spec tests and other HTML tests but keeping this around
fails: this is because the new parser has different semantics there.
In fact, I suspect the original behaviour was a bug that wasn't
caught/fixed but simply included as-is during the testing.
Diffstat (limited to 'src/Haddock/Utf8.hs')
-rw-r--r-- | src/Haddock/Utf8.hs | 74 |
1 files changed, 74 insertions, 0 deletions
diff --git a/src/Haddock/Utf8.hs b/src/Haddock/Utf8.hs new file mode 100644 index 00000000..1fb0e818 --- /dev/null +++ b/src/Haddock/Utf8.hs @@ -0,0 +1,74 @@ +module Haddock.Utf8 (encodeUtf8, decodeUtf8) where +import Data.Bits ((.|.), (.&.), shiftL, shiftR) +import qualified Data.ByteString as BS +import Data.Char (chr, ord) +import Data.Word (Word8) + +-- | Helper that encodes and packs a 'String' into a 'BS.ByteString' +encodeUtf8 :: String -> BS.ByteString +encodeUtf8 = BS.pack . encode + +-- | Helper that unpacks and decodes a 'BS.ByteString' into a 'String' +decodeUtf8 :: BS.ByteString -> String +decodeUtf8 = decode . BS.unpack + +-- Copy/pasted functions from Codec.Binary.UTF8.String for encoding/decoding +-- | Character to use when 'encode' or 'decode' fail for a byte. +replacementCharacter :: Char +replacementCharacter = '\xfffd' + +-- | Encode a Haskell String to a list of Word8 values, in UTF8 format. +encode :: String -> [Word8] +encode = concatMap (map fromIntegral . go . ord) + where + go oc + | oc <= 0x7f = [oc] + + | oc <= 0x7ff = [ 0xc0 + (oc `shiftR` 6) + , 0x80 + oc .&. 0x3f + ] + + | oc <= 0xffff = [ 0xe0 + (oc `shiftR` 12) + , 0x80 + ((oc `shiftR` 6) .&. 0x3f) + , 0x80 + oc .&. 0x3f + ] + | otherwise = [ 0xf0 + (oc `shiftR` 18) + , 0x80 + ((oc `shiftR` 12) .&. 0x3f) + , 0x80 + ((oc `shiftR` 6) .&. 0x3f) + , 0x80 + oc .&. 0x3f + ] + +-- | Decode a UTF8 string packed into a list of Word8 values, directly to String +decode :: [Word8] -> String +decode [ ] = "" +decode (c:cs) + | c < 0x80 = chr (fromEnum c) : decode cs + | c < 0xc0 = replacementCharacter : decode cs + | c < 0xe0 = multi1 + | c < 0xf0 = multi_byte 2 0xf 0x800 + | c < 0xf8 = multi_byte 3 0x7 0x10000 + | c < 0xfc = multi_byte 4 0x3 0x200000 + | c < 0xfe = multi_byte 5 0x1 0x4000000 + | otherwise = replacementCharacter : decode cs + where + multi1 = case cs of + c1 : ds | c1 .&. 0xc0 == 0x80 -> + let d = ((fromEnum c .&. 0x1f) `shiftL` 6) .|. fromEnum (c1 .&. 0x3f) + in if d >= 0x000080 then toEnum d : decode ds + else replacementCharacter : decode ds + _ -> replacementCharacter : decode cs + + multi_byte :: Int -> Word8 -> Int -> String + multi_byte i mask overlong = aux i cs (fromEnum (c .&. mask)) + where + aux 0 rs acc + | overlong <= acc && acc <= 0x10ffff && + (acc < 0xd800 || 0xdfff < acc) && + (acc < 0xfffe || 0xffff < acc) = chr acc : decode rs + | otherwise = replacementCharacter : decode rs + + aux n (r:rs) acc + | r .&. 0xc0 == 0x80 = aux (n-1) rs + $ shiftL acc 6 .|. fromEnum (r .&. 0x3f) + + aux _ rs _ = replacementCharacter : decode rs |