One pass parser and tests.

We remove the HTML test as it is no longer necessary. We cover the test case in spec tests and other HTML tests but keeping this around fails: this is because the new parser has different semantics there. In fact, I suspect the original behaviour was a bug that wasn't caught/fixed but simply included as-is during the testing.
author: Mateusz Kowalczyk <fuuzetsu@fuuzetsu.co.uk> 2013-07-09 14:24:10 +0100
committer: Austin Seipp <austin@well-typed.com> 2014-01-12 14:48:35 -0600
commit: bb6cef20b82ef7a7f2d49f3ef6dc1a7ce880b5f0 (patch)
tree: ea07b1d4ab43169bc8d7074ff05bf1792c93feb0 /src/Haddock/Utf8.hs
parent: c1228df0339d041b455bb993786a9ed6322c5e01 (diff)
1 files changed, 74 insertions, 0 deletions
diff --git a/src/Haddock/Utf8.hs b/src/Haddock/Utf8.hs
new file mode 100644
index 00000000..1fb0e818
--- /dev/null
+++ b/src/Haddock/Utf8.hs
@@ -0,0 +1,74 @@
+module Haddock.Utf8 (encodeUtf8, decodeUtf8) where
+import           Data.Bits ((.|.), (.&.), shiftL, shiftR)
+import qualified Data.ByteString as BS
+import           Data.Char (chr, ord)
+import           Data.Word (Word8)
+
+-- | Helper that encodes and packs a 'String' into a 'BS.ByteString'
+encodeUtf8 :: String -> BS.ByteString
+encodeUtf8 = BS.pack . encode
+
+-- | Helper that unpacks and decodes a 'BS.ByteString' into a 'String'
+decodeUtf8 :: BS.ByteString -> String
+decodeUtf8 = decode . BS.unpack
+
+-- Copy/pasted functions from Codec.Binary.UTF8.String for encoding/decoding
+-- | Character to use when 'encode' or 'decode' fail for a byte.
+replacementCharacter :: Char
+replacementCharacter = '\xfffd'
+
+-- | Encode a Haskell String to a list of Word8 values, in UTF8 format.
+encode :: String -> [Word8]
+encode = concatMap (map fromIntegral . go . ord)
+ where
+  go oc
+   | oc <= 0x7f       = [oc]
+
+   | oc <= 0x7ff      = [ 0xc0 + (oc `shiftR` 6)
+                        , 0x80 + oc .&. 0x3f
+                        ]
+
+   | oc <= 0xffff     = [ 0xe0 + (oc `shiftR` 12)
+                        , 0x80 + ((oc `shiftR` 6) .&. 0x3f)
+                        , 0x80 + oc .&. 0x3f
+                        ]
+   | otherwise        = [ 0xf0 + (oc `shiftR` 18)
+                        , 0x80 + ((oc `shiftR` 12) .&. 0x3f)
+                        , 0x80 + ((oc `shiftR` 6) .&. 0x3f)
+                        , 0x80 + oc .&. 0x3f
+                        ]
+
+-- | Decode a UTF8 string packed into a list of Word8 values, directly to String
+decode :: [Word8] -> String
+decode [    ] = ""
+decode (c:cs)
+  | c < 0x80  = chr (fromEnum c) : decode cs
+  | c < 0xc0  = replacementCharacter : decode cs
+  | c < 0xe0  = multi1
+  | c < 0xf0  = multi_byte 2 0xf  0x800
+  | c < 0xf8  = multi_byte 3 0x7  0x10000
+  | c < 0xfc  = multi_byte 4 0x3  0x200000
+  | c < 0xfe  = multi_byte 5 0x1  0x4000000
+  | otherwise = replacementCharacter : decode cs
+  where
+    multi1 = case cs of
+      c1 : ds | c1 .&. 0xc0 == 0x80 ->
+        let d = ((fromEnum c .&. 0x1f) `shiftL` 6) .|.  fromEnum (c1 .&. 0x3f)
+        in if d >= 0x000080 then toEnum d : decode ds
+                            else replacementCharacter : decode ds
+      _ -> replacementCharacter : decode cs
+
+    multi_byte :: Int -> Word8 -> Int -> String
+    multi_byte i mask overlong = aux i cs (fromEnum (c .&. mask))
+      where
+        aux 0 rs acc
+          | overlong <= acc && acc <= 0x10ffff &&
+            (acc < 0xd800 || 0xdfff < acc)     &&
+            (acc < 0xfffe || 0xffff < acc)      = chr acc : decode rs
+          | otherwise = replacementCharacter : decode rs
+
+        aux n (r:rs) acc
+          | r .&. 0xc0 == 0x80 = aux (n-1) rs
+                               $ shiftL acc 6 .|. fromEnum (r .&. 0x3f)
+
+        aux _ rs     _ = replacementCharacter : decode rs
author	Mateusz Kowalczyk <fuuzetsu@fuuzetsu.co.uk>	2013-07-09 14:24:10 +0100
committer	Austin Seipp <austin@well-typed.com>	2014-01-12 14:48:35 -0600
commit	bb6cef20b82ef7a7f2d49f3ef6dc1a7ce880b5f0 (patch)
tree	ea07b1d4ab43169bc8d7074ff05bf1792c93feb0 /src/Haddock/Utf8.hs
parent	c1228df0339d041b455bb993786a9ed6322c5e01 (diff)