From cc269e6b0b615e9e237c35a985e4ace7b9ab0dd9 Mon Sep 17 00:00:00 2001
From: Mateusz Kowalczyk <fuuzetsu@fuuzetsu.co.uk>
Date: Mon, 5 May 2014 09:01:03 +0200
Subject: Move parser + parser tests out to own package.

We move some types out that are necessary as well and then
re-export and specialise them in the core Haddock.

Reason for moving out spec tests is that if we're working on the parser,
we can simply work on that and we can ignore the rest of Haddock. The
downside is that it's a little inconvenient if at the end of the day we
want to see that everything passes.
---
 haddock-library/src/Documentation/Haddock/Utf8.hs | 74 +++++++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 haddock-library/src/Documentation/Haddock/Utf8.hs

(limited to 'haddock-library/src/Documentation/Haddock/Utf8.hs')

diff --git a/haddock-library/src/Documentation/Haddock/Utf8.hs b/haddock-library/src/Documentation/Haddock/Utf8.hs
new file mode 100644
index 00000000..3f75e53b
--- /dev/null
+++ b/haddock-library/src/Documentation/Haddock/Utf8.hs
@@ -0,0 +1,74 @@
+module Documentation.Haddock.Utf8 (encodeUtf8, decodeUtf8) where
+import           Data.Bits ((.|.), (.&.), shiftL, shiftR)
+import qualified Data.ByteString as BS
+import           Data.Char (chr, ord)
+import           Data.Word (Word8)
+
+-- | Helper that encodes and packs a 'String' into a 'BS.ByteString'
+encodeUtf8 :: String -> BS.ByteString
+encodeUtf8 = BS.pack . encode
+
+-- | Helper that unpacks and decodes a 'BS.ByteString' into a 'String'
+decodeUtf8 :: BS.ByteString -> String
+decodeUtf8 = decode . BS.unpack
+
+-- Copy/pasted functions from Codec.Binary.UTF8.String for encoding/decoding
+-- | Character to use when 'encode' or 'decode' fail for a byte.
+replacementCharacter :: Char
+replacementCharacter = '\xfffd'
+
+-- | Encode a Haskell String to a list of Word8 values, in UTF8 format.
+encode :: String -> [Word8]
+encode = concatMap (map fromIntegral . go . ord)
+ where
+  go oc
+   | oc <= 0x7f       = [oc]
+
+   | oc <= 0x7ff      = [ 0xc0 + (oc `shiftR` 6)
+                        , 0x80 + oc .&. 0x3f
+                        ]
+
+   | oc <= 0xffff     = [ 0xe0 + (oc `shiftR` 12)
+                        , 0x80 + ((oc `shiftR` 6) .&. 0x3f)
+                        , 0x80 + oc .&. 0x3f
+                        ]
+   | otherwise        = [ 0xf0 + (oc `shiftR` 18)
+                        , 0x80 + ((oc `shiftR` 12) .&. 0x3f)
+                        , 0x80 + ((oc `shiftR` 6) .&. 0x3f)
+                        , 0x80 + oc .&. 0x3f
+                        ]
+
+-- | Decode a UTF8 string packed into a list of Word8 values, directly to String
+decode :: [Word8] -> String
+decode [    ] = ""
+decode (c:cs)
+  | c < 0x80  = chr (fromEnum c) : decode cs
+  | c < 0xc0  = replacementCharacter : decode cs
+  | c < 0xe0  = multi1
+  | c < 0xf0  = multi_byte 2 0xf  0x800
+  | c < 0xf8  = multi_byte 3 0x7  0x10000
+  | c < 0xfc  = multi_byte 4 0x3  0x200000
+  | c < 0xfe  = multi_byte 5 0x1  0x4000000
+  | otherwise = replacementCharacter : decode cs
+  where
+    multi1 = case cs of
+      c1 : ds | c1 .&. 0xc0 == 0x80 ->
+        let d = ((fromEnum c .&. 0x1f) `shiftL` 6) .|.  fromEnum (c1 .&. 0x3f)
+        in if d >= 0x000080 then toEnum d : decode ds
+                            else replacementCharacter : decode ds
+      _ -> replacementCharacter : decode cs
+
+    multi_byte :: Int -> Word8 -> Int -> String
+    multi_byte i mask overlong = aux i cs (fromEnum (c .&. mask))
+      where
+        aux 0 rs acc
+          | overlong <= acc && acc <= 0x10ffff &&
+            (acc < 0xd800 || 0xdfff < acc)     &&
+            (acc < 0xfffe || 0xffff < acc)      = chr acc : decode rs
+          | otherwise = replacementCharacter : decode rs
+
+        aux n (r:rs) acc
+          | r .&. 0xc0 == 0x80 = aux (n-1) rs
+                               $ shiftL acc 6 .|. fromEnum (r .&. 0x3f)
+
+        aux _ rs     _ = replacementCharacter : decode rs
-- 
cgit v1.2.3