aboutsummaryrefslogtreecommitdiff
path: root/haddock-library/src/Documentation/Haddock/Parser
diff options
context:
space:
mode:
authorAlec Theriault <alec.theriault@gmail.com>2018-02-05 18:14:32 -0800
committeralexbiehl <alex.biehl@gmail.com>2018-03-02 15:48:36 +0100
commit44b1d87503a62181b4079962632cd61f1e158d79 (patch)
treefd7b0d45ae56891a9b876f05471a17c8655663d2 /haddock-library/src/Documentation/Haddock/Parser
parent69b98a99ce4de93ea0e6082bd11edb3baaf2fa6e (diff)
Support unicode operators, proper modules
Unicode operators are a pretty big thing in Haskell, so supporting linking them seems like it outweighs the cost of the extra machinery to force Attoparsec to look for unicode. Fixes #458.
Diffstat (limited to 'haddock-library/src/Documentation/Haddock/Parser')
-rw-r--r--haddock-library/src/Documentation/Haddock/Parser/Monad.hs25
1 files changed, 23 insertions, 2 deletions
diff --git a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs
index 3f7d60f8..3430ef8a 100644
--- a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs
+++ b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs
@@ -1,4 +1,4 @@
-{-# LANGUAGE GeneralizedNewtypeDeriving, TypeFamilies #-}
+{-# LANGUAGE GeneralizedNewtypeDeriving, TypeFamilies, BangPatterns #-}
module Documentation.Haddock.Parser.Monad (
module Documentation.Haddock.Parser.Monad
, Attoparsec.isDigit
@@ -31,9 +31,10 @@ module Documentation.Haddock.Parser.Monad (
import Control.Applicative
import Control.Monad
import Data.String
-import Data.ByteString (ByteString)
+import Data.ByteString (ByteString, length)
import qualified Data.ByteString.Lazy as LB
import qualified Data.Attoparsec.ByteString.Char8 as Attoparsec
+import qualified Data.Attoparsec.Combinator as Attoparsec
import Control.Monad.Trans.State
import qualified Control.Monad.Trans.Class as Trans
import Data.Word
@@ -41,6 +42,7 @@ import Data.Bits
import Data.Tuple
import Documentation.Haddock.Types (Version)
+import Documentation.Haddock.Utf8 (encodeUtf8, decodeUtf8)
newtype ParserState = ParserState {
parserStateSince :: Maybe Version
@@ -73,6 +75,25 @@ char = lift . Attoparsec.char
char8 :: Char -> Parser Word8
char8 = lift . Attoparsec.char8
+-- | Peek a unicode character and return the number of bytes that it took up
+peekUnicode :: Parser (Char, Int)
+peekUnicode = lift $ Attoparsec.lookAhead $ do
+
+ -- attoparsec's take fails on shorter inputs rather than truncate
+ bs <- Attoparsec.choice (map Attoparsec.take [4,3,2,1])
+
+ let !c = head . decodeUtf8 $ bs
+ !n = Data.ByteString.length . encodeUtf8 $ [c]
+ pure (c, fromIntegral n)
+
+-- | Like 'satisfy', but consuming a unicode character
+satisfyUnicode :: (Char -> Bool) -> Parser Char
+satisfyUnicode predicate = do
+ (c,n) <- peekUnicode
+ if predicate c
+ then Documentation.Haddock.Parser.Monad.take n *> pure c
+ else fail "satsifyUnicode"
+
anyChar :: Parser Char
anyChar = lift Attoparsec.anyChar