From 44b1d87503a62181b4079962632cd61f1e158d79 Mon Sep 17 00:00:00 2001 From: Alec Theriault Date: Mon, 5 Feb 2018 18:14:32 -0800 Subject: Support unicode operators, proper modules Unicode operators are a pretty big thing in Haskell, so supporting linking them seems like it outweighs the cost of the extra machinery to force Attoparsec to look for unicode. Fixes #458. --- .../src/Documentation/Haddock/Parser/Monad.hs | 25 ++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'haddock-library/src/Documentation/Haddock/Parser') diff --git a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs index 3f7d60f8..3430ef8a 100644 --- a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs +++ b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs @@ -1,4 +1,4 @@ -{-# LANGUAGE GeneralizedNewtypeDeriving, TypeFamilies #-} +{-# LANGUAGE GeneralizedNewtypeDeriving, TypeFamilies, BangPatterns #-} module Documentation.Haddock.Parser.Monad ( module Documentation.Haddock.Parser.Monad , Attoparsec.isDigit @@ -31,9 +31,10 @@ module Documentation.Haddock.Parser.Monad ( import Control.Applicative import Control.Monad import Data.String -import Data.ByteString (ByteString) +import Data.ByteString (ByteString, length) import qualified Data.ByteString.Lazy as LB import qualified Data.Attoparsec.ByteString.Char8 as Attoparsec +import qualified Data.Attoparsec.Combinator as Attoparsec import Control.Monad.Trans.State import qualified Control.Monad.Trans.Class as Trans import Data.Word @@ -41,6 +42,7 @@ import Data.Bits import Data.Tuple import Documentation.Haddock.Types (Version) +import Documentation.Haddock.Utf8 (encodeUtf8, decodeUtf8) newtype ParserState = ParserState { parserStateSince :: Maybe Version @@ -73,6 +75,25 @@ char = lift . Attoparsec.char char8 :: Char -> Parser Word8 char8 = lift . Attoparsec.char8 +-- | Peek a unicode character and return the number of bytes that it took up +peekUnicode :: Parser (Char, Int) +peekUnicode = lift $ Attoparsec.lookAhead $ do + + -- attoparsec's take fails on shorter inputs rather than truncate + bs <- Attoparsec.choice (map Attoparsec.take [4,3,2,1]) + + let !c = head . decodeUtf8 $ bs + !n = Data.ByteString.length . encodeUtf8 $ [c] + pure (c, fromIntegral n) + +-- | Like 'satisfy', but consuming a unicode character +satisfyUnicode :: (Char -> Bool) -> Parser Char +satisfyUnicode predicate = do + (c,n) <- peekUnicode + if predicate c + then Documentation.Haddock.Parser.Monad.take n *> pure c + else fail "satsifyUnicode" + anyChar :: Parser Char anyChar = lift Attoparsec.anyChar -- cgit v1.2.3