diff options
author | Alec Theriault <alec.theriault@gmail.com> | 2018-02-05 18:14:32 -0800 |
---|---|---|
committer | alexbiehl <alex.biehl@gmail.com> | 2018-03-02 15:48:36 +0100 |
commit | 44b1d87503a62181b4079962632cd61f1e158d79 (patch) | |
tree | fd7b0d45ae56891a9b876f05471a17c8655663d2 /haddock-library/src/Documentation/Haddock/Parser | |
parent | 69b98a99ce4de93ea0e6082bd11edb3baaf2fa6e (diff) |
Support unicode operators, proper modules
Unicode operators are a pretty big thing in Haskell, so supporting linking them
seems like it outweighs the cost of the extra machinery to force Attoparsec to
look for unicode.
Fixes #458.
Diffstat (limited to 'haddock-library/src/Documentation/Haddock/Parser')
-rw-r--r-- | haddock-library/src/Documentation/Haddock/Parser/Monad.hs | 25 |
1 files changed, 23 insertions, 2 deletions
diff --git a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs index 3f7d60f8..3430ef8a 100644 --- a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs +++ b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs @@ -1,4 +1,4 @@ -{-# LANGUAGE GeneralizedNewtypeDeriving, TypeFamilies #-} +{-# LANGUAGE GeneralizedNewtypeDeriving, TypeFamilies, BangPatterns #-} module Documentation.Haddock.Parser.Monad ( module Documentation.Haddock.Parser.Monad , Attoparsec.isDigit @@ -31,9 +31,10 @@ module Documentation.Haddock.Parser.Monad ( import Control.Applicative import Control.Monad import Data.String -import Data.ByteString (ByteString) +import Data.ByteString (ByteString, length) import qualified Data.ByteString.Lazy as LB import qualified Data.Attoparsec.ByteString.Char8 as Attoparsec +import qualified Data.Attoparsec.Combinator as Attoparsec import Control.Monad.Trans.State import qualified Control.Monad.Trans.Class as Trans import Data.Word @@ -41,6 +42,7 @@ import Data.Bits import Data.Tuple import Documentation.Haddock.Types (Version) +import Documentation.Haddock.Utf8 (encodeUtf8, decodeUtf8) newtype ParserState = ParserState { parserStateSince :: Maybe Version @@ -73,6 +75,25 @@ char = lift . Attoparsec.char char8 :: Char -> Parser Word8 char8 = lift . Attoparsec.char8 +-- | Peek a unicode character and return the number of bytes that it took up +peekUnicode :: Parser (Char, Int) +peekUnicode = lift $ Attoparsec.lookAhead $ do + + -- attoparsec's take fails on shorter inputs rather than truncate + bs <- Attoparsec.choice (map Attoparsec.take [4,3,2,1]) + + let !c = head . decodeUtf8 $ bs + !n = Data.ByteString.length . encodeUtf8 $ [c] + pure (c, fromIntegral n) + +-- | Like 'satisfy', but consuming a unicode character +satisfyUnicode :: (Char -> Bool) -> Parser Char +satisfyUnicode predicate = do + (c,n) <- peekUnicode + if predicate c + then Documentation.Haddock.Parser.Monad.take n *> pure c + else fail "satsifyUnicode" + anyChar :: Parser Char anyChar = lift Attoparsec.anyChar |