From 44b1d87503a62181b4079962632cd61f1e158d79 Mon Sep 17 00:00:00 2001 From: Alec Theriault Date: Mon, 5 Feb 2018 18:14:32 -0800 Subject: Support unicode operators, proper modules Unicode operators are a pretty big thing in Haskell, so supporting linking them seems like it outweighs the cost of the extra machinery to force Attoparsec to look for unicode. Fixes #458. --- .../src/Documentation/Haddock/Parser.hs | 34 ++++++++-------------- .../src/Documentation/Haddock/Parser/Monad.hs | 25 ++++++++++++++-- 2 files changed, 35 insertions(+), 24 deletions(-) (limited to 'haddock-library/src/Documentation') diff --git a/haddock-library/src/Documentation/Haddock/Parser.hs b/haddock-library/src/Documentation/Haddock/Parser.hs index a1349c95..82515ab4 100644 --- a/haddock-library/src/Documentation/Haddock/Parser.hs +++ b/haddock-library/src/Documentation/Haddock/Parser.hs @@ -23,7 +23,7 @@ import Control.Applicative import Control.Arrow (first) import Control.Monad import qualified Data.ByteString.Char8 as BS -import Data.Char (chr, isAsciiUpper) +import Data.Char (chr, isUpper, isAlpha, isAlphaNum) import Data.List (stripPrefix, intercalate, unfoldr, elemIndex) import Data.Maybe (fromMaybe, mapMaybe) import Data.Monoid @@ -35,6 +35,7 @@ import Documentation.Haddock.Types import Documentation.Haddock.Utf8 import Prelude hiding (takeWhile) import qualified Prelude as P +import Text.Read.Lex (isSymbolChar) -- $setup -- >>> :set -XOverloadedStrings @@ -205,20 +206,19 @@ monospace :: Parser (DocH mod Identifier) monospace = DocMonospaced . parseStringBS <$> ("@" *> takeWhile1_ (/= '@') <* "@") --- | Module names: we try our reasonable best to only allow valid --- Haskell module names, with caveat about not matching on technically --- valid unicode symbols. +-- | Module names. +-- +-- Note that we allow '#' and '\' to support anchors (old style anchors are of +-- the form "SomeModule\#anchor"). moduleName :: Parser (DocH mod a) moduleName = DocModule <$> (char '"' *> modid <* char '"') where modid = intercalate "." <$> conid `sepBy1` "." conid = (:) - <$> satisfy isAsciiUpper - -- NOTE: According to Haskell 2010 we should actually only - -- accept {small | large | digit | ' } here. But as we can't - -- match on unicode characters, this is currently not possible. - -- Note that we allow ‘#’ to suport anchors. - <*> (decodeUtf8 <$> takeWhile (notInClass " .&[{}(=*)+]!|@/;,^?\"\n")) + <$> satisfyUnicode (\c -> isAlpha c && isUpper c) + <*> many (satisfyUnicode conChar <|> char '\\' <|> char '#') + + conChar c = isAlphaNum c || c == '_' -- | Picture parser, surrounded by \<\< and \>\>. It's possible to specify -- a title for the picture. @@ -760,26 +760,16 @@ autoUrl = mkLink <$> url parseValid :: Parser String parseValid = p some where - idChar = - satisfy (\c -> isAlpha_ascii c - || isDigit c - -- N.B. '-' is placed first otherwise attoparsec thinks - -- it belongs to a character class - || inClass "-_.!#$%&*+/<=>?@\\|~:^" c) + idChar = satisfyUnicode (\c -> isAlphaNum c || isSymbolChar c || c == '_') p p' = do - vs' <- p' $ utf8String "⋆" <|> return <$> idChar - let vs = concat vs' + vs <- p' idChar c <- peekChar' case c of '`' -> return vs '\'' -> (\x -> vs ++ "'" ++ x) <$> ("'" *> p many') <|> return vs _ -> fail "outofvalid" --- | Parses UTF8 strings from ByteString streams. -utf8String :: String -> Parser String -utf8String x = decodeUtf8 <$> string (encodeUtf8 x) - -- | Parses identifiers with help of 'parseValid'. Asks GHC for -- 'String' from the string it deems valid. identifier :: Parser (DocH mod Identifier) diff --git a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs index 3f7d60f8..3430ef8a 100644 --- a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs +++ b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs @@ -1,4 +1,4 @@ -{-# LANGUAGE GeneralizedNewtypeDeriving, TypeFamilies #-} +{-# LANGUAGE GeneralizedNewtypeDeriving, TypeFamilies, BangPatterns #-} module Documentation.Haddock.Parser.Monad ( module Documentation.Haddock.Parser.Monad , Attoparsec.isDigit @@ -31,9 +31,10 @@ module Documentation.Haddock.Parser.Monad ( import Control.Applicative import Control.Monad import Data.String -import Data.ByteString (ByteString) +import Data.ByteString (ByteString, length) import qualified Data.ByteString.Lazy as LB import qualified Data.Attoparsec.ByteString.Char8 as Attoparsec +import qualified Data.Attoparsec.Combinator as Attoparsec import Control.Monad.Trans.State import qualified Control.Monad.Trans.Class as Trans import Data.Word @@ -41,6 +42,7 @@ import Data.Bits import Data.Tuple import Documentation.Haddock.Types (Version) +import Documentation.Haddock.Utf8 (encodeUtf8, decodeUtf8) newtype ParserState = ParserState { parserStateSince :: Maybe Version @@ -73,6 +75,25 @@ char = lift . Attoparsec.char char8 :: Char -> Parser Word8 char8 = lift . Attoparsec.char8 +-- | Peek a unicode character and return the number of bytes that it took up +peekUnicode :: Parser (Char, Int) +peekUnicode = lift $ Attoparsec.lookAhead $ do + + -- attoparsec's take fails on shorter inputs rather than truncate + bs <- Attoparsec.choice (map Attoparsec.take [4,3,2,1]) + + let !c = head . decodeUtf8 $ bs + !n = Data.ByteString.length . encodeUtf8 $ [c] + pure (c, fromIntegral n) + +-- | Like 'satisfy', but consuming a unicode character +satisfyUnicode :: (Char -> Bool) -> Parser Char +satisfyUnicode predicate = do + (c,n) <- peekUnicode + if predicate c + then Documentation.Haddock.Parser.Monad.take n *> pure c + else fail "satsifyUnicode" + anyChar :: Parser Char anyChar = lift Attoparsec.anyChar -- cgit v1.2.3