aboutsummaryrefslogtreecommitdiff
path: root/haddock-library/src
diff options
context:
space:
mode:
authorAlec Theriault <alec.theriault@gmail.com>2018-02-05 18:14:32 -0800
committeralexbiehl <alex.biehl@gmail.com>2018-03-02 15:48:36 +0100
commit44b1d87503a62181b4079962632cd61f1e158d79 (patch)
treefd7b0d45ae56891a9b876f05471a17c8655663d2 /haddock-library/src
parent69b98a99ce4de93ea0e6082bd11edb3baaf2fa6e (diff)
Support unicode operators, proper modules
Unicode operators are a pretty big thing in Haskell, so supporting linking them seems like it outweighs the cost of the extra machinery to force Attoparsec to look for unicode. Fixes #458.
Diffstat (limited to 'haddock-library/src')
-rw-r--r--haddock-library/src/Documentation/Haddock/Parser.hs34
-rw-r--r--haddock-library/src/Documentation/Haddock/Parser/Monad.hs25
2 files changed, 35 insertions, 24 deletions
diff --git a/haddock-library/src/Documentation/Haddock/Parser.hs b/haddock-library/src/Documentation/Haddock/Parser.hs
index a1349c95..82515ab4 100644
--- a/haddock-library/src/Documentation/Haddock/Parser.hs
+++ b/haddock-library/src/Documentation/Haddock/Parser.hs
@@ -23,7 +23,7 @@ import Control.Applicative
import Control.Arrow (first)
import Control.Monad
import qualified Data.ByteString.Char8 as BS
-import Data.Char (chr, isAsciiUpper)
+import Data.Char (chr, isUpper, isAlpha, isAlphaNum)
import Data.List (stripPrefix, intercalate, unfoldr, elemIndex)
import Data.Maybe (fromMaybe, mapMaybe)
import Data.Monoid
@@ -35,6 +35,7 @@ import Documentation.Haddock.Types
import Documentation.Haddock.Utf8
import Prelude hiding (takeWhile)
import qualified Prelude as P
+import Text.Read.Lex (isSymbolChar)
-- $setup
-- >>> :set -XOverloadedStrings
@@ -205,20 +206,19 @@ monospace :: Parser (DocH mod Identifier)
monospace = DocMonospaced . parseStringBS
<$> ("@" *> takeWhile1_ (/= '@') <* "@")
--- | Module names: we try our reasonable best to only allow valid
--- Haskell module names, with caveat about not matching on technically
--- valid unicode symbols.
+-- | Module names.
+--
+-- Note that we allow '#' and '\' to support anchors (old style anchors are of
+-- the form "SomeModule\#anchor").
moduleName :: Parser (DocH mod a)
moduleName = DocModule <$> (char '"' *> modid <* char '"')
where
modid = intercalate "." <$> conid `sepBy1` "."
conid = (:)
- <$> satisfy isAsciiUpper
- -- NOTE: According to Haskell 2010 we should actually only
- -- accept {small | large | digit | ' } here. But as we can't
- -- match on unicode characters, this is currently not possible.
- -- Note that we allow ‘#’ to suport anchors.
- <*> (decodeUtf8 <$> takeWhile (notInClass " .&[{}(=*)+]!|@/;,^?\"\n"))
+ <$> satisfyUnicode (\c -> isAlpha c && isUpper c)
+ <*> many (satisfyUnicode conChar <|> char '\\' <|> char '#')
+
+ conChar c = isAlphaNum c || c == '_'
-- | Picture parser, surrounded by \<\< and \>\>. It's possible to specify
-- a title for the picture.
@@ -760,26 +760,16 @@ autoUrl = mkLink <$> url
parseValid :: Parser String
parseValid = p some
where
- idChar =
- satisfy (\c -> isAlpha_ascii c
- || isDigit c
- -- N.B. '-' is placed first otherwise attoparsec thinks
- -- it belongs to a character class
- || inClass "-_.!#$%&*+/<=>?@\\|~:^" c)
+ idChar = satisfyUnicode (\c -> isAlphaNum c || isSymbolChar c || c == '_')
p p' = do
- vs' <- p' $ utf8String "⋆" <|> return <$> idChar
- let vs = concat vs'
+ vs <- p' idChar
c <- peekChar'
case c of
'`' -> return vs
'\'' -> (\x -> vs ++ "'" ++ x) <$> ("'" *> p many') <|> return vs
_ -> fail "outofvalid"
--- | Parses UTF8 strings from ByteString streams.
-utf8String :: String -> Parser String
-utf8String x = decodeUtf8 <$> string (encodeUtf8 x)
-
-- | Parses identifiers with help of 'parseValid'. Asks GHC for
-- 'String' from the string it deems valid.
identifier :: Parser (DocH mod Identifier)
diff --git a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs
index 3f7d60f8..3430ef8a 100644
--- a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs
+++ b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs
@@ -1,4 +1,4 @@
-{-# LANGUAGE GeneralizedNewtypeDeriving, TypeFamilies #-}
+{-# LANGUAGE GeneralizedNewtypeDeriving, TypeFamilies, BangPatterns #-}
module Documentation.Haddock.Parser.Monad (
module Documentation.Haddock.Parser.Monad
, Attoparsec.isDigit
@@ -31,9 +31,10 @@ module Documentation.Haddock.Parser.Monad (
import Control.Applicative
import Control.Monad
import Data.String
-import Data.ByteString (ByteString)
+import Data.ByteString (ByteString, length)
import qualified Data.ByteString.Lazy as LB
import qualified Data.Attoparsec.ByteString.Char8 as Attoparsec
+import qualified Data.Attoparsec.Combinator as Attoparsec
import Control.Monad.Trans.State
import qualified Control.Monad.Trans.Class as Trans
import Data.Word
@@ -41,6 +42,7 @@ import Data.Bits
import Data.Tuple
import Documentation.Haddock.Types (Version)
+import Documentation.Haddock.Utf8 (encodeUtf8, decodeUtf8)
newtype ParserState = ParserState {
parserStateSince :: Maybe Version
@@ -73,6 +75,25 @@ char = lift . Attoparsec.char
char8 :: Char -> Parser Word8
char8 = lift . Attoparsec.char8
+-- | Peek a unicode character and return the number of bytes that it took up
+peekUnicode :: Parser (Char, Int)
+peekUnicode = lift $ Attoparsec.lookAhead $ do
+
+ -- attoparsec's take fails on shorter inputs rather than truncate
+ bs <- Attoparsec.choice (map Attoparsec.take [4,3,2,1])
+
+ let !c = head . decodeUtf8 $ bs
+ !n = Data.ByteString.length . encodeUtf8 $ [c]
+ pure (c, fromIntegral n)
+
+-- | Like 'satisfy', but consuming a unicode character
+satisfyUnicode :: (Char -> Bool) -> Parser Char
+satisfyUnicode predicate = do
+ (c,n) <- peekUnicode
+ if predicate c
+ then Documentation.Haddock.Parser.Monad.take n *> pure c
+ else fail "satsifyUnicode"
+
anyChar :: Parser Char
anyChar = lift Attoparsec.anyChar