aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlec Theriault <alec.theriault@gmail.com>2018-02-05 18:14:32 -0800
committeralexbiehl <alex.biehl@gmail.com>2018-03-02 15:48:36 +0100
commit44b1d87503a62181b4079962632cd61f1e158d79 (patch)
treefd7b0d45ae56891a9b876f05471a17c8655663d2
parent69b98a99ce4de93ea0e6082bd11edb3baaf2fa6e (diff)
Support unicode operators, proper modules
Unicode operators are a pretty big thing in Haskell, so supporting linking them seems like it outweighs the cost of the extra machinery to force Attoparsec to look for unicode. Fixes #458.
-rw-r--r--haddock-library/haddock-library.cabal2
-rw-r--r--haddock-library/src/Documentation/Haddock/Parser.hs34
-rw-r--r--haddock-library/src/Documentation/Haddock/Parser/Monad.hs25
-rw-r--r--html-test/ref/Bug458.html80
-rw-r--r--html-test/src/Bug458.hs6
5 files changed, 122 insertions, 25 deletions
diff --git a/haddock-library/haddock-library.cabal b/haddock-library/haddock-library.cabal
index 3d069f07..d7935747 100644
--- a/haddock-library/haddock-library.cabal
+++ b/haddock-library/haddock-library.cabal
@@ -64,13 +64,13 @@ library attoparsec
exposed-modules:
Data.Attoparsec.ByteString
Data.Attoparsec.ByteString.Char8
+ Data.Attoparsec.Combinator
other-modules:
Data.Attoparsec
Data.Attoparsec.ByteString.Buffer
Data.Attoparsec.ByteString.FastSet
Data.Attoparsec.ByteString.Internal
- Data.Attoparsec.Combinator
Data.Attoparsec.Internal
Data.Attoparsec.Internal.Fhthagn
Data.Attoparsec.Internal.Types
diff --git a/haddock-library/src/Documentation/Haddock/Parser.hs b/haddock-library/src/Documentation/Haddock/Parser.hs
index a1349c95..82515ab4 100644
--- a/haddock-library/src/Documentation/Haddock/Parser.hs
+++ b/haddock-library/src/Documentation/Haddock/Parser.hs
@@ -23,7 +23,7 @@ import Control.Applicative
import Control.Arrow (first)
import Control.Monad
import qualified Data.ByteString.Char8 as BS
-import Data.Char (chr, isAsciiUpper)
+import Data.Char (chr, isUpper, isAlpha, isAlphaNum)
import Data.List (stripPrefix, intercalate, unfoldr, elemIndex)
import Data.Maybe (fromMaybe, mapMaybe)
import Data.Monoid
@@ -35,6 +35,7 @@ import Documentation.Haddock.Types
import Documentation.Haddock.Utf8
import Prelude hiding (takeWhile)
import qualified Prelude as P
+import Text.Read.Lex (isSymbolChar)
-- $setup
-- >>> :set -XOverloadedStrings
@@ -205,20 +206,19 @@ monospace :: Parser (DocH mod Identifier)
monospace = DocMonospaced . parseStringBS
<$> ("@" *> takeWhile1_ (/= '@') <* "@")
--- | Module names: we try our reasonable best to only allow valid
--- Haskell module names, with caveat about not matching on technically
--- valid unicode symbols.
+-- | Module names.
+--
+-- Note that we allow '#' and '\' to support anchors (old style anchors are of
+-- the form "SomeModule\#anchor").
moduleName :: Parser (DocH mod a)
moduleName = DocModule <$> (char '"' *> modid <* char '"')
where
modid = intercalate "." <$> conid `sepBy1` "."
conid = (:)
- <$> satisfy isAsciiUpper
- -- NOTE: According to Haskell 2010 we should actually only
- -- accept {small | large | digit | ' } here. But as we can't
- -- match on unicode characters, this is currently not possible.
- -- Note that we allow ‘#’ to suport anchors.
- <*> (decodeUtf8 <$> takeWhile (notInClass " .&[{}(=*)+]!|@/;,^?\"\n"))
+ <$> satisfyUnicode (\c -> isAlpha c && isUpper c)
+ <*> many (satisfyUnicode conChar <|> char '\\' <|> char '#')
+
+ conChar c = isAlphaNum c || c == '_'
-- | Picture parser, surrounded by \<\< and \>\>. It's possible to specify
-- a title for the picture.
@@ -760,26 +760,16 @@ autoUrl = mkLink <$> url
parseValid :: Parser String
parseValid = p some
where
- idChar =
- satisfy (\c -> isAlpha_ascii c
- || isDigit c
- -- N.B. '-' is placed first otherwise attoparsec thinks
- -- it belongs to a character class
- || inClass "-_.!#$%&*+/<=>?@\\|~:^" c)
+ idChar = satisfyUnicode (\c -> isAlphaNum c || isSymbolChar c || c == '_')
p p' = do
- vs' <- p' $ utf8String "⋆" <|> return <$> idChar
- let vs = concat vs'
+ vs <- p' idChar
c <- peekChar'
case c of
'`' -> return vs
'\'' -> (\x -> vs ++ "'" ++ x) <$> ("'" *> p many') <|> return vs
_ -> fail "outofvalid"
--- | Parses UTF8 strings from ByteString streams.
-utf8String :: String -> Parser String
-utf8String x = decodeUtf8 <$> string (encodeUtf8 x)
-
-- | Parses identifiers with help of 'parseValid'. Asks GHC for
-- 'String' from the string it deems valid.
identifier :: Parser (DocH mod Identifier)
diff --git a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs
index 3f7d60f8..3430ef8a 100644
--- a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs
+++ b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs
@@ -1,4 +1,4 @@
-{-# LANGUAGE GeneralizedNewtypeDeriving, TypeFamilies #-}
+{-# LANGUAGE GeneralizedNewtypeDeriving, TypeFamilies, BangPatterns #-}
module Documentation.Haddock.Parser.Monad (
module Documentation.Haddock.Parser.Monad
, Attoparsec.isDigit
@@ -31,9 +31,10 @@ module Documentation.Haddock.Parser.Monad (
import Control.Applicative
import Control.Monad
import Data.String
-import Data.ByteString (ByteString)
+import Data.ByteString (ByteString, length)
import qualified Data.ByteString.Lazy as LB
import qualified Data.Attoparsec.ByteString.Char8 as Attoparsec
+import qualified Data.Attoparsec.Combinator as Attoparsec
import Control.Monad.Trans.State
import qualified Control.Monad.Trans.Class as Trans
import Data.Word
@@ -41,6 +42,7 @@ import Data.Bits
import Data.Tuple
import Documentation.Haddock.Types (Version)
+import Documentation.Haddock.Utf8 (encodeUtf8, decodeUtf8)
newtype ParserState = ParserState {
parserStateSince :: Maybe Version
@@ -73,6 +75,25 @@ char = lift . Attoparsec.char
char8 :: Char -> Parser Word8
char8 = lift . Attoparsec.char8
+-- | Peek a unicode character and return the number of bytes that it took up
+peekUnicode :: Parser (Char, Int)
+peekUnicode = lift $ Attoparsec.lookAhead $ do
+
+ -- attoparsec's take fails on shorter inputs rather than truncate
+ bs <- Attoparsec.choice (map Attoparsec.take [4,3,2,1])
+
+ let !c = head . decodeUtf8 $ bs
+ !n = Data.ByteString.length . encodeUtf8 $ [c]
+ pure (c, fromIntegral n)
+
+-- | Like 'satisfy', but consuming a unicode character
+satisfyUnicode :: (Char -> Bool) -> Parser Char
+satisfyUnicode predicate = do
+ (c,n) <- peekUnicode
+ if predicate c
+ then Documentation.Haddock.Parser.Monad.take n *> pure c
+ else fail "satsifyUnicode"
+
anyChar :: Parser Char
anyChar = lift Attoparsec.anyChar
diff --git a/html-test/ref/Bug458.html b/html-test/ref/Bug458.html
new file mode 100644
index 00000000..aa99e719
--- /dev/null
+++ b/html-test/ref/Bug458.html
@@ -0,0 +1,80 @@
+<html xmlns="http://www.w3.org/1999/xhtml"
+><head
+ ><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"
+ /><title
+ >Bug458</title
+ ><link href="#" rel="stylesheet" type="text/css" title="Ocean"
+ /><link rel="stylesheet" type="text/css" href="#"
+ /><script src="haddock-bundle.min.js" async="async" type="text/javascript"
+ ></script
+ ><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-AMS-MML_HTMLorMML" type="text/javascript"
+ ></script
+ ></head
+ ><body
+ ><div id="package-header"
+ ><ul class="links" id="page-menu"
+ ><li
+ ><a href="#"
+ >Contents</a
+ ></li
+ ><li
+ ><a href="#"
+ >Index</a
+ ></li
+ ></ul
+ ><p class="caption empty"
+ ></p
+ ></div
+ ><div id="content"
+ ><div id="module-header"
+ ><table class="info"
+ ><tr
+ ><th
+ >Safe Haskell</th
+ ><td
+ >Safe</td
+ ></tr
+ ></table
+ ><p class="caption"
+ >Bug458</p
+ ></div
+ ><div id="synopsis"
+ ><details id="syn"
+ ><summary
+ >Synopsis</summary
+ ><ul class="details-toggle" data-details-id="syn"
+ ><li class="src short"
+ ><a href="#"
+ >(&#8838;)</a
+ > :: () -&gt; () -&gt; ()</li
+ ></ul
+ ></details
+ ></div
+ ><div id="interface"
+ ><h1
+ >Documentation</h1
+ ><div class="top"
+ ><p class="src"
+ ><a id="v:-8838-" class="def"
+ >(&#8838;)</a
+ > :: () -&gt; () -&gt; () <a href="#" class="selflink"
+ >#</a
+ ></p
+ ><div class="doc"
+ ><p
+ >See the defn of <code
+ ><code
+ ><a href="#"
+ >&#8838;</a
+ ></code
+ ></code
+ >.</p
+ ></div
+ ></div
+ ></div
+ ></div
+ ><div id="footer"
+ ></div
+ ></body
+ ></html
+> \ No newline at end of file
diff --git a/html-test/src/Bug458.hs b/html-test/src/Bug458.hs
new file mode 100644
index 00000000..6a3ac9a4
--- /dev/null
+++ b/html-test/src/Bug458.hs
@@ -0,0 +1,6 @@
+module Bug458 where
+
+-- | See the defn of @'⊆'@.
+(⊆) :: () -> () -> ()
+_ ⊆ _ = ()
+