diff options
author | Alec Theriault <alec.theriault@gmail.com> | 2018-02-05 18:14:32 -0800 |
---|---|---|
committer | alexbiehl <alex.biehl@gmail.com> | 2018-03-02 15:48:36 +0100 |
commit | 44b1d87503a62181b4079962632cd61f1e158d79 (patch) | |
tree | fd7b0d45ae56891a9b876f05471a17c8655663d2 | |
parent | 69b98a99ce4de93ea0e6082bd11edb3baaf2fa6e (diff) |
Support unicode operators, proper modules
Unicode operators are a pretty big thing in Haskell, so supporting linking them
seems like it outweighs the cost of the extra machinery to force Attoparsec to
look for unicode.
Fixes #458.
-rw-r--r-- | haddock-library/haddock-library.cabal | 2 | ||||
-rw-r--r-- | haddock-library/src/Documentation/Haddock/Parser.hs | 34 | ||||
-rw-r--r-- | haddock-library/src/Documentation/Haddock/Parser/Monad.hs | 25 | ||||
-rw-r--r-- | html-test/ref/Bug458.html | 80 | ||||
-rw-r--r-- | html-test/src/Bug458.hs | 6 |
5 files changed, 122 insertions, 25 deletions
diff --git a/haddock-library/haddock-library.cabal b/haddock-library/haddock-library.cabal index 3d069f07..d7935747 100644 --- a/haddock-library/haddock-library.cabal +++ b/haddock-library/haddock-library.cabal @@ -64,13 +64,13 @@ library attoparsec exposed-modules: Data.Attoparsec.ByteString Data.Attoparsec.ByteString.Char8 + Data.Attoparsec.Combinator other-modules: Data.Attoparsec Data.Attoparsec.ByteString.Buffer Data.Attoparsec.ByteString.FastSet Data.Attoparsec.ByteString.Internal - Data.Attoparsec.Combinator Data.Attoparsec.Internal Data.Attoparsec.Internal.Fhthagn Data.Attoparsec.Internal.Types diff --git a/haddock-library/src/Documentation/Haddock/Parser.hs b/haddock-library/src/Documentation/Haddock/Parser.hs index a1349c95..82515ab4 100644 --- a/haddock-library/src/Documentation/Haddock/Parser.hs +++ b/haddock-library/src/Documentation/Haddock/Parser.hs @@ -23,7 +23,7 @@ import Control.Applicative import Control.Arrow (first) import Control.Monad import qualified Data.ByteString.Char8 as BS -import Data.Char (chr, isAsciiUpper) +import Data.Char (chr, isUpper, isAlpha, isAlphaNum) import Data.List (stripPrefix, intercalate, unfoldr, elemIndex) import Data.Maybe (fromMaybe, mapMaybe) import Data.Monoid @@ -35,6 +35,7 @@ import Documentation.Haddock.Types import Documentation.Haddock.Utf8 import Prelude hiding (takeWhile) import qualified Prelude as P +import Text.Read.Lex (isSymbolChar) -- $setup -- >>> :set -XOverloadedStrings @@ -205,20 +206,19 @@ monospace :: Parser (DocH mod Identifier) monospace = DocMonospaced . parseStringBS <$> ("@" *> takeWhile1_ (/= '@') <* "@") --- | Module names: we try our reasonable best to only allow valid --- Haskell module names, with caveat about not matching on technically --- valid unicode symbols. +-- | Module names. +-- +-- Note that we allow '#' and '\' to support anchors (old style anchors are of +-- the form "SomeModule\#anchor"). moduleName :: Parser (DocH mod a) moduleName = DocModule <$> (char '"' *> modid <* char '"') where modid = intercalate "." <$> conid `sepBy1` "." conid = (:) - <$> satisfy isAsciiUpper - -- NOTE: According to Haskell 2010 we should actually only - -- accept {small | large | digit | ' } here. But as we can't - -- match on unicode characters, this is currently not possible. - -- Note that we allow ‘#’ to suport anchors. - <*> (decodeUtf8 <$> takeWhile (notInClass " .&[{}(=*)+]!|@/;,^?\"\n")) + <$> satisfyUnicode (\c -> isAlpha c && isUpper c) + <*> many (satisfyUnicode conChar <|> char '\\' <|> char '#') + + conChar c = isAlphaNum c || c == '_' -- | Picture parser, surrounded by \<\< and \>\>. It's possible to specify -- a title for the picture. @@ -760,26 +760,16 @@ autoUrl = mkLink <$> url parseValid :: Parser String parseValid = p some where - idChar = - satisfy (\c -> isAlpha_ascii c - || isDigit c - -- N.B. '-' is placed first otherwise attoparsec thinks - -- it belongs to a character class - || inClass "-_.!#$%&*+/<=>?@\\|~:^" c) + idChar = satisfyUnicode (\c -> isAlphaNum c || isSymbolChar c || c == '_') p p' = do - vs' <- p' $ utf8String "⋆" <|> return <$> idChar - let vs = concat vs' + vs <- p' idChar c <- peekChar' case c of '`' -> return vs '\'' -> (\x -> vs ++ "'" ++ x) <$> ("'" *> p many') <|> return vs _ -> fail "outofvalid" --- | Parses UTF8 strings from ByteString streams. -utf8String :: String -> Parser String -utf8String x = decodeUtf8 <$> string (encodeUtf8 x) - -- | Parses identifiers with help of 'parseValid'. Asks GHC for -- 'String' from the string it deems valid. identifier :: Parser (DocH mod Identifier) diff --git a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs index 3f7d60f8..3430ef8a 100644 --- a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs +++ b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs @@ -1,4 +1,4 @@ -{-# LANGUAGE GeneralizedNewtypeDeriving, TypeFamilies #-} +{-# LANGUAGE GeneralizedNewtypeDeriving, TypeFamilies, BangPatterns #-} module Documentation.Haddock.Parser.Monad ( module Documentation.Haddock.Parser.Monad , Attoparsec.isDigit @@ -31,9 +31,10 @@ module Documentation.Haddock.Parser.Monad ( import Control.Applicative import Control.Monad import Data.String -import Data.ByteString (ByteString) +import Data.ByteString (ByteString, length) import qualified Data.ByteString.Lazy as LB import qualified Data.Attoparsec.ByteString.Char8 as Attoparsec +import qualified Data.Attoparsec.Combinator as Attoparsec import Control.Monad.Trans.State import qualified Control.Monad.Trans.Class as Trans import Data.Word @@ -41,6 +42,7 @@ import Data.Bits import Data.Tuple import Documentation.Haddock.Types (Version) +import Documentation.Haddock.Utf8 (encodeUtf8, decodeUtf8) newtype ParserState = ParserState { parserStateSince :: Maybe Version @@ -73,6 +75,25 @@ char = lift . Attoparsec.char char8 :: Char -> Parser Word8 char8 = lift . Attoparsec.char8 +-- | Peek a unicode character and return the number of bytes that it took up +peekUnicode :: Parser (Char, Int) +peekUnicode = lift $ Attoparsec.lookAhead $ do + + -- attoparsec's take fails on shorter inputs rather than truncate + bs <- Attoparsec.choice (map Attoparsec.take [4,3,2,1]) + + let !c = head . decodeUtf8 $ bs + !n = Data.ByteString.length . encodeUtf8 $ [c] + pure (c, fromIntegral n) + +-- | Like 'satisfy', but consuming a unicode character +satisfyUnicode :: (Char -> Bool) -> Parser Char +satisfyUnicode predicate = do + (c,n) <- peekUnicode + if predicate c + then Documentation.Haddock.Parser.Monad.take n *> pure c + else fail "satsifyUnicode" + anyChar :: Parser Char anyChar = lift Attoparsec.anyChar diff --git a/html-test/ref/Bug458.html b/html-test/ref/Bug458.html new file mode 100644 index 00000000..aa99e719 --- /dev/null +++ b/html-test/ref/Bug458.html @@ -0,0 +1,80 @@ +<html xmlns="http://www.w3.org/1999/xhtml" +><head + ><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" + /><title + >Bug458</title + ><link href="#" rel="stylesheet" type="text/css" title="Ocean" + /><link rel="stylesheet" type="text/css" href="#" + /><script src="haddock-bundle.min.js" async="async" type="text/javascript" + ></script + ><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-AMS-MML_HTMLorMML" type="text/javascript" + ></script + ></head + ><body + ><div id="package-header" + ><ul class="links" id="page-menu" + ><li + ><a href="#" + >Contents</a + ></li + ><li + ><a href="#" + >Index</a + ></li + ></ul + ><p class="caption empty" + ></p + ></div + ><div id="content" + ><div id="module-header" + ><table class="info" + ><tr + ><th + >Safe Haskell</th + ><td + >Safe</td + ></tr + ></table + ><p class="caption" + >Bug458</p + ></div + ><div id="synopsis" + ><details id="syn" + ><summary + >Synopsis</summary + ><ul class="details-toggle" data-details-id="syn" + ><li class="src short" + ><a href="#" + >(⊆)</a + > :: () -> () -> ()</li + ></ul + ></details + ></div + ><div id="interface" + ><h1 + >Documentation</h1 + ><div class="top" + ><p class="src" + ><a id="v:-8838-" class="def" + >(⊆)</a + > :: () -> () -> () <a href="#" class="selflink" + >#</a + ></p + ><div class="doc" + ><p + >See the defn of <code + ><code + ><a href="#" + >⊆</a + ></code + ></code + >.</p + ></div + ></div + ></div + ></div + ><div id="footer" + ></div + ></body + ></html +>
\ No newline at end of file diff --git a/html-test/src/Bug458.hs b/html-test/src/Bug458.hs new file mode 100644 index 00000000..6a3ac9a4 --- /dev/null +++ b/html-test/src/Bug458.hs @@ -0,0 +1,6 @@ +module Bug458 where + +-- | See the defn of @'⊆'@. +(⊆) :: () -> () -> () +_ ⊆ _ = () + |