Support unicode operators, proper modules

Unicode operators are a pretty big thing in Haskell, so supporting linking them seems like it outweighs the cost of the extra machinery to force Attoparsec to look for unicode. Fixes #458.
author: Alec Theriault <alec.theriault@gmail.com> 2018-02-05 18:14:32 -0800
committer: alexbiehl <alex.biehl@gmail.com> 2018-03-02 15:48:36 +0100
commit: 44b1d87503a62181b4079962632cd61f1e158d79 (patch)
tree: fd7b0d45ae56891a9b876f05471a17c8655663d2 /haddock-library/src/Documentation/Haddock
parent: 69b98a99ce4de93ea0e6082bd11edb3baaf2fa6e (diff)
2 files changed, 35 insertions, 24 deletions
diff --git a/haddock-library/src/Documentation/Haddock/Parser.hs b/haddock-library/src/Documentation/Haddock/Parser.hs
index a1349c95..82515ab4 100644
--- a/haddock-library/src/Documentation/Haddock/Parser.hs
+++ b/haddock-library/src/Documentation/Haddock/Parser.hs
@@ -23,7 +23,7 @@ import           Control.Applicative
 import           Control.Arrow (first)
 import           Control.Monad
 import qualified Data.ByteString.Char8 as BS
-import           Data.Char (chr, isAsciiUpper)
+import           Data.Char (chr, isUpper, isAlpha, isAlphaNum)
 import           Data.List (stripPrefix, intercalate, unfoldr, elemIndex)
 import           Data.Maybe (fromMaybe, mapMaybe)
 import           Data.Monoid
@@ -35,6 +35,7 @@ import           Documentation.Haddock.Types
 import           Documentation.Haddock.Utf8
 import           Prelude hiding (takeWhile)
 import qualified Prelude as P
+import           Text.Read.Lex (isSymbolChar)
 
 -- $setup
 -- >>> :set -XOverloadedStrings
@@ -205,20 +206,19 @@ monospace :: Parser (DocH mod Identifier)
 monospace = DocMonospaced . parseStringBS
             <$> ("@" *> takeWhile1_ (/= '@') <* "@")
 
--- | Module names: we try our reasonable best to only allow valid
--- Haskell module names, with caveat about not matching on technically
--- valid unicode symbols.
+-- | Module names.
+--
+-- Note that we allow '#' and '\' to support anchors (old style anchors are of
+-- the form "SomeModule\#anchor").
 moduleName :: Parser (DocH mod a)
 moduleName = DocModule <$> (char '"' *> modid <* char '"')
   where
     modid = intercalate "." <$> conid `sepBy1` "."
     conid = (:)
-      <$> satisfy isAsciiUpper
-      -- NOTE: According to Haskell 2010 we should actually only
-      -- accept {small | large | digit | ' } here.  But as we can't
-      -- match on unicode characters, this is currently not possible.
-      -- Note that we allow ‘#’ to suport anchors.
-      <*> (decodeUtf8 <$> takeWhile (notInClass " .&[{}(=*)+]!|@/;,^?\"\n"))
+      <$> satisfyUnicode (\c -> isAlpha c && isUpper c)
+      <*> many (satisfyUnicode conChar <|> char '\\' <|> char '#')
+
+    conChar c = isAlphaNum c || c == '_'
 
 -- | Picture parser, surrounded by \<\< and \>\>. It's possible to specify
 -- a title for the picture.
@@ -760,26 +760,16 @@ autoUrl = mkLink <$> url
 parseValid :: Parser String
 parseValid = p some
   where
-    idChar =
-      satisfy (\c -> isAlpha_ascii c
-                     || isDigit c
-                     -- N.B. '-' is placed first otherwise attoparsec thinks
-                     -- it belongs to a character class
-                     || inClass "-_.!#$%&*+/<=>?@\\|~:^" c)
+    idChar = satisfyUnicode (\c -> isAlphaNum c || isSymbolChar c || c == '_')
 
     p p' = do
-      vs' <- p' $ utf8String "⋆" <|> return <$> idChar
-      let vs = concat vs'
+      vs <- p' idChar
       c <- peekChar'
       case c of
         '`' -> return vs
         '\'' -> (\x -> vs ++ "'" ++ x) <$> ("'" *> p many') <|> return vs
         _ -> fail "outofvalid"
 
--- | Parses UTF8 strings from ByteString streams.
-utf8String :: String -> Parser String
-utf8String x = decodeUtf8 <$> string (encodeUtf8 x)
-
 -- | Parses identifiers with help of 'parseValid'. Asks GHC for
 -- 'String' from the string it deems valid.
 identifier :: Parser (DocH mod Identifier)
diff --git a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs
index 3f7d60f8..3430ef8a 100644
--- a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs
+++ b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs
@@ -1,4 +1,4 @@
-{-# LANGUAGE GeneralizedNewtypeDeriving, TypeFamilies #-}
+{-# LANGUAGE GeneralizedNewtypeDeriving, TypeFamilies, BangPatterns #-}
 module Documentation.Haddock.Parser.Monad (
   module Documentation.Haddock.Parser.Monad
 , Attoparsec.isDigit
@@ -31,9 +31,10 @@ module Documentation.Haddock.Parser.Monad (
 import           Control.Applicative
 import           Control.Monad
 import           Data.String
-import           Data.ByteString (ByteString)
+import           Data.ByteString (ByteString, length)
 import qualified Data.ByteString.Lazy as LB
 import qualified Data.Attoparsec.ByteString.Char8 as Attoparsec
+import qualified Data.Attoparsec.Combinator as Attoparsec
 import           Control.Monad.Trans.State
 import qualified Control.Monad.Trans.Class as Trans
 import           Data.Word
@@ -41,6 +42,7 @@ import           Data.Bits
 import           Data.Tuple
 
 import           Documentation.Haddock.Types (Version)
+import           Documentation.Haddock.Utf8  (encodeUtf8, decodeUtf8)
 
 newtype ParserState = ParserState {
   parserStateSince :: Maybe Version
@@ -73,6 +75,25 @@ char = lift . Attoparsec.char
 char8 :: Char -> Parser Word8
 char8 = lift . Attoparsec.char8
 
+-- | Peek a unicode character and return the number of bytes that it took up
+peekUnicode :: Parser (Char, Int)
+peekUnicode = lift $ Attoparsec.lookAhead $ do
+  
+  -- attoparsec's take fails on shorter inputs rather than truncate
+  bs <- Attoparsec.choice (map Attoparsec.take [4,3,2,1])
+  
+  let !c = head . decodeUtf8 $ bs
+      !n = Data.ByteString.length . encodeUtf8 $ [c]
+  pure (c, fromIntegral n)
+
+-- | Like 'satisfy', but consuming a unicode character
+satisfyUnicode :: (Char -> Bool) -> Parser Char
+satisfyUnicode predicate = do
+  (c,n) <- peekUnicode
+  if predicate c
+    then Documentation.Haddock.Parser.Monad.take n *> pure c
+    else fail "satsifyUnicode"
+
 anyChar :: Parser Char
 anyChar = lift Attoparsec.anyChar
author	Alec Theriault <alec.theriault@gmail.com>	2018-02-05 18:14:32 -0800
committer	alexbiehl <alex.biehl@gmail.com>	2018-03-02 15:48:36 +0100
commit	44b1d87503a62181b4079962632cd61f1e158d79 (patch)
tree	fd7b0d45ae56891a9b876f05471a17c8655663d2 /haddock-library/src/Documentation/Haddock
parent	69b98a99ce4de93ea0e6082bd11edb3baaf2fa6e (diff)