Better identifier parsing

* '(<|>)' and '`elem`' now get parsed and rendered properly as links * 'DbModule'/'DbUnitId' now properly get split apart into two links * tuple names now get parsed properly * some more small niceties... The identifier parsing code is more precise and more efficient (although to be fair: it is also longer and in its own module). On the rendering side, we need to pipe through information about backticks/parens/neither all the way through from renaming to the backends. In terms of impact: a total of 35 modules in the entirety of the bootlib + ghc lib docs change. The only "regression" is things like '\0'. These should be changed to @\\0@ (the path by which this previously worked seems accidental).
author: Alec Theriault <alec.theriault@gmail.com> 2019-02-06 01:01:41 -0800
committer: Alec Theriault <alec.theriault@gmail.com> 2019-02-25 00:42:46 -0800
commit: a5199600c39d25d7b71dcb2328000c1c49ad95a2 (patch)
tree: 787057c0315d1adf98cab3769ad47b63cb3c0a94 /haddock-library/src/Documentation
parent: dd47029cb29c80b1ab4db520c9c2ce4dca37f833 (diff)
3 files changed, 202 insertions, 60 deletions
diff --git a/haddock-library/src/Documentation/Haddock/Parser.hs b/haddock-library/src/Documentation/Haddock/Parser.hs
index e9b1c496..36c8bb5b 100644
--- a/haddock-library/src/Documentation/Haddock/Parser.hs
+++ b/haddock-library/src/Documentation/Haddock/Parser.hs
@@ -27,8 +27,7 @@ module Documentation.Haddock.Parser (
 import           Control.Applicative
 import           Control.Arrow (first)
 import           Control.Monad
-import           Data.Char (chr, isUpper, isAlpha, isAlphaNum, isSpace)
-import           Data.Foldable (asum)
+import           Data.Char (chr, isUpper, isAlpha, isSpace)
 import           Data.List (intercalate, unfoldr, elemIndex)
 import           Data.Maybe (fromMaybe, mapMaybe)
 import           Data.Monoid
@@ -37,6 +36,7 @@ import           Documentation.Haddock.Doc
 import           Documentation.Haddock.Markup ( markup, plainMarkup )
 import           Documentation.Haddock.Parser.Monad
 import           Documentation.Haddock.Parser.Util
+import           Documentation.Haddock.Parser.Identifier
 import           Documentation.Haddock.Types
 import           Prelude hiding (takeWhile)
 import qualified Prelude as P
@@ -47,37 +47,10 @@ import           Text.Parsec (try)
 import qualified Data.Text as T
 import           Data.Text (Text)
 
-#if MIN_VERSION_base(4,9,0)
-import           Text.Read.Lex                      (isSymbolChar)
-#else
-import           Data.Char                          (GeneralCategory (..),
-                                                     generalCategory)
-#endif
 
 -- $setup
 -- >>> :set -XOverloadedStrings
 
-#if !MIN_VERSION_base(4,9,0)
--- inlined from base-4.10.0.0
-isSymbolChar :: Char -> Bool
-isSymbolChar c = not (isPuncChar c) && case generalCategory c of
-    MathSymbol           -> True
-    CurrencySymbol       -> True
-    ModifierSymbol       -> True
-    OtherSymbol          -> True
-    DashPunctuation      -> True
-    OtherPunctuation     -> c `notElem` ("'\"" :: String)
-    ConnectorPunctuation -> c /= '_'
-    _                    -> False
-  where
-    -- | The @special@ character class as defined in the Haskell Report.
-    isPuncChar :: Char -> Bool
-    isPuncChar = (`elem` (",;()[]{}`" :: String))
-#endif
-
--- | Identifier string surrounded with opening and closing quotes/backticks.
-data Identifier = Identifier !Namespace !Char String !Char
-
 -- | Drops the quotes/backticks around all identifiers, as if they
 -- were valid but still 'String's.
 toRegular :: DocH mod Identifier -> DocH mod String
@@ -838,34 +811,6 @@ autoUrl = mkLink <$> url
     mkHyperlink lnk = Hyperlink (T.unpack lnk) Nothing
 
 
-
--- | Parses strings between identifier delimiters. Consumes all input that it
--- deems to be valid in an identifier. Note that it simply blindly consumes
--- characters and does no actual validation itself.
-parseValid :: Parser String
-parseValid = p some
-  where
-    idChar = Parsec.satisfy (\c -> isAlphaNum c || isSymbolChar c || c == '_')
-
-    p p' = do
-      vs <- p' idChar
-      c <- peekChar'
-      case c of
-        '`' -> return vs
-        '\'' -> choice' [ (\x -> vs ++ "'" ++ x) <$> ("'" *> p many), return vs ]
-        _ -> fail "outofvalid"
-
--- | Parses identifiers with help of 'parseValid'. Asks GHC for
--- 'String' from the string it deems valid.
+-- | Parses identifiers with help of 'parseValid'.
 identifier :: Parser (DocH mod Identifier)
-identifier = do
-  ns <- asum [ Value <$ Parsec.char 'v'
-             , Type <$ Parsec.char 't'
-             , pure None
-             ]
-  o <- idDelim
-  vid <- parseValid
-  e <- idDelim
-  return $ DocIdentifier (Identifier ns o vid e)
-  where
-    idDelim = Parsec.oneOf "'`"
+identifier = DocIdentifier <$> parseValid
diff --git a/haddock-library/src/Documentation/Haddock/Parser/Identifier.hs b/haddock-library/src/Documentation/Haddock/Parser/Identifier.hs
new file mode 100644
index 00000000..7bc98b62
--- /dev/null
+++ b/haddock-library/src/Documentation/Haddock/Parser/Identifier.hs
@@ -0,0 +1,186 @@
+{-# LANGUAGE CPP          #-}
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE ViewPatterns #-}
+-- |
+-- Module      :  Documentation.Haddock.Parser.Identifier
+-- Copyright   :  (c) Alec Theriault 2019,
+-- License     :  BSD-like
+--
+-- Maintainer  :  haddock@projects.haskell.org
+-- Stability   :  experimental
+-- Portability :  portable
+--
+-- Functionality for parsing identifiers and operators
+
+module Documentation.Haddock.Parser.Identifier (
+  Identifier(..),
+  parseValid,
+) where
+
+import Documentation.Haddock.Types           ( Namespace(..) )
+import Documentation.Haddock.Parser.Monad
+import qualified Text.Parsec as Parsec
+import           Text.Parsec.Pos             ( updatePosChar )
+import           Text.Parsec                 ( State(..)
+                                             , getParserState, setParserState )
+
+import Data.Text (Text)
+import qualified Data.Text as T
+
+import           Data.Char (isAlpha, isAlphaNum)
+import Control.Monad (guard)
+import Data.Functor (($>))
+#if MIN_VERSION_base(4,9,0)
+import           Text.Read.Lex                      (isSymbolChar)
+#else
+import           Data.Char                          (GeneralCategory (..),
+                                                     generalCategory)
+#endif
+
+import Data.Maybe
+
+-- | Identifier string surrounded with namespace, opening, and closing quotes/backticks.
+data Identifier = Identifier !Namespace !Char String !Char
+  deriving (Show, Eq)
+
+parseValid :: Parser Identifier
+parseValid = do
+  s@State{ stateInput = inp, statePos = pos } <- getParserState
+
+  case takeIdentifier inp of
+    Nothing -> Parsec.parserFail "parseValid: Failed to match a valid identifier"
+    Just (ns, op, ident, cl, inp') ->
+      let posOp = updatePosChar pos op
+          posIdent = T.foldl updatePosChar posOp ident
+          posCl = updatePosChar posIdent cl
+          s' = s{ stateInput = inp', statePos = posCl }
+      in setParserState s' $> Identifier ns op (T.unpack ident) cl
+
+
+#if !MIN_VERSION_base(4,9,0)
+-- inlined from base-4.10.0.0
+isSymbolChar :: Char -> Bool
+isSymbolChar c = not (isPuncChar c) && case generalCategory c of
+    MathSymbol           -> True
+    CurrencySymbol       -> True
+    ModifierSymbol       -> True
+    OtherSymbol          -> True
+    DashPunctuation      -> True
+    OtherPunctuation     -> c `notElem` "'\""
+    ConnectorPunctuation -> c /= '_'
+    _                    -> False
+  where
+    -- | The @special@ character class as defined in the Haskell Report.
+    isPuncChar :: Char -> Bool
+    isPuncChar = (`elem` (",;()[]{}`" :: String))
+#endif
+
+-- | Try to parse a delimited identifier off the front of the given input.
+--
+-- This tries to match as many valid Haskell identifiers/operators as possible,
+-- to the point of sometimes accepting invalid things (ex: keywords). Some
+-- considerations:
+--
+--   - operators and identifiers can have module qualifications
+--   - operators can be wrapped in parens (for prefix)
+--   - identifiers can be wrapped in backticks (for infix)
+--   - delimiters are backticks or regular ticks
+--   - since regular ticks are also valid in identifiers, we opt for the
+--     longest successful parse
+--
+-- This function should make /O(1)/ allocations
+takeIdentifier :: Text -> Maybe (Namespace, Char, Text, Char, Text)
+takeIdentifier input = listToMaybe $ do
+
+    -- Optional namespace
+    let (ns, input') = case T.uncons input of
+                         Just ('v', i) -> (Value, i)
+                         Just ('t', i) -> (Type, i)
+                         _             -> (None, input)
+
+    -- Opening tick
+    (op, input'') <- maybeToList (T.uncons input')
+    guard (op == '\'' || op == '`')
+
+    -- Identifier/operator
+    (ident, input''') <- wrapped input''
+
+    -- Closing tick
+    (cl, input'''') <- maybeToList (T.uncons input''')
+    guard (cl == '\'' || cl == '`')
+
+    pure (ns, op, ident, cl, input'''')
+
+  where
+
+    -- | Parse out a wrapped, possibly qualified, operator or identifier
+    wrapped t = do
+      (c, t'  ) <- maybeToList (T.uncons t)
+      -- Tuples
+      case c of
+        '(' | Just (c', _) <- T.uncons t'
+            , c' == ',' || c' == ')'
+            -> do let (commas, t'') = T.span (== ',') t'
+                  (')', t''') <- maybeToList (T.uncons t'')
+                  pure (T.take (T.length commas + 2) t, t''')
+
+        -- Parenthesized
+        '(' -> do (n,   t'' ) <- general False 0 [] t'
+                  (')', t''') <- maybeToList (T.uncons t'')
+                  pure (T.take (n + 2) t, t''')
+
+        -- Backticked
+        '`' -> do (n,   t'' ) <- general False 0 [] t'
+                  ('`', t''') <- maybeToList (T.uncons t'')
+                  pure (T.take (n + 2) t, t''')
+
+        -- Unadorned
+        _   -> do (n,   t'' ) <- general False 0 [] t
+                  pure (T.take n t, t'')
+
+    -- | Parse out a possibly qualified operator or identifier
+    general :: Bool           -- ^ refuse inputs starting with operators
+            -> Int            -- ^ total characters \"consumed\" so far
+            -> [(Int, Text)]  -- ^ accumulated results
+            -> Text           -- ^ current input
+            -> [(Int, Text)]  -- ^ total characters parsed & what remains
+    general !identOnly !i acc t
+      -- Starts with an identifier (either just an identifier, or a module qual)
+      | Just (n, rest) <- identLike t
+      = if T.null rest
+          then acc
+          else case T.head rest of
+                 '`' -> (n + i, rest) : acc
+                 ')' -> (n + i, rest) : acc
+                 '.' -> general False (n + i + 1) acc (T.tail rest)
+                 '\'' -> let (m, rest') = quotes rest
+                         in general True (n + m + 1 + i) ((n + m + i, rest') : acc) (T.tail rest')
+                 _ -> acc
+
+      -- An operator
+      | Just (n, rest) <- optr t
+      , not identOnly
+      = (n + i, rest) : acc
+
+      -- Anything else
+      | otherwise
+      = acc
+
+    -- | Parse an identifier off the front of the input
+    identLike t
+      | T.null t = Nothing
+      | isAlpha (T.head t) || '_' == T.head t
+      = let !(idt, rest) = T.span (\c -> isAlphaNum c || c == '_') t
+            !(octos, rest') = T.span (== '#') rest
+      in Just (T.length idt + T.length octos, rest')
+      | otherwise = Nothing
+
+    -- | Parse all but the last quote off the front of the input
+    -- PRECONDITION: T.head t == '\''
+    quotes :: Text -> (Int, Text)
+    quotes t = let !n = T.length (T.takeWhile (== '\'') t) - 1
+               in (n, T.drop n t)
+
+    -- | Parse an operator off the front of the input
+    optr t = let !(op, rest) = T.span isSymbolChar t
+             in if T.null op then Nothing else Just (T.length op, rest)
diff --git a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs
index 8f5bd217..fa46f536 100644
--- a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs
+++ b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs
@@ -4,6 +4,18 @@
 {-# LANGUAGE TypeFamilies #-}
 {-# LANGUAGE BangPatterns #-}
 {-# LANGUAGE TypeSynonymInstances #-}
+-- |
+-- Module      :  Documentation.Haddock.Parser.Monad
+-- Copyright   :  (c) Alec Theriault 2018-2019,
+-- License     :  BSD-like
+--
+-- Maintainer  :  haddock@projects.haskell.org
+-- Stability   :  experimental
+-- Portability :  portable
+--
+-- Defines the Parsec monad over which all parsing is done and also provides
+-- more efficient versions of the usual parsec combinator functions (but
+-- specialized to 'Text').
 
 module Documentation.Haddock.Parser.Monad where
 
@@ -96,7 +108,6 @@ takeWhile f = do
       s' = s{ stateInput = inp', statePos = pos' }
   setParserState s' $> t
 
-
 -- | Like 'takeWhile', but fails if no characters matched.
 --
 -- Equivalent to @fmap T.pack . Parsec.many1@, but more efficient.
author	Alec Theriault <alec.theriault@gmail.com>	2019-02-06 01:01:41 -0800
committer	Alec Theriault <alec.theriault@gmail.com>	2019-02-25 00:42:46 -0800
commit	a5199600c39d25d7b71dcb2328000c1c49ad95a2 (patch)
tree	787057c0315d1adf98cab3769ad47b63cb3c0a94 /haddock-library/src/Documentation
parent	dd47029cb29c80b1ab4db520c9c2ce4dca37f833 (diff)