diff options
| author | Alec Theriault <alec.theriault@gmail.com> | 2019-02-06 01:01:41 -0800 | 
|---|---|---|
| committer | Alec Theriault <alec.theriault@gmail.com> | 2019-02-25 00:42:46 -0800 | 
| commit | a5199600c39d25d7b71dcb2328000c1c49ad95a2 (patch) | |
| tree | 787057c0315d1adf98cab3769ad47b63cb3c0a94 /haddock-library | |
| parent | dd47029cb29c80b1ab4db520c9c2ce4dca37f833 (diff) | |
Better identifier parsing
  * '(<|>)' and '`elem`' now get parsed and rendered properly as links
  * 'DbModule'/'DbUnitId' now properly get split apart into two links
  * tuple names now get parsed properly
  * some more small niceties...
The identifier parsing code is more precise and more efficient (although to be
fair: it is also longer and in its own module). On the rendering side, we need
to pipe through information about backticks/parens/neither all the way through
from renaming to the backends.
In terms of impact: a total of 35 modules in the entirety of the bootlib + ghc
lib docs change. The only "regression" is things like '\0'. These should be
changed to @\\0@ (the path by which this previously worked seems accidental).
Diffstat (limited to 'haddock-library')
5 files changed, 212 insertions, 61 deletions
| diff --git a/haddock-library/haddock-library.cabal b/haddock-library/haddock-library.cabal index b24db5d4..5475d61b 100644 --- a/haddock-library/haddock-library.cabal +++ b/haddock-library/haddock-library.cabal @@ -49,6 +49,7 @@ library    other-modules:      Documentation.Haddock.Parser.Util      Documentation.Haddock.Parser.Monad +    Documentation.Haddock.Parser.Identifier  test-suite spec    import: lib-defaults @@ -70,6 +71,7 @@ test-suite spec        Documentation.Haddock.Parser.UtilSpec        Documentation.Haddock.ParserSpec        Documentation.Haddock.Types +      Documentation.Haddock.Parser.Identifier    build-depends:      , base-compat  ^>= 0.9.3 || ^>= 0.10.0 diff --git a/haddock-library/src/Documentation/Haddock/Parser.hs b/haddock-library/src/Documentation/Haddock/Parser.hs index e9b1c496..36c8bb5b 100644 --- a/haddock-library/src/Documentation/Haddock/Parser.hs +++ b/haddock-library/src/Documentation/Haddock/Parser.hs @@ -27,8 +27,7 @@ module Documentation.Haddock.Parser (  import           Control.Applicative  import           Control.Arrow (first)  import           Control.Monad -import           Data.Char (chr, isUpper, isAlpha, isAlphaNum, isSpace) -import           Data.Foldable (asum) +import           Data.Char (chr, isUpper, isAlpha, isSpace)  import           Data.List (intercalate, unfoldr, elemIndex)  import           Data.Maybe (fromMaybe, mapMaybe)  import           Data.Monoid @@ -37,6 +36,7 @@ import           Documentation.Haddock.Doc  import           Documentation.Haddock.Markup ( markup, plainMarkup )  import           Documentation.Haddock.Parser.Monad  import           Documentation.Haddock.Parser.Util +import           Documentation.Haddock.Parser.Identifier  import           Documentation.Haddock.Types  import           Prelude hiding (takeWhile)  import qualified Prelude as P @@ -47,37 +47,10 @@ import           Text.Parsec (try)  import qualified Data.Text as T  import           Data.Text (Text) -#if MIN_VERSION_base(4,9,0) -import           Text.Read.Lex                      (isSymbolChar) -#else -import           Data.Char                          (GeneralCategory (..), -                                                     generalCategory) -#endif  -- $setup  -- >>> :set -XOverloadedStrings -#if !MIN_VERSION_base(4,9,0) --- inlined from base-4.10.0.0 -isSymbolChar :: Char -> Bool -isSymbolChar c = not (isPuncChar c) && case generalCategory c of -    MathSymbol           -> True -    CurrencySymbol       -> True -    ModifierSymbol       -> True -    OtherSymbol          -> True -    DashPunctuation      -> True -    OtherPunctuation     -> c `notElem` ("'\"" :: String) -    ConnectorPunctuation -> c /= '_' -    _                    -> False -  where -    -- | The @special@ character class as defined in the Haskell Report. -    isPuncChar :: Char -> Bool -    isPuncChar = (`elem` (",;()[]{}`" :: String)) -#endif - --- | Identifier string surrounded with opening and closing quotes/backticks. -data Identifier = Identifier !Namespace !Char String !Char -  -- | Drops the quotes/backticks around all identifiers, as if they  -- were valid but still 'String's.  toRegular :: DocH mod Identifier -> DocH mod String @@ -838,34 +811,6 @@ autoUrl = mkLink <$> url      mkHyperlink lnk = Hyperlink (T.unpack lnk) Nothing - --- | Parses strings between identifier delimiters. Consumes all input that it --- deems to be valid in an identifier. Note that it simply blindly consumes --- characters and does no actual validation itself. -parseValid :: Parser String -parseValid = p some -  where -    idChar = Parsec.satisfy (\c -> isAlphaNum c || isSymbolChar c || c == '_') - -    p p' = do -      vs <- p' idChar -      c <- peekChar' -      case c of -        '`' -> return vs -        '\'' -> choice' [ (\x -> vs ++ "'" ++ x) <$> ("'" *> p many), return vs ] -        _ -> fail "outofvalid" - --- | Parses identifiers with help of 'parseValid'. Asks GHC for --- 'String' from the string it deems valid. +-- | Parses identifiers with help of 'parseValid'.  identifier :: Parser (DocH mod Identifier) -identifier = do -  ns <- asum [ Value <$ Parsec.char 'v' -             , Type <$ Parsec.char 't' -             , pure None -             ] -  o <- idDelim -  vid <- parseValid -  e <- idDelim -  return $ DocIdentifier (Identifier ns o vid e) -  where -    idDelim = Parsec.oneOf "'`" +identifier = DocIdentifier <$> parseValid diff --git a/haddock-library/src/Documentation/Haddock/Parser/Identifier.hs b/haddock-library/src/Documentation/Haddock/Parser/Identifier.hs new file mode 100644 index 00000000..7bc98b62 --- /dev/null +++ b/haddock-library/src/Documentation/Haddock/Parser/Identifier.hs @@ -0,0 +1,186 @@ +{-# LANGUAGE CPP          #-} +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE ViewPatterns #-} +-- | +-- Module      :  Documentation.Haddock.Parser.Identifier +-- Copyright   :  (c) Alec Theriault 2019, +-- License     :  BSD-like +-- +-- Maintainer  :  haddock@projects.haskell.org +-- Stability   :  experimental +-- Portability :  portable +-- +-- Functionality for parsing identifiers and operators + +module Documentation.Haddock.Parser.Identifier ( +  Identifier(..), +  parseValid, +) where + +import Documentation.Haddock.Types           ( Namespace(..) ) +import Documentation.Haddock.Parser.Monad +import qualified Text.Parsec as Parsec +import           Text.Parsec.Pos             ( updatePosChar ) +import           Text.Parsec                 ( State(..) +                                             , getParserState, setParserState ) + +import Data.Text (Text) +import qualified Data.Text as T + +import           Data.Char (isAlpha, isAlphaNum) +import Control.Monad (guard) +import Data.Functor (($>)) +#if MIN_VERSION_base(4,9,0) +import           Text.Read.Lex                      (isSymbolChar) +#else +import           Data.Char                          (GeneralCategory (..), +                                                     generalCategory) +#endif + +import Data.Maybe + +-- | Identifier string surrounded with namespace, opening, and closing quotes/backticks. +data Identifier = Identifier !Namespace !Char String !Char +  deriving (Show, Eq) + +parseValid :: Parser Identifier +parseValid = do +  s@State{ stateInput = inp, statePos = pos } <- getParserState + +  case takeIdentifier inp of +    Nothing -> Parsec.parserFail "parseValid: Failed to match a valid identifier" +    Just (ns, op, ident, cl, inp') -> +      let posOp = updatePosChar pos op +          posIdent = T.foldl updatePosChar posOp ident +          posCl = updatePosChar posIdent cl +          s' = s{ stateInput = inp', statePos = posCl } +      in setParserState s' $> Identifier ns op (T.unpack ident) cl + + +#if !MIN_VERSION_base(4,9,0) +-- inlined from base-4.10.0.0 +isSymbolChar :: Char -> Bool +isSymbolChar c = not (isPuncChar c) && case generalCategory c of +    MathSymbol           -> True +    CurrencySymbol       -> True +    ModifierSymbol       -> True +    OtherSymbol          -> True +    DashPunctuation      -> True +    OtherPunctuation     -> c `notElem` "'\"" +    ConnectorPunctuation -> c /= '_' +    _                    -> False +  where +    -- | The @special@ character class as defined in the Haskell Report. +    isPuncChar :: Char -> Bool +    isPuncChar = (`elem` (",;()[]{}`" :: String)) +#endif + +-- | Try to parse a delimited identifier off the front of the given input. +-- +-- This tries to match as many valid Haskell identifiers/operators as possible, +-- to the point of sometimes accepting invalid things (ex: keywords). Some +-- considerations: +-- +--   - operators and identifiers can have module qualifications +--   - operators can be wrapped in parens (for prefix) +--   - identifiers can be wrapped in backticks (for infix) +--   - delimiters are backticks or regular ticks +--   - since regular ticks are also valid in identifiers, we opt for the +--     longest successful parse +-- +-- This function should make /O(1)/ allocations +takeIdentifier :: Text -> Maybe (Namespace, Char, Text, Char, Text) +takeIdentifier input = listToMaybe $ do + +    -- Optional namespace +    let (ns, input') = case T.uncons input of +                         Just ('v', i) -> (Value, i) +                         Just ('t', i) -> (Type, i) +                         _             -> (None, input) + +    -- Opening tick +    (op, input'') <- maybeToList (T.uncons input') +    guard (op == '\'' || op == '`') + +    -- Identifier/operator +    (ident, input''') <- wrapped input'' + +    -- Closing tick +    (cl, input'''') <- maybeToList (T.uncons input''') +    guard (cl == '\'' || cl == '`') + +    pure (ns, op, ident, cl, input'''') + +  where + +    -- | Parse out a wrapped, possibly qualified, operator or identifier +    wrapped t = do +      (c, t'  ) <- maybeToList (T.uncons t) +      -- Tuples +      case c of +        '(' | Just (c', _) <- T.uncons t' +            , c' == ',' || c' == ')' +            -> do let (commas, t'') = T.span (== ',') t' +                  (')', t''') <- maybeToList (T.uncons t'') +                  pure (T.take (T.length commas + 2) t, t''') + +        -- Parenthesized +        '(' -> do (n,   t'' ) <- general False 0 [] t' +                  (')', t''') <- maybeToList (T.uncons t'') +                  pure (T.take (n + 2) t, t''') + +        -- Backticked +        '`' -> do (n,   t'' ) <- general False 0 [] t' +                  ('`', t''') <- maybeToList (T.uncons t'') +                  pure (T.take (n + 2) t, t''') + +        -- Unadorned +        _   -> do (n,   t'' ) <- general False 0 [] t +                  pure (T.take n t, t'') + +    -- | Parse out a possibly qualified operator or identifier +    general :: Bool           -- ^ refuse inputs starting with operators +            -> Int            -- ^ total characters \"consumed\" so far +            -> [(Int, Text)]  -- ^ accumulated results +            -> Text           -- ^ current input +            -> [(Int, Text)]  -- ^ total characters parsed & what remains +    general !identOnly !i acc t +      -- Starts with an identifier (either just an identifier, or a module qual) +      | Just (n, rest) <- identLike t +      = if T.null rest +          then acc +          else case T.head rest of +                 '`' -> (n + i, rest) : acc +                 ')' -> (n + i, rest) : acc +                 '.' -> general False (n + i + 1) acc (T.tail rest) +                 '\'' -> let (m, rest') = quotes rest +                         in general True (n + m + 1 + i) ((n + m + i, rest') : acc) (T.tail rest') +                 _ -> acc + +      -- An operator +      | Just (n, rest) <- optr t +      , not identOnly +      = (n + i, rest) : acc + +      -- Anything else +      | otherwise +      = acc + +    -- | Parse an identifier off the front of the input +    identLike t +      | T.null t = Nothing +      | isAlpha (T.head t) || '_' == T.head t +      = let !(idt, rest) = T.span (\c -> isAlphaNum c || c == '_') t +            !(octos, rest') = T.span (== '#') rest +      in Just (T.length idt + T.length octos, rest') +      | otherwise = Nothing + +    -- | Parse all but the last quote off the front of the input +    -- PRECONDITION: T.head t == '\'' +    quotes :: Text -> (Int, Text) +    quotes t = let !n = T.length (T.takeWhile (== '\'') t) - 1 +               in (n, T.drop n t) + +    -- | Parse an operator off the front of the input +    optr t = let !(op, rest) = T.span isSymbolChar t +             in if T.null op then Nothing else Just (T.length op, rest) diff --git a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs index 8f5bd217..fa46f536 100644 --- a/haddock-library/src/Documentation/Haddock/Parser/Monad.hs +++ b/haddock-library/src/Documentation/Haddock/Parser/Monad.hs @@ -4,6 +4,18 @@  {-# LANGUAGE TypeFamilies #-}  {-# LANGUAGE BangPatterns #-}  {-# LANGUAGE TypeSynonymInstances #-} +-- | +-- Module      :  Documentation.Haddock.Parser.Monad +-- Copyright   :  (c) Alec Theriault 2018-2019, +-- License     :  BSD-like +-- +-- Maintainer  :  haddock@projects.haskell.org +-- Stability   :  experimental +-- Portability :  portable +-- +-- Defines the Parsec monad over which all parsing is done and also provides +-- more efficient versions of the usual parsec combinator functions (but +-- specialized to 'Text').  module Documentation.Haddock.Parser.Monad where @@ -96,7 +108,6 @@ takeWhile f = do        s' = s{ stateInput = inp', statePos = pos' }    setParserState s' $> t -  -- | Like 'takeWhile', but fails if no characters matched.  --  -- Equivalent to @fmap T.pack . Parsec.many1@, but more efficient. diff --git a/haddock-library/test/Documentation/Haddock/ParserSpec.hs b/haddock-library/test/Documentation/Haddock/ParserSpec.hs index e186a5cf..bc40a0a2 100644 --- a/haddock-library/test/Documentation/Haddock/ParserSpec.hs +++ b/haddock-library/test/Documentation/Haddock/ParserSpec.hs @@ -112,7 +112,7 @@ spec = do          "``" `shouldParseTo` "``"        it "can parse an identifier in infix notation enclosed within backticks" $ do -        "``infix``" `shouldParseTo` "`" <> DocIdentifier "infix" <> "`" +        "``infix``" `shouldParseTo` DocIdentifier "`infix`"        it "can parse identifiers containing a single quote" $ do          "'don't'" `shouldParseTo` DocIdentifier "don't" @@ -138,6 +138,13 @@ spec = do        it "can parse type-namespaced identifiers" $ do          "t'foo'" `shouldParseTo` DocIdentifier "foo" +      it "can parse parenthesized operators and backticked identifiers" $ do +        "'(<|>)'" `shouldParseTo` DocIdentifier "(<|>)" +        "'`elem`'" `shouldParseTo` DocIdentifier "`elem`" + +      it "can properly figure out the end of identifiers" $ do +        "'DbModule'/'DbUnitId'" `shouldParseTo` DocIdentifier "DbModule" <> "/" <> DocIdentifier "DbUnitId" +      context "when parsing operators" $ do        it "can parse an operator enclosed within single quotes" $ do          "'.='" `shouldParseTo` DocIdentifier ".=" | 
