Allow escaping in URLs and pictures.

Some tests were moved under parseString as they weren't about paragraph level markup. Conflicts: src/Haddock/Parser.hs test/Haddock/ParserSpec.hs
author: Mateusz Kowalczyk <fuuzetsu@fuuzetsu.co.uk> 2013-09-16 03:01:29 +0100
committer: Austin Seipp <austin@well-typed.com> 2014-01-12 14:48:36 -0600
commit: deb106d8250b7582e01f78e88c88ca7465fc0bc1 (patch)
tree: fef76ea6a8c491359becf8ee5d620b34558ffb85
parent: a03c93524ba2ca4143c10770a2fa0dd134b57a83 (diff)
3 files changed, 190 insertions, 143 deletions
diff --git a/src/Haddock/Parser.hs b/src/Haddock/Parser.hs
index b8aa9cb4..6370eecb 100644
--- a/src/Haddock/Parser.hs
+++ b/src/Haddock/Parser.hs
@@ -167,7 +167,7 @@ moduleName = DocModule <$> (char '"' *> modid <* char '"')
 -- Right (DocPic (Picture "hello.png" (Just "world")))
 picture :: Parser (Doc a)
 picture = DocPic . makeLabeled Picture . decodeUtf8
-          <$> ("<<" *> takeWhile1 (`notElem` ">\n") <* ">>")
+          <$> disallowNewline ("<<" *> takeUntil ">>")
 
 -- | Paragraph parser, called by 'parseParas'.
 paragraph :: DynFlags -> Parser (Doc RdrName)
@@ -271,7 +271,8 @@ moreContent :: Monoid a => Parser a -> DynFlags
             -> Parser ([String], Either (Doc RdrName) a)
 moreContent item d = first . (:) <$> nonEmptyLine <*> more item d
 
--- | Collects and parses the result of 'dropFrontOfPara'
+-- | Runs the 'parseParas' parser on an indented paragraph.
+-- The indentation is 4 spaces.
 indentedParagraphs :: DynFlags -> Parser (Doc RdrName)
 indentedParagraphs d = parseParas d . concat <$> dropFrontOfPara "    "
 
@@ -367,7 +368,9 @@ codeblock d =
           | otherwise = Just $ c == '\n'
 
 hyperlink :: Parser (Doc a)
-hyperlink = DocHyperlink . makeLabeled Hyperlink . decodeUtf8 <$> ("<" *> takeWhile1 (`notElem` ">\n") <* ">")
+hyperlink = DocHyperlink . makeLabeled Hyperlink . decodeUtf8
+              <$> disallowNewline ("<" *> takeUntil ">")
+            <|> autoUrl
 
 autoUrl :: Parser (Doc a)
 autoUrl = mkLink <$> url
@@ -425,6 +428,14 @@ takeHorizontalSpace :: Parser BS.ByteString
 takeHorizontalSpace = takeWhile (`elem` " \t\f\v\r")
 
 makeLabeled :: (String -> Maybe String -> a) -> String -> a
-makeLabeled f input = case break isSpace $ strip input of
+makeLabeled f input = case break isSpace $ removeEscapes $ strip input of
   (uri, "")    -> f uri Nothing
   (uri, label) -> f uri (Just $ dropWhile isSpace label)
+  where
+    -- As we don't parse these any further, we don't do any processing to the
+    -- string so we at least remove escape character here. Perhaps we should
+    -- actually be parsing the label at the very least?
+    removeEscapes "" = ""
+    removeEscapes ('\\':'\\':xs) = '\\' : removeEscapes xs
+    removeEscapes ('\\':xs) = removeEscapes xs
+    removeEscapes (x:xs) = x : removeEscapes xs
diff --git a/src/Haddock/Parser/Util.hs b/src/Haddock/Parser/Util.hs
index ea682601..92fa7448 100644
--- a/src/Haddock/Parser/Util.hs
+++ b/src/Haddock/Parser/Util.hs
@@ -7,7 +7,7 @@ import           Data.ByteString.Char8 (ByteString)
 import qualified Data.ByteString.Char8 as BS
 
 takeUntil :: ByteString -> Parser ByteString
-takeUntil end_ = dropEnd <$> requireEnd (scan (False, end) p)
+takeUntil end_ = dropEnd <$> requireEnd (scan (False, end) p) >>= gotSome
   where
     end = BS.unpack end_
 
@@ -20,3 +20,7 @@ takeUntil end_ = dropEnd <$> requireEnd (scan (False, end) p)
 
     dropEnd = BS.reverse . BS.drop (length end) . BS.reverse
     requireEnd = mfilter (BS.isSuffixOf end_)
+
+    gotSome xs
+      | BS.null xs = fail "didn't get any content"
+      | otherwise = return xs
diff --git a/test/Haddock/ParserSpec.hs b/test/Haddock/ParserSpec.hs
index 4679661f..9bca745f 100644
--- a/test/Haddock/ParserSpec.hs
+++ b/test/Haddock/ParserSpec.hs
@@ -68,6 +68,16 @@ spec = before initStaticOpts $ do
       it "allows to backslash-escape characters" $ do
         property $ \x -> ['\\', x] `shouldParseTo` DocString [x]
 
+      context "when parsing strings contaning numeric character references" $ do
+        it "will implicitly convert digits to characters" $ do
+          "&#65;&#65;&#65;&#65;" `shouldParseTo` "AAAA"
+
+          "&#28796;&#30524;&#12398;&#12471;&#12515;&#12490;"
+            `shouldParseTo` "灼眼のシャナ"
+
+        it "will implicitly convert hex encoded characters" $ do
+          "&#x65;&#x65;&#x65;&#x65;" `shouldParseTo` "eeee"
+
     context "when parsing identifiers" $ do
       it "parses identifiers enclosed within single ticks" $ do
         "'foo'" `shouldParseTo` DocIdentifier "foo"
@@ -75,8 +85,15 @@ spec = before initStaticOpts $ do
       it "parses identifiers enclosed within backticks" $ do
         "`foo`" `shouldParseTo` DocIdentifier "foo"
 
-      it "parses a word with one of the delimiters in it as ordinary string" $ do
-        "don't use apostrophe's in the wrong place's" `shouldParseTo` "don't use apostrophe's in the wrong place's"
+      it "parses a word with an one of the delimiters in it as DocString" $ do
+          "don't" `shouldParseTo` "don't"
+
+      it "doesn't pass pairs of delimiters with spaces between them" $ do
+        "hel'lo w'orld" `shouldParseTo` "hel'lo w'orld"
+
+      it "don't use apostrophe's in the wrong place's" $ do
+        " don't use apostrophe's in the wrong place's" `shouldParseTo`
+          "don't use apostrophe's in the wrong place's"
 
     context "when parsing URLs" $ do
       let hyperlink :: String -> Maybe String -> Doc RdrName
@@ -91,13 +108,25 @@ spec = before initStaticOpts $ do
       it "does not accept newlines in label" $ do
         "<foo bar\nbaz>" `shouldParseTo` "<foo bar\nbaz>"
 
-      it "does not allow to escap >" $ do
-        "<http://examp\\>le.com" `shouldParseTo` hyperlink "http://examp\\" Nothing <> "le.com"
+      -- new behaviour test, this will be now consistent with other markup
+      it "allows us to escape > inside the URL" $ do
+        "<http://examp\\>le.com>" `shouldParseTo`
+          hyperlink "http://examp>le.com" Nothing
+
+        "<http://exa\\>mp\\>le.com>" `shouldParseTo`
+          hyperlink "http://exa>mp>le.com" Nothing
+
+        -- Likewise in label
+        "<http://example.com f\\>oo>" `shouldParseTo`
+          hyperlink "http://example.com" "f>oo"
 
       it "parses inline URLs" $ do
         "foo <http://example.com/> bar" `shouldParseTo`
           "foo " <> hyperlink "http://example.com/" Nothing <> " bar"
 
+      it "doesn't allow for multi-line link tags" $ do
+        "<ba\nz aar>" `shouldParseTo` "<ba\nz aar>"
+
       context "when autolinking URLs" $ do
         it "autolinks HTTP URLs" $ do
           "http://example.com/" `shouldParseTo` hyperlink "http://example.com/" Nothing
@@ -129,19 +158,19 @@ spec = before initStaticOpts $ do
           picture uri = DocPic . Picture uri
 
       it "parses a simple picture" $ do
-        "<<foo>>" `shouldParseTo` picture "foo" Nothing
-
-      it "accepts an optional title" $ do
-        "<<foo bar baz>>" `shouldParseTo` picture "foo" (Just "bar baz")
+        "<<baz>>" `shouldParseTo` picture "baz" Nothing
 
-      it "does not accept newlines in title" $ do
-        "<<foo bar\nbaz>>" `shouldParseTo` "<<foo bar\nbaz>>"
+      it "parses a picture with a title" $ do
+        "<<b a z>>" `shouldParseTo` picture "b" (Just "a z")
 
       it "parses a picture with unicode" $ do
-        "<<灼眼 のシャナ>>" `shouldParseTo` picture "灼眼" (Just "のシャナ")
+        "<<灼眼のシャナ>>" `shouldParseTo` picture "灼眼のシャナ" Nothing
 
-      it "doesn't allow for escaping of the closing tags" $ do -- bug?
-        "<<ba\\>>z>>" `shouldParseTo` picture "ba\\" Nothing <> "z>>"
+      it "allows for escaping of the closing tags" $ do
+        "<<ba\\>>z>>" `shouldParseTo` picture "ba>>z" Nothing
+
+      it "doesn't allow for multi-line picture tags" $ do
+        "<<ba\nz aar>>" `shouldParseTo` "<<ba\nz aar>>"
 
     context "when parsing anchors" $ do
       it "parses a single word anchor" $ do
@@ -191,6 +220,25 @@ spec = before initStaticOpts $ do
       it "allows to escape the emphasis delimiter inside of emphasis" $ do
         "/empha\\/sis/" `shouldParseTo` DocEmphasis "empha/sis"
 
+    context "when parsing monospaced text" $ do
+      it "parses simple monospaced text" $ do
+        "@foo@" `shouldParseTo` DocMonospaced "foo"
+
+      it "parses inline monospaced text" $ do
+        "foo @bar@ baz" `shouldParseTo` "foo " <> DocMonospaced "bar" <> " baz"
+
+      it "allows to escape @" $ do
+        "@foo \\@ bar@" `shouldParseTo` DocMonospaced "foo @ bar"
+
+      it "accepts unicode" $ do
+        "@foo 灼眼のシャナ bar@" `shouldParseTo` DocMonospaced "foo 灼眼のシャナ bar"
+
+      it "accepts other markup in monospaced text" $ do
+        "@/foo/@" `shouldParseTo` DocMonospaced (DocEmphasis "foo")
+
+      it "requires the closing @" $ do
+        "@foo /bar/ baz" `shouldParseTo` "@foo " <> DocEmphasis "bar" <> " baz"
+
     context "when parsing bold strings" $ do
       it "allows for a bold string on its own" $ do
         "__bold string__" `shouldParseTo`
@@ -224,50 +272,42 @@ spec = before initStaticOpts $ do
         "__bo\\__ld__" `shouldParseTo`
           DocBold "bo__ld"
 
-    context "when parsing monospaced text" $ do
-      it "parses simple monospaced text" $ do
-        "@foo@" `shouldParseTo` DocMonospaced "foo"
+      it "doesn't allow for empty bold" $ do
+        "____" `shouldParseTo` "____"
 
-      it "parses inline monospaced text" $ do
-        "foo @bar@ baz" `shouldParseTo` "foo " <> DocMonospaced "bar" <> " baz"
+    context "when parsing module strings" $ do
+      it "should parse a module on its own" $ do
+        "\"Module\"" `shouldParseTo`
+          DocModule "Module"
 
-      it "allows to escape @" $ do
-        "@foo \\@ bar@" `shouldParseTo` DocMonospaced "foo @ bar"
+      it "should parse a module inline" $ do
+        "This is a \"Module\"." `shouldParseTo`
+          "This is a " <> DocModule "Module" <> "."
 
-      it "accepts unicode" $ do
-        "@foo 灼眼のシャナ bar@" `shouldParseTo` DocMonospaced "foo 灼眼のシャナ bar"
+      it "can accept a simple module name" $ do
+        "\"Hello\"" `shouldParseTo` DocModule "Hello"
 
-      it "accepts other markup in monospaced text" $ do
-        "@/foo/@" `shouldParseTo` DocMonospaced (DocEmphasis "foo")
+      it "can accept a module name with dots" $ do
+        "\"Hello.World\"" `shouldParseTo` DocModule "Hello.World"
 
-      it "requires the closing @" $ do
-        "@foo /bar/ baz" `shouldParseTo` "@foo " <> DocEmphasis "bar" <> " baz"
+      it "can accept a module name with unicode" $ do
+        "\"Hello.Worldλ\"" `shouldParseTo` DocModule "Hello.Worldλ"
 
-    context "when parsing module names" $ do
-      it "accepts a simple module name" $ do
-        "\"Foo\"" `shouldParseTo` DocModule "Foo"
+      it "parses a module name with a trailing dot as regular quoted string" $ do
+        "\"Hello.\"" `shouldParseTo` "\"Hello.\""
 
-      it "accepts a module name with dots" $ do
-        "\"Foo.Bar.Baz\"" `shouldParseTo` DocModule "Foo.Bar.Baz"
+      it "parses a module name with a space as regular quoted string" $ do
+        "\"Hello World\"" `shouldParseTo` "\"Hello World\""
+
+      it "parses a module name with invalid characters as regular quoted string" $ do
+        "\"Hello&[{}(=*)+]!\"" `shouldParseTo` "\"Hello&[{}(=*)+]!\""
 
       it "accepts a module name with unicode" $ do
         "\"Foo.Barλ\"" `shouldParseTo` DocModule "Foo.Barλ"
 
-      it "parses a module inline" $ do
-        "This is a \"Module\"." `shouldParseTo` ("This is a " <> (DocModule "Module" <> "."))
-
-      it "rejects empty module name" $ do
+      it "treats empty module name as regular double quotes" $ do
         "\"\"" `shouldParseTo` "\"\""
 
-      it "rejects a module name with a trailing dot" $ do
-        "\"Foo.\"" `shouldParseTo` "\"Foo.\""
-
-      it "rejects a module name with a space" $ do
-        "\"Foo Bar\"" `shouldParseTo` "\"Foo Bar\""
-
-      it "rejects a module name with invalid characters" $ do
-        "\"Foo&[{}(=*)+]!\"" `shouldParseTo` "\"Foo&[{}(=*)+]!\""
-
   describe "parseParas" $ do
     let infix 1 `shouldParseTo`
         shouldParseTo :: String -> Doc RdrName -> Expectation
@@ -370,6 +410,14 @@ spec = before initStaticOpts $ do
           , "@"
           ] `shouldParseTo` DocCodeBlock "foo\n@\nbar\n"
 
+      it "accepts unicode" $ do
+        "@foo 灼眼のシャナ bar@" `shouldParseTo` DocCodeBlock "foo 灼眼のシャナ bar"
+
+      it "requires the closing @" $ do
+        "@foo /bar/ baz"
+          `shouldParseTo` DocParagraph ("@foo " <> DocEmphasis "bar" <> " baz")
+
+
     context "when parsing examples" $ do
       it "parses a simple example" $ do
         ">>> foo" `shouldParseTo` DocExamples [Example "foo" []]
@@ -392,6 +440,61 @@ spec = before initStaticOpts $ do
           , Example "fib 10" ["55"]
           ]
 
+      it ("requires an example to be separated"
+          ++ " from a previous paragraph by an empty line") $ do
+        "foobar\n\n>>> fib 10\n55" `shouldParseTo`
+          DocParagraph "foobar"
+                <> DocExamples [Example "fib 10" ["55"]]
+
+      it "parses bird-tracks inside of paragraphs as plain strings" $ do
+        let xs = "foo\n>>> bar"
+        xs `shouldParseTo` DocParagraph (DocString xs)
+
+      it "skips empty lines in front of an example" $ do
+        "\n   \n\n>>> foo" `shouldParseTo` DocExamples [Example "foo" []]
+
+      it "terminates example on empty line" $ do
+        unlines [
+            ">>> foo"
+          , "bar"
+          , "    "
+          , "baz"
+          ]
+        `shouldParseTo`
+          DocExamples [Example "foo" ["bar"]] <> DocParagraph "baz"
+
+      it "parses a <BLANKLINE> result as an empty result" $ do
+        unlines [
+            ">>> foo"
+          , "bar"
+          , "<BLANKLINE>"
+          , "baz"
+          ]
+        `shouldParseTo` DocExamples [Example "foo" ["bar", "", "baz"]]
+
+      it "accepts unicode in examples" $ do
+        ">>> 灼眼\nシャナ" `shouldParseTo` DocExamples [Example "灼眼" ["シャナ"]]
+
+      context "when prompt is prefixed by whitespace" $ do
+        it "strips the exact same amount of whitespace from result lines" $ do
+          unlines [
+              "   >>> foo"
+            , "   bar"
+            , "   baz"
+            ] `shouldParseTo` DocExamples [Example "foo" ["bar", "baz"]]
+
+        it "preserves additional whitespace" $ do
+          unlines [
+              "   >>> foo"
+            , "    bar"
+            ] `shouldParseTo` DocExamples [Example "foo" [" bar"]]
+
+        it "keeps original if stripping is not possible" $ do
+          unlines [
+              "   >>> foo"
+            , " bar"
+            ] `shouldParseTo` DocExamples [Example "foo" [" bar"]]
+
 
     context "when parsing paragraphs nested in lists" $ do
       it "can nest the same type of list" $ do
@@ -450,77 +553,6 @@ spec = before initStaticOpts $ do
                      ]
           <> DocOrderedList [ DocParagraph "baz" ]
 
-
-    context "when parsing consecutive paragraphs" $ do
-      it "will not capture irrelevant consecutive lists" $ do
-        "   * bullet\n\n   - different bullet\n\n   (1) ordered\n \n   "
-          ++ "2. different bullet\n   \n   [cat] kitten\n   \n   [pineapple] fruit"
-          `shouldParseTo`
-          DocUnorderedList [ DocParagraph "bullet"
-                           , DocParagraph "different bullet"]
-          <> DocOrderedList [ DocParagraph "ordered"
-                            , DocParagraph "different bullet"
-                            ]
-          <> DocDefList [ ("cat", "kitten")
-                        , ("pineapple", "fruit")
-                        ]
-
-    context "when parsing an example" $ do
-      it ("requires an example to be separated"
-          ++ " from a previous paragraph by an empty line") $ do
-        "foobar\n\n>>> fib 10\n55" `shouldParseTo`
-          DocParagraph "foobar"
-                <> DocExamples [Example "fib 10" ["55"]]
-
-      it "parses bird-tracks inside of paragraphs as plain strings" $ do
-        let xs = "foo\n>>> bar"
-        xs `shouldParseTo` DocParagraph (DocString xs)
-
-      it "skips empty lines in front of an example" $ do
-        "\n   \n\n>>> foo" `shouldParseTo` DocExamples [Example "foo" []]
-
-      it "terminates example on empty line" $ do
-        unlines [
-            ">>> foo"
-          , "bar"
-          , "    "
-          , "baz"
-          ]
-        `shouldParseTo`
-          DocExamples [Example "foo" ["bar"]] <> DocParagraph "baz"
-
-      it "parses a <BLANKLINE> result as an empty result" $ do
-        unlines [
-            ">>> foo"
-          , "bar"
-          , "<BLANKLINE>"
-          , "baz"
-          ]
-        `shouldParseTo` DocExamples [Example "foo" ["bar", "", "baz"]]
-
-      it "accepts unicode in examples" $ do
-        ">>> 灼眼\nシャナ" `shouldParseTo` DocExamples [Example "灼眼" ["シャナ"]]
-
-      context "when prompt is prefixed by whitespace" $ do
-        it "strips the exact same amount of whitespace from result lines" $ do
-          unlines [
-              "   >>> foo"
-            , "   bar"
-            , "   baz"
-            ] `shouldParseTo` DocExamples [Example "foo" ["bar", "baz"]]
-
-        it "preserves additional whitespace" $ do
-          unlines [
-              "   >>> foo"
-            , "    bar"
-            ] `shouldParseTo` DocExamples [Example "foo" [" bar"]]
-
-        it "keeps original if stripping is not possible" $ do
-          unlines [
-              "   >>> foo"
-            , " bar"
-            ] `shouldParseTo` DocExamples [Example "foo" [" bar"]]
-
     context "when parsing properties" $ do
       it "can parse a single property" $ do
         "prop> 23 == 23" `shouldParseTo` DocProperty "23 == 23"
@@ -701,29 +733,29 @@ spec = before initStaticOpts $ do
         `shouldParseTo` DocParagraph "foo" <> DocDefList [("foo", "bar")] <> DocParagraph "baz"
 
     context "when parsing consecutive paragraphs" $ do
-      it "accepts consecutive lists" $ do
-        unlines [
-            "   * foo"
-          , ""
-          , "   - bar"
-          , ""
-          , "   (1) ordered foo"
-          , " "
-          , "   2. ordered bar"
-          , "   "
-          , "   [cat] kitten"
-          , "   "
-          , "   [pineapple] fruit"
-          ] `shouldParseTo` DocUnorderedList [
-            DocParagraph "foo"
-          , DocParagraph "bar"
-          ] <> DocOrderedList [
-            DocParagraph "ordered foo"
-          , DocParagraph "ordered bar"
-          ] <> DocDefList [
-            ("cat", "kitten")
-          , ("pineapple", "fruit")
-          ]
+      it "will not capture irrelevant consecutive lists" $ do
+        unlines [ "   * bullet"
+                , ""
+                , ""
+                , "   - different bullet"
+                , ""
+                , ""
+                , "   (1) ordered"
+                , " "
+                , "   2. different bullet"
+                , "   "
+                , "   [cat] kitten"
+                , "   "
+                , "   [pineapple] fruit"
+                ] `shouldParseTo`
+          DocUnorderedList [ DocParagraph "bullet"
+                           , DocParagraph "different bullet"]
+          <> DocOrderedList [ DocParagraph "ordered"
+                            , DocParagraph "different bullet"
+                            ]
+          <> DocDefList [ ("cat", "kitten")
+                        , ("pineapple", "fruit")
+                        ]
 
     context "when parsing function documentation headers" $ do
       it "can parse a simple header" $ do
author	Mateusz Kowalczyk <fuuzetsu@fuuzetsu.co.uk>	2013-09-16 03:01:29 +0100
committer	Austin Seipp <austin@well-typed.com>	2014-01-12 14:48:36 -0600
commit	deb106d8250b7582e01f78e88c88ca7465fc0bc1 (patch)
tree	fef76ea6a8c491359becf8ee5d620b34558ffb85
parent	a03c93524ba2ca4143c10770a2fa0dd134b57a83 (diff)