diff options
author | Alec Theriault <alec.theriault@gmail.com> | 2018-06-05 10:47:16 -0700 |
---|---|---|
committer | Simon Jakobi <simon.jakobi@gmail.com> | 2018-06-13 23:39:30 +0200 |
commit | 254de3010dddb06bc1dacf4c029a9e8f30ff1600 (patch) | |
tree | 57c99c3873cc6cfc415ff6e44960e327492a4da0 | |
parent | ca619464b74c3f55a34406f0a555f7d82c27caae (diff) |
Improve hyperlinker's 'spanToNewline' (#846)
'spanToNewline' is used to help break apart the source into lines which
can then be partioned into CPP and non-CPP chunks. It is important that
'spanToNewline' not break apart tokens, so it needs to properly handle
things like
* block comments, possibly nested
* string literals, possibly multi-line
* CPP macros, possibly multi-line
String literals in particular were not being properly handled. The fix
is to to fall back in 'Text.Read.lex' to help lex things that are not
comments.
Fixes #837.
-rw-r--r-- | haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs | 32 | ||||
-rw-r--r-- | hypsrc-test/ref/src/CPP.html | 216 | ||||
-rw-r--r-- | hypsrc-test/src/CPP.hs | 26 |
3 files changed, 267 insertions, 7 deletions
diff --git a/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs b/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs index 92443bff..e7ecac73 100644 --- a/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs +++ b/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs @@ -3,6 +3,8 @@ module Haddock.Backends.Hyperlinker.Parser (parse) where import Data.Either ( isRight, isLeft ) import Data.List ( foldl', isPrefixOf, isSuffixOf ) import Data.Maybe ( maybeToList ) +import Data.Char ( isSpace ) +import qualified Text.Read as R import GHC ( DynFlags, addSourceToTokens ) import SrcLoc @@ -109,12 +111,9 @@ isCPPline :: String -> Bool isCPPline = isPrefixOf "#" . dropWhile (`elem` " \t") . take 5 --- | Split a "line" off the front of a string, supporting newline escapes. --- --- By "line", we understand: the shortest substring ending in a '\n' that is not --- --- 1. immediately preceded by a '\\' --- 2. not inside some (possibly nested) block comment +-- | Split a "line" off the front of a string, hopefully without cutting tokens +-- in half. I say "hopefully" because knowing what a token is requires lexing, +-- yet lexing depends on this function. -- -- All characters in the input are present in the output: -- @@ -122,17 +121,36 @@ isCPPline = isPrefixOf "#" . dropWhile (`elem` " \t") . take 5 spanToNewline :: Int -- ^ open '{-' -> String -- ^ input -> (String, String) -spanToNewline _ [] = ([], []) + +-- Base case and space characters +spanToNewline _ "" = ("", "") +spanToNewline n ('\n':str) | n <= 0 = ("\n", str) spanToNewline n ('\n':str) | n <= 0 = ("\n", str) spanToNewline n ('\\':'\n':str) = let (str', rest) = spanToNewline n str in ('\\':'\n':str', rest) + +-- Block comments spanToNewline n ('{':'-':str) = let (str', rest) = spanToNewline (n+1) str in ('{':'-':str', rest) spanToNewline n ('-':'}':str) = let (str', rest) = spanToNewline (n-1) str in ('-':'}':str', rest) + +-- When not in a block comment, try to lex a Haskell token +spanToNewline 0 str@(c:_) | ((lexed, str') : _) <- R.lex str, not (isSpace c) = + if all (== '-') lexed && length lexed >= 2 + -- A Haskell line comment + then case span (/= '\n') str' of + (str'', '\n':rest) -> (lexed ++ str'' ++ "\n", rest) + (_, _) -> (str, "") + + -- An actual Haskell token + else let (str'', rest) = spanToNewline 0 str' + in (lexed ++ str'', rest) + +-- In all other cases, advance one character at a time spanToNewline n (c:str) = let (str', rest) = spanToNewline n str in (c:str', rest) diff --git a/hypsrc-test/ref/src/CPP.html b/hypsrc-test/ref/src/CPP.html new file mode 100644 index 00000000..fb85bd2f --- /dev/null +++ b/hypsrc-test/ref/src/CPP.html @@ -0,0 +1,216 @@ +<html xmlns="http://www.w3.org/1999/xhtml" +><head + ><link rel="stylesheet" type="text/css" href="style.css" + /><script type="text/javascript" src="highlight.js" + ></script + ></head + ><body + ><pre + ><span class="hs-pragma" + >{-# LANGUAGE CPP #-}</span + ><span + > +</span + ><a name="line-2" + ></a + ><span class="hs-keyword" + >module</span + ><span + > </span + ><span class="hs-identifier" + >CPP</span + ><span + > </span + ><span class="hs-keyword" + >where</span + ><span + > +</span + ><a name="line-3" + ></a + ><span + > +</span + ><a name="line-4" + ></a + ><span class="hs-cpp" + >#define SOMETHING1 +</span + ><span + > +</span + ><a name="line-6" + ></a + ><span class="hs-identifier" + >foo</span + ><span + > </span + ><span class="hs-glyph" + >::</span + ><span + > </span + ><span class="hs-identifier hs-type" + >String</span + ><span + > +</span + ><a name="line-7" + ></a + ><a name="foo" + ><a href="CPP.html#foo" + ><span class="hs-identifier" + >foo</span + ></a + ></a + ><span + > </span + ><span class="hs-glyph" + >=</span + ><span + > </span + ><span class="hs-comment" + >{- " single quotes are fine in block comments + {- nested block comments are fine -} + -}</span + ><span + > </span + ><span class="hs-string" + >"foo"</span + ><span + > +</span + ><a name="line-10" + ></a + ><span + > +</span + ><a name="line-11" + ></a + ><span class="hs-cpp" + >#define SOMETHING2 +</span + ><span + > +</span + ><a name="line-13" + ></a + ><span class="hs-identifier" + >bar</span + ><span + > </span + ><span class="hs-glyph" + >::</span + ><span + > </span + ><span class="hs-identifier hs-type" + >String</span + ><span + > +</span + ><a name="line-14" + ></a + ><a name="bar" + ><a href="CPP.html#bar" + ><span class="hs-identifier" + >bar</span + ></a + ></a + ><span + > </span + ><span class="hs-glyph" + >=</span + ><span + > </span + ><span class="hs-string" + >"block comment in a string is not a comment {- "</span + ><span + > +</span + ><a name="line-15" + ></a + ><span + > +</span + ><a name="line-16" + ></a + ><span class="hs-cpp" + >#define SOMETHING3 +</span + ><span + > +</span + ><a name="line-18" + ></a + ><span class="hs-comment" + >-- " single quotes are fine in line comments</span + ><span + > +</span + ><a name="line-19" + ></a + ><span class="hs-comment" + >-- {- unclosed block comments are fine in line comments</span + ><span + > +</span + ><a name="line-20" + ></a + ><span + > +</span + ><a name="line-21" + ></a + ><span class="hs-comment" + >-- Multiline CPP is also fine</span + ><span + > +</span + ><a name="line-22" + ></a + ><span class="hs-cpp" + >#define FOO\ + 1 +</span + ><span + > +</span + ><a name="line-25" + ></a + ><span class="hs-identifier" + >baz</span + ><span + > </span + ><span class="hs-glyph" + >::</span + ><span + > </span + ><span class="hs-identifier hs-type" + >String</span + ><span + > +</span + ><a name="line-26" + ></a + ><a name="baz" + ><a href="CPP.html#baz" + ><span class="hs-identifier" + >baz</span + ></a + ></a + ><span + > </span + ><span class="hs-glyph" + >=</span + ><span + > </span + ><span class="hs-string" + >"line comment in a string is not a comment --"</span + ><span + > +</span + ><a name="line-27" + ></a + ></pre + ></body + ></html +>
\ No newline at end of file diff --git a/hypsrc-test/src/CPP.hs b/hypsrc-test/src/CPP.hs new file mode 100644 index 00000000..f00ce031 --- /dev/null +++ b/hypsrc-test/src/CPP.hs @@ -0,0 +1,26 @@ +{-# LANGUAGE CPP #-} +module CPP where + +#define SOMETHING1 + +foo :: String +foo = {- " single quotes are fine in block comments + {- nested block comments are fine -} + -} "foo" + +#define SOMETHING2 + +bar :: String +bar = "block comment in a string is not a comment {- " + +#define SOMETHING3 + +-- " single quotes are fine in line comments +-- {- unclosed block comments are fine in line comments + +-- Multiline CPP is also fine +#define FOO\ + 1 + +baz :: String +baz = "line comment in a string is not a comment --" |