aboutsummaryrefslogtreecommitdiff
path: root/haddock-test/src/Test/Haddock/Xhtml.hs
blob: 74d8c4f77f285f19adaffb058c9775e4183df26e (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
module Test.Haddock.Xhtml
    ( Xml
    , parseXml, dumpXml
    , stripLinks, stripLinksWhen, stripAnchorsWhen, stripIdsWhen, stripFooter
    ) where

{-
This module used to actually parse the HTML (using the `xml` parsing library)
which made it was possible to do more proper normalization of things like ids or
names.

However, in the interests of being able to run this from within the GHC
testsuite (where non-bootlib dependencies are a liability), this was swapped
out for some simple string manipulation. Since the test cases aren't very
and since the `xhtml` library already handles the pretty-printing aspect,
this would appear to be a reasonable compromise for now.
-}

import Data.List ( stripPrefix, isPrefixOf )
import Data.Char ( isSpace )

-- | Simple wrapper around the pretty-printed HTML source
newtype Xml = Xml { unXml :: String }

-- | Part of parsing involves dropping the @DOCTYPE@ line
-- and windows newline endings
parseXml :: String -> Maybe Xml
parseXml = Just . Xml . filter (/= '\r') . dropDocTypeLine
  where
  dropDocTypeLine bs
    | "<!DOCTYPE" `isPrefixOf` bs
    = drop 1 (dropWhile (/= '\n') bs)
    | otherwise
    = bs

dumpXml :: Xml -> String
dumpXml = unXml

type Attr = String
type Value = String

-- | Almost all sanitization operations take the form of:
--
--    * match an attribute key
--    * check something about the value
--    * if the check succeeded, replace the value with a dummy value
--
stripAttrValueWhen
  :: Attr             -- ^ attribute key
  -> Value            -- ^ dummy attribute value
  -> (Value -> Bool)  -- ^ determine whether we should modify the attribute
  -> Xml              -- ^ input XML
  -> Xml              -- ^ output XML
stripAttrValueWhen key fallback p (Xml body) = Xml (filterAttrs body)
  where
  keyEq = key ++ "=\""

  filterAttrs "" = ""
  filterAttrs b@(c:cs)
      | Just valRest <- stripPrefix keyEq b
      , Just (val,rest) <- spanToEndOfString valRest
      = if p val
          then keyEq ++ fallback ++ "\"" ++ filterAttrs rest
          else keyEq ++ val      ++ "\"" ++ filterAttrs rest

      | otherwise
      = c : filterAttrs cs

-- | Spans to the next (unescaped) @\"@ character.
--
-- >>> spanToEndOfString "no closing quotation"
-- Nothing
-- >>> spanToEndOfString "foo\" bar \"baz\""
-- Just ("foo", " bar \"baz\"")
-- >>> spanToEndOfString "foo\\\" bar \"baz\""
-- Just ("foo\\\" bar ", "baz\"")
--
spanToEndOfString :: String -> Maybe (String, String)
spanToEndOfString ('"':rest) = Just ("", rest)
spanToEndOfString ('\\':c:rest)
  | Just (str, rest') <- spanToEndOfString rest
  = Just ('\\':c:str, rest')
spanToEndOfString (c:rest)
  | Just (str, rest') <- spanToEndOfString rest
  = Just (c:str, rest')
spanToEndOfString _ = Nothing


-- | Replace hyperlink targets with @\"#\"@ if they match a predicate
stripLinksWhen :: (Value -> Bool) -> Xml -> Xml 
stripLinksWhen = stripAttrValueWhen "href" "#"

-- | Replace all hyperlink targets with @\"#\"@
stripLinks :: Xml -> Xml
stripLinks = stripLinksWhen (const True)

-- | Replace id's with @\"\"@ if they match a predicate
stripIdsWhen :: (Value -> Bool) -> Xml -> Xml 
stripIdsWhen = stripAttrValueWhen "id" ""

-- | Replace names's with @\"\"@ if they match a predicate
stripAnchorsWhen :: (Value -> Bool) -> Xml -> Xml
stripAnchorsWhen = stripAttrValueWhen "name" ""

-- | Remove the @div@ which has @id=\"footer\"@
stripFooter :: Xml -> Xml
stripFooter (Xml body) = Xml (findDiv body)
  where
  findDiv "" = ""
  findDiv b@(c:cs)
      | Just divRest <- stripPrefix "<div id=\"footer\"" b
      , Just rest <- dropToDiv divRest
      = rest 

      | otherwise
      = c : findDiv cs

  dropToDiv "" = Nothing
  dropToDiv b@(_:cs)
      | Just valRest <- stripPrefix "</div" b
      , valRest' <- dropWhile isSpace valRest
      , Just valRest'' <- stripPrefix ">" valRest'
      = Just valRest''

      | otherwise
      = dropToDiv cs