diff options
Diffstat (limited to 'haddock-library/src/Documentation')
-rw-r--r-- | haddock-library/src/Documentation/Haddock/Doc.hs | 60 | ||||
-rw-r--r-- | haddock-library/src/Documentation/Haddock/Parser.hs | 474 | ||||
-rw-r--r-- | haddock-library/src/Documentation/Haddock/Parser/Util.hs | 61 | ||||
-rw-r--r-- | haddock-library/src/Documentation/Haddock/Types.hs | 70 | ||||
-rw-r--r-- | haddock-library/src/Documentation/Haddock/Utf8.hs | 74 |
5 files changed, 739 insertions, 0 deletions
diff --git a/haddock-library/src/Documentation/Haddock/Doc.hs b/haddock-library/src/Documentation/Haddock/Doc.hs new file mode 100644 index 00000000..4d6c10a4 --- /dev/null +++ b/haddock-library/src/Documentation/Haddock/Doc.hs @@ -0,0 +1,60 @@ +{-# OPTIONS_GHC -fno-warn-orphans #-} +module Documentation.Haddock.Doc (docParagraph) where + +import Data.Monoid +import Documentation.Haddock.Types +import Data.Char (isSpace) + +-- We put it here so that we can avoid a circular import +-- anything relevant imports this module anyway +instance Monoid (DocH mod id) where + mempty = DocEmpty + mappend = docAppend + +docAppend :: DocH mod id -> DocH mod id -> DocH mod id +docAppend (DocDefList ds1) (DocDefList ds2) = DocDefList (ds1++ds2) +docAppend (DocDefList ds1) (DocAppend (DocDefList ds2) d) = DocAppend (DocDefList (ds1++ds2)) d +docAppend (DocOrderedList ds1) (DocOrderedList ds2) = DocOrderedList (ds1 ++ ds2) +docAppend (DocUnorderedList ds1) (DocUnorderedList ds2) = DocUnorderedList (ds1 ++ ds2) +docAppend DocEmpty d = d +docAppend d DocEmpty = d +docAppend (DocString s1) (DocString s2) = DocString (s1 ++ s2) +docAppend (DocAppend d (DocString s1)) (DocString s2) = DocAppend d (DocString (s1 ++ s2)) +docAppend (DocString s1) (DocAppend (DocString s2) d) = DocAppend (DocString (s1 ++ s2)) d +docAppend d1 d2 = DocAppend d1 d2 + +-- again to make parsing easier - we spot a paragraph whose only item +-- is a DocMonospaced and make it into a DocCodeBlock +docParagraph :: DocH mod id -> DocH mod id +docParagraph (DocMonospaced p) + = DocCodeBlock (docCodeBlock p) +docParagraph (DocAppend (DocString s1) (DocMonospaced p)) + | all isSpace s1 + = DocCodeBlock (docCodeBlock p) +docParagraph (DocAppend (DocString s1) + (DocAppend (DocMonospaced p) (DocString s2))) + | all isSpace s1 && all isSpace s2 + = DocCodeBlock (docCodeBlock p) +docParagraph (DocAppend (DocMonospaced p) (DocString s2)) + | all isSpace s2 + = DocCodeBlock (docCodeBlock p) +docParagraph p + = DocParagraph p + + +-- Drop trailing whitespace from @..@ code blocks. Otherwise this: +-- +-- -- @ +-- -- foo +-- -- @ +-- +-- turns into (DocCodeBlock "\nfoo\n ") which when rendered in HTML +-- gives an extra vertical space after the code block. The single space +-- on the final line seems to trigger the extra vertical space. +-- +docCodeBlock :: DocH mod id -> DocH mod id +docCodeBlock (DocString s) + = DocString (reverse $ dropWhile (`elem` " \t") $ reverse s) +docCodeBlock (DocAppend l r) + = DocAppend l (docCodeBlock r) +docCodeBlock d = d diff --git a/haddock-library/src/Documentation/Haddock/Parser.hs b/haddock-library/src/Documentation/Haddock/Parser.hs new file mode 100644 index 00000000..3d146d33 --- /dev/null +++ b/haddock-library/src/Documentation/Haddock/Parser.hs @@ -0,0 +1,474 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE StandaloneDeriving #-} +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE UndecidableInstances #-} +{-# LANGUAGE IncoherentInstances #-} +{-# LANGUAGE LambdaCase #-} +-- | +-- Module : Documentation.Haddock.Parser +-- Copyright : (c) Mateusz Kowalczyk 2013-2014, +-- Simon Hengel 2013 +-- License : BSD-like +-- +-- Maintainer : haddock@projects.haskell.org +-- Stability : experimental +-- Portability : portable + +module Documentation.Haddock.Parser ( parseString, parseParas + , overIdentifier, toRegular, Identifier + ) where + +import Control.Applicative +import Control.Arrow (first) +import Control.Monad (void, mfilter) +import Data.Attoparsec.ByteString.Char8 hiding (parse, take, endOfLine) +import qualified Data.ByteString.Char8 as BS +import Data.Char (chr, isAsciiUpper) +import Data.List (stripPrefix, intercalate, unfoldr) +import Data.Maybe (fromMaybe) +import Data.Monoid +import Documentation.Haddock.Doc +import Documentation.Haddock.Parser.Util +import Documentation.Haddock.Types +import Documentation.Haddock.Utf8 +import Prelude hiding (takeWhile) + +-- | Identifier string surrounded with opening and closing quotes/backticks. +type Identifier = (Char, String, Char) + +-- | Drops the quotes/backticks around all identifiers, as if they +-- were valid but still 'String's. +toRegular :: DocH mod Identifier -> DocH mod String +toRegular = fmap (\(_, x, _) -> x) + +-- | Maps over 'DocIdentifier's over 'String' with potentially failing +-- conversion using user-supplied function. If the conversion fails, +-- the identifier is deemed to not be valid and is treated as a +-- regular string. +overIdentifier :: (String -> Maybe a) + -> DocH mod Identifier + -> DocH mod a +overIdentifier f d = g d + where + g (DocIdentifier (o, x, e)) = case f x of + Nothing -> DocString $ o : x ++ [e] + Just x' -> DocIdentifier x' + g DocEmpty = DocEmpty + g (DocAppend x x') = DocAppend (g x) (g x') + g (DocString x) = DocString x + g (DocParagraph x) = DocParagraph $ g x + g (DocIdentifierUnchecked x) = DocIdentifierUnchecked x + g (DocModule x) = DocModule x + g (DocWarning x) = DocWarning $ g x + g (DocEmphasis x) = DocEmphasis $ g x + g (DocMonospaced x) = DocMonospaced $ g x + g (DocBold x) = DocBold $ g x + g (DocUnorderedList x) = DocUnorderedList $ fmap g x + g (DocOrderedList x) = DocOrderedList $ fmap g x + g (DocDefList x) = DocDefList $ fmap (\(y, z) -> (g y, g z)) x + g (DocCodeBlock x) = DocCodeBlock $ g x + g (DocHyperlink x) = DocHyperlink x + g (DocPic x) = DocPic x + g (DocAName x) = DocAName x + g (DocProperty x) = DocProperty x + g (DocExamples x) = DocExamples x + g (DocHeader (Header l x)) = DocHeader . Header l $ g x + +parse :: Parser a -> BS.ByteString -> a +parse p = either err id . parseOnly (p <* endOfInput) + where + err = error . ("Haddock.Parser.parse: " ++) + +-- | Main entry point to the parser. Appends the newline character +-- to the input string. +parseParas :: String -- ^ String to parse + -> DocH mod Identifier +parseParas = parse (p <* skipSpace) . encodeUtf8 . (++ "\n") + where + p :: Parser (DocH mod Identifier) + p = mconcat <$> paragraph `sepBy` many (skipHorizontalSpace *> "\n") + +-- | Parse a text paragraph. Actually just a wrapper over 'parseStringBS' which +-- drops leading whitespace and encodes the string to UTF8 first. +parseString :: String -> DocH mod Identifier +parseString = parseStringBS . encodeUtf8 . dropWhile isSpace + +parseStringBS :: BS.ByteString -> DocH mod Identifier +parseStringBS = parse p + where + p :: Parser (DocH mod Identifier) + p = mconcat <$> many (monospace <|> anchor <|> identifier + <|> moduleName <|> picture <|> hyperlink <|> autoUrl <|> bold + <|> emphasis <|> encodedChar <|> string' <|> skipSpecialChar) + +-- | Parses and processes +-- <https://en.wikipedia.org/wiki/Numeric_character_reference Numeric character references> +-- +-- >>> parseOnly encodedChar "ABC" +-- Right (DocString "ABC") +encodedChar :: Parser (DocH mod a) +encodedChar = "&#" *> c <* ";" + where + c = DocString . return . chr <$> num + num = hex <|> decimal + hex = ("x" <|> "X") *> hexadecimal + +specialChar :: [Char] +specialChar = "_/<@\"&'`" + +-- | Plain, regular parser for text. Called as one of the last parsers +-- to ensure that we have already given a chance to more meaningful parsers +-- before capturing their characers. +string' :: Parser (DocH mod a) +string' = DocString . unescape . decodeUtf8 <$> takeWhile1_ (`notElem` specialChar) + where + unescape "" = "" + unescape ('\\':x:xs) = x : unescape xs + unescape (x:xs) = x : unescape xs + +-- | Skips a single special character and treats it as a plain string. +-- This is done to skip over any special characters belonging to other +-- elements but which were not deemed meaningful at their positions. +skipSpecialChar :: Parser (DocH mod a) +skipSpecialChar = DocString . return <$> satisfy (`elem` specialChar) + +-- | Emphasis parser. +-- +-- >>> parseOnly emphasis "/Hello world/" +-- Right (DocEmphasis (DocString "Hello world")) +emphasis :: Parser (DocH mod Identifier) +emphasis = DocEmphasis . parseStringBS <$> + mfilter ('\n' `BS.notElem`) ("/" *> takeWhile1_ (/= '/') <* "/") + +-- | Bold parser. +-- +-- >>> parseOnly bold "__Hello world__" +-- Right (DocBold (DocString "Hello world")) +bold :: Parser (DocH mod Identifier) +bold = DocBold . parseStringBS <$> disallowNewline ("__" *> takeUntil "__") + +disallowNewline :: Parser BS.ByteString -> Parser BS.ByteString +disallowNewline = mfilter ('\n' `BS.notElem`) + +-- | Like `takeWhile`, but unconditionally take escaped characters. +takeWhile_ :: (Char -> Bool) -> Parser BS.ByteString +takeWhile_ p = scan False p_ + where + p_ escaped c + | escaped = Just False + | not $ p c = Nothing + | otherwise = Just (c == '\\') + +-- | Like `takeWhile1`, but unconditionally take escaped characters. +takeWhile1_ :: (Char -> Bool) -> Parser BS.ByteString +takeWhile1_ = mfilter (not . BS.null) . takeWhile_ + +-- | Text anchors to allow for jumping around the generated documentation. +-- +-- >>> parseOnly anchor "#Hello world#" +-- Right (DocAName "Hello world") +anchor :: Parser (DocH mod a) +anchor = DocAName . decodeUtf8 <$> ("#" *> takeWhile1 (`notElem` "#\n") <* "#") + +-- | Monospaced strings. +-- +-- >>> parseOnly monospace "@cruel@" +-- Right (DocMonospaced (DocString "cruel")) +monospace :: Parser (DocH mod Identifier) +monospace = DocMonospaced . parseStringBS <$> ("@" *> takeWhile1_ (/= '@') <* "@") + +moduleName :: Parser (DocH mod a) +moduleName = DocModule <$> (char '"' *> modid <* char '"') + where + modid = intercalate "." <$> conid `sepBy1` "." + conid = (:) + <$> satisfy isAsciiUpper + -- NOTE: According to Haskell 2010 we shouldd actually only + -- accept {small | large | digit | ' } here. But as we can't + -- match on unicode characters, this is currently not possible. + <*> (decodeUtf8 <$> takeWhile (`notElem` " .&[{}(=*)+]!#|@/;,^?\"\n")) + +-- | Picture parser, surrounded by \<\< and \>\>. It's possible to specify +-- a title for the picture. +-- +-- >>> parseOnly picture "<<hello.png>>" +-- Right (DocPic (Picture "hello.png" Nothing)) +-- >>> parseOnly picture "<<hello.png world>>" +-- Right (DocPic (Picture "hello.png" (Just "world"))) +picture :: Parser (DocH mod a) +picture = DocPic . makeLabeled Picture . decodeUtf8 + <$> disallowNewline ("<<" *> takeUntil ">>") + +-- | Paragraph parser, called by 'parseParas'. +paragraph :: Parser (DocH mod Identifier) +paragraph = examples <|> skipSpace *> (list <|> birdtracks <|> codeblock + <|> property <|> header + <|> textParagraph) + +header :: Parser (DocH mod Identifier) +header = do + let psers = map (string . encodeUtf8 . concat . flip replicate "=") [6, 5 .. 1] + pser = foldl1 (<|>) psers + delim <- decodeUtf8 <$> pser + line <- skipHorizontalSpace *> nonEmptyLine >>= return . parseString + rest <- paragraph <|> return mempty + return $ DocParagraph (DocHeader (Header (length delim) line)) <> rest + +textParagraph :: Parser (DocH mod Identifier) +textParagraph = docParagraph . parseString . intercalate "\n" <$> many1 nonEmptyLine + +-- | List parser, called by 'paragraph'. +list :: Parser (DocH mod Identifier) +list = DocUnorderedList <$> unorderedList + <|> DocOrderedList <$> orderedList + <|> DocDefList <$> definitionList + +-- | Parses unordered (bullet) lists. +unorderedList :: Parser [DocH mod Identifier] +unorderedList = ("*" <|> "-") *> innerList unorderedList + +-- | Parses ordered lists (numbered or dashed). +orderedList :: Parser [DocH mod Identifier] +orderedList = (paren <|> dot) *> innerList orderedList + where + dot = (decimal :: Parser Int) <* "." + paren = "(" *> decimal <* ")" + +-- | Generic function collecting any further lines belonging to the +-- list entry and recursively collecting any further lists in the +-- same paragraph. Usually used as +-- +-- > someListFunction = listBeginning *> innerList someListFunction +innerList :: Parser [DocH mod Identifier] -> Parser [DocH mod Identifier] +innerList item = do + c <- takeLine + (cs, items) <- more item + let contents = docParagraph . parseString . dropNLs . unlines $ c : cs + return $ case items of + Left p -> [contents `joinPara` p] + Right i -> contents : i + +-- | Parses definition lists. +definitionList :: Parser [(DocH mod Identifier, DocH mod Identifier)] +definitionList = do + label <- "[" *> (parseStringBS <$> takeWhile1 (`notElem` "]\n")) <* "]" + c <- takeLine + (cs, items) <- more definitionList + let contents = parseString . dropNLs . unlines $ c : cs + return $ case items of + Left p -> [(label, contents `joinPara` p)] + Right i -> (label, contents) : i + +-- | If possible, appends two 'Doc's under a 'DocParagraph' rather than +-- outside of it. This allows to get structures like +-- +-- @DocParagraph (DocAppend … …)@ +-- +-- rather than +-- +-- @DocAppend (DocParagraph …) …@ +joinPara :: DocH mod id -> DocH mod id -> DocH mod id +joinPara (DocParagraph p) c = docParagraph $ p <> c +joinPara d p = d <> p + +-- | Drops all trailing newlines. +dropNLs :: String -> String +dropNLs = reverse . dropWhile (== '\n') . reverse + +-- | Main worker for 'innerList' and 'definitionList'. +-- We need the 'Either' here to be able to tell in the respective functions +-- whether we're dealing with the next list or a nested paragraph. +more :: Monoid a => Parser a + -> Parser ([String], Either (DocH mod Identifier) a) +more item = innerParagraphs <|> moreListItems item + <|> moreContent item <|> pure ([], Right mempty) + +-- | Use by 'innerList' and 'definitionList' to parse any nested paragraphs. +innerParagraphs :: Parser ([String], Either (DocH mod Identifier) a) +innerParagraphs = (,) [] . Left <$> ("\n" *> indentedParagraphs) + +-- | Attemps to fetch the next list if possibly. Used by 'innerList' and +-- 'definitionList' to recursivly grab lists that aren't separated by a whole +-- paragraph. +moreListItems :: Parser a + -> Parser ([String], Either (DocH mod Identifier) a) +moreListItems item = (,) [] . Right <$> (skipSpace *> item) + +-- | Helper for 'innerList' and 'definitionList' which simply takes +-- a line of text and attempts to parse more list content with 'more'. +moreContent :: Monoid a => Parser a + -> Parser ([String], Either (DocH mod Identifier) a) +moreContent item = first . (:) <$> nonEmptyLine <*> more item + +-- | Runs the 'parseParas' parser on an indented paragraph. +-- The indentation is 4 spaces. +indentedParagraphs :: Parser (DocH mod Identifier) +indentedParagraphs = parseParas . concat <$> dropFrontOfPara " " + +-- | Grab as many fully indented paragraphs as we can. +dropFrontOfPara :: Parser BS.ByteString -> Parser [String] +dropFrontOfPara sp = do + currentParagraph <- some (sp *> takeNonEmptyLine) + followingParagraphs <- + skipHorizontalSpace *> nextPar -- we have more paragraphs to take + <|> skipHorizontalSpace *> nlList -- end of the ride, remember the newline + <|> endOfInput *> return [] -- nothing more to take at all + return (currentParagraph ++ followingParagraphs) + where + nextPar = (++) <$> nlList <*> dropFrontOfPara sp + nlList = "\n" *> return ["\n"] + +nonSpace :: BS.ByteString -> Parser BS.ByteString +nonSpace xs + | not $ any (not . isSpace) $ decodeUtf8 xs = fail "empty line" + | otherwise = return xs + +-- | Takes a non-empty, not fully whitespace line. +-- +-- Doesn't discard the trailing newline. +takeNonEmptyLine :: Parser String +takeNonEmptyLine = do + (++ "\n") . decodeUtf8 <$> (takeWhile1 (/= '\n') >>= nonSpace) <* "\n" + +birdtracks :: Parser (DocH mod a) +birdtracks = DocCodeBlock . DocString . intercalate "\n" . stripSpace <$> many1 line + where + line = skipHorizontalSpace *> ">" *> takeLine + +stripSpace :: [String] -> [String] +stripSpace = fromMaybe <*> mapM strip' + where + strip' (' ':xs') = Just xs' + strip' "" = Just "" + strip' _ = Nothing + +-- | Parses examples. Examples are a paragraph level entitity (separated by an empty line). +-- Consecutive examples are accepted. +examples :: Parser (DocH mod a) +examples = DocExamples <$> (many (skipHorizontalSpace *> "\n") *> go) + where + go :: Parser [Example] + go = do + prefix <- decodeUtf8 <$> takeHorizontalSpace <* ">>>" + expr <- takeLine + (rs, es) <- resultAndMoreExamples + return (makeExample prefix expr rs : es) + where + resultAndMoreExamples :: Parser ([String], [Example]) + resultAndMoreExamples = moreExamples <|> result <|> pure ([], []) + where + moreExamples :: Parser ([String], [Example]) + moreExamples = (,) [] <$> go + + result :: Parser ([String], [Example]) + result = first . (:) <$> nonEmptyLine <*> resultAndMoreExamples + + makeExample :: String -> String -> [String] -> Example + makeExample prefix expression res = + Example (strip expression) result + where + result = map (substituteBlankLine . tryStripPrefix) res + + tryStripPrefix xs = fromMaybe xs (stripPrefix prefix xs) + + substituteBlankLine "<BLANKLINE>" = "" + substituteBlankLine xs = xs + +nonEmptyLine :: Parser String +nonEmptyLine = mfilter (any (not . isSpace)) takeLine + +takeLine :: Parser String +takeLine = decodeUtf8 <$> takeWhile (/= '\n') <* endOfLine + +endOfLine :: Parser () +endOfLine = void "\n" <|> endOfInput + +-- | Property parser. +-- +-- >>> parseOnly property "prop> hello world" +-- Right (DocProperty "hello world") +property :: Parser (DocH mod a) +property = DocProperty . strip . decodeUtf8 <$> ("prop>" *> takeWhile1 (/= '\n')) + +-- | +-- Paragraph level codeblock. Anything between the two delimiting @ is parsed +-- for markup. +codeblock :: Parser (DocH mod Identifier) +codeblock = + DocCodeBlock . parseStringBS . dropSpaces + <$> ("@" *> skipHorizontalSpace *> "\n" *> block' <* "@") + where + dropSpaces xs = + let rs = decodeUtf8 xs + in case splitByNl rs of + [] -> xs + ys -> case last ys of + ' ':_ -> case mapM dropSpace ys of + Nothing -> xs + Just zs -> encodeUtf8 $ intercalate "\n" zs + _ -> xs + + -- This is necessary because ‘lines’ swallows up a trailing newline + -- and we lose information about whether the last line belongs to @ or to + -- text which we need to decide whether we actually want to be dropping + -- anything at all. + splitByNl = unfoldr (\case '\n':s -> Just (span (/= '\n') s) + _ -> Nothing) + . ('\n' :) + + dropSpace "" = Just "" + dropSpace (' ':xs) = Just xs + dropSpace _ = Nothing + + block' = scan False p + where + p isNewline c + | isNewline && c == '@' = Nothing + | isNewline && isSpace c = Just isNewline + | otherwise = Just $ c == '\n' + +hyperlink :: Parser (DocH mod a) +hyperlink = DocHyperlink . makeLabeled Hyperlink . decodeUtf8 + <$> disallowNewline ("<" *> takeUntil ">") + <|> autoUrl + +autoUrl :: Parser (DocH mod a) +autoUrl = mkLink <$> url + where + url = mappend <$> ("http://" <|> "https://" <|> "ftp://") <*> takeWhile1 (not . isSpace) + mkLink :: BS.ByteString -> DocH mod a + mkLink s = case BS.unsnoc s of + Just (xs, x) | x `elem` ",.!?" -> DocHyperlink (Hyperlink (decodeUtf8 xs) Nothing) <> DocString [x] + _ -> DocHyperlink (Hyperlink (decodeUtf8 s) Nothing) + +-- | Parses strings between identifier delimiters. Consumes all input that it +-- deems to be valid in an identifier. Note that it simply blindly consumes +-- characters and does no actual validation itself. +parseValid :: Parser String +parseValid = do + vs' <- many' $ utf8String "⋆" <|> return <$> idChar + let vs = concat vs' + c <- peekChar + case c of + Just '`' -> return vs + Just '\'' -> (\x -> vs ++ "'" ++ x) <$> ("'" *> parseValid) + <|> return vs + _ -> fail "outofvalid" + where + idChar = satisfy (`elem` "_.!#$%&*+/<=>?@\\|-~:^") + <|> digit <|> letter_ascii + +-- | Parses UTF8 strings from ByteString streams. +utf8String :: String -> Parser String +utf8String x = decodeUtf8 <$> string (encodeUtf8 x) + +-- | Parses identifiers with help of 'parseValid'. Asks GHC for 'String' from the +-- string it deems valid. +identifier :: Parser (DocH mod Identifier) +identifier = do + o <- idDelim + vid <- parseValid + e <- idDelim + return $ DocIdentifier (o, vid, e) + where + idDelim = char '\'' <|> char '`' diff --git a/haddock-library/src/Documentation/Haddock/Parser/Util.hs b/haddock-library/src/Documentation/Haddock/Parser/Util.hs new file mode 100644 index 00000000..25dba2d5 --- /dev/null +++ b/haddock-library/src/Documentation/Haddock/Parser/Util.hs @@ -0,0 +1,61 @@ +-- | +-- Module : Documentation.Haddock.Parser.Util +-- Copyright : (c) Mateusz Kowalczyk 2013-2014, +-- Simon Hengel 2013 +-- License : BSD-like +-- +-- Maintainer : haddock@projects.haskell.org +-- Stability : experimental +-- Portability : portable +-- +-- Various utility functions used by the parser. +module Documentation.Haddock.Parser.Util where + +import Control.Applicative +import Control.Monad (mfilter) +import Data.Attoparsec.ByteString.Char8 hiding (parse, take, endOfLine) +import Data.ByteString.Char8 (ByteString) +import qualified Data.ByteString.Char8 as BS +import Prelude hiding (takeWhile) + +-- | Remove all leading and trailing whitespace +strip :: String -> String +strip = (\f -> f . f) $ dropWhile isSpace . reverse + +skipHorizontalSpace :: Parser () +skipHorizontalSpace = skipWhile (`elem` " \t\f\v\r") + +takeHorizontalSpace :: Parser BS.ByteString +takeHorizontalSpace = takeWhile (`elem` " \t\f\v\r") + +makeLabeled :: (String -> Maybe String -> a) -> String -> a +makeLabeled f input = case break isSpace $ removeEscapes $ strip input of + (uri, "") -> f uri Nothing + (uri, label) -> f uri (Just $ dropWhile isSpace label) + where + -- As we don't parse these any further, we don't do any processing to the + -- string so we at least remove escape character here. Perhaps we should + -- actually be parsing the label at the very least? + removeEscapes "" = "" + removeEscapes ('\\':'\\':xs) = '\\' : removeEscapes xs + removeEscapes ('\\':xs) = removeEscapes xs + removeEscapes (x:xs) = x : removeEscapes xs + +takeUntil :: ByteString -> Parser ByteString +takeUntil end_ = dropEnd <$> requireEnd (scan (False, end) p) >>= gotSome + where + end = BS.unpack end_ + + p :: (Bool, String) -> Char -> Maybe (Bool, String) + p acc c = case acc of + (True, _) -> Just (False, end) + (_, []) -> Nothing + (_, x:xs) | x == c -> Just (False, xs) + _ -> Just (c == '\\', end) + + dropEnd = BS.reverse . BS.drop (length end) . BS.reverse + requireEnd = mfilter (BS.isSuffixOf end_) + + gotSome xs + | BS.null xs = fail "didn't get any content" + | otherwise = return xs diff --git a/haddock-library/src/Documentation/Haddock/Types.hs b/haddock-library/src/Documentation/Haddock/Types.hs new file mode 100644 index 00000000..3e25d230 --- /dev/null +++ b/haddock-library/src/Documentation/Haddock/Types.hs @@ -0,0 +1,70 @@ +{-# LANGUAGE DeriveDataTypeable, DeriveFunctor, DeriveFoldable #-} +{-# LANGUAGE DeriveTraversable, StandaloneDeriving #-} + +-- | +-- Module : Documentation.Haddock.Types +-- Copyright : (c) Simon Marlow 2003-2006, +-- David Waern 2006-2009, +-- Mateusz Kowalczyk 2013 +-- License : BSD-like +-- +-- Maintainer : haddock@projects.haskellorg +-- Stability : experimental +-- Portability : portable +-- +-- Exposes documentation data types used for (some) of Haddock. +module Documentation.Haddock.Types where + +import Data.Foldable +import Data.Traversable + +instance Foldable Header where + foldMap f (Header _ a) = f a + +instance Traversable Header where + traverse f (Header l a) = Header l `fmap` f a + +data Hyperlink = Hyperlink + { hyperlinkUrl :: String + , hyperlinkLabel :: Maybe String + } deriving (Eq, Show) + + +data Picture = Picture + { pictureUri :: String + , pictureTitle :: Maybe String + } deriving (Eq, Show) + +data Header id = Header + { headerLevel :: Int + , headerTitle :: id + } deriving Functor + +data Example = Example + { exampleExpression :: String + , exampleResult :: [String] + } deriving (Eq, Show) + +data DocH mod id + = DocEmpty + | DocAppend (DocH mod id) (DocH mod id) + | DocString String + | DocParagraph (DocH mod id) + | DocIdentifier id + | DocIdentifierUnchecked mod + | DocModule String + | DocWarning (DocH mod id) + | DocEmphasis (DocH mod id) + | DocMonospaced (DocH mod id) + | DocBold (DocH mod id) + | DocUnorderedList [DocH mod id] + | DocOrderedList [DocH mod id] + | DocDefList [(DocH mod id, DocH mod id)] + | DocCodeBlock (DocH mod id) + | DocHyperlink Hyperlink + | DocPic Picture + | DocAName String + | DocProperty String + | DocExamples [Example] + | DocHeader (Header (DocH mod id)) + deriving (Functor, Foldable, Traversable) diff --git a/haddock-library/src/Documentation/Haddock/Utf8.hs b/haddock-library/src/Documentation/Haddock/Utf8.hs new file mode 100644 index 00000000..3f75e53b --- /dev/null +++ b/haddock-library/src/Documentation/Haddock/Utf8.hs @@ -0,0 +1,74 @@ +module Documentation.Haddock.Utf8 (encodeUtf8, decodeUtf8) where +import Data.Bits ((.|.), (.&.), shiftL, shiftR) +import qualified Data.ByteString as BS +import Data.Char (chr, ord) +import Data.Word (Word8) + +-- | Helper that encodes and packs a 'String' into a 'BS.ByteString' +encodeUtf8 :: String -> BS.ByteString +encodeUtf8 = BS.pack . encode + +-- | Helper that unpacks and decodes a 'BS.ByteString' into a 'String' +decodeUtf8 :: BS.ByteString -> String +decodeUtf8 = decode . BS.unpack + +-- Copy/pasted functions from Codec.Binary.UTF8.String for encoding/decoding +-- | Character to use when 'encode' or 'decode' fail for a byte. +replacementCharacter :: Char +replacementCharacter = '\xfffd' + +-- | Encode a Haskell String to a list of Word8 values, in UTF8 format. +encode :: String -> [Word8] +encode = concatMap (map fromIntegral . go . ord) + where + go oc + | oc <= 0x7f = [oc] + + | oc <= 0x7ff = [ 0xc0 + (oc `shiftR` 6) + , 0x80 + oc .&. 0x3f + ] + + | oc <= 0xffff = [ 0xe0 + (oc `shiftR` 12) + , 0x80 + ((oc `shiftR` 6) .&. 0x3f) + , 0x80 + oc .&. 0x3f + ] + | otherwise = [ 0xf0 + (oc `shiftR` 18) + , 0x80 + ((oc `shiftR` 12) .&. 0x3f) + , 0x80 + ((oc `shiftR` 6) .&. 0x3f) + , 0x80 + oc .&. 0x3f + ] + +-- | Decode a UTF8 string packed into a list of Word8 values, directly to String +decode :: [Word8] -> String +decode [ ] = "" +decode (c:cs) + | c < 0x80 = chr (fromEnum c) : decode cs + | c < 0xc0 = replacementCharacter : decode cs + | c < 0xe0 = multi1 + | c < 0xf0 = multi_byte 2 0xf 0x800 + | c < 0xf8 = multi_byte 3 0x7 0x10000 + | c < 0xfc = multi_byte 4 0x3 0x200000 + | c < 0xfe = multi_byte 5 0x1 0x4000000 + | otherwise = replacementCharacter : decode cs + where + multi1 = case cs of + c1 : ds | c1 .&. 0xc0 == 0x80 -> + let d = ((fromEnum c .&. 0x1f) `shiftL` 6) .|. fromEnum (c1 .&. 0x3f) + in if d >= 0x000080 then toEnum d : decode ds + else replacementCharacter : decode ds + _ -> replacementCharacter : decode cs + + multi_byte :: Int -> Word8 -> Int -> String + multi_byte i mask overlong = aux i cs (fromEnum (c .&. mask)) + where + aux 0 rs acc + | overlong <= acc && acc <= 0x10ffff && + (acc < 0xd800 || 0xdfff < acc) && + (acc < 0xfffe || 0xffff < acc) = chr acc : decode rs + | otherwise = replacementCharacter : decode rs + + aux n (r:rs) acc + | r .&. 0xc0 == 0x80 = aux (n-1) rs + $ shiftL acc 6 .|. fromEnum (r .&. 0x3f) + + aux _ rs _ = replacementCharacter : decode rs |