{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TupleSections #-}
module Servall.WikiParser
( infobox
, wikiFilter
) where
import Control.Applicative ( (<|>) )
import Data.Attoparsec.Combinator ( lookAhead )
import Data.Attoparsec.Text ( Parser
, anyChar
, char
, choice
, many'
, many1
, manyTill
, notChar
, parse
, parseOnly
, sepBy
, skipMany
, skipSpace
, skipWhile
, space
, string
, takeTill
, takeWhile1
)
import Data.Char ( isAlpha
, isAlphaNum
)
import Data.Either ( fromRight
, rights
)
import Data.List ( find
, intersperse
)
import Data.Maybe ( catMaybes
, fromMaybe
, isNothing
, mapMaybe
, maybeToList
)
import qualified Data.Text as T
import Data.Text ( Text )
import GHC.Generics ( Generic )
import Servall.Types
import Text.Pandoc.Definition ( Block(..)
, Format(..)
, Inline(..)
, Pandoc(..)
, nullAttr
)
import Text.Pandoc.Generic ( topDown )
wikilinkPrefix :: Text
wikilinkPrefix = "wiki:"
parseWikiTemplates :: Text -> Either String [WikiTemplate]
parseWikiTemplates = parseOnly wikiP
infobox :: Text -> [(Text, Text)]
infobox wiki = filterNameValues
(maybe [] wtArgs (find isInfobox (fromRight [] (parseWikiTemplates wiki))))
filterNameValues :: [(Maybe Text, Text)] -> [(Text, Text)]
filterNameValues = mapMaybe (\(k, v) -> (, v) <$> k)
isInfobox :: WikiTemplate -> Bool
isInfobox (WikiTemplate name _) = T.isPrefixOf "Infobox" name
wikiP :: Parser [WikiTemplate]
wikiP = sepBy templateP (many' (commentP <|> (many1 space >> return ())))
templateP :: Parser WikiTemplate
templateP = do
string "{{"
name <- tempHeaderP
skipSpace
fields <- many' tempFieldP
string "}}"
return $ WikiTemplate name (catMaybes fields)
-- simple rendering of templates in an infobox
renderInfoTemplate :: WikiTemplate -> Text
renderInfoTemplate (WikiTemplate "URL" ((Nothing, url) : _)) =
"<" <> url <> ">"
renderInfoTemplate (WikiTemplate "coord" ((Nothing, lat1) : (Nothing, lat2) : (Nothing, lat3) : (Nothing, latd) : (Nothing, lon1) : (Nothing, lon2) : (Nothing, lon3) : (Nothing, lond) : _))
= lat1
<> "°"
<> lat2
<> "′"
<> lat3
<> "\""
<> latd
<> " "
<> lon1
<> "°"
<> lon2
<> "′"
<> lon3
<> "\""
<> lond
renderInfoTemplate (WikiTemplate header ((Nothing, year) : (Nothing, month) : (Nothing, date) : _))
| (header == "start date and age" || header == "Death date and age")
= "["
<> year
<> "-"
<> (if (T.length month == 1) then "0" else "")
<> month
<> "-"
<> date
<> "]"
renderInfoTemplate _ = ""
tempHeaderP :: Parser Text
tempHeaderP = do
-- Not sure whether name can contain numbers
-- Does not consider multiple words like Short description
name1 <- takeWhile1 isAlpha
name2 <- manyTill
anyChar
(lookAhead (skipSpace >> ((char '|' >> return "") <|> string "}}")))
return $ name1 <> T.pack name2
-- parse a template field
tempFieldP :: Parser (Maybe (Maybe Text, Text))
tempFieldP = do
char '|' >> skipSpace
namedField <|> fmap (Just . (Nothing, )) tempFieldValueP
where
namedField = do
key <- takeWhile1 (\c -> isAlphaNum c || c `elem` ['_', '-'])
skipSpace
char '='
skipSpace
value <- tempFieldValueP
return $ if T.null value then Nothing else Just (Just key, value)
-- parse a field value
tempFieldValueP :: Parser Text
tempFieldValueP = do
T.concat <$> many'
(choice
[ skipRefP
, renderInfoTemplate <$> templateP
, commentP >> return ""
, wikilinkP
, simpleP
]
)
where
simpleP :: Parser Text
simpleP = do
t <- T.pack <$> manyTill
anyChar
(lookAhead
( string "[["
<|> string "{{"
<|> string "") >> return ()
wikilinkP :: Parser Text
wikilinkP = do
beg <- string "[["
content <- T.pack <$> manyTill anyChar (string "]]")
let (first, second) = T.breakOn "|" content
return
$ "[["
<> wikilinkPrefix
<> first
<> "]["
<> (if T.null second then first else T.tail second)
<> "]]"
skipRefP :: Parser Text
skipRefP = (ref1 <|> ref2) >> ""
where
ref1 = string "[> (char '>' <|> char ' ') >> manyTill
anyChar
(string "]")
ref2 = string "[> manyTill (notChar '>') (string "/>")
wikiFilter :: Text -> Pandoc -> Pandoc
wikiFilter title = topDown fixInline . fixHeaderTemplates . insertHeader title
insertHeader :: Text -> Pandoc -> Pandoc
insertHeader title (Pandoc m bs) =
Pandoc m (Header 1 nullAttr [Str title] : bs)
fixHeaderTemplates :: Pandoc -> Pandoc
fixHeaderTemplates (Pandoc m blocks) = Pandoc m (go [] blocks) where
go :: [Block] -> [Block] -> [Block]
go acc [] = acc
go acc (b : bs) =
let (raws, rest) = span isTemplateBlock bs
-- assume all parsing returns Right
in go
(acc ++ processTemplates
[b]
(rights (map (parseOnly templateP . templateFromRawBlock) raws))
)
rest
processTemplates :: [Block] -> [WikiTemplate] -> [Block]
processTemplates bs [] = bs
processTemplates (Header level (x, y, attr) content : bs) (t : ts)
| isInfobox t = processTemplates
(Header level (x, y, attr ++ filterNameValues (wtArgs t)) content : bs)
ts
processTemplates bs (t : ts) = processTemplates (bs ++ templateToBlocks t) ts
templateToBlocks :: WikiTemplate -> [Block]
templateToBlocks (WikiTemplate key pairs)
| key `elem` ["Main", "See also", "Further"]
= [ Para
[ Emph
([Str (showKey key)] ++ intersperse
(Str ", ")
( map
(\(_, title) -> Link nullAttr [Str title] (title, "wikilink"))
$ filter (\(k, v) -> isNothing k && (not (T.null v))) pairs
)
)
]
]
where
showKey "Main" = "Main article(s): "
showKey "See also" = "See also: "
showKey "Further" = "Further information: "
showKey _ = error "The impossible happened."
templateToBlocks _ = []
isTemplateBlock :: Block -> Bool
isTemplateBlock (RawBlock (Format "mediawiki") temp) = True
isTemplateBlock _ = False
templateFromRawBlock :: Block -> Text
templateFromRawBlock (RawBlock (Format "mediawiki") temp) = temp
templateFromRawBlock _ = ""
fixInline :: Inline -> Inline
fixInline (Link attr label (url, "wikilink")) =
Link attr label ("wiki:" <> url, "")
fixInline (Note (RawBlock (Format "mediawiki") temp : rest)) =
let
fields = (fromRight [] . fmap (filterNameValues . wtArgs))
(parseOnly templateP temp)
url = lookup "url" fields
title = lookup "title" fields
link =
(\url' ->
Link nullAttr (maybeToList (Str <$> title)) (url', fromMaybe "" title)
)
<$> url
in
case link of
Just link' -> Note (Plain [link'] : rest)
_ -> Note rest
fixInline x = x
]