{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TupleSections #-} module Servall.WikiParser ( infobox , wikiFilter ) where import Control.Applicative ( (<|>) ) import Data.Attoparsec.Combinator ( lookAhead ) import Data.Attoparsec.Text ( Parser , anyChar , char , choice , many' , many1 , manyTill , notChar , parse , parseOnly , sepBy , skipMany , skipSpace , skipWhile , space , string , takeTill , takeWhile1 ) import Data.Char ( isAlpha , isAlphaNum ) import Data.Either ( fromRight , rights ) import Data.List ( find , intersperse ) import Data.Maybe ( catMaybes , fromMaybe , isNothing , mapMaybe , maybeToList ) import qualified Data.Text as T import Data.Text ( Text ) import GHC.Generics ( Generic ) import Servall.Types import Text.Pandoc.Definition ( Block(..) , Format(..) , Inline(..) , Pandoc(..) , nullAttr ) import Text.Pandoc.Generic ( topDown ) wikilinkPrefix :: Text wikilinkPrefix = "wiki:" parseWikiTemplates :: Text -> Either String [WikiTemplate] parseWikiTemplates = parseOnly wikiP infobox :: Text -> [(Text, Text)] infobox wiki = filterNameValues (maybe [] wtArgs (find isInfobox (fromRight [] (parseWikiTemplates wiki)))) filterNameValues :: [(Maybe Text, Text)] -> [(Text, Text)] filterNameValues = mapMaybe (\(k, v) -> (, v) <$> k) isInfobox :: WikiTemplate -> Bool isInfobox (WikiTemplate name _) = T.isPrefixOf "Infobox" name wikiP :: Parser [WikiTemplate] wikiP = sepBy templateP (many' (commentP <|> (many1 space >> return ()))) templateP :: Parser WikiTemplate templateP = do string "{{" name <- tempHeaderP skipSpace fields <- many' tempFieldP string "}}" return $ WikiTemplate name (catMaybes fields) -- simple rendering of templates in an infobox renderInfoTemplate :: WikiTemplate -> Text renderInfoTemplate (WikiTemplate "URL" ((Nothing, url) : _)) = "<" <> url <> ">" renderInfoTemplate (WikiTemplate "coord" ((Nothing, lat1) : (Nothing, lat2) : (Nothing, lat3) : (Nothing, latd) : (Nothing, lon1) : (Nothing, lon2) : (Nothing, lon3) : (Nothing, lond) : _)) = lat1 <> "°" <> lat2 <> "′" <> lat3 <> "\"" <> latd <> " " <> lon1 <> "°" <> lon2 <> "′" <> lon3 <> "\"" <> lond renderInfoTemplate (WikiTemplate header ((Nothing, year) : (Nothing, month) : (Nothing, date) : _)) | (header == "start date and age" || header == "Death date and age") = "[" <> year <> "-" <> (if (T.length month == 1) then "0" else "") <> month <> "-" <> date <> "]" renderInfoTemplate _ = "" tempHeaderP :: Parser Text tempHeaderP = do -- Not sure whether name can contain numbers -- Does not consider multiple words like Short description name1 <- takeWhile1 isAlpha name2 <- manyTill anyChar (lookAhead (skipSpace >> ((char '|' >> return "") <|> string "}}"))) return $ name1 <> T.pack name2 -- parse a template field tempFieldP :: Parser (Maybe (Maybe Text, Text)) tempFieldP = do char '|' >> skipSpace namedField <|> fmap (Just . (Nothing, )) tempFieldValueP where namedField = do key <- takeWhile1 (\c -> isAlphaNum c || c `elem` ['_', '-']) skipSpace char '=' skipSpace value <- tempFieldValueP return $ if T.null value then Nothing else Just (Just key, value) -- parse a field value tempFieldValueP :: Parser Text tempFieldValueP = do T.concat <$> many' (choice [ skipRefP , renderInfoTemplate <$> templateP , commentP >> return "" , wikilinkP , simpleP ] ) where simpleP :: Parser Text simpleP = do t <- T.pack <$> manyTill anyChar (lookAhead ( string "[[" <|> string "{{" <|> string "") >> return () wikilinkP :: Parser Text wikilinkP = do beg <- string "[[" content <- T.pack <$> manyTill anyChar (string "]]") let (first, second) = T.breakOn "|" content return $ "[[" <> wikilinkPrefix <> first <> "][" <> (if T.null second then first else T.tail second) <> "]]" skipRefP :: Parser Text skipRefP = (ref1 <|> ref2) >> "" where ref1 = string "> (char '>' <|> char ' ') >> manyTill anyChar (string "") ref2 = string "> manyTill (notChar '>') (string "/>") wikiFilter :: Text -> Pandoc -> Pandoc wikiFilter title = topDown fixInline . fixHeaderTemplates . insertHeader title insertHeader :: Text -> Pandoc -> Pandoc insertHeader title (Pandoc m bs) = Pandoc m (Header 1 nullAttr [Str title] : bs) fixHeaderTemplates :: Pandoc -> Pandoc fixHeaderTemplates (Pandoc m blocks) = Pandoc m (go [] blocks) where go :: [Block] -> [Block] -> [Block] go acc [] = acc go acc (b : bs) = let (raws, rest) = span isTemplateBlock bs -- assume all parsing returns Right in go (acc ++ processTemplates [b] (rights (map (parseOnly templateP . templateFromRawBlock) raws)) ) rest processTemplates :: [Block] -> [WikiTemplate] -> [Block] processTemplates bs [] = bs processTemplates (Header level (x, y, attr) content : bs) (t : ts) | isInfobox t = processTemplates (Header level (x, y, attr ++ filterNameValues (wtArgs t)) content : bs) ts processTemplates bs (t : ts) = processTemplates (bs ++ templateToBlocks t) ts templateToBlocks :: WikiTemplate -> [Block] templateToBlocks (WikiTemplate key pairs) | key `elem` ["Main", "See also", "Further"] = [ Para [ Emph ([Str (showKey key)] ++ intersperse (Str ", ") ( map (\(_, title) -> Link nullAttr [Str title] (title, "wikilink")) $ filter (\(k, v) -> isNothing k && (not (T.null v))) pairs ) ) ] ] where showKey "Main" = "Main article(s): " showKey "See also" = "See also: " showKey "Further" = "Further information: " showKey _ = error "The impossible happened." templateToBlocks _ = [] isTemplateBlock :: Block -> Bool isTemplateBlock (RawBlock (Format "mediawiki") temp) = True isTemplateBlock _ = False templateFromRawBlock :: Block -> Text templateFromRawBlock (RawBlock (Format "mediawiki") temp) = temp templateFromRawBlock _ = "" fixInline :: Inline -> Inline fixInline (Link attr label (url, "wikilink")) = Link attr label ("wiki:" <> url, "") fixInline (Note (RawBlock (Format "mediawiki") temp : rest)) = let fields = (fromRight [] . fmap (filterNameValues . wtArgs)) (parseOnly templateP temp) url = lookup "url" fields title = lookup "title" fields link = (\url' -> Link nullAttr (maybeToList (Str <$> title)) (url', fromMaybe "" title) ) <$> url in case link of Just link' -> Note (Plain [link'] : rest) _ -> Note rest fixInline x = x