path: root/rss.py



# Copyright (C) 2021 Yuchen Pei.

# This file is part of site generator for Yuchen's website (abbreviated to sg4y).

# sg4y is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# sg4y is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.

# You should have received a copy of the GNU Affero General Public License
# along with sg4y.  If not, see <https://www.gnu.org/licenses/>.

import datetime
from lxml import etree, html
import os
import re

_DOMAIN = 'https://ypei.me'
_RSS_TAGNAME = 'rss'
_RSS_ATTRIBS = {'version': '2.0'}

_POSTS_HTML_DIR = 'site/posts'
_POSTS_ORG_DIR = 'posts'
_POSTS_BASE_URL = _DOMAIN + '/posts'
_BLOG_FEED_PATH = 'site/blog-feed.xml'

_MICROBLOG_URL = _DOMAIN + '/microblog.html'
_MICROBLOG_PATH = 'site/microblog.html'
_MICROBLOG_FEED_PATH = 'site/microblog-feed.xml'


def get_date(org):
    return re.search('^#\+(date|DATE): <(.*)>$', org, re.MULTILINE).group(2)

def make_rss_root():
    return etree.Element(_RSS_TAGNAME, attrib=_RSS_ATTRIBS)

def make_post_item(post_html, post_org, post_filename):
    post = etree.Element('item')
    tree = html.fromstring(post_html)
    post.append(tree.find('./head/title'))
    content = tree.find('.//div[@id="content"]')
    content.tag = 'description'
    content.attrib.clear()
    post.append(wrap_in_cdata(content))
    etree.SubElement(post, 'link').text = _POSTS_BASE_URL + '/' + post_filename
    etree.SubElement(post, 'pubDate').text = get_date(post_org)
    return post

def make_post_items():
    posts = []
    for post in os.listdir(_POSTS_HTML_DIR):
        post_sans_ext, ext = os.path.splitext(post)
        if ext != '.html': continue
        post_html = open(_POSTS_HTML_DIR + '/' + post, 'rb').read()
        post_org = open(_POSTS_ORG_DIR + '/' + post_sans_ext + '.org').read()
        posts.append(make_post_item(post_html, post_org, post))
    posts.sort(key=lambda post: post.find('./pubDate').text, reverse=True)
    return posts

def make_blog_channel():
    channel = etree.Element('channel')
    etree.SubElement(channel, 'title').text = 'Yuchen Pei\'s blog'
    etree.SubElement(channel, 'description').text = 'Yuchen Pei\'s blog'
    etree.SubElement(channel, 'link').text = _DOMAIN + '/blog.html'
    etree.SubElement(channel, 'copyright').text = '2013-2021 Yuchen Pei, licensed under CC BY-SA 4.0'
    etree.SubElement(channel, 'lastBuildDate').text = str(datetime.datetime.now())
    for post in make_post_items():
        channel.append(post)
    return channel

def make_blog_rss():
    root = make_rss_root()
    root.append(make_blog_channel())
    open(_BLOG_FEED_PATH, 'w').write(etree.tostring(root, encoding='unicode'))
    return root

def wrap_in_cdata(element):
    parser = etree.XMLParser(strip_cdata=False)
    inner = ''.join(etree.tostring(child, encoding='unicode')
                    for child in element)
    return etree.XML(f"""<{element.tag}><![CDATA[
{inner}
]]>
</{element.tag}>
""", parser)

def make_micropost_item(micropost_html):
    """
    The header of a micropost has the following format:
    <p>
    <b><a href="#ia-lawsuit">2020-08-02</a></b> - ia lawsuit
    <a id="ia-lawsuit"></a>
    </p>
    """
    micropost = etree.Element('item')
    header = micropost_html.find('./p')
    etree.SubElement(micropost, 'link').text = _MICROBLOG_URL + header.find('.//a').attrib['href']
    etree.SubElement(micropost, 'pubDate').text = header.find('.//a').text
    etree.SubElement(micropost, 'title').text = header.find('./b').tail[3:].strip()
    micropost_html.remove(header)
    micropost_html.tag = 'description'
    micropost.append(wrap_in_cdata(micropost_html))
    return micropost

def make_and_add_micropost_items(channel):
    microblog = html.fromstring(open(_MICROBLOG_PATH, 'rb').read())
    for micropost in microblog.findall('.//div[@id="content"]/ul/li'):
        channel.append(make_micropost_item(micropost))

def make_microblog_channel():
    channel = etree.Element('channel')
    etree.SubElement(channel, 'title').text = 'Yuchen Pei\'s microblog'
    etree.SubElement(channel, 'description').text = 'Yuchen Pei\'s microblog'
    etree.SubElement(channel, 'link').text = _MICROBLOG_URL
    etree.SubElement(channel, 'lastBuildDate').text = str(datetime.datetime.now())
    make_and_add_micropost_items(channel)
    return channel

def make_microblog_rss():
    root = make_rss_root()
    root.append(make_microblog_channel())
    open(_MICROBLOG_FEED_PATH, 'w').write(etree.tostring(root, encoding='unicode'))
    return root

def main():
    make_blog_rss()
    print(f"Published blog rss to {_BLOG_FEED_PATH}.")
    make_microblog_rss()
    print(f"Published microblog rss to {_MICROBLOG_FEED_PATH}.")

if __name__ == '__main__':
    main()
# Copyright (C) 2021 Yuchen Pei.

# This file is part of site generator for Yuchen's website (abbreviated to sg4y).

# sg4y is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# sg4y is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.

# You should have received a copy of the GNU Affero General Public License
# along with sg4y.  If not, see <https://www.gnu.org/licenses/>.

import datetime
from lxml import etree, html
import os
import re

_DOMAIN = 'https://ypei.me'
_RSS_TAGNAME = 'rss'
_RSS_ATTRIBS = {'version': '2.0'}

_POSTS_HTML_DIR = 'site/posts'
_POSTS_ORG_DIR = 'posts'
_POSTS_BASE_URL = _DOMAIN + '/posts'
_BLOG_FEED_PATH = 'site/blog-feed.xml'

_MICROBLOG_URL = _DOMAIN + '/microblog.html'
_MICROBLOG_PATH = 'site/microblog.html'
_MICROBLOG_FEED_PATH = 'site/microblog-feed.xml'


def get_date(org):
    return re.search('^#\+(date|DATE): <(.*)>$', org, re.MULTILINE).group(2)

def make_rss_root():
    return etree.Element(_RSS_TAGNAME, attrib=_RSS_ATTRIBS)

def make_post_item(post_html, post_org, post_filename):
    post = etree.Element('item')
    tree = html.fromstring(post_html)
    post.append(tree.find('./head/title'))
    content = tree.find('.//div[@id="content"]')
    content.tag = 'description'
    content.attrib.clear()
    post.append(wrap_in_cdata(content))
    etree.SubElement(post, 'link').text = _POSTS_BASE_URL + '/' + post_filename
    etree.SubElement(post, 'pubDate').text = get_date(post_org)
    return post

def make_post_items():
    posts = []
    for post in os.listdir(_POSTS_HTML_DIR):
        post_sans_ext, ext = os.path.splitext(post)
        if ext != '.html': continue
        post_html = open(_POSTS_HTML_DIR + '/' + post, 'rb').read()
        post_org = open(_POSTS_ORG_DIR + '/' + post_sans_ext + '.org').read()
        posts.append(make_post_item(post_html, post_org, post))
    posts.sort(key=lambda post: post.find('./pubDate').text, reverse=True)
    return posts

def make_blog_channel():
    channel = etree.Element('channel')
    etree.SubElement(channel, 'title').text = 'Yuchen Pei\'s blog'
    etree.SubElement(channel, 'description').text = 'Yuchen Pei\'s blog'
    etree.SubElement(channel, 'link').text = _DOMAIN + '/blog.html'
    etree.SubElement(channel, 'copyright').text = '2013-2021 Yuchen Pei, licensed under CC BY-SA 4.0'
    etree.SubElement(channel, 'lastBuildDate').text = str(datetime.datetime.now())
    for post in make_post_items():
        channel.append(post)
    return channel

def make_blog_rss():
    root = make_rss_root()
    root.append(make_blog_channel())
    open(_BLOG_FEED_PATH, 'w').write(etree.tostring(root, encoding='unicode'))
    return root

def wrap_in_cdata(element):
    parser = etree.XMLParser(strip_cdata=False)
    inner = ''.join(etree.tostring(child, encoding='unicode')
                    for child in element)
    return etree.XML(f"""<{element.tag}><![CDATA[
{inner}
]]>
</{element.tag}>
""", parser)

def make_micropost_item(micropost_html):
    """
    The header of a micropost has the following format:
    <p>
    <b><a href="#ia-lawsuit">2020-08-02</a></b> - ia lawsuit
    <a id="ia-lawsuit"></a>
    </p>
    """
    micropost = etree.Element('item')
    header = micropost_html.find('./p')
    etree.SubElement(micropost, 'link').text = _MICROBLOG_URL + header.find('.//a').attrib['href']
    etree.SubElement(micropost, 'pubDate').text = header.find('.//a').text
    etree.SubElement(micropost, 'title').text = header.find('./b').tail[3:].strip()
    micropost_html.remove(header)
    micropost_html.tag = 'description'
    micropost.append(wrap_in_cdata(micropost_html))
    return micropost

def make_and_add_micropost_items(channel):
    microblog = html.fromstring(open(_MICROBLOG_PATH, 'rb').read())
    for micropost in microblog.findall('.//div[@id="content"]/ul/li'):
        channel.append(make_micropost_item(micropost))

def make_microblog_channel():
    channel = etree.Element('channel')
    etree.SubElement(channel, 'title').text = 'Yuchen Pei\'s microblog'
    etree.SubElement(channel, 'description').text = 'Yuchen Pei\'s microblog'
    etree.SubElement(channel, 'link').text = _MICROBLOG_URL
    etree.SubElement(channel, 'lastBuildDate').text = str(datetime.datetime.now())
    make_and_add_micropost_items(channel)
    return channel

def make_microblog_rss():
    root = make_rss_root()
    root.append(make_microblog_channel())
    open(_MICROBLOG_FEED_PATH, 'w').write(etree.tostring(root, encoding='unicode'))
    return root

def main():
    make_blog_rss()
    print(f"Published blog rss to {_BLOG_FEED_PATH}.")
    make_microblog_rss()
    print(f"Published microblog rss to {_MICROBLOG_FEED_PATH}.")

if __name__ == '__main__':
    main()