diff options
Diffstat (limited to 'rss.py')
-rw-r--r-- | rss.py | 111 |
1 files changed, 111 insertions, 0 deletions
@@ -0,0 +1,111 @@ +import datetime +from lxml import etree, html +import os +import re + +_DOMAIN = 'https://ypei.me' +_RSS_TAGNAME = 'rss' +_RSS_ATTRIBS = {'version': '2.0'} + +_POSTS_HTML_DIR = 'site/posts' +_POSTS_ORG_DIR = 'posts' +_POSTS_BASE_URL = _DOMAIN + '/posts' +_BLOG_FEED_PATH = 'site/blog-feed.xml' + +_MICROBLOG_URL = _DOMAIN + '/microblog.html' +_MICROBLOG_PATH = 'site/microblog.html' +_MICROBLOG_FEED_PATH = 'site/microblog-feed.xml' + + +def get_date(org): + return re.search('^#\+(date|DATE): <(.*)>$', org, re.MULTILINE).group(2) + +def make_rss_root(): + return etree.Element(_RSS_TAGNAME, attrib=_RSS_ATTRIBS) + +def make_post_item(post_html, post_org, post_filename): + post = etree.Element('item') + tree = html.fromstring(post_html) + post.append(tree.find('./head/title')) + content = tree.find('.//div[@id="content"]') + content.tag = 'description' + content.attrib.clear() + post.append(content) + etree.SubElement(post, 'link').text = _POSTS_BASE_URL + '/' + post_filename + etree.SubElement(post, 'pubDate').text = get_date(post_org) + return post + +def make_post_items(): + posts = [] + for post in os.listdir(_POSTS_HTML_DIR): + post_sans_ext, ext = os.path.splitext(post) + if ext != '.html': continue + post_html = open(_POSTS_HTML_DIR + '/' + post, 'rb').read() + post_org = open(_POSTS_ORG_DIR + '/' + post_sans_ext + '.org').read() + posts.append(make_post_item(post_html, post_org, post)) + posts.sort(key=lambda post: post.find('./pubDate').text, reverse=True) + return posts + +def make_blog_channel(): + channel = etree.Element('channel') + etree.SubElement(channel, 'title').text = 'Yuchen Pei\'s blog' + etree.SubElement(channel, 'description').text = 'Yuchen Pei\'s blog' + etree.SubElement(channel, 'link').text = _DOMAIN + '/blog.html' + etree.SubElement(channel, 'copyright').text = '2013-2021 Yuchen Pei, licensed under CC BY-SA 4.0' + etree.SubElement(channel, 'lastBuildDate').text = str(datetime.datetime.now()) + for post in make_post_items(): + channel.append(post) + return channel + +def make_blog_rss(): + root = make_rss_root() + root.append(make_blog_channel()) + open(_BLOG_FEED_PATH, 'w').write(etree.tostring(root, encoding='unicode')) + return root + +def make_micropost_item(micropost_html): + """ + The header of a micropost has the following format: + <p> + <b><a href="#ia-lawsuit">2020-08-02</a></b> - ia lawsuit + <a id="ia-lawsuit"></a> + </p> + """ + micropost = etree.Element('item') + header = micropost_html.find('./p') + etree.SubElement(micropost, 'link').text = _MICROBLOG_URL + header.find('.//a').attrib['href'] + etree.SubElement(micropost, 'pubDate').text = header.find('.//a').text + etree.SubElement(micropost, 'title').text = header.find('./b').tail[3:].strip() + micropost_html.remove(header) + micropost_html.tag = 'description' + micropost.append(micropost_html) + return micropost + +def make_and_add_micropost_items(channel): + microblog = html.fromstring(open(_MICROBLOG_PATH, 'rb').read()) + for micropost in microblog.findall('.//div[@id="content"]/ul/li'): + channel.append(make_micropost_item(micropost)) + +def make_microblog_channel(): + channel = etree.Element('channel') + etree.SubElement(channel, 'title').text = 'Yuchen Pei\'s microblog' + etree.SubElement(channel, 'description').text = 'Yuchen Pei\'s microblog' + etree.SubElement(channel, 'link').text = _MICROBLOG_URL + etree.SubElement(channel, 'lastBuildDate').text = str(datetime.datetime.now()) + make_and_add_micropost_items(channel) + return channel + +def make_microblog_rss(): + root = make_rss_root() + root.append(make_microblog_channel()) + open(_MICROBLOG_FEED_PATH, 'w').write(etree.tostring(root, encoding='unicode')) + return root + +def main(): + make_blog_rss() + print(f"Published blog rss to {_BLOG_FEED_PATH}.") + make_microblog_rss() + print(f"Published microblog rss to {_MICROBLOG_FEED_PATH}.") + +if __name__ == '__main__': + main() |