import datetime from lxml import etree, html import os import re _DOMAIN = 'https://ypei.me' _RSS_TAGNAME = 'rss' _RSS_ATTRIBS = {'version': '2.0'} _POSTS_HTML_DIR = 'site/posts' _POSTS_ORG_DIR = 'posts' _POSTS_BASE_URL = _DOMAIN + '/posts' _BLOG_FEED_PATH = 'site/blog-feed.xml' _MICROBLOG_URL = _DOMAIN + '/microblog.html' _MICROBLOG_PATH = 'site/microblog.html' _MICROBLOG_FEED_PATH = 'site/microblog-feed.xml' def get_date(org): return re.search('^#\+(date|DATE): <(.*)>$', org, re.MULTILINE).group(2) def make_rss_root(): return etree.Element(_RSS_TAGNAME, attrib=_RSS_ATTRIBS) def make_post_item(post_html, post_org, post_filename): post = etree.Element('item') tree = html.fromstring(post_html) post.append(tree.find('./head/title')) content = tree.find('.//div[@id="content"]') content.tag = 'description' content.attrib.clear() post.append(content) etree.SubElement(post, 'link').text = _POSTS_BASE_URL + '/' + post_filename etree.SubElement(post, 'pubDate').text = get_date(post_org) return post def make_post_items(): posts = [] for post in os.listdir(_POSTS_HTML_DIR): post_sans_ext, ext = os.path.splitext(post) if ext != '.html': continue post_html = open(_POSTS_HTML_DIR + '/' + post, 'rb').read() post_org = open(_POSTS_ORG_DIR + '/' + post_sans_ext + '.org').read() posts.append(make_post_item(post_html, post_org, post)) posts.sort(key=lambda post: post.find('./pubDate').text, reverse=True) return posts def make_blog_channel(): channel = etree.Element('channel') etree.SubElement(channel, 'title').text = 'Yuchen Pei\'s blog' etree.SubElement(channel, 'description').text = 'Yuchen Pei\'s blog' etree.SubElement(channel, 'link').text = _DOMAIN + '/blog.html' etree.SubElement(channel, 'copyright').text = '2013-2021 Yuchen Pei, licensed under CC BY-SA 4.0' etree.SubElement(channel, 'lastBuildDate').text = str(datetime.datetime.now()) for post in make_post_items(): channel.append(post) return channel def make_blog_rss(): root = make_rss_root() root.append(make_blog_channel()) open(_BLOG_FEED_PATH, 'w').write(etree.tostring(root, encoding='unicode')) return root def make_micropost_item(micropost_html): """ The header of a micropost has the following format:
2020-08-02 - ia lawsuit
""" micropost = etree.Element('item') header = micropost_html.find('./p') etree.SubElement(micropost, 'link').text = _MICROBLOG_URL + header.find('.//a').attrib['href'] etree.SubElement(micropost, 'pubDate').text = header.find('.//a').text etree.SubElement(micropost, 'title').text = header.find('./b').tail[3:].strip() micropost_html.remove(header) micropost_html.tag = 'description' micropost.append(micropost_html) return micropost def make_and_add_micropost_items(channel): microblog = html.fromstring(open(_MICROBLOG_PATH, 'rb').read()) for micropost in microblog.findall('.//div[@id="content"]/ul/li'): channel.append(make_micropost_item(micropost)) def make_microblog_channel(): channel = etree.Element('channel') etree.SubElement(channel, 'title').text = 'Yuchen Pei\'s microblog' etree.SubElement(channel, 'description').text = 'Yuchen Pei\'s microblog' etree.SubElement(channel, 'link').text = _MICROBLOG_URL etree.SubElement(channel, 'lastBuildDate').text = str(datetime.datetime.now()) make_and_add_micropost_items(channel) return channel def make_microblog_rss(): root = make_rss_root() root.append(make_microblog_channel()) open(_MICROBLOG_FEED_PATH, 'w').write(etree.tostring(root, encoding='unicode')) return root def main(): make_blog_rss() print(f"Published blog rss to {_BLOG_FEED_PATH}.") make_microblog_rss() print(f"Published microblog rss to {_MICROBLOG_FEED_PATH}.") if __name__ == '__main__': main()