rss.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138

# Copyright (C) 2021 Yuchen Pei.

# This file is part of site generator for Yuchen's website (abbreviated to sg4y).

# sg4y is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# sg4y is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.

# You should have received a copy of the GNU Affero General Public License
# along with sg4y.  If not, see <https://www.gnu.org/licenses/>.

import datetime
from lxml import etree, html
import os
import re

_DOMAIN = 'https://ypei.me'
_RSS_TAGNAME = 'rss'
_RSS_ATTRIBS = {'version': '2.0'}

_POSTS_HTML_DIR = 'site/posts'
_POSTS_ORG_DIR = 'posts'
_POSTS_BASE_URL = _DOMAIN + '/posts'
_BLOG_FEED_PATH = 'site/blog-feed.xml'

_MICROBLOG_URL = _DOMAIN + '/microblog.html'
_MICROBLOG_PATH = 'site/microblog.html'
_MICROBLOG_FEED_PATH = 'site/microblog-feed.xml'


def get_date(org):
    return re.search('^#\+(date|DATE): <(.*)>$', org, re.MULTILINE).group(2)

def make_rss_root():
    return etree.Element(_RSS_TAGNAME, attrib=_RSS_ATTRIBS)

def make_post_item(post_html, post_org, post_filename):
    post = etree.Element('item')
    tree = html.fromstring(post_html)
    post.append(tree.find('./head/title'))
    content = tree.find('.//div[@id="content"]')
    content.tag = 'description'
    content.attrib.clear()
    post.append(wrap_in_cdata(content))
    etree.SubElement(post, 'link').text = _POSTS_BASE_URL + '/' + post_filename
    etree.SubElement(post, 'pubDate').text = get_date(post_org)
    return post

def make_post_items():
    posts = []
    for post in os.listdir(_POSTS_HTML_DIR):
        post_sans_ext, ext = os.path.splitext(post)
        if ext != '.html': continue
        post_html = open(_POSTS_HTML_DIR + '/' + post, 'rb').read()
        post_org = open(_POSTS_ORG_DIR + '/' + post_sans_ext + '.org').read()
        posts.append(make_post_item(post_html, post_org, post))
    posts.sort(key=lambda post: post.find('./pubDate').text, reverse=True)
    return posts

def make_blog_channel():
    channel = etree.Element('channel')
    etree.SubElement(channel, 'title').text = 'Yuchen Pei\'s blog'
    etree.SubElement(channel, 'description').text = 'Yuchen Pei\'s blog'
    etree.SubElement(channel, 'link').text = _DOMAIN + '/blog.html'
    etree.SubElement(channel, 'copyright').text = '2013-2021 Yuchen Pei, licensed under CC BY-SA 4.0'
    etree.SubElement(channel, 'lastBuildDate').text = str(datetime.datetime.now())
    for post in make_post_items():
        channel.append(post)
    return channel

def make_blog_rss():
    root = make_rss_root()
    root.append(make_blog_channel())
    open(_BLOG_FEED_PATH, 'w').write(etree.tostring(root, encoding='unicode'))
    return root

def wrap_in_cdata(element):
    parser = etree.XMLParser(strip_cdata=False)
    inner = ''.join(etree.tostring(child, encoding='unicode')
                    for child in element)
    return etree.XML(f"""<{element.tag}><![CDATA[
{inner}
]]>
</{element.tag}>
""", parser)

def make_micropost_item(micropost_html):
    """
    The header of a micropost has the following format:
    <p>
    <b><a href="#ia-lawsuit">2020-08-02</a></b> - ia lawsuit
    <a id="ia-lawsuit"></a>
    </p>
    """
    micropost = etree.Element('item')
    header = micropost_html.find('./p')
    etree.SubElement(micropost, 'link').text = _MICROBLOG_URL + header.find('.//a').attrib['href']
    etree.SubElement(micropost, 'pubDate').text = header.find('.//a').text
    etree.SubElement(micropost, 'title').text = header.find('./b').tail[3:].strip()
    micropost_html.remove(header)
    micropost_html.tag = 'description'
    micropost.append(wrap_in_cdata(micropost_html))
    return micropost

def make_and_add_micropost_items(channel):
    microblog = html.fromstring(open(_MICROBLOG_PATH, 'rb').read())
    for micropost in microblog.findall('.//div[@id="content"]/ul/li'):
        channel.append(make_micropost_item(micropost))

def make_microblog_channel():
    channel = etree.Element('channel')
    etree.SubElement(channel, 'title').text = 'Yuchen Pei\'s microblog'
    etree.SubElement(channel, 'description').text = 'Yuchen Pei\'s microblog'
    etree.SubElement(channel, 'link').text = _MICROBLOG_URL
    etree.SubElement(channel, 'lastBuildDate').text = str(datetime.datetime.now())
    make_and_add_micropost_items(channel)
    return channel

def make_microblog_rss():
    root = make_rss_root()
    root.append(make_microblog_channel())
    open(_MICROBLOG_FEED_PATH, 'w').write(etree.tostring(root, encoding='unicode'))
    return root

def main():
    make_blog_rss()
    print(f"Published blog rss to {_BLOG_FEED_PATH}.")
    make_microblog_rss()
    print(f"Published microblog rss to {_MICROBLOG_FEED_PATH}.")

if __name__ == '__main__':
    main()