constance

Scripts for generating (an earlier obsolete version of) my personal web site
git clone https://code.djc.id.au/git/constance/
commit 3206cf08a5e9467baf28332a56ecd27422149486
parent ae6182286419cc0d1e4b21cc2130c57d2acdaf43
Author: Dan Callaghan <djc@djc.id.au>
Date:   Wed, 17 Sep 2008 22:53:28 +1000

mostly ditched colubrid in favour of a hand-rolled WSGI callable; mostly cleaned up encoding issues

Diffstat:
MTODO | 8++++----
Mapp.py | 187++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------
Mblog.py | 4++--
Mconfig.defaults | 11+++++++++++
Mconfig.py | 2+-
5 files changed, 137 insertions(+), 75 deletions(-)
diff --git a/TODO b/TODO
@@ -17,14 +17,14 @@
   - perhaps as interim step could export from Nokia and embed gmaps directly in constance?
 - document on-disk format for the entry types
 - tests!!!!!!!!
-- escape high bytes in URLs (should work properly since the page is UTF-8 anyway, but it seems to confuse at least MSN-Bot (and probably IE) sigh)
+- escape high bytes in URLs (should work properly since pages are served in the same encoding as what we expect URLs to be in, but it seems to confuse at least MSN-Bot (and probably IE) sigh)
   - better means of generating URLs?
-- fix unicode in comments
-  - really, this means cleaning up the whole self.charset crap (move into config) and apply it consistently!
-    - *really*, this means patching or ditching colubrid ergh ...
+- have almost completely ditched colubrid, just need to replace StaticExports for testing then rm it
+- use encoding from config for blog.py instead of hard-coding utf8
 - handle reCAPTCHA errors (including no captcha fields submitted!!!)
 - invalid offsets (displays every entry at max and 500's on invalid such as alpha)
 - config option to add next/prev links to page (as well as the link rels)
 - markdown typography/smartypants
   - or even just, better markdown
 - generate goog sitemaps
+- prettier error pages
diff --git a/app.py b/app.py
@@ -1,37 +1,67 @@
 
 # vim:encoding=utf-8
 
-import os
+import os, cgi, re
 from itertools import chain
 import wsgiref.util
 from genshi.template import TemplateLoader
-from colubrid import RegexApplication, HttpResponse
-from colubrid.exceptions import PageNotFound, AccessDenied, HttpFound
 from colubrid.server import StaticExports
 from recaptcha.client import captcha
 
 import config
 import blog
 
+class HTTPException(Exception):
+    status = '500 Internal Server Error'
+    headers = []
+
+class ForbiddenError(HTTPException):
+    status = '403 Forbidden'
+
+class NotFoundError(HTTPException):
+    status = '404 Not Found'
+
+class HTTPRedirect(HTTPException):
+    def __init__(self, location):
+        assert isinstance(location, str)
+        self.headers = [('Location', location)]
+
+class HTTPFound(HTTPRedirect):
+    status = '302 Found'
+
+class HTTPTemporaryRedirect(HTTPRedirect):
+    status = '307 Temporary Redirect'
+
+class HTTPPermanentRedirect(HTTPRedirect):
+    status = '301 Moved Permanently'
+
 template_loader = TemplateLoader(
         os.path.join(os.path.dirname(__file__), 'templates'), 
         variable_lookup='strict', 
         auto_reload=True)
 
-class Constance(RegexApplication):
+class Constance(object):
+
+    def __init__(self, environ, start_response):
+        self.environ = environ
+        self.start = start_response
+        # as with SCRIPT_NAME, we want APP_URI *not* to include trailing slash
+        self.environ['APP_URI'] = wsgiref.util.application_uri(self.environ).rstrip('/')
+
+        self.config = config.ConstanceConfigParser(self.environ['constance.config_filename'])
 
-    urls = [(r'^$', 'index'), 
-            (r'^\+tags/$', 'tag_cloud'), 
-            (r'^\+tags/(.+)$', 'tag'), 
-            (r'^\+reading/?$', 'reading'), 
-            (r'^([^+/][^/]*)/?$', 'post'), 
-            (r'^([^+/][^/]*)/comments/\+new$', 'add_post_comment')]
-    charset = 'utf-8'
+        self.encoding = self.config.get('global', 'encoding')
+        self.args = dict((k.decode(self.encoding, 'ignore'), 
+                          v.decode(self.encoding, 'ignore')) 
+                         for k, v in 
+                         cgi.parse_qsl(self.environ.get('QUERY_STRING', ''), True))
+        if self.environ['REQUEST_METHOD'] == 'POST':
+            maxlen = int(self.environ['CONTENT_LENGTH'])
+            self.post_data = self.environ['wsgi.input'].read(maxlen)
+            self.form = dict((k.decode(self.encoding, 'ignore'), 
+                              v.decode(self.encoding, 'ignore')) 
+                             for k, v in cgi.parse_qsl(self.post_data, True))
 
-    def __init__(self, *args, **kwargs):
-        super(Constance, self).__init__(*args, **kwargs)
-        self.request.environ['APP_URI'] = wsgiref.util.application_uri(self.request.environ) # Colubrid ought to do this for us
-        self.config = config.ConstanceConfigParser(self.request.environ['constance.config_filename'])
         self.blog_entries = blog.BlogEntrySet(self.config.getunicode('blog', 'dir'))
         readinglog_filename = self.config.getunicode('readinglog', 'filename')
         if readinglog_filename:
@@ -39,30 +69,55 @@ class Constance(RegexApplication):
         else:
             self.readinglog_entries = frozenset()
 
+    def __iter__(self):
+        try:
+            for patt, method_name in self.urls:
+                match = patt.match(self.environ['PATH_INFO'])
+                if match:
+                    response_body, response_headers = getattr(self, method_name)(
+                            *[x.decode(self.encoding, 'ignore') for x in match.groups()])
+                    status = '200 OK'
+                    self.start(status, response_headers)
+                    return iter([response_body])
+            # no matching URI found, so give a 404
+            raise NotFoundError()
+        except HTTPException, e:
+            # XXX make prettier errors
+            self.start(e.status, [('Content-type', 'text/plain')] + e.headers)
+            return iter([e.status])
+
+    urls = [(r'/$', 'index'), 
+            (r'/\+tags/$', 'tag_cloud'), 
+            (r'/\+tags/(.+)$', 'tag'), 
+            (r'/\+reading/?$', 'reading'), 
+            (r'/([^+/][^/]*)/?$', 'post'), 
+            (r'/([^+/][^/]*)/comments/\+new$', 'add_post_comment')]
+    urls = [(re.compile(patt), method) for patt, method in urls]
+
     def index(self):
-        offset = int(self.request.args.get('offset', 0))
+        offset = int(self.args.get('offset', 0))
         sorted_entries = sorted(chain(self.blog_entries, self.readinglog_entries), 
                 key=lambda e: e.publication_date, reverse=True)
-        format = self.request.args.get('format', 'html')
+        format = self.args.get('format', 'html')
         if format == 'html':
             rendered = template_loader.load('multiple.xml').generate(
                     config=self.config, 
-                    environ=self.request.environ, 
+                    environ=self.environ, 
                     title=None, 
                     sorted_entries=sorted_entries, 
                     offset=offset,
-                    ).render('xhtml')
-            return HttpResponse(rendered, [('Content-Type', 'text/html')], 200)
+                    ).render('xhtml', encoding=self.encoding)
+            return (rendered, [('Content-Type', 'text/html')])
         elif format == 'atom':
             rendered = template_loader.load('multiple_atom.xml').generate(
                     config=self.config, 
-                    environ=self.request.environ, 
+                    environ=self.environ, 
                     title=None, 
-                    self_url='%s/' % self.request.environ['APP_URI'], 
+                    self_url='%s/' % self.environ['APP_URI'], 
                     sorted_entries=sorted_entries[:self.config.getint('global', 'entries_in_feed')], 
                     feed_updated=sorted_entries[0].modified_date
-                    ).render('xml')
-            return HttpResponse(rendered, [('Content-Type', 'application/atom+xml')], 200)
+                    ).render('xml', encoding=self.encoding)
+            return (rendered, [('Content-Type', 'application/atom+xml')])
         else:
             raise PageNotFound('Unknown format %r' % format)
     
@@ -73,109 +128,105 @@ class Constance(RegexApplication):
                 tag_freqs[tag] = tag_freqs.get(tag, 0) + 1
         rendered = template_loader.load('tag_cloud.xml').generate(
                 config=self.config, 
-                environ=self.request.environ, 
+                environ=self.environ, 
                 tag_freqs=tag_freqs
-                ).render('xhtml')
-        return HttpResponse(rendered, [('Content-Type', 'text/html')], 200)
+                ).render('xhtml', encoding=self.encoding)
+        return (rendered, [('Content-Type', 'text/html')])
     
     def post(self, id):
-        id = id.decode(self.charset) # shouldn't Colubrid do this?
         try:
             entry = self.blog_entries[id]
         except KeyError:
-            raise PageNotFound()
+            raise NotFoundError()
         rendered = template_loader.load('single.xml').generate(
                 config=self.config, 
-                environ=self.request.environ, 
+                environ=self.environ, 
                 entry=entry
-                ).render('xhtml')
-        return HttpResponse(rendered, [('Content-Type', 'text/html')], 200)
+                ).render('xhtml', encoding=self.encoding)
+        return (rendered, [('Content-Type', 'text/html')])
     
     def add_post_comment(self, id):
-        id = id.decode(self.charset) # shouldn't Colubrid do this?
         entry = self.blog_entries[id]
-        form_data = self.request.form.as_dict()
 
         if self.config.getboolean('blog', 'require_captcha'):
             # first verify the captcha
             captcha_response = captcha.submit(
-                    form_data['recaptcha_challenge_field'], 
-                    form_data['recaptcha_response_field'], 
+                    self.form['recaptcha_challenge_field'], 
+                    self.form['recaptcha_response_field'], 
                     self.config.get('blog', 'recaptcha_privkey'), 
-                    self.request.environ['REMOTE_ADDR'])
+                    self.environ['REMOTE_ADDR'])
             if not captcha_response.is_valid:
                 raise ValueError(captcha_response.error_code) # XXX handle better
 
         try:
             metadata = {}
-            metadata['From'] = form_data['from'] or 'Anonymous'
-            if form_data['author-url']:
-                metadata['Author-URL'] = form_data['author-url']
-            if form_data['author-email']:
-                metadata['Author-Email'] = form_data['author-email']
-            if self.request.environ['HTTP_USER_AGENT']:
-                metadata['User-Agent'] = self.request.environ['HTTP_USER_AGENT']
-            if self.request.environ['REMOTE_ADDR']:
-                metadata['Received'] = 'from %s' % self.request.environ['REMOTE_ADDR']
-            entry.add_comment(metadata, form_data['comment'])
-            raise HttpFound('%s/%s/' % (self.request.environ.get('SCRIPT_NAME', ''), 
-                    id.encode(self.charset)))
+            metadata['From'] = self.form['from'] or u'Anonymous'
+            if self.form['author-url']:
+                metadata['Author-URL'] = self.form['author-url']
+            if self.form['author-email']:
+                metadata['Author-Email'] = self.form['author-email']
+            if self.environ['HTTP_USER_AGENT']:
+                metadata['User-Agent'] = self.environ['HTTP_USER_AGENT']
+            if self.environ['REMOTE_ADDR']:
+                metadata['Received'] = u'from %s' % self.environ['REMOTE_ADDR']
+            entry.add_comment(metadata, self.form['comment'])
+            raise HTTPFound('%s/%s/' % (self.environ.get('APP_URI', ''), 
+                    id.encode(self.encoding)))
         except blog.CommentingForbiddenError:
-            raise AccessDenied()
+            raise ForbiddenError()
 
     def tag(self, tag):
-        tag = tag.decode(self.charset)
         with_tag = [e for e in self.blog_entries if tag in e.tags]
         if not with_tag:
-            raise PageNotFound()
-        offset = int(self.request.args.get('offset', 0))
+            raise NotFoundError()
+        offset = int(self.args.get('offset', 0))
         sorted_entries = sorted(with_tag, key=lambda e: e.publication_date, reverse=True)
-        format = self.request.args.get('format', 'html')
+        format = self.args.get('format', 'html')
         if format == 'html':
             rendered = template_loader.load('multiple.xml').generate(
                     config=self.config, 
-                    environ=self.request.environ, 
+                    environ=self.environ, 
                     title=u'ā€œ%sā€ tag' % tag, 
                     sorted_entries=sorted_entries, 
                     offset=offset
                     ).render('xhtml')
-            return HttpResponse(rendered, [('Content-Type', 'text/html')], 200)
+            return (rendered, [('Content-Type', 'text/html')])
         elif format == 'atom':
             rendered = template_loader.load('multiple_atom.xml').generate(
                     config=self.config, 
-                    environ=self.request.environ, 
+                    environ=self.environ, 
                     title=u'ā€œ%sā€ tag' % tag, 
-                    self_url='%s/+tags/%s' % (self.request.environ['APP_URI'], tag.encode(self.charset)), 
+                    self_url='%s/+tags/%s' % (self.environ['APP_URI'], tag.encode(self.encoding)), 
                     sorted_entries=sorted_entries[:self.config.getint('global', 'entries_in_feed')], 
                     feed_updated=sorted_entries[0].modified_date
                     ).render('xml')
-            return HttpResponse(rendered, [('Content-Type', 'application/atom+xml')], 200)
+            return (rendered, [('Content-Type', 'application/atom+xml')])
         else:
             raise PageNotFound('Unknown format %r' % format)
 
     def reading(self):
         sorted_entries = sorted(self.readinglog_entries, key=lambda e: e.publication_date, reverse=True)
-        format = self.request.args.get('format', 'html')
+        format = self.args.get('format', 'html')
         if format == 'html':
             rendered = template_loader.load('multiple.xml').generate(
                     config=self.config, 
-                    environ=self.request.environ, 
+                    environ=self.environ, 
                     title=u'reading log', 
                     sorted_entries=sorted_entries, 
-                    ).render('xhtml')
-            return HttpResponse(rendered, [('Content-Type', 'text/html')], 200)
+                    ).render('xhtml', encoding=self.encoding)
+            return (rendered, [('Content-Type', 'text/html')])
         elif format == 'atom':
             rendered = template_loader.load('multiple_atom.xml').generate(
                     config=self.config, 
-                    environ=self.request.environ, 
+                    environ=self.environ, 
                     title=u'reading log', 
-                    self_url='%s/+reading/' % self.request.environ['APP_URI'], 
+                    self_url='%s/+reading/' % self.environ['APP_URI'], 
                     sorted_entries=sorted_entries[:self.config.getint('global', 'entries_in_feed')], 
                     feed_updated=sorted_entries[0].modified_date
-                    ).render('xml')
-            return HttpResponse(rendered, [('Content-Type', 'application/atom+xml')], 200)
+                    ).render('xml', encoding=self.encoding)
+            return (rendered, [('Content-Type', 'application/atom+xml')])
         else:
-            raise PageNotFound('Unknown format %r' % format)
+            raise NotFoundError('Unknown format %r' % format)
 
 application = Constance
 
diff --git a/blog.py b/blog.py
@@ -126,9 +126,9 @@ class BlogEntry(object):
         guid = uuid.uuid4().get_hex()
         f = open(os.path.join(self.comments_dir, guid), 'w')
         for k, v in metadata.iteritems():
-            f.write('%s: %s\n' % (k, v))
+            f.write('%s: %s\n' % (k, v.encode('utf8'))) # XXX encoding
         f.write('\n')
-        f.write(content)
+        f.write(content.encode('utf8')) # XXX encoding
 
 
 class BlogEntrySet(DirectoryEntrySet):
diff --git a/config.defaults b/config.defaults
@@ -16,6 +16,17 @@ entries_per_page = 20
 # The maximum number of entries to be included in feeds.
 entries_in_feed = 20
 
+# Character encoding to be used everywhere. That is, for:
+#   * all data read from disk (including this config)
+#   * URL components and query string arguments
+#   * POST data
+#   * rendered templates
+# and anywhere else I have forgotten. Really whenever we are converting between 
+# Unicode data and bytestrings, this is the encoding that is used.
+# It is *highly* recommended that you not change this value from its default of 
+# utf8!
+encoding = utf8
+
 [blog]
 
 # The directory containing blog entries.
diff --git a/config.py b/config.py
@@ -12,4 +12,4 @@ class ConstanceConfigParser(SafeConfigParser):
         self.readfp(open(filename, 'r'))
 
     def getunicode(self, section, option):
-        return self.get(section, option).decode('utf8') # XXX make codec configurable?
+        return self.get(section, option).decode(self.get('global', 'encoding'))