constance

Scripts for generating (an earlier obsolete version of) my personal web site
git clone https://code.djc.id.au/git/constance/

tools/export_wp.py (5607B) - raw

      1 #!/usr/bin/env python
      2 
      3 import os, time, re, urllib, uuid, codecs
      4 import MySQLdb
      5 
      6 def html2md(s):
      7     s = s.replace('<p>', '')
      8     s = s.replace('</p>', '')
      9     # XXX
     10     return s
     11 
     12 def export(options):
     13     base_dir = options.base_dir
     14     if not os.path.exists(base_dir):
     15         os.mkdir(base_dir)
     16 
     17     accepted_post_types = ['post']
     18     if options.include_pages:
     19         accepted_post_types.append('page')
     20 
     21     cn = MySQLdb.connect(host=options.host, user=options.username, 
     22             passwd=options.password, db=options.db, use_unicode=True)
     23 
     24     cur = cn.cursor()
     25     cur.execute('SELECT id, post_name, post_title, post_date, post_modified, '
     26             'guid, post_content, post_type FROM wp_posts WHERE post_status = %s', 
     27             ('publish',))
     28     for row in cur.fetchall():
     29         id, post_name, post_title, post_date, post_modified, \
     30                 guid, post_content, post_type = row
     31 
     32         if post_type not in accepted_post_types: continue
     33         
     34         # Wordpress stores these URL-encoded
     35         post_name = urllib.unquote(str(post_name)).decode('utf8')
     36         guid = urllib.unquote(str(guid)).decode('utf8')
     37 
     38         subcur = cn.cursor()
     39         subcur.execute('SELECT wp_terms.name FROM wp_term_relationships '
     40                 'INNER JOIN wp_term_taxonomy ON '
     41                 'wp_term_relationships.term_taxonomy_id = '
     42                 'wp_term_taxonomy.term_taxonomy_id INNER JOIN wp_terms '
     43                 'ON wp_term_taxonomy.term_id = wp_terms.term_id '
     44                 'WHERE taxonomy = %s AND object_id = %s', ('category', id,))
     45         categories = [category for category, in subcur.fetchall()]
     46         subcur = cn.cursor()
     47         subcur.execute('SELECT wp_terms.name FROM wp_term_relationships '
     48                 'INNER JOIN wp_term_taxonomy ON '
     49                 'wp_term_relationships.term_taxonomy_id = '
     50                 'wp_term_taxonomy.term_taxonomy_id INNER JOIN wp_terms '
     51                 'ON wp_term_taxonomy.term_id = wp_terms.term_id '
     52                 'WHERE taxonomy = %s AND object_id = %s', ('post_tag', id,))
     53         tags = [tag for tag, in subcur.fetchall()]
     54 
     55         # XXX
     56         if 'Reading' in categories: continue
     57 
     58         if options.convert_categories:
     59             tags = [category.lower() for category in categories] + tags
     60 
     61         os.mkdir(os.path.join(base_dir, post_name))
     62         f = codecs.open(os.path.join(base_dir, post_name, 'content.txt'), 
     63                 'w', 'utf8')
     64         f.write('Title: %s\n' % post_title)
     65         f.write('Publication-Date: %s\n' % 
     66                 post_date.strftime('%Y-%m-%d %H:%M:%S'))
     67         f.write('GUID: %s\n' % guid)
     68         if not options.convert_categories:
     69             f.write('Categories: %s\n' % ', '.join(categories))
     70         f.write('Tags: %s\n' % ', '.join(tags))
     71         f.write('\n')
     72         f.write(post_content)
     73         del f
     74         os.utime(os.path.join(base_dir, post_name, 'content.txt'), 
     75                 (time.mktime(post_modified.timetuple()), 
     76                  time.mktime(post_modified.timetuple())))
     77 
     78         # comments
     79         subcur = cn.cursor()
     80         subcur.execute('SELECT comment_author, comment_author_email, '
     81                 'comment_author_url, comment_author_ip, comment_date, '
     82                 'comment_agent, comment_content FROM wp_comments WHERE '
     83                 'comment_post_id = %s AND comment_approved LIKE %s', (id, 1))
     84         os.mkdir(os.path.join(base_dir, post_name, 'comments'))
     85         # XXX dir perms
     86         for subrow in subcur.fetchall():
     87             author, email, url, ip_addr, date, user_agent, content = subrow
     88             id = str(uuid.uuid4()).replace('-', '')
     89             filename = os.path.join(base_dir, post_name, 'comments', id)
     90             f = open(filename, 'w')
     91             if author:
     92                 f.write('From: %s\n' % author)
     93             f.write('Date: %s\n' % date.strftime('%Y-%m-%d %H:%M:%S'))
     94             if email:
     95                 f.write('Author-Email: %s\n' % email)
     96             if url:
     97                 f.write('Author-URL: %s\n' % url)
     98             if user_agent:
     99                 f.write('User-Agent: %s\n' % user_agent)
    100             if ip_addr:
    101                 f.write('Received: from %s\n' % ip_addr)
    102             f.write('\n')
    103             f.write(html2md(content)) # Wordpress HTMLifies comments >_<
    104             del f
    105             os.utime(filename, 
    106                     (time.mktime(date.timetuple()), 
    107                      time.mktime(date.timetuple())))
    108 
    109 if __name__ == '__main__':
    110     from optparse import OptionParser
    111     parser = OptionParser()
    112     parser.add_option('-H', '--host', 
    113             help='connect to MySQL server HOST [default: %default]')
    114     parser.add_option('-u', '--username', 
    115             help='use USERNAME when connecting [default: %default]')
    116     parser.add_option('-p', '--password', 
    117             help='use PASSWORD when connecting [default: no password]')
    118     parser.add_option('-d', '--db', 
    119             help='name of the Wordpress database [default: %default]')
    120     parser.set_defaults(host='localhost', username='root', 
    121             password=None, db='wordpress', convert_categories=False)
    122     parser.add_option('-b', '--base-dir', metavar='BASE', 
    123             help='create entries as subdirectories of BASE')
    124     parser.add_option('--convert-categories', action='store_true', 
    125             help='convert categories to tags')
    126     parser.add_option('--include-pages', action='store_true', 
    127             help='include pages as well as posts')
    128     options, args = parser.parse_args()
    129     if options.base_dir is None:
    130         parser.error('--base-dir must be specified')
    131     export(options)