tools/export_wp.py (5607B) - raw
1 #!/usr/bin/env python
2
3 import os, time, re, urllib, uuid, codecs
4 import MySQLdb
5
6 def html2md(s):
7 s = s.replace('<p>', '')
8 s = s.replace('</p>', '')
9 # XXX
10 return s
11
12 def export(options):
13 base_dir = options.base_dir
14 if not os.path.exists(base_dir):
15 os.mkdir(base_dir)
16
17 accepted_post_types = ['post']
18 if options.include_pages:
19 accepted_post_types.append('page')
20
21 cn = MySQLdb.connect(host=options.host, user=options.username,
22 passwd=options.password, db=options.db, use_unicode=True)
23
24 cur = cn.cursor()
25 cur.execute('SELECT id, post_name, post_title, post_date, post_modified, '
26 'guid, post_content, post_type FROM wp_posts WHERE post_status = %s',
27 ('publish',))
28 for row in cur.fetchall():
29 id, post_name, post_title, post_date, post_modified, \
30 guid, post_content, post_type = row
31
32 if post_type not in accepted_post_types: continue
33
34 # Wordpress stores these URL-encoded
35 post_name = urllib.unquote(str(post_name)).decode('utf8')
36 guid = urllib.unquote(str(guid)).decode('utf8')
37
38 subcur = cn.cursor()
39 subcur.execute('SELECT wp_terms.name FROM wp_term_relationships '
40 'INNER JOIN wp_term_taxonomy ON '
41 'wp_term_relationships.term_taxonomy_id = '
42 'wp_term_taxonomy.term_taxonomy_id INNER JOIN wp_terms '
43 'ON wp_term_taxonomy.term_id = wp_terms.term_id '
44 'WHERE taxonomy = %s AND object_id = %s', ('category', id,))
45 categories = [category for category, in subcur.fetchall()]
46 subcur = cn.cursor()
47 subcur.execute('SELECT wp_terms.name FROM wp_term_relationships '
48 'INNER JOIN wp_term_taxonomy ON '
49 'wp_term_relationships.term_taxonomy_id = '
50 'wp_term_taxonomy.term_taxonomy_id INNER JOIN wp_terms '
51 'ON wp_term_taxonomy.term_id = wp_terms.term_id '
52 'WHERE taxonomy = %s AND object_id = %s', ('post_tag', id,))
53 tags = [tag for tag, in subcur.fetchall()]
54
55 # XXX
56 if 'Reading' in categories: continue
57
58 if options.convert_categories:
59 tags = [category.lower() for category in categories] + tags
60
61 os.mkdir(os.path.join(base_dir, post_name))
62 f = codecs.open(os.path.join(base_dir, post_name, 'content.txt'),
63 'w', 'utf8')
64 f.write('Title: %s\n' % post_title)
65 f.write('Publication-Date: %s\n' %
66 post_date.strftime('%Y-%m-%d %H:%M:%S'))
67 f.write('GUID: %s\n' % guid)
68 if not options.convert_categories:
69 f.write('Categories: %s\n' % ', '.join(categories))
70 f.write('Tags: %s\n' % ', '.join(tags))
71 f.write('\n')
72 f.write(post_content)
73 del f
74 os.utime(os.path.join(base_dir, post_name, 'content.txt'),
75 (time.mktime(post_modified.timetuple()),
76 time.mktime(post_modified.timetuple())))
77
78 # comments
79 subcur = cn.cursor()
80 subcur.execute('SELECT comment_author, comment_author_email, '
81 'comment_author_url, comment_author_ip, comment_date, '
82 'comment_agent, comment_content FROM wp_comments WHERE '
83 'comment_post_id = %s AND comment_approved LIKE %s', (id, 1))
84 os.mkdir(os.path.join(base_dir, post_name, 'comments'))
85 # XXX dir perms
86 for subrow in subcur.fetchall():
87 author, email, url, ip_addr, date, user_agent, content = subrow
88 id = str(uuid.uuid4()).replace('-', '')
89 filename = os.path.join(base_dir, post_name, 'comments', id)
90 f = open(filename, 'w')
91 if author:
92 f.write('From: %s\n' % author)
93 f.write('Date: %s\n' % date.strftime('%Y-%m-%d %H:%M:%S'))
94 if email:
95 f.write('Author-Email: %s\n' % email)
96 if url:
97 f.write('Author-URL: %s\n' % url)
98 if user_agent:
99 f.write('User-Agent: %s\n' % user_agent)
100 if ip_addr:
101 f.write('Received: from %s\n' % ip_addr)
102 f.write('\n')
103 f.write(html2md(content)) # Wordpress HTMLifies comments >_<
104 del f
105 os.utime(filename,
106 (time.mktime(date.timetuple()),
107 time.mktime(date.timetuple())))
108
109 if __name__ == '__main__':
110 from optparse import OptionParser
111 parser = OptionParser()
112 parser.add_option('-H', '--host',
113 help='connect to MySQL server HOST [default: %default]')
114 parser.add_option('-u', '--username',
115 help='use USERNAME when connecting [default: %default]')
116 parser.add_option('-p', '--password',
117 help='use PASSWORD when connecting [default: no password]')
118 parser.add_option('-d', '--db',
119 help='name of the Wordpress database [default: %default]')
120 parser.set_defaults(host='localhost', username='root',
121 password=None, db='wordpress', convert_categories=False)
122 parser.add_option('-b', '--base-dir', metavar='BASE',
123 help='create entries as subdirectories of BASE')
124 parser.add_option('--convert-categories', action='store_true',
125 help='convert categories to tags')
126 parser.add_option('--include-pages', action='store_true',
127 help='include pages as well as posts')
128 options, args = parser.parse_args()
129 if options.base_dir is None:
130 parser.error('--base-dir must be specified')
131 export(options)