tools/export_wp.py (5607B) - raw
1 #!/usr/bin/env python 2 3 import os, time, re, urllib, uuid, codecs 4 import MySQLdb 5 6 def html2md(s): 7 s = s.replace('<p>', '') 8 s = s.replace('</p>', '') 9 # XXX 10 return s 11 12 def export(options): 13 base_dir = options.base_dir 14 if not os.path.exists(base_dir): 15 os.mkdir(base_dir) 16 17 accepted_post_types = ['post'] 18 if options.include_pages: 19 accepted_post_types.append('page') 20 21 cn = MySQLdb.connect(host=options.host, user=options.username, 22 passwd=options.password, db=options.db, use_unicode=True) 23 24 cur = cn.cursor() 25 cur.execute('SELECT id, post_name, post_title, post_date, post_modified, ' 26 'guid, post_content, post_type FROM wp_posts WHERE post_status = %s', 27 ('publish',)) 28 for row in cur.fetchall(): 29 id, post_name, post_title, post_date, post_modified, \ 30 guid, post_content, post_type = row 31 32 if post_type not in accepted_post_types: continue 33 34 # Wordpress stores these URL-encoded 35 post_name = urllib.unquote(str(post_name)).decode('utf8') 36 guid = urllib.unquote(str(guid)).decode('utf8') 37 38 subcur = cn.cursor() 39 subcur.execute('SELECT wp_terms.name FROM wp_term_relationships ' 40 'INNER JOIN wp_term_taxonomy ON ' 41 'wp_term_relationships.term_taxonomy_id = ' 42 'wp_term_taxonomy.term_taxonomy_id INNER JOIN wp_terms ' 43 'ON wp_term_taxonomy.term_id = wp_terms.term_id ' 44 'WHERE taxonomy = %s AND object_id = %s', ('category', id,)) 45 categories = [category for category, in subcur.fetchall()] 46 subcur = cn.cursor() 47 subcur.execute('SELECT wp_terms.name FROM wp_term_relationships ' 48 'INNER JOIN wp_term_taxonomy ON ' 49 'wp_term_relationships.term_taxonomy_id = ' 50 'wp_term_taxonomy.term_taxonomy_id INNER JOIN wp_terms ' 51 'ON wp_term_taxonomy.term_id = wp_terms.term_id ' 52 'WHERE taxonomy = %s AND object_id = %s', ('post_tag', id,)) 53 tags = [tag for tag, in subcur.fetchall()] 54 55 # XXX 56 if 'Reading' in categories: continue 57 58 if options.convert_categories: 59 tags = [category.lower() for category in categories] + tags 60 61 os.mkdir(os.path.join(base_dir, post_name)) 62 f = codecs.open(os.path.join(base_dir, post_name, 'content.txt'), 63 'w', 'utf8') 64 f.write('Title: %s\n' % post_title) 65 f.write('Publication-Date: %s\n' % 66 post_date.strftime('%Y-%m-%d %H:%M:%S')) 67 f.write('GUID: %s\n' % guid) 68 if not options.convert_categories: 69 f.write('Categories: %s\n' % ', '.join(categories)) 70 f.write('Tags: %s\n' % ', '.join(tags)) 71 f.write('\n') 72 f.write(post_content) 73 del f 74 os.utime(os.path.join(base_dir, post_name, 'content.txt'), 75 (time.mktime(post_modified.timetuple()), 76 time.mktime(post_modified.timetuple()))) 77 78 # comments 79 subcur = cn.cursor() 80 subcur.execute('SELECT comment_author, comment_author_email, ' 81 'comment_author_url, comment_author_ip, comment_date, ' 82 'comment_agent, comment_content FROM wp_comments WHERE ' 83 'comment_post_id = %s AND comment_approved LIKE %s', (id, 1)) 84 os.mkdir(os.path.join(base_dir, post_name, 'comments')) 85 # XXX dir perms 86 for subrow in subcur.fetchall(): 87 author, email, url, ip_addr, date, user_agent, content = subrow 88 id = str(uuid.uuid4()).replace('-', '') 89 filename = os.path.join(base_dir, post_name, 'comments', id) 90 f = open(filename, 'w') 91 if author: 92 f.write('From: %s\n' % author) 93 f.write('Date: %s\n' % date.strftime('%Y-%m-%d %H:%M:%S')) 94 if email: 95 f.write('Author-Email: %s\n' % email) 96 if url: 97 f.write('Author-URL: %s\n' % url) 98 if user_agent: 99 f.write('User-Agent: %s\n' % user_agent) 100 if ip_addr: 101 f.write('Received: from %s\n' % ip_addr) 102 f.write('\n') 103 f.write(html2md(content)) # Wordpress HTMLifies comments >_< 104 del f 105 os.utime(filename, 106 (time.mktime(date.timetuple()), 107 time.mktime(date.timetuple()))) 108 109 if __name__ == '__main__': 110 from optparse import OptionParser 111 parser = OptionParser() 112 parser.add_option('-H', '--host', 113 help='connect to MySQL server HOST [default: %default]') 114 parser.add_option('-u', '--username', 115 help='use USERNAME when connecting [default: %default]') 116 parser.add_option('-p', '--password', 117 help='use PASSWORD when connecting [default: no password]') 118 parser.add_option('-d', '--db', 119 help='name of the Wordpress database [default: %default]') 120 parser.set_defaults(host='localhost', username='root', 121 password=None, db='wordpress', convert_categories=False) 122 parser.add_option('-b', '--base-dir', metavar='BASE', 123 help='create entries as subdirectories of BASE') 124 parser.add_option('--convert-categories', action='store_true', 125 help='convert categories to tags') 126 parser.add_option('--include-pages', action='store_true', 127 help='include pages as well as posts') 128 options, args = parser.parse_args() 129 if options.base_dir is None: 130 parser.error('--base-dir must be specified') 131 export(options)