lib/markdown2.py (73615B) - raw
1 #!/usr/bin/env python 2 # Copyright (c) 2007-2008 ActiveState Corp. 3 # License: MIT (http://www.opensource.org/licenses/mit-license.php) 4 5 r"""A fast and complete Python implementation of Markdown. 6 7 [from http://daringfireball.net/projects/markdown/] 8 > Markdown is a text-to-HTML filter; it translates an easy-to-read / 9 > easy-to-write structured text format into HTML. Markdown's text 10 > format is most similar to that of plain text email, and supports 11 > features such as headers, *emphasis*, code blocks, blockquotes, and 12 > links. 13 > 14 > Markdown's syntax is designed not as a generic markup language, but 15 > specifically to serve as a front-end to (X)HTML. You can use span-level 16 > HTML tags anywhere in a Markdown document, and you can use block level 17 > HTML tags (like <div> and <table> as well). 18 19 Module usage: 20 21 >>> import markdown2 22 >>> markdown2.markdown("*boo!*") # or use `html = markdown_path(PATH)` 23 u'<p><em>boo!</em></p>\n' 24 25 >>> markdowner = Markdown() 26 >>> markdowner.convert("*boo!*") 27 u'<p><em>boo!</em></p>\n' 28 >>> markdowner.convert("**boom!**") 29 u'<p><strong>boom!</strong></p>\n' 30 31 This implementation of Markdown implements the full "core" syntax plus a 32 number of extras (e.g., code syntax coloring, footnotes) as described on 33 <http://code.google.com/p/python-markdown2/wiki/Extras>. 34 """ 35 36 cmdln_desc = """A fast and complete Python implementation of Markdown, a 37 text-to-HTML conversion tool for web writers. 38 """ 39 40 # Dev Notes: 41 # - There is already a Python markdown processor 42 # (http://www.freewisdom.org/projects/python-markdown/). 43 # - Python's regex syntax doesn't have '\z', so I'm using '\Z'. I'm 44 # not yet sure if there implications with this. Compare 'pydoc sre' 45 # and 'perldoc perlre'. 46 47 __version_info__ = (1, 0, 1, 9) # first three nums match Markdown.pl 48 __version__ = '.'.join(map(str, __version_info__)) 49 __author__ = "Trent Mick" 50 51 import os 52 import sys 53 from pprint import pprint 54 import re 55 import logging 56 try: 57 from hashlib import md5 58 except ImportError: 59 from md5 import md5 60 import optparse 61 from random import random 62 import codecs 63 64 65 66 #---- Python version compat 67 68 if sys.version_info[:2] < (2,4): 69 from sets import Set as set 70 def reversed(sequence): 71 for i in sequence[::-1]: 72 yield i 73 def _unicode_decode(s, encoding, errors='xmlcharrefreplace'): 74 return unicode(s, encoding, errors) 75 else: 76 def _unicode_decode(s, encoding, errors='strict'): 77 return s.decode(encoding, errors) 78 79 80 #---- globals 81 82 DEBUG = False 83 log = logging.getLogger("markdown") 84 85 DEFAULT_TAB_WIDTH = 4 86 87 # Table of hash values for escaped characters: 88 def _escape_hash(s): 89 # Lame attempt to avoid possible collision with someone actually 90 # using the MD5 hexdigest of one of these chars in there text. 91 # Other ideas: random.random(), uuid.uuid() 92 #return md5(s).hexdigest() # Markdown.pl effectively does this. 93 return 'md5:'+md5(s).hexdigest() 94 g_escape_table = dict([(ch, _escape_hash(ch)) 95 for ch in '\\`*_{}[]()>#+-.!']) 96 97 98 99 #---- exceptions 100 101 class MarkdownError(Exception): 102 pass 103 104 105 106 #---- public api 107 108 def markdown_path(path, encoding="utf-8", 109 html4tags=False, tab_width=DEFAULT_TAB_WIDTH, 110 safe_mode=None, extras=None, link_patterns=None, 111 use_file_vars=False): 112 text = codecs.open(path, 'r', encoding).read() 113 return Markdown(html4tags=html4tags, tab_width=tab_width, 114 safe_mode=safe_mode, extras=extras, 115 link_patterns=link_patterns, 116 use_file_vars=use_file_vars).convert(text) 117 118 def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH, 119 safe_mode=None, extras=None, link_patterns=None, 120 use_file_vars=False): 121 return Markdown(html4tags=html4tags, tab_width=tab_width, 122 safe_mode=safe_mode, extras=extras, 123 link_patterns=link_patterns, 124 use_file_vars=use_file_vars).convert(text) 125 126 class Markdown(object): 127 # The dict of "extras" to enable in processing -- a mapping of 128 # extra name to argument for the extra. Most extras do not have an 129 # argument, in which case the value is None. 130 # 131 # This can be set via (a) subclassing and (b) the constructor 132 # "extras" argument. 133 extras = None 134 135 urls = None 136 titles = None 137 html_blocks = None 138 html_spans = None 139 html_removed_text = "[HTML_REMOVED]" # for compat with markdown.py 140 141 # Used to track when we're inside an ordered or unordered list 142 # (see _ProcessListItems() for details): 143 list_level = 0 144 145 _ws_only_line_re = re.compile(r"^[ \t]+$", re.M) 146 147 def __init__(self, html4tags=False, tab_width=4, safe_mode=None, 148 extras=None, link_patterns=None, use_file_vars=False): 149 if html4tags: 150 self.empty_element_suffix = ">" 151 else: 152 self.empty_element_suffix = " />" 153 self.tab_width = tab_width 154 155 # For compatibility with earlier markdown2.py and with 156 # markdown.py's safe_mode being a boolean, 157 # safe_mode == True -> "replace" 158 if safe_mode is True: 159 self.safe_mode = "replace" 160 else: 161 self.safe_mode = safe_mode 162 163 if self.extras is None: 164 self.extras = {} 165 elif not isinstance(self.extras, dict): 166 self.extras = dict([(e, None) for e in self.extras]) 167 if extras: 168 if not isinstance(extras, dict): 169 extras = dict([(e, None) for e in extras]) 170 self.extras.update(extras) 171 assert isinstance(self.extras, dict) 172 self._instance_extras = self.extras.copy() 173 self.link_patterns = link_patterns 174 self.use_file_vars = use_file_vars 175 self._outdent_re = re.compile(r'^(\t|[ ]{1,%d})' % tab_width, re.M) 176 177 def reset(self): 178 self.urls = {} 179 self.titles = {} 180 self.html_blocks = {} 181 self.html_spans = {} 182 self.list_level = 0 183 self.extras = self._instance_extras.copy() 184 if "footnotes" in self.extras: 185 self.footnotes = {} 186 self.footnote_ids = [] 187 188 def convert(self, text): 189 """Convert the given text.""" 190 # Main function. The order in which other subs are called here is 191 # essential. Link and image substitutions need to happen before 192 # _EscapeSpecialChars(), so that any *'s or _'s in the <a> 193 # and <img> tags get encoded. 194 195 # Clear the global hashes. If we don't clear these, you get conflicts 196 # from other articles when generating a page which contains more than 197 # one article (e.g. an index page that shows the N most recent 198 # articles): 199 self.reset() 200 201 if not isinstance(text, unicode): 202 #TODO: perhaps shouldn't presume UTF-8 for string input? 203 text = unicode(text, 'utf-8') 204 205 if self.use_file_vars: 206 # Look for emacs-style file variable hints. 207 emacs_vars = self._get_emacs_vars(text) 208 if "markdown-extras" in emacs_vars: 209 splitter = re.compile("[ ,]+") 210 for e in splitter.split(emacs_vars["markdown-extras"]): 211 if '=' in e: 212 ename, earg = e.split('=', 1) 213 try: 214 earg = int(earg) 215 except ValueError: 216 pass 217 else: 218 ename, earg = e, None 219 self.extras[ename] = earg 220 221 # Standardize line endings: 222 text = re.sub("\r\n|\r", "\n", text) 223 224 # Make sure $text ends with a couple of newlines: 225 text += "\n\n" 226 227 # Convert all tabs to spaces. 228 text = self._detab(text) 229 230 # Strip any lines consisting only of spaces and tabs. 231 # This makes subsequent regexen easier to write, because we can 232 # match consecutive blank lines with /\n+/ instead of something 233 # contorted like /[ \t]*\n+/ . 234 text = self._ws_only_line_re.sub("", text) 235 236 if self.safe_mode: 237 text = self._hash_html_spans(text) 238 239 # Turn block-level HTML blocks into hash entries 240 text = self._hash_html_blocks(text, raw=True) 241 242 # Strip link definitions, store in hashes. 243 if "footnotes" in self.extras: 244 # Must do footnotes first because an unlucky footnote defn 245 # looks like a link defn: 246 # [^4]: this "looks like a link defn" 247 text = self._strip_footnote_definitions(text) 248 text = self._strip_link_definitions(text) 249 250 text = self._run_block_gamut(text) 251 252 text = self._unescape_special_chars(text) 253 254 if "footnotes" in self.extras: 255 text = self._add_footnotes(text) 256 257 if self.safe_mode: 258 text = self._unhash_html_spans(text) 259 260 text += "\n" 261 return text 262 263 _emacs_oneliner_vars_pat = re.compile(r"-\*-\s*([^\r\n]*?)\s*-\*-", re.UNICODE) 264 # This regular expression is intended to match blocks like this: 265 # PREFIX Local Variables: SUFFIX 266 # PREFIX mode: Tcl SUFFIX 267 # PREFIX End: SUFFIX 268 # Some notes: 269 # - "[ \t]" is used instead of "\s" to specifically exclude newlines 270 # - "(\r\n|\n|\r)" is used instead of "$" because the sre engine does 271 # not like anything other than Unix-style line terminators. 272 _emacs_local_vars_pat = re.compile(r"""^ 273 (?P<prefix>(?:[^\r\n|\n|\r])*?) 274 [\ \t]*Local\ Variables:[\ \t]* 275 (?P<suffix>.*?)(?:\r\n|\n|\r) 276 (?P<content>.*?\1End:) 277 """, re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE) 278 279 def _get_emacs_vars(self, text): 280 """Return a dictionary of emacs-style local variables. 281 282 Parsing is done loosely according to this spec (and according to 283 some in-practice deviations from this): 284 http://www.gnu.org/software/emacs/manual/html_node/emacs/Specifying-File-Variables.html#Specifying-File-Variables 285 """ 286 emacs_vars = {} 287 SIZE = pow(2, 13) # 8kB 288 289 # Search near the start for a '-*-'-style one-liner of variables. 290 head = text[:SIZE] 291 if "-*-" in head: 292 match = self._emacs_oneliner_vars_pat.search(head) 293 if match: 294 emacs_vars_str = match.group(1) 295 assert '\n' not in emacs_vars_str 296 emacs_var_strs = [s.strip() for s in emacs_vars_str.split(';') 297 if s.strip()] 298 if len(emacs_var_strs) == 1 and ':' not in emacs_var_strs[0]: 299 # While not in the spec, this form is allowed by emacs: 300 # -*- Tcl -*- 301 # where the implied "variable" is "mode". This form 302 # is only allowed if there are no other variables. 303 emacs_vars["mode"] = emacs_var_strs[0].strip() 304 else: 305 for emacs_var_str in emacs_var_strs: 306 try: 307 variable, value = emacs_var_str.strip().split(':', 1) 308 except ValueError: 309 log.debug("emacs variables error: malformed -*- " 310 "line: %r", emacs_var_str) 311 continue 312 # Lowercase the variable name because Emacs allows "Mode" 313 # or "mode" or "MoDe", etc. 314 emacs_vars[variable.lower()] = value.strip() 315 316 tail = text[-SIZE:] 317 if "Local Variables" in tail: 318 match = self._emacs_local_vars_pat.search(tail) 319 if match: 320 prefix = match.group("prefix") 321 suffix = match.group("suffix") 322 lines = match.group("content").splitlines(0) 323 #print "prefix=%r, suffix=%r, content=%r, lines: %s"\ 324 # % (prefix, suffix, match.group("content"), lines) 325 326 # Validate the Local Variables block: proper prefix and suffix 327 # usage. 328 for i, line in enumerate(lines): 329 if not line.startswith(prefix): 330 log.debug("emacs variables error: line '%s' " 331 "does not use proper prefix '%s'" 332 % (line, prefix)) 333 return {} 334 # Don't validate suffix on last line. Emacs doesn't care, 335 # neither should we. 336 if i != len(lines)-1 and not line.endswith(suffix): 337 log.debug("emacs variables error: line '%s' " 338 "does not use proper suffix '%s'" 339 % (line, suffix)) 340 return {} 341 342 # Parse out one emacs var per line. 343 continued_for = None 344 for line in lines[:-1]: # no var on the last line ("PREFIX End:") 345 if prefix: line = line[len(prefix):] # strip prefix 346 if suffix: line = line[:-len(suffix)] # strip suffix 347 line = line.strip() 348 if continued_for: 349 variable = continued_for 350 if line.endswith('\\'): 351 line = line[:-1].rstrip() 352 else: 353 continued_for = None 354 emacs_vars[variable] += ' ' + line 355 else: 356 try: 357 variable, value = line.split(':', 1) 358 except ValueError: 359 log.debug("local variables error: missing colon " 360 "in local variables entry: '%s'" % line) 361 continue 362 # Do NOT lowercase the variable name, because Emacs only 363 # allows "mode" (and not "Mode", "MoDe", etc.) in this block. 364 value = value.strip() 365 if value.endswith('\\'): 366 value = value[:-1].rstrip() 367 continued_for = variable 368 else: 369 continued_for = None 370 emacs_vars[variable] = value 371 372 # Unquote values. 373 for var, val in emacs_vars.items(): 374 if len(val) > 1 and (val.startswith('"') and val.endswith('"') 375 or val.startswith('"') and val.endswith('"')): 376 emacs_vars[var] = val[1:-1] 377 378 return emacs_vars 379 380 # Cribbed from a post by Bart Lateur: 381 # <http://www.nntp.perl.org/group/perl.macperl.anyperl/154> 382 _detab_re = re.compile(r'(.*?)\t', re.M) 383 def _detab_sub(self, match): 384 g1 = match.group(1) 385 return g1 + (' ' * (self.tab_width - len(g1) % self.tab_width)) 386 def _detab(self, text): 387 r"""Remove (leading?) tabs from a file. 388 389 >>> m = Markdown() 390 >>> m._detab("\tfoo") 391 ' foo' 392 >>> m._detab(" \tfoo") 393 ' foo' 394 >>> m._detab("\t foo") 395 ' foo' 396 >>> m._detab(" foo") 397 ' foo' 398 >>> m._detab(" foo\n\tbar\tblam") 399 ' foo\n bar blam' 400 """ 401 if '\t' not in text: 402 return text 403 return self._detab_re.subn(self._detab_sub, text)[0] 404 405 _block_tags_a = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del' 406 _strict_tag_block_re = re.compile(r""" 407 ( # save in \1 408 ^ # start of line (with re.M) 409 <(%s) # start tag = \2 410 \b # word break 411 (.*\n)*? # any number of lines, minimally matching 412 </\2> # the matching end tag 413 [ \t]* # trailing spaces/tabs 414 (?=\n+|\Z) # followed by a newline or end of document 415 ) 416 """ % _block_tags_a, 417 re.X | re.M) 418 419 _block_tags_b = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math' 420 _liberal_tag_block_re = re.compile(r""" 421 ( # save in \1 422 ^ # start of line (with re.M) 423 <(%s) # start tag = \2 424 \b # word break 425 (.*\n)*? # any number of lines, minimally matching 426 .*</\2> # the matching end tag 427 [ \t]* # trailing spaces/tabs 428 (?=\n+|\Z) # followed by a newline or end of document 429 ) 430 """ % _block_tags_b, 431 re.X | re.M) 432 433 # Save for usage in coming 'xml' extra. 434 XXX_liberal_tag_block_re = re.compile(r""" 435 ( # save in \1 436 ^ # start of line (with re.M) 437 (?: 438 <(%s|\w+:\w+) # start tag = \2 439 \b # word break 440 (?:.*\n)*? # any number of lines, minimally matching 441 .*</\2> # the matching end tag 442 | 443 <(\w+:)?\w+ # single tag-start 444 \b # word break 445 .*? # any content on one line, minimally matching 446 /> # end of tag 447 | 448 <\?\w+ # start of processing instruction 449 \b # word break 450 .*? # any content on one line, minimally matching 451 \?> # the PI end tag 452 ) 453 [ \t]* # trailing spaces/tabs 454 (?=\n+|\Z) # followed by a newline or end of document 455 ) 456 """ % _block_tags_b, 457 re.X | re.M) 458 459 def _hash_html_block_sub(self, match, raw=False): 460 html = match.group(1) 461 if raw and self.safe_mode: 462 html = self._sanitize_html(html) 463 key = _hash_text(html) 464 self.html_blocks[key] = html 465 return "\n\n" + key + "\n\n" 466 467 def _hash_html_blocks(self, text, raw=False): 468 """Hashify HTML blocks 469 470 We only want to do this for block-level HTML tags, such as headers, 471 lists, and tables. That's because we still want to wrap <p>s around 472 "paragraphs" that are wrapped in non-block-level tags, such as anchors, 473 phrase emphasis, and spans. The list of tags we're looking for is 474 hard-coded. 475 476 @param raw {boolean} indicates if these are raw HTML blocks in 477 the original source. It makes a difference in "safe" mode. 478 """ 479 if '<' not in text: 480 return text 481 482 # Pass `raw` value into our calls to self._hash_html_block_sub. 483 hash_html_block_sub = _curry(self._hash_html_block_sub, raw=raw) 484 485 # First, look for nested blocks, e.g.: 486 # <div> 487 # <div> 488 # tags for inner block must be indented. 489 # </div> 490 # </div> 491 # 492 # The outermost tags must start at the left margin for this to match, and 493 # the inner nested divs must be indented. 494 # We need to do this before the next, more liberal match, because the next 495 # match will start at the first `<div>` and stop at the first `</div>`. 496 text = self._strict_tag_block_re.sub(hash_html_block_sub, text) 497 498 # Now match more liberally, simply from `\n<tag>` to `</tag>\n` 499 text = self._liberal_tag_block_re.sub(hash_html_block_sub, text) 500 501 # Special case just for <hr />. It was easier to make a special 502 # case than to make the other regex more complicated. 503 if "<hr" in text: 504 _hr_tag_re = _hr_tag_re_from_tab_width(self.tab_width) 505 text = _hr_tag_re.sub(hash_html_block_sub, text) 506 507 # Special case for standalone HTML comments: 508 if "<!--" in text: 509 start = 0 510 while True: 511 # Delimiters for next comment block. 512 try: 513 start_idx = text.index("<!--", start) 514 except ValueError, ex: 515 break 516 try: 517 end_idx = text.index("-->", start_idx) + 3 518 except ValueError, ex: 519 break 520 521 # Start position for next comment block search. 522 start = end_idx 523 524 # Validate whitespace before comment. 525 if start_idx: 526 # - Up to `tab_width - 1` spaces before start_idx. 527 for i in range(self.tab_width - 1): 528 if text[start_idx - 1] != ' ': 529 break 530 start_idx -= 1 531 if start_idx == 0: 532 break 533 # - Must be preceded by 2 newlines or hit the start of 534 # the document. 535 if start_idx == 0: 536 pass 537 elif start_idx == 1 and text[0] == '\n': 538 start_idx = 0 # to match minute detail of Markdown.pl regex 539 elif text[start_idx-2:start_idx] == '\n\n': 540 pass 541 else: 542 break 543 544 # Validate whitespace after comment. 545 # - Any number of spaces and tabs. 546 while end_idx < len(text): 547 if text[end_idx] not in ' \t': 548 break 549 end_idx += 1 550 # - Must be following by 2 newlines or hit end of text. 551 if text[end_idx:end_idx+2] not in ('', '\n', '\n\n'): 552 continue 553 554 # Escape and hash (must match `_hash_html_block_sub`). 555 html = text[start_idx:end_idx] 556 if raw and self.safe_mode: 557 html = self._sanitize_html(html) 558 key = _hash_text(html) 559 self.html_blocks[key] = html 560 text = text[:start_idx] + "\n\n" + key + "\n\n" + text[end_idx:] 561 562 return text 563 564 def _strip_link_definitions(self, text): 565 # Strips link definitions from text, stores the URLs and titles in 566 # hash references. 567 less_than_tab = self.tab_width - 1 568 569 # Link defs are in the form: 570 # [id]: url "optional title" 571 _link_def_re = re.compile(r""" 572 ^[ ]{0,%d}\[(.+)\]: # id = \1 573 [ \t]* 574 \n? # maybe *one* newline 575 [ \t]* 576 <?(.+?)>? # url = \2 577 [ \t]* 578 (?: 579 \n? # maybe one newline 580 [ \t]* 581 (?<=\s) # lookbehind for whitespace 582 ['"(] 583 ([^\n]*) # title = \3 584 ['")] 585 [ \t]* 586 )? # title is optional 587 (?:\n+|\Z) 588 """ % less_than_tab, re.X | re.M | re.U) 589 return _link_def_re.sub(self._extract_link_def_sub, text) 590 591 def _extract_link_def_sub(self, match): 592 id, url, title = match.groups() 593 key = id.lower() # Link IDs are case-insensitive 594 self.urls[key] = self._encode_amps_and_angles(url) 595 if title: 596 self.titles[key] = title.replace('"', '"') 597 return "" 598 599 def _extract_footnote_def_sub(self, match): 600 id, text = match.groups() 601 text = _dedent(text, skip_first_line=not text.startswith('\n')).strip() 602 normed_id = re.sub(r'\W', '-', id) 603 # Ensure footnote text ends with a couple newlines (for some 604 # block gamut matches). 605 self.footnotes[normed_id] = text + "\n\n" 606 return "" 607 608 def _strip_footnote_definitions(self, text): 609 """A footnote definition looks like this: 610 611 [^note-id]: Text of the note. 612 613 May include one or more indented paragraphs. 614 615 Where, 616 - The 'note-id' can be pretty much anything, though typically it 617 is the number of the footnote. 618 - The first paragraph may start on the next line, like so: 619 620 [^note-id]: 621 Text of the note. 622 """ 623 less_than_tab = self.tab_width - 1 624 footnote_def_re = re.compile(r''' 625 ^[ ]{0,%d}\[\^(.+)\]: # id = \1 626 [ \t]* 627 ( # footnote text = \2 628 # First line need not start with the spaces. 629 (?:\s*.*\n+) 630 (?: 631 (?:[ ]{%d} | \t) # Subsequent lines must be indented. 632 .*\n+ 633 )* 634 ) 635 # Lookahead for non-space at line-start, or end of doc. 636 (?:(?=^[ ]{0,%d}\S)|\Z) 637 ''' % (less_than_tab, self.tab_width, self.tab_width), 638 re.X | re.M) 639 return footnote_def_re.sub(self._extract_footnote_def_sub, text) 640 641 642 _hr_res = [ 643 re.compile(r"^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$", re.M), 644 re.compile(r"^[ ]{0,2}([ ]?\-[ ]?){3,}[ \t]*$", re.M), 645 re.compile(r"^[ ]{0,2}([ ]?\_[ ]?){3,}[ \t]*$", re.M), 646 ] 647 648 def _run_block_gamut(self, text): 649 # These are all the transformations that form block-level 650 # tags like paragraphs, headers, and list items. 651 652 text = self._do_headers(text) 653 654 # Do Horizontal Rules: 655 hr = "\n<hr"+self.empty_element_suffix+"\n" 656 for hr_re in self._hr_res: 657 text = hr_re.sub(hr, text) 658 659 text = self._do_lists(text) 660 661 if "pyshell" in self.extras: 662 text = self._prepare_pyshell_blocks(text) 663 664 text = self._do_code_blocks(text) 665 666 text = self._do_block_quotes(text) 667 668 # We already ran _HashHTMLBlocks() before, in Markdown(), but that 669 # was to escape raw HTML in the original Markdown source. This time, 670 # we're escaping the markup we've just created, so that we don't wrap 671 # <p> tags around block-level tags. 672 text = self._hash_html_blocks(text) 673 674 text = self._form_paragraphs(text) 675 676 return text 677 678 def _pyshell_block_sub(self, match): 679 lines = match.group(0).splitlines(0) 680 _dedentlines(lines) 681 indent = ' ' * self.tab_width 682 s = ('\n' # separate from possible cuddled paragraph 683 + indent + ('\n'+indent).join(lines) 684 + '\n\n') 685 return s 686 687 def _prepare_pyshell_blocks(self, text): 688 """Ensure that Python interactive shell sessions are put in 689 code blocks -- even if not properly indented. 690 """ 691 if ">>>" not in text: 692 return text 693 694 less_than_tab = self.tab_width - 1 695 _pyshell_block_re = re.compile(r""" 696 ^([ ]{0,%d})>>>[ ].*\n # first line 697 ^(\1.*\S+.*\n)* # any number of subsequent lines 698 ^\n # ends with a blank line 699 """ % less_than_tab, re.M | re.X) 700 701 return _pyshell_block_re.sub(self._pyshell_block_sub, text) 702 703 def _run_span_gamut(self, text): 704 # These are all the transformations that occur *within* block-level 705 # tags like paragraphs, headers, and list items. 706 707 text = self._do_code_spans(text) 708 709 text = self._escape_special_chars(text) 710 711 # Process anchor and image tags. 712 text = self._do_links(text) 713 714 # Make links out of things like `<http://example.com/>` 715 # Must come after _do_links(), because you can use < and > 716 # delimiters in inline links like [this](<url>). 717 text = self._do_auto_links(text) 718 719 if "link-patterns" in self.extras: 720 text = self._do_link_patterns(text) 721 722 text = self._encode_amps_and_angles(text) 723 724 text = self._do_italics_and_bold(text) 725 726 # Do hard breaks: 727 text = re.sub(r" {2,}\n", " <br%s\n" % self.empty_element_suffix, text) 728 729 return text 730 731 # "Sorta" because auto-links are identified as "tag" tokens. 732 _sorta_html_tokenize_re = re.compile(r""" 733 ( 734 # tag 735 </? 736 (?:\w+) # tag name 737 (?:\s+(?:[\w-]+:)?[\w-]+=(?:".*?"|'.*?'))* # attributes 738 \s*/?> 739 | 740 # auto-link (e.g., <http://www.activestate.com/>) 741 <\w+[^>]*> 742 | 743 <!--.*?--> # comment 744 | 745 <\?.*?\?> # processing instruction 746 ) 747 """, re.X) 748 749 def _escape_special_chars(self, text): 750 # Python markdown note: the HTML tokenization here differs from 751 # that in Markdown.pl, hence the behaviour for subtle cases can 752 # differ (I believe the tokenizer here does a better job because 753 # it isn't susceptible to unmatched '<' and '>' in HTML tags). 754 # Note, however, that '>' is not allowed in an auto-link URL 755 # here. 756 escaped = [] 757 is_html_markup = False 758 for token in self._sorta_html_tokenize_re.split(text): 759 if is_html_markup: 760 # Within tags/HTML-comments/auto-links, encode * and _ 761 # so they don't conflict with their use in Markdown for 762 # italics and strong. We're replacing each such 763 # character with its corresponding MD5 checksum value; 764 # this is likely overkill, but it should prevent us from 765 # colliding with the escape values by accident. 766 escaped.append(token.replace('*', g_escape_table['*']) 767 .replace('_', g_escape_table['_'])) 768 else: 769 escaped.append(self._encode_backslash_escapes(token)) 770 is_html_markup = not is_html_markup 771 return ''.join(escaped) 772 773 def _hash_html_spans(self, text): 774 # Used for safe_mode. 775 776 def _is_auto_link(s): 777 if ':' in s and self._auto_link_re.match(s): 778 return True 779 elif '@' in s and self._auto_email_link_re.match(s): 780 return True 781 return False 782 783 tokens = [] 784 is_html_markup = False 785 for token in self._sorta_html_tokenize_re.split(text): 786 if is_html_markup and not _is_auto_link(token): 787 sanitized = self._sanitize_html(token) 788 key = _hash_text(sanitized) 789 self.html_spans[key] = sanitized 790 tokens.append(key) 791 else: 792 tokens.append(token) 793 is_html_markup = not is_html_markup 794 return ''.join(tokens) 795 796 def _unhash_html_spans(self, text): 797 for key, sanitized in self.html_spans.items(): 798 text = text.replace(key, sanitized) 799 return text 800 801 def _sanitize_html(self, s): 802 if self.safe_mode == "replace": 803 return self.html_removed_text 804 elif self.safe_mode == "escape": 805 replacements = [ 806 ('&', '&'), 807 ('<', '<'), 808 ('>', '>'), 809 ] 810 for before, after in replacements: 811 s = s.replace(before, after) 812 return s 813 else: 814 raise MarkdownError("invalid value for 'safe_mode': %r (must be " 815 "'escape' or 'replace')" % self.safe_mode) 816 817 _tail_of_inline_link_re = re.compile(r''' 818 # Match tail of: [text](/url/) or [text](/url/ "title") 819 \( # literal paren 820 [ \t]* 821 (?P<url> # \1 822 <.*?> 823 | 824 .*? 825 ) 826 [ \t]* 827 ( # \2 828 (['"]) # quote char = \3 829 (?P<title>.*?) 830 \3 # matching quote 831 )? # title is optional 832 \) 833 ''', re.X | re.S) 834 _tail_of_reference_link_re = re.compile(r''' 835 # Match tail of: [text][id] 836 [ ]? # one optional space 837 (?:\n[ ]*)? # one optional newline followed by spaces 838 \[ 839 (?P<id>.*?) 840 \] 841 ''', re.X | re.S) 842 843 def _do_links(self, text): 844 """Turn Markdown link shortcuts into XHTML <a> and <img> tags. 845 846 This is a combination of Markdown.pl's _DoAnchors() and 847 _DoImages(). They are done together because that simplified the 848 approach. It was necessary to use a different approach than 849 Markdown.pl because of the lack of atomic matching support in 850 Python's regex engine used in $g_nested_brackets. 851 """ 852 MAX_LINK_TEXT_SENTINEL = 300 853 854 # `anchor_allowed_pos` is used to support img links inside 855 # anchors, but not anchors inside anchors. An anchor's start 856 # pos must be `>= anchor_allowed_pos`. 857 anchor_allowed_pos = 0 858 859 curr_pos = 0 860 while True: # Handle the next link. 861 # The next '[' is the start of: 862 # - an inline anchor: [text](url "title") 863 # - a reference anchor: [text][id] 864 # - an inline img: ![text](url "title") 865 # - a reference img: ![text][id] 866 # - a footnote ref: [^id] 867 # (Only if 'footnotes' extra enabled) 868 # - a footnote defn: [^id]: ... 869 # (Only if 'footnotes' extra enabled) These have already 870 # been stripped in _strip_footnote_definitions() so no 871 # need to watch for them. 872 # - a link definition: [id]: url "title" 873 # These have already been stripped in 874 # _strip_link_definitions() so no need to watch for them. 875 # - not markup: [...anything else... 876 try: 877 start_idx = text.index('[', curr_pos) 878 except ValueError: 879 break 880 text_length = len(text) 881 882 # Find the matching closing ']'. 883 # Markdown.pl allows *matching* brackets in link text so we 884 # will here too. Markdown.pl *doesn't* currently allow 885 # matching brackets in img alt text -- we'll differ in that 886 # regard. 887 bracket_depth = 0 888 for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL, 889 text_length)): 890 ch = text[p] 891 if ch == ']': 892 bracket_depth -= 1 893 if bracket_depth < 0: 894 break 895 elif ch == '[': 896 bracket_depth += 1 897 else: 898 # Closing bracket not found within sentinel length. 899 # This isn't markup. 900 curr_pos = start_idx + 1 901 continue 902 link_text = text[start_idx+1:p] 903 904 # Possibly a footnote ref? 905 if "footnotes" in self.extras and link_text.startswith("^"): 906 normed_id = re.sub(r'\W', '-', link_text[1:]) 907 if normed_id in self.footnotes: 908 self.footnote_ids.append(normed_id) 909 result = '<sup class="footnote-ref" id="fnref-%s">' \ 910 '<a href="#fn-%s">%s</a></sup>' \ 911 % (normed_id, normed_id, len(self.footnote_ids)) 912 text = text[:start_idx] + result + text[p+1:] 913 else: 914 # This id isn't defined, leave the markup alone. 915 curr_pos = p+1 916 continue 917 918 # Now determine what this is by the remainder. 919 p += 1 920 if p == text_length: 921 return text 922 923 # Inline anchor or img? 924 if text[p] == '(': # attempt at perf improvement 925 match = self._tail_of_inline_link_re.match(text, p) 926 if match: 927 # Handle an inline anchor or img. 928 is_img = start_idx > 0 and text[start_idx-1] == "!" 929 if is_img: 930 start_idx -= 1 931 932 url, title = match.group("url"), match.group("title") 933 if url and url[0] == '<': 934 url = url[1:-1] # '<url>' -> 'url' 935 # We've got to encode these to avoid conflicting 936 # with italics/bold. 937 url = url.replace('*', g_escape_table['*']) \ 938 .replace('_', g_escape_table['_']) 939 if title: 940 title_str = ' title="%s"' \ 941 % title.replace('*', g_escape_table['*']) \ 942 .replace('_', g_escape_table['_']) \ 943 .replace('"', '"') 944 else: 945 title_str = '' 946 if is_img: 947 result = '<img src="%s" alt="%s"%s%s' \ 948 % (url, link_text.replace('"', '"'), 949 title_str, self.empty_element_suffix) 950 curr_pos = start_idx + len(result) 951 text = text[:start_idx] + result + text[match.end():] 952 elif start_idx >= anchor_allowed_pos: 953 result_head = '<a href="%s"%s>' % (url, title_str) 954 result = '%s%s</a>' % (result_head, link_text) 955 # <img> allowed from curr_pos on, <a> from 956 # anchor_allowed_pos on. 957 curr_pos = start_idx + len(result_head) 958 anchor_allowed_pos = start_idx + len(result) 959 text = text[:start_idx] + result + text[match.end():] 960 else: 961 # Anchor not allowed here. 962 curr_pos = start_idx + 1 963 continue 964 965 # Reference anchor or img? 966 else: 967 match = self._tail_of_reference_link_re.match(text, p) 968 if match: 969 # Handle a reference-style anchor or img. 970 is_img = start_idx > 0 and text[start_idx-1] == "!" 971 if is_img: 972 start_idx -= 1 973 link_id = match.group("id").lower() 974 if not link_id: 975 link_id = link_text.lower() # for links like [this][] 976 if link_id in self.urls: 977 url = self.urls[link_id] 978 # We've got to encode these to avoid conflicting 979 # with italics/bold. 980 url = url.replace('*', g_escape_table['*']) \ 981 .replace('_', g_escape_table['_']) 982 title = self.titles.get(link_id) 983 if title: 984 title = title.replace('*', g_escape_table['*']) \ 985 .replace('_', g_escape_table['_']) 986 title_str = ' title="%s"' % title 987 else: 988 title_str = '' 989 if is_img: 990 result = '<img src="%s" alt="%s"%s%s' \ 991 % (url, link_text.replace('"', '"'), 992 title_str, self.empty_element_suffix) 993 curr_pos = start_idx + len(result) 994 text = text[:start_idx] + result + text[match.end():] 995 elif start_idx >= anchor_allowed_pos: 996 result = '<a href="%s"%s>%s</a>' \ 997 % (url, title_str, link_text) 998 result_head = '<a href="%s"%s>' % (url, title_str) 999 result = '%s%s</a>' % (result_head, link_text) 1000 # <img> allowed from curr_pos on, <a> from 1001 # anchor_allowed_pos on. 1002 curr_pos = start_idx + len(result_head) 1003 anchor_allowed_pos = start_idx + len(result) 1004 text = text[:start_idx] + result + text[match.end():] 1005 else: 1006 # Anchor not allowed here. 1007 curr_pos = start_idx + 1 1008 else: 1009 # This id isn't defined, leave the markup alone. 1010 curr_pos = match.end() 1011 continue 1012 1013 # Otherwise, it isn't markup. 1014 curr_pos = start_idx + 1 1015 1016 return text 1017 1018 1019 _setext_h_re = re.compile(r'^(.+)[ \t]*\n(=+|-+)[ \t]*\n+', re.M) 1020 def _setext_h_sub(self, match): 1021 n = {"=": 1, "-": 2}[match.group(2)[0]] 1022 demote_headers = self.extras.get("demote-headers") 1023 if demote_headers: 1024 n = min(n + demote_headers, 6) 1025 return "<h%d>%s</h%d>\n\n" \ 1026 % (n, self._run_span_gamut(match.group(1)), n) 1027 1028 _atx_h_re = re.compile(r''' 1029 ^(\#{1,6}) # \1 = string of #'s 1030 [ \t]* 1031 (.+?) # \2 = Header text 1032 [ \t]* 1033 (?<!\\) # ensure not an escaped trailing '#' 1034 \#* # optional closing #'s (not counted) 1035 \n+ 1036 ''', re.X | re.M) 1037 def _atx_h_sub(self, match): 1038 n = len(match.group(1)) 1039 demote_headers = self.extras.get("demote-headers") 1040 if demote_headers: 1041 n = min(n + demote_headers, 6) 1042 return "<h%d>%s</h%d>\n\n" \ 1043 % (n, self._run_span_gamut(match.group(2)), n) 1044 1045 def _do_headers(self, text): 1046 # Setext-style headers: 1047 # Header 1 1048 # ======== 1049 # 1050 # Header 2 1051 # -------- 1052 text = self._setext_h_re.sub(self._setext_h_sub, text) 1053 1054 # atx-style headers: 1055 # # Header 1 1056 # ## Header 2 1057 # ## Header 2 with closing hashes ## 1058 # ... 1059 # ###### Header 6 1060 text = self._atx_h_re.sub(self._atx_h_sub, text) 1061 1062 return text 1063 1064 1065 _marker_ul_chars = '*+-' 1066 _marker_any = r'(?:[%s]|\d+\.)' % _marker_ul_chars 1067 _marker_ul = '(?:[%s])' % _marker_ul_chars 1068 _marker_ol = r'(?:\d+\.)' 1069 1070 def _list_sub(self, match): 1071 lst = match.group(1) 1072 lst_type = match.group(3) in self._marker_ul_chars and "ul" or "ol" 1073 result = self._process_list_items(lst) 1074 if self.list_level: 1075 return "<%s>\n%s</%s>\n" % (lst_type, result, lst_type) 1076 else: 1077 return "<%s>\n%s</%s>\n\n" % (lst_type, result, lst_type) 1078 1079 def _do_lists(self, text): 1080 # Form HTML ordered (numbered) and unordered (bulleted) lists. 1081 1082 for marker_pat in (self._marker_ul, self._marker_ol): 1083 # Re-usable pattern to match any entire ul or ol list: 1084 less_than_tab = self.tab_width - 1 1085 whole_list = r''' 1086 ( # \1 = whole list 1087 ( # \2 1088 [ ]{0,%d} 1089 (%s) # \3 = first list item marker 1090 [ \t]+ 1091 ) 1092 (?:.+?) 1093 ( # \4 1094 \Z 1095 | 1096 \n{2,} 1097 (?=\S) 1098 (?! # Negative lookahead for another list item marker 1099 [ \t]* 1100 %s[ \t]+ 1101 ) 1102 ) 1103 ) 1104 ''' % (less_than_tab, marker_pat, marker_pat) 1105 1106 # We use a different prefix before nested lists than top-level lists. 1107 # See extended comment in _process_list_items(). 1108 # 1109 # Note: There's a bit of duplication here. My original implementation 1110 # created a scalar regex pattern as the conditional result of the test on 1111 # $g_list_level, and then only ran the $text =~ s{...}{...}egmx 1112 # substitution once, using the scalar as the pattern. This worked, 1113 # everywhere except when running under MT on my hosting account at Pair 1114 # Networks. There, this caused all rebuilds to be killed by the reaper (or 1115 # perhaps they crashed, but that seems incredibly unlikely given that the 1116 # same script on the same server ran fine *except* under MT. I've spent 1117 # more time trying to figure out why this is happening than I'd like to 1118 # admit. My only guess, backed up by the fact that this workaround works, 1119 # is that Perl optimizes the substition when it can figure out that the 1120 # pattern will never change, and when this optimization isn't on, we run 1121 # afoul of the reaper. Thus, the slightly redundant code to that uses two 1122 # static s/// patterns rather than one conditional pattern. 1123 1124 if self.list_level: 1125 sub_list_re = re.compile("^"+whole_list, re.X | re.M | re.S) 1126 text = sub_list_re.sub(self._list_sub, text) 1127 else: 1128 list_re = re.compile(r"(?:(?<=\n\n)|\A\n?)"+whole_list, 1129 re.X | re.M | re.S) 1130 text = list_re.sub(self._list_sub, text) 1131 1132 return text 1133 1134 _list_item_re = re.compile(r''' 1135 (\n)? # leading line = \1 1136 (^[ \t]*) # leading whitespace = \2 1137 (%s) [ \t]+ # list marker = \3 1138 ((?:.+?) # list item text = \4 1139 (\n{1,2})) # eols = \5 1140 (?= \n* (\Z | \2 (%s) [ \t]+)) 1141 ''' % (_marker_any, _marker_any), 1142 re.M | re.X | re.S) 1143 1144 _last_li_endswith_two_eols = False 1145 def _list_item_sub(self, match): 1146 item = match.group(4) 1147 leading_line = match.group(1) 1148 leading_space = match.group(2) 1149 if leading_line or "\n\n" in item or self._last_li_endswith_two_eols: 1150 item = self._run_block_gamut(self._outdent(item)) 1151 else: 1152 # Recursion for sub-lists: 1153 item = self._do_lists(self._outdent(item)) 1154 if item.endswith('\n'): 1155 item = item[:-1] 1156 item = self._run_span_gamut(item) 1157 self._last_li_endswith_two_eols = (len(match.group(5)) == 2) 1158 return "<li>%s</li>\n" % item 1159 1160 def _process_list_items(self, list_str): 1161 # Process the contents of a single ordered or unordered list, 1162 # splitting it into individual list items. 1163 1164 # The $g_list_level global keeps track of when we're inside a list. 1165 # Each time we enter a list, we increment it; when we leave a list, 1166 # we decrement. If it's zero, we're not in a list anymore. 1167 # 1168 # We do this because when we're not inside a list, we want to treat 1169 # something like this: 1170 # 1171 # I recommend upgrading to version 1172 # 8. Oops, now this line is treated 1173 # as a sub-list. 1174 # 1175 # As a single paragraph, despite the fact that the second line starts 1176 # with a digit-period-space sequence. 1177 # 1178 # Whereas when we're inside a list (or sub-list), that line will be 1179 # treated as the start of a sub-list. What a kludge, huh? This is 1180 # an aspect of Markdown's syntax that's hard to parse perfectly 1181 # without resorting to mind-reading. Perhaps the solution is to 1182 # change the syntax rules such that sub-lists must start with a 1183 # starting cardinal number; e.g. "1." or "a.". 1184 self.list_level += 1 1185 self._last_li_endswith_two_eols = False 1186 list_str = list_str.rstrip('\n') + '\n' 1187 list_str = self._list_item_re.sub(self._list_item_sub, list_str) 1188 self.list_level -= 1 1189 return list_str 1190 1191 def _get_pygments_lexer(self, lexer_name): 1192 try: 1193 from pygments import lexers, util 1194 except ImportError: 1195 return None 1196 try: 1197 return lexers.get_lexer_by_name(lexer_name) 1198 except util.ClassNotFound: 1199 return None 1200 1201 def _color_with_pygments(self, codeblock, lexer): 1202 import pygments 1203 import pygments.formatters 1204 1205 class HtmlCodeFormatter(pygments.formatters.HtmlFormatter): 1206 def _wrap_code(self, inner): 1207 """A function for use in a Pygments Formatter which 1208 wraps in <code> tags. 1209 """ 1210 yield 0, "<code>" 1211 for tup in inner: 1212 yield tup 1213 yield 0, "</code>" 1214 1215 def wrap(self, source, outfile): 1216 """Return the source with a code, pre, and div.""" 1217 return self._wrap_div(self._wrap_pre(self._wrap_code(source))) 1218 1219 formatter = HtmlCodeFormatter(cssclass="codehilite") 1220 return pygments.highlight(codeblock, lexer, formatter) 1221 1222 def _code_block_sub(self, match): 1223 codeblock = match.group(1) 1224 codeblock = self._outdent(codeblock) 1225 codeblock = self._detab(codeblock) 1226 codeblock = codeblock.lstrip('\n') # trim leading newlines 1227 codeblock = codeblock.rstrip() # trim trailing whitespace 1228 1229 if "code-color" in self.extras and codeblock.startswith(":::"): 1230 lexer_name, rest = codeblock.split('\n', 1) 1231 lexer_name = lexer_name[3:].strip() 1232 lexer = self._get_pygments_lexer(lexer_name) 1233 codeblock = rest.lstrip("\n") # Remove lexer declaration line. 1234 if lexer: 1235 colored = self._color_with_pygments(codeblock, lexer) 1236 return "\n\n%s\n\n" % colored 1237 1238 codeblock = self._encode_code(codeblock) 1239 return "\n\n<pre><code>%s\n</code></pre>\n\n" % codeblock 1240 1241 def _do_code_blocks(self, text): 1242 """Process Markdown `<pre><code>` blocks.""" 1243 code_block_re = re.compile(r''' 1244 (?:\n\n|\A) 1245 ( # $1 = the code block -- one or more lines, starting with a space/tab 1246 (?: 1247 (?:[ ]{%d} | \t) # Lines must start with a tab or a tab-width of spaces 1248 .*\n+ 1249 )+ 1250 ) 1251 ((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space at line-start, or end of doc 1252 ''' % (self.tab_width, self.tab_width), 1253 re.M | re.X) 1254 1255 return code_block_re.sub(self._code_block_sub, text) 1256 1257 1258 # Rules for a code span: 1259 # - backslash escapes are not interpreted in a code span 1260 # - to include one or or a run of more backticks the delimiters must 1261 # be a longer run of backticks 1262 # - cannot start or end a code span with a backtick; pad with a 1263 # space and that space will be removed in the emitted HTML 1264 # See `test/tm-cases/escapes.text` for a number of edge-case 1265 # examples. 1266 _code_span_re = re.compile(r''' 1267 (?<!\\) 1268 (`+) # \1 = Opening run of ` 1269 (?!`) # See Note A test/tm-cases/escapes.text 1270 (.+?) # \2 = The code block 1271 (?<!`) 1272 \1 # Matching closer 1273 (?!`) 1274 ''', re.X | re.S) 1275 1276 def _code_span_sub(self, match): 1277 c = match.group(2).strip(" \t") 1278 c = self._encode_code(c) 1279 return "<code>%s</code>" % c 1280 1281 def _do_code_spans(self, text): 1282 # * Backtick quotes are used for <code></code> spans. 1283 # 1284 # * You can use multiple backticks as the delimiters if you want to 1285 # include literal backticks in the code span. So, this input: 1286 # 1287 # Just type ``foo `bar` baz`` at the prompt. 1288 # 1289 # Will translate to: 1290 # 1291 # <p>Just type <code>foo `bar` baz</code> at the prompt.</p> 1292 # 1293 # There's no arbitrary limit to the number of backticks you 1294 # can use as delimters. If you need three consecutive backticks 1295 # in your code, use four for delimiters, etc. 1296 # 1297 # * You can use spaces to get literal backticks at the edges: 1298 # 1299 # ... type `` `bar` `` ... 1300 # 1301 # Turns to: 1302 # 1303 # ... type <code>`bar`</code> ... 1304 return self._code_span_re.sub(self._code_span_sub, text) 1305 1306 def _encode_code(self, text): 1307 """Encode/escape certain characters inside Markdown code runs. 1308 The point is that in code, these characters are literals, 1309 and lose their special Markdown meanings. 1310 """ 1311 replacements = [ 1312 # Encode all ampersands; HTML entities are not 1313 # entities within a Markdown code span. 1314 ('&', '&'), 1315 # Do the angle bracket song and dance: 1316 ('<', '<'), 1317 ('>', '>'), 1318 # Now, escape characters that are magic in Markdown: 1319 ('*', g_escape_table['*']), 1320 ('_', g_escape_table['_']), 1321 ('{', g_escape_table['{']), 1322 ('}', g_escape_table['}']), 1323 ('[', g_escape_table['[']), 1324 (']', g_escape_table[']']), 1325 ('\\', g_escape_table['\\']), 1326 ] 1327 for before, after in replacements: 1328 text = text.replace(before, after) 1329 return text 1330 1331 _strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]*)(?<=\S)\1", re.S) 1332 _em_re = re.compile(r"(\*|_)(?=\S)(.+?)(?<=\S)\1", re.S) 1333 _code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S) 1334 _code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S) 1335 def _do_italics_and_bold(self, text): 1336 # <strong> must go first: 1337 if "code-friendly" in self.extras: 1338 text = self._code_friendly_strong_re.sub(r"<strong>\1</strong>", text) 1339 text = self._code_friendly_em_re.sub(r"<em>\1</em>", text) 1340 else: 1341 text = self._strong_re.sub(r"<strong>\2</strong>", text) 1342 text = self._em_re.sub(r"<em>\2</em>", text) 1343 return text 1344 1345 1346 _block_quote_re = re.compile(r''' 1347 ( # Wrap whole match in \1 1348 ( 1349 ^[ \t]*>[ \t]? # '>' at the start of a line 1350 .+\n # rest of the first line 1351 (.+\n)* # subsequent consecutive lines 1352 \n* # blanks 1353 )+ 1354 ) 1355 ''', re.M | re.X) 1356 _bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M); 1357 1358 _html_pre_block_re = re.compile(r'(\s*<pre>.+?</pre>)', re.S) 1359 def _dedent_two_spaces_sub(self, match): 1360 return re.sub(r'(?m)^ ', '', match.group(1)) 1361 1362 def _block_quote_sub(self, match): 1363 bq = match.group(1) 1364 bq = self._bq_one_level_re.sub('', bq) # trim one level of quoting 1365 bq = self._ws_only_line_re.sub('', bq) # trim whitespace-only lines 1366 bq = self._run_block_gamut(bq) # recurse 1367 1368 bq = re.sub('(?m)^', ' ', bq) 1369 # These leading spaces screw with <pre> content, so we need to fix that: 1370 bq = self._html_pre_block_re.sub(self._dedent_two_spaces_sub, bq) 1371 1372 return "<blockquote>\n%s\n</blockquote>\n\n" % bq 1373 1374 def _do_block_quotes(self, text): 1375 if '>' not in text: 1376 return text 1377 return self._block_quote_re.sub(self._block_quote_sub, text) 1378 1379 def _form_paragraphs(self, text): 1380 # Strip leading and trailing lines: 1381 text = text.strip('\n') 1382 1383 # Wrap <p> tags. 1384 grafs = re.split(r"\n{2,}", text) 1385 for i, graf in enumerate(grafs): 1386 if graf in self.html_blocks: 1387 # Unhashify HTML blocks 1388 grafs[i] = self.html_blocks[graf] 1389 else: 1390 # Wrap <p> tags. 1391 graf = self._run_span_gamut(graf) 1392 grafs[i] = "<p>" + graf.lstrip(" \t") + "</p>" 1393 1394 return "\n\n".join(grafs) 1395 1396 def _add_footnotes(self, text): 1397 if self.footnotes: 1398 footer = [ 1399 '<div class="footnotes">', 1400 '<hr' + self.empty_element_suffix, 1401 '<ol>', 1402 ] 1403 for i, id in enumerate(self.footnote_ids): 1404 if i != 0: 1405 footer.append('') 1406 footer.append('<li id="fn-%s">' % id) 1407 footer.append(self._run_block_gamut(self.footnotes[id])) 1408 backlink = ('<a href="#fnref-%s" ' 1409 'class="footnoteBackLink" ' 1410 'title="Jump back to footnote %d in the text.">' 1411 '↩</a>' % (id, i+1)) 1412 if footer[-1].endswith("</p>"): 1413 footer[-1] = footer[-1][:-len("</p>")] \ 1414 + ' ' + backlink + "</p>" 1415 else: 1416 footer.append("\n<p>%s</p>" % backlink) 1417 footer.append('</li>') 1418 footer.append('</ol>') 1419 footer.append('</div>') 1420 return text + '\n\n' + '\n'.join(footer) 1421 else: 1422 return text 1423 1424 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin: 1425 # http://bumppo.net/projects/amputator/ 1426 _ampersand_re = re.compile(r'&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)') 1427 _naked_lt_re = re.compile(r'<(?![a-z/?\$!])', re.I) 1428 1429 def _encode_amps_and_angles(self, text): 1430 # Smart processing for ampersands and angle brackets that need 1431 # to be encoded. 1432 text = self._ampersand_re.sub('&', text) 1433 1434 # Encode naked <'s 1435 text = self._naked_lt_re.sub('<', text) 1436 return text 1437 1438 def _encode_backslash_escapes(self, text): 1439 for ch, escape in g_escape_table.items(): 1440 text = text.replace("\\"+ch, escape) 1441 return text 1442 1443 _auto_link_re = re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I) 1444 def _auto_link_sub(self, match): 1445 g1 = match.group(1) 1446 return '<a href="%s">%s</a>' % (g1, g1) 1447 1448 _auto_email_link_re = re.compile(r""" 1449 < 1450 (?:mailto:)? 1451 ( 1452 [-.\w]+ 1453 \@ 1454 [-\w]+(\.[-\w]+)*\.[a-zA-Z]+ 1455 ) 1456 > 1457 """, re.I | re.X | re.U) 1458 def _auto_email_link_sub(self, match): 1459 return self._encode_email_address( 1460 self._unescape_special_chars(match.group(1))) 1461 1462 def _do_auto_links(self, text): 1463 text = self._auto_link_re.sub(self._auto_link_sub, text) 1464 text = self._auto_email_link_re.sub(self._auto_email_link_sub, text) 1465 return text 1466 1467 def _encode_email_address(self, addr): 1468 # Input: an email address, e.g. "foo@example.com" 1469 # 1470 # Output: the email address as a mailto link, with each character 1471 # of the address encoded as either a decimal or hex entity, in 1472 # the hopes of foiling most address harvesting spam bots. E.g.: 1473 # 1474 # <a href="mailto:foo@e 1475 # xample.com">foo 1476 # @example.com</a> 1477 # 1478 # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk 1479 # mailing list: <http://tinyurl.com/yu7ue> 1480 chars = [_xml_encode_email_char_at_random(ch) 1481 for ch in "mailto:" + addr] 1482 # Strip the mailto: from the visible part. 1483 addr = '<a href="%s">%s</a>' \ 1484 % (''.join(chars), ''.join(chars[7:])) 1485 return addr 1486 1487 def _do_link_patterns(self, text): 1488 """Caveat emptor: there isn't much guarding against link 1489 patterns being formed inside other standard Markdown links, e.g. 1490 inside a [link def][like this]. 1491 1492 Dev Notes: *Could* consider prefixing regexes with a negative 1493 lookbehind assertion to attempt to guard against this. 1494 """ 1495 link_from_hash = {} 1496 for regex, href in self.link_patterns: 1497 replacements = [] 1498 for match in regex.finditer(text): 1499 replacements.append((match.span(), match.expand(href))) 1500 for (start, end), href in reversed(replacements): 1501 escaped_href = ( 1502 href.replace('"', '"') # b/c of attr quote 1503 # To avoid markdown <em> and <strong>: 1504 .replace('*', g_escape_table['*']) 1505 .replace('_', g_escape_table['_'])) 1506 link = '<a href="%s">%s</a>' % (escaped_href, text[start:end]) 1507 hash = md5(link).hexdigest() 1508 link_from_hash[hash] = link 1509 text = text[:start] + hash + text[end:] 1510 for hash, link in link_from_hash.items(): 1511 text = text.replace(hash, link) 1512 return text 1513 1514 def _unescape_special_chars(self, text): 1515 # Swap back in all the special characters we've hidden. 1516 for ch, hash in g_escape_table.items(): 1517 text = text.replace(hash, ch) 1518 return text 1519 1520 def _outdent(self, text): 1521 # Remove one level of line-leading tabs or spaces 1522 return self._outdent_re.sub('', text) 1523 1524 1525 class MarkdownWithExtras(Markdown): 1526 """A markdowner class that enables most extras: 1527 1528 - footnotes 1529 - code-color (only has effect if 'pygments' Python module on path) 1530 1531 These are not included: 1532 - pyshell (specific to Python-related documenting) 1533 - code-friendly (because it *disables* part of the syntax) 1534 - link-patterns (because you need to specify some actual 1535 link-patterns anyway) 1536 """ 1537 extras = ["footnotes", "code-color"] 1538 1539 1540 #---- internal support functions 1541 1542 # From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52549 1543 def _curry(*args, **kwargs): 1544 function, args = args[0], args[1:] 1545 def result(*rest, **kwrest): 1546 combined = kwargs.copy() 1547 combined.update(kwrest) 1548 return function(*args + rest, **combined) 1549 return result 1550 1551 # Recipe: regex_from_encoded_pattern (1.0) 1552 def _regex_from_encoded_pattern(s): 1553 """'foo' -> re.compile(re.escape('foo')) 1554 '/foo/' -> re.compile('foo') 1555 '/foo/i' -> re.compile('foo', re.I) 1556 """ 1557 if s.startswith('/') and s.rfind('/') != 0: 1558 # Parse it: /PATTERN/FLAGS 1559 idx = s.rfind('/') 1560 pattern, flags_str = s[1:idx], s[idx+1:] 1561 flag_from_char = { 1562 "i": re.IGNORECASE, 1563 "l": re.LOCALE, 1564 "s": re.DOTALL, 1565 "m": re.MULTILINE, 1566 "u": re.UNICODE, 1567 } 1568 flags = 0 1569 for char in flags_str: 1570 try: 1571 flags |= flag_from_char[char] 1572 except KeyError: 1573 raise ValueError("unsupported regex flag: '%s' in '%s' " 1574 "(must be one of '%s')" 1575 % (char, s, ''.join(flag_from_char.keys()))) 1576 return re.compile(s[1:idx], flags) 1577 else: # not an encoded regex 1578 return re.compile(re.escape(s)) 1579 1580 # Recipe: dedent (0.1.2) 1581 def _dedentlines(lines, tabsize=8, skip_first_line=False): 1582 """_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines 1583 1584 "lines" is a list of lines to dedent. 1585 "tabsize" is the tab width to use for indent width calculations. 1586 "skip_first_line" is a boolean indicating if the first line should 1587 be skipped for calculating the indent width and for dedenting. 1588 This is sometimes useful for docstrings and similar. 1589 1590 Same as dedent() except operates on a sequence of lines. Note: the 1591 lines list is modified **in-place**. 1592 """ 1593 DEBUG = False 1594 if DEBUG: 1595 print "dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\ 1596 % (tabsize, skip_first_line) 1597 indents = [] 1598 margin = None 1599 for i, line in enumerate(lines): 1600 if i == 0 and skip_first_line: continue 1601 indent = 0 1602 for ch in line: 1603 if ch == ' ': 1604 indent += 1 1605 elif ch == '\t': 1606 indent += tabsize - (indent % tabsize) 1607 elif ch in '\r\n': 1608 continue # skip all-whitespace lines 1609 else: 1610 break 1611 else: 1612 continue # skip all-whitespace lines 1613 if DEBUG: print "dedent: indent=%d: %r" % (indent, line) 1614 if margin is None: 1615 margin = indent 1616 else: 1617 margin = min(margin, indent) 1618 if DEBUG: print "dedent: margin=%r" % margin 1619 1620 if margin is not None and margin > 0: 1621 for i, line in enumerate(lines): 1622 if i == 0 and skip_first_line: continue 1623 removed = 0 1624 for j, ch in enumerate(line): 1625 if ch == ' ': 1626 removed += 1 1627 elif ch == '\t': 1628 removed += tabsize - (removed % tabsize) 1629 elif ch in '\r\n': 1630 if DEBUG: print "dedent: %r: EOL -> strip up to EOL" % line 1631 lines[i] = lines[i][j:] 1632 break 1633 else: 1634 raise ValueError("unexpected non-whitespace char %r in " 1635 "line %r while removing %d-space margin" 1636 % (ch, line, margin)) 1637 if DEBUG: 1638 print "dedent: %r: %r -> removed %d/%d"\ 1639 % (line, ch, removed, margin) 1640 if removed == margin: 1641 lines[i] = lines[i][j+1:] 1642 break 1643 elif removed > margin: 1644 lines[i] = ' '*(removed-margin) + lines[i][j+1:] 1645 break 1646 else: 1647 if removed: 1648 lines[i] = lines[i][removed:] 1649 return lines 1650 1651 def _dedent(text, tabsize=8, skip_first_line=False): 1652 """_dedent(text, tabsize=8, skip_first_line=False) -> dedented text 1653 1654 "text" is the text to dedent. 1655 "tabsize" is the tab width to use for indent width calculations. 1656 "skip_first_line" is a boolean indicating if the first line should 1657 be skipped for calculating the indent width and for dedenting. 1658 This is sometimes useful for docstrings and similar. 1659 1660 textwrap.dedent(s), but don't expand tabs to spaces 1661 """ 1662 lines = text.splitlines(1) 1663 _dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line) 1664 return ''.join(lines) 1665 1666 1667 class _memoized(object): 1668 """Decorator that caches a function's return value each time it is called. 1669 If called later with the same arguments, the cached value is returned, and 1670 not re-evaluated. 1671 1672 http://wiki.python.org/moin/PythonDecoratorLibrary 1673 """ 1674 def __init__(self, func): 1675 self.func = func 1676 self.cache = {} 1677 def __call__(self, *args): 1678 try: 1679 return self.cache[args] 1680 except KeyError: 1681 self.cache[args] = value = self.func(*args) 1682 return value 1683 except TypeError: 1684 # uncachable -- for instance, passing a list as an argument. 1685 # Better to not cache than to blow up entirely. 1686 return self.func(*args) 1687 def __repr__(self): 1688 """Return the function's docstring.""" 1689 return self.func.__doc__ 1690 1691 1692 def _hr_tag_re_from_tab_width(tab_width): 1693 return re.compile(r""" 1694 (?: 1695 (?<=\n\n) # Starting after a blank line 1696 | # or 1697 \A\n? # the beginning of the doc 1698 ) 1699 ( # save in \1 1700 [ ]{0,%d} 1701 <(hr) # start tag = \2 1702 \b # word break 1703 ([^<>])*? # 1704 /?> # the matching end tag 1705 [ \t]* 1706 (?=\n{2,}|\Z) # followed by a blank line or end of document 1707 ) 1708 """ % (tab_width - 1), re.X) 1709 _hr_tag_re_from_tab_width = _memoized(_hr_tag_re_from_tab_width) 1710 1711 1712 def _xml_encode_email_char_at_random(ch): 1713 r = random() 1714 # Roughly 10% raw, 45% hex, 45% dec. 1715 # '@' *must* be encoded. I [John Gruber] insist. 1716 if r > 0.9 and ch != "@": 1717 return ch 1718 elif r < 0.45: 1719 # The [1:] is to drop leading '0': 0x63 -> x63 1720 return '&#%s;' % hex(ord(ch))[1:] 1721 else: 1722 return '&#%s;' % ord(ch) 1723 1724 def _hash_text(text): 1725 return 'md5:'+md5(text.encode("utf-8")).hexdigest() 1726 1727 1728 #---- mainline 1729 1730 class _NoReflowFormatter(optparse.IndentedHelpFormatter): 1731 """An optparse formatter that does NOT reflow the description.""" 1732 def format_description(self, description): 1733 return description or "" 1734 1735 def _test(): 1736 import doctest 1737 doctest.testmod() 1738 1739 def main(argv=sys.argv): 1740 if not logging.root.handlers: 1741 logging.basicConfig() 1742 1743 usage = "usage: %prog [PATHS...]" 1744 version = "%prog "+__version__ 1745 parser = optparse.OptionParser(prog="markdown2", usage=usage, 1746 version=version, description=cmdln_desc, 1747 formatter=_NoReflowFormatter()) 1748 parser.add_option("-v", "--verbose", dest="log_level", 1749 action="store_const", const=logging.DEBUG, 1750 help="more verbose output") 1751 parser.add_option("--encoding", 1752 help="specify encoding of text content") 1753 parser.add_option("--html4tags", action="store_true", default=False, 1754 help="use HTML 4 style for empty element tags") 1755 parser.add_option("-s", "--safe", metavar="MODE", dest="safe_mode", 1756 help="sanitize literal HTML: 'escape' escapes " 1757 "HTML meta chars, 'replace' replaces with an " 1758 "[HTML_REMOVED] note") 1759 parser.add_option("-x", "--extras", action="append", 1760 help="Turn on specific extra features (not part of " 1761 "the core Markdown spec). Supported values: " 1762 "'code-friendly' disables _/__ for emphasis; " 1763 "'code-color' adds code-block syntax coloring; " 1764 "'link-patterns' adds auto-linking based on patterns; " 1765 "'footnotes' adds the footnotes syntax;" 1766 "'pyshell' to put unindented Python interactive shell sessions in a <code> block.") 1767 parser.add_option("--use-file-vars", 1768 help="Look for and use Emacs-style 'markdown-extras' " 1769 "file var to turn on extras. See " 1770 "<http://code.google.com/p/python-markdown2/wiki/Extras>.") 1771 parser.add_option("--link-patterns-file", 1772 help="path to a link pattern file") 1773 parser.add_option("--self-test", action="store_true", 1774 help="run internal self-tests (some doctests)") 1775 parser.add_option("--compare", action="store_true", 1776 help="run against Markdown.pl as well (for testing)") 1777 parser.set_defaults(log_level=logging.INFO, compare=False, 1778 encoding="utf-8", safe_mode=None, use_file_vars=False) 1779 opts, paths = parser.parse_args() 1780 log.setLevel(opts.log_level) 1781 1782 if opts.self_test: 1783 return _test() 1784 1785 if opts.extras: 1786 extras = {} 1787 for s in opts.extras: 1788 splitter = re.compile("[,;: ]+") 1789 for e in splitter.split(s): 1790 if '=' in e: 1791 ename, earg = e.split('=', 1) 1792 try: 1793 earg = int(earg) 1794 except ValueError: 1795 pass 1796 else: 1797 ename, earg = e, None 1798 extras[ename] = earg 1799 else: 1800 extras = None 1801 1802 if opts.link_patterns_file: 1803 link_patterns = [] 1804 f = open(opts.link_patterns_file) 1805 try: 1806 for i, line in enumerate(f.readlines()): 1807 if not line.strip(): continue 1808 if line.lstrip().startswith("#"): continue 1809 try: 1810 pat, href = line.rstrip().rsplit(None, 1) 1811 except ValueError: 1812 raise MarkdownError("%s:%d: invalid link pattern line: %r" 1813 % (opts.link_patterns_file, i+1, line)) 1814 link_patterns.append( 1815 (_regex_from_encoded_pattern(pat), href)) 1816 finally: 1817 f.close() 1818 else: 1819 link_patterns = None 1820 1821 from os.path import join, dirname, abspath 1822 markdown_pl = join(dirname(dirname(abspath(__file__))), "test", 1823 "Markdown.pl") 1824 for path in paths: 1825 if opts.compare: 1826 print "==== Markdown.pl ====" 1827 perl_cmd = 'perl %s "%s"' % (markdown_pl, path) 1828 o = os.popen(perl_cmd) 1829 perl_html = o.read() 1830 o.close() 1831 sys.stdout.write(perl_html) 1832 print "==== markdown2.py ====" 1833 html = markdown_path(path, encoding=opts.encoding, 1834 html4tags=opts.html4tags, 1835 safe_mode=opts.safe_mode, 1836 extras=extras, link_patterns=link_patterns, 1837 use_file_vars=opts.use_file_vars) 1838 sys.stdout.write( 1839 html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace')) 1840 if opts.compare: 1841 print "==== match? %r ====" % (perl_html == html) 1842 1843 1844 if __name__ == "__main__": 1845 sys.exit( main(sys.argv) ) 1846