Viewing file: preprocessors.py (6.97 KB) -rw-r--r-- Select action/file-type: (+) | (+) | (+) | Code (+) | Session (+) | (+) | SDB (+) | (+) | (+) | (+) | (+) | (+) |
""" PRE-PROCESSORS =============================================================================
Preprocessors work on source text before we start doing anything too complicated. """
import re import markdown
HTML_PLACEHOLDER_PREFIX = markdown.STX+"wzxhzdk:" HTML_PLACEHOLDER = HTML_PLACEHOLDER_PREFIX + "%d" + markdown.ETX
class Processor: def __init__(self, markdown_instance=None): if markdown_instance: self.markdown = markdown_instance
class Preprocessor (Processor): """ Preprocessors are run after the text is broken into lines.
Each preprocessor implements a "run" method that takes a pointer to a list of lines of the document, modifies it as necessary and returns either the same pointer or a pointer to a new list.
Preprocessors must extend markdown.Preprocessor.
""" def run(self, lines): """ Each subclass of Preprocessor should override the `run` method, which takes the document as a list of strings split by newlines and returns the (possibly modified) list of lines.
""" pass
class HtmlStash: """ This class is used for stashing HTML objects that we extract in the beginning and replace with place-holders. """
def __init__ (self): """ Create a HtmlStash. """ self.html_counter = 0 # for counting inline html segments self.rawHtmlBlocks=[]
def store(self, html, safe=False): """ Saves an HTML segment for later reinsertion. Returns a placeholder string that needs to be inserted into the document.
Keyword arguments:
* html: an html segment * safe: label an html segment as safe for safemode
Returns : a placeholder string
""" self.rawHtmlBlocks.append((html, safe)) placeholder = HTML_PLACEHOLDER % self.html_counter self.html_counter += 1 return placeholder
def reset(self): self.html_counter = 0 self.rawHtmlBlocks = []
class HtmlBlockPreprocessor(Preprocessor): """Remove html blocks from the text and store them for later retrieval."""
right_tag_patterns = ["</%s>", "%s>"]
def _get_left_tag(self, block): return block[1:].split(">", 1)[0].lower()
def _get_right_tag(self, left_tag, block): for p in self.right_tag_patterns: tag = p % left_tag i = block.rfind(tag) if i > 2: return tag.lstrip("<").rstrip(">"), i + len(p)-2 + len(left_tag) return block.rstrip()[-len(left_tag)-2:-1].lower(), len(block)
def _equal_tags(self, left_tag, right_tag): if left_tag == 'div' or left_tag[0] in ['?', '@', '%']: # handle PHP, etc. return True if ("/" + left_tag) == right_tag: return True if (right_tag == "--" and left_tag == "--"): return True elif left_tag == right_tag[1:] \ and right_tag[0] != "<": return True else: return False
def _is_oneliner(self, tag): return (tag in ['hr', 'hr/'])
def run(self, lines): text = "\n".join(lines) new_blocks = [] text = text.split("\n\n") items = [] left_tag = '' right_tag = '' in_tag = False # flag
while text: block = text[0] if block.startswith("\n"): block = block[1:] text = text[1:]
if block.startswith("\n"): block = block[1:]
if not in_tag: if block.startswith("<") and len(block.strip()) > 1: left_tag = self._get_left_tag(block) right_tag, data_index = self._get_right_tag(left_tag, block)
if block[1] == "!": # is a comment block left_tag = "--" right_tag, data_index = self._get_right_tag(left_tag, block) # keep checking conditions below and maybe just append if data_index < len(block) \ and markdown.isBlockLevel(left_tag): text.insert(0, block[data_index:]) block = block[:data_index]
if not (markdown.isBlockLevel(left_tag) \ or block[1] in ["!", "?", "@", "%"]): new_blocks.append(block) continue
if self._is_oneliner(left_tag): new_blocks.append(block.strip()) continue
if block.rstrip().endswith(">") \ and self._equal_tags(left_tag, right_tag): new_blocks.append( self.markdown.htmlStash.store(block.strip())) continue else: #if not block[1] == "!": # if is block level tag and is not complete
if markdown.isBlockLevel(left_tag) or left_tag == "--" \ and not block.rstrip().endswith(">"): items.append(block.strip()) in_tag = True else: new_blocks.append( self.markdown.htmlStash.store(block.strip()))
continue
new_blocks.append(block)
else: items.append(block.strip())
right_tag, data_index = self._get_right_tag(left_tag, block)
if self._equal_tags(left_tag, right_tag): # if find closing tag in_tag = False new_blocks.append( self.markdown.htmlStash.store('\n\n'.join(items))) items = []
if items: new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items))) new_blocks.append('\n')
new_text = "\n\n".join(new_blocks) return new_text.split("\n")
class ReferencePreprocessor(Preprocessor): """ Remove reference definitions from text and store for later use. """
RE = re.compile(r'^(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)$', re.DOTALL)
def run (self, lines): new_text = []; for line in lines: m = self.RE.match(line) if m: id = m.group(2).strip().lower() t = m.group(4).strip() # potential title if not t: self.markdown.references[id] = (m.group(3), t) elif (len(t) >= 2 and (t[0] == t[-1] == "\"" or t[0] == t[-1] == "\'" or (t[0] == "(" and t[-1] == ")") ) ): self.markdown.references[id] = (m.group(3), t[1:-1]) else: new_text.append(line) else: new_text.append(line)
return new_text #+ "\n"
|