Created
May 13, 2020 01:24
-
-
Save gasman/e640f6e12609ffe43718403203fd8d65 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict | |
from collections.abc import Mapping | |
import html | |
import re | |
import warnings | |
from django.utils.html import conditional_escape, format_html | |
from django.utils.safestring import mark_safe | |
ELEMENT_SELECTOR = re.compile(r'^([\w-]+)$') | |
ELEMENT_WITH_ATTR_SELECTOR = re.compile(r'^([\w-]+)\[([\w-]+)\]$') | |
ELEMENT_WITH_ATTR_EXACT_SINGLE_QUOTE_SELECTOR = re.compile(r"^([\w-]+)\[([\w-]+)='(.*)'\]$") | |
ELEMENT_WITH_ATTR_EXACT_DOUBLE_QUOTE_SELECTOR = re.compile(r'^([\w-]+)\[([\w-]+)="(.*)"\]$') | |
ELEMENT_WITH_ATTR_EXACT_UNQUOTED_SELECTOR = re.compile(r"^([\w-]+)\[([\w-]+)=([\w-]+)\]$") | |
DOUBLE_QUOTED_ATTRIBUTE = r'([\w-]+)\="([^"]*)"' | |
SINGLE_QUOTED_ATTRIBUTE = r"([\w-]+)\='([^]*)'" | |
UNQUOTED_ATTRIBUTE = r'([\w-]+)\=([\w-]+)' | |
ATTRIBUTE = re.compile( | |
r'(?:%s|%s|%s)' % (DOUBLE_QUOTED_ATTRIBUTE, SINGLE_QUOTED_ATTRIBUTE, UNQUOTED_ATTRIBUTE) | |
) | |
class Rule: | |
"""A CSS-like rule that an HTML element can match or not""" | |
priority = None | |
def __lt__(self, other): | |
# Define an ordering on Selector objects so that running sort() on a list of them | |
# orders by priority | |
return self.priority < other.priority | |
class ElementRule(Rule): | |
priority = 2 # lower than an element-with-attribute rule | |
def __init__(self, name, rewriter): | |
self.name = name | |
self.rewriter = rewriter | |
def attributes_match(self, attrs): | |
# this rule does not care about attributes | |
return True | |
class ElementWithAttributeRule(Rule): | |
priority = 1 # higher than element rule | |
def __init__(self, name, attr, rewriter): | |
self.name = name | |
self.attr = attr | |
self.rewriter = rewriter | |
def attributes_match(self, attrs): | |
return self.attr in attrs | |
class ElementWithAttributeExactRule(Rule): | |
priority = 1 # higher than element rule | |
def __init__(self, name, attr, value, rewriter): | |
self.name = name | |
self.attr = attr | |
self.value = value | |
self.rewriter = rewriter | |
def attributes_match(self, attrs): | |
return (self.attr in attrs) and (attrs[self.attr] == self.value) | |
class HTMLRewriter: | |
def __init__(self, rules): | |
self.rules_by_element = defaultdict(list) | |
self.add_rules(rules) | |
def add_rules(self, rules): | |
# accepts either a dict of {selector: rewriter}, or a list of (selector, rewriter) tuples | |
if isinstance(rules, Mapping): | |
rules = rules.items() | |
for selector, rewriter in rules: | |
self.add_rule(selector, rewriter) | |
def add_rule(self, selector, rewriter): | |
match = ELEMENT_SELECTOR.match(selector) | |
if match: | |
name = match.group(1) | |
self.rules_by_element[name].append( | |
ElementRule(name, rewriter) | |
) | |
self.rules_by_element[name].sort() | |
return | |
match = ELEMENT_WITH_ATTR_SELECTOR.match(selector) | |
if match: | |
name, attr = match.groups() | |
self.rules_by_element[name].append( | |
ElementWithAttributeRule(name, attr, rewriter) | |
) | |
self.rules_by_element[name].sort() | |
return | |
for regex in ( | |
ELEMENT_WITH_ATTR_EXACT_SINGLE_QUOTE_SELECTOR, | |
ELEMENT_WITH_ATTR_EXACT_DOUBLE_QUOTE_SELECTOR, | |
ELEMENT_WITH_ATTR_EXACT_UNQUOTED_SELECTOR | |
): | |
match = regex.match(selector) | |
if match: | |
name, attr, value = match.groups() | |
self.rules_by_element[name].append( | |
ElementWithAttributeExactRule(name, attr, value, rewriter) | |
) | |
self.rules_by_element[name].sort() | |
return | |
warnings.warn("Unsupported selector format: %r" % selector) | |
def rewrite(self, html): | |
""" | |
Rewrite a string of HTML according to the configured rules | |
""" | |
result, _ = self._rewrite(html) | |
return result | |
def _rewrite(self, html, *, start=0, until_tag=None): | |
""" | |
Internal method used recursively by `rewrite`. | |
Rewrites a string of HTML, starting from offset `start`, until it encounters either the end | |
of the string, or a closing tag as specified in `until_tag` that is NOT balanced by an | |
opening tag seen in the current invocation of `_rewrite`. | |
In other words, | |
some text with <span>an extra span</span> in it</span> | |
^^^^^^^ this one. | |
Upon reaching either end condition (closing tag or end of string), it will return a tuple | |
of the rewritten HTML string and the offset of any subsequent not-yet-processed HTML. | |
""" | |
if not self.rules_by_element: | |
# no rewrite rules, so nothing to do. Yay! | |
return mark_safe(html[start:]), len(html) | |
position = start # current position within the html string | |
result = '' # rewritten HTML goes here | |
# the number of pending occurrences of until_tag that we will skip past before treating it | |
# as the ACTUAL closing tag | |
ignored_closing_tag_count = 0 | |
# build a regexp that matches any element name in our rule list, | |
# e.g. "(h1|a|embed)" | |
element_name_re = "(%s)" % ( | |
'|'.join(re.escape(name) for name in self.rules_by_element.keys()) | |
) | |
# turn this into a regexp that matches any opening tag with one of these names | |
# and any set of attributes - e.g. "<(h1|a|embed)(\b[^>]*)>" | |
opening_tag_re = r'<%s(\b[^>]*)>' % element_name_re | |
if until_tag: | |
# our final regexp also needs to match the specified closing tag | |
closing_tag_re = r'</(%s)>' % re.escape(until_tag) | |
final_re_expr = r'(?:%s|%s)' % (opening_tag_re, closing_tag_re) | |
else: | |
# no until_tag specified, so we're just looking for opening tags | |
final_re_expr = opening_tag_re | |
# we need to compile the regexp to be able to start the search from an arbitrary offset | |
final_re = re.compile(final_re_expr) | |
while True: | |
match = final_re.search(html, position) | |
if not match: | |
# we have reached the end of the HTML string with no more elements to rewrite. | |
# This shouldn't happen if we were expecting a closing tag, so warn in that case | |
if until_tag: | |
warnings.warn( | |
"Reached end of string without encountering closing %r tag" % until_tag | |
) | |
# either way, we can tack the remaining non-rewritten HTML on to our result, | |
# and we're done | |
result += html[position:] | |
return mark_safe(result), len(html) | |
else: | |
# we've found a tag that we're interested in, but first we should add all of the | |
# non-rewritten HTML up to that point on to our result | |
result += html[position:match.start()] | |
# once we've finished handling this tag, we'll continue from the offset after it | |
position = match.end() | |
# now parse the tag into its components | |
tag = match.group(0) | |
is_closing_tag = tag.startswith('</') | |
if is_closing_tag: | |
# the regexp is structured as <(tagname)(attrs)>|</(tagname)> , so we're | |
# interested in the third group | |
tag_name = match.group(3) | |
else: | |
tag_name = match.group(1) | |
attr_string = match.group(2) | |
is_self_closing = attr_string.endswith('/') | |
if is_self_closing: | |
attr_string = attr_string[:-1] | |
attrs = self.unpack_attr_string(attr_string) | |
if is_closing_tag: | |
if ignored_closing_tag_count > 0: | |
# this tag is closing a tag that was previously opened in this invocation | |
# of _rewrite - it isn't the *real* closing tag | |
ignored_closing_tag_count -= 1 | |
else: | |
# this is the real closing tag, so we're done with this invocation of | |
# _rewrite | |
return mark_safe(result), position | |
else: | |
# this is an opening tag - look for a matching rewrite rule | |
matching_rule = None | |
for rule in self.rules_by_element[tag_name]: | |
if rule.attributes_match(attrs): | |
matching_rule = rule | |
break | |
if not matching_rule: | |
# no matching rewrite rule, so output this tag unchanged | |
result += tag | |
if tag_name == until_tag and not is_self_closing: | |
# this tag has the same name as the closing tag we're waiting for, | |
# so the next occurrence of the closing tag will be closing this one, | |
# rather than the *real* closing tag that signals the end of this | |
# invocation of _rewrite | |
ignored_closing_tag_count += 1 | |
elif hasattr(matching_rule.rewriter, 'rewrite_attributes'): | |
# If the rewrite rule has a rewrite_attributes method, then we output the | |
# original tag with the updated attributes. Since the closing tag will be | |
# unchanged in the output, there's no need for us to do anything special to | |
# match it; however, if it happens to be the same as until_tag, we need to | |
# bump up ignored_closing_tag_count so that we'll skip over it rather than | |
# treating it as the *real* closing tag | |
new_attrs = matching_rule.rewriter.rewrite_attributes(tag_name, attrs) | |
new_attr_string = ' '.join( | |
'%s="%s"' % (conditional_escape(key), conditional_escape(val)) | |
for key, val in new_attrs.items() | |
) | |
if is_self_closing: | |
new_tag = '<%s %s/>' % (tag_name, new_attr_string) | |
else: | |
new_tag = '<%s %s>' % (tag_name, new_attr_string) | |
result += new_tag | |
if tag_name == until_tag and not is_self_closing: | |
ignored_closing_tag_count += 1 | |
elif hasattr(matching_rule.rewriter, 'rewrite_element'): | |
if is_self_closing: | |
# this element has no content, so just call rewrite_element with an | |
# empty string as content | |
rewritten_element = matching_rule.rewriter.rewrite_element( | |
tag_name, attrs, '' | |
) | |
else: | |
# we need to consume the element content by spinning up a recursive | |
# call to _rewrite and then call rewrite_element with that result | |
content, position = self._rewrite(html, start=position, until_tag=tag_name) | |
rewritten_element = matching_rule.rewriter.rewrite_element( | |
tag_name, attrs, content | |
) | |
# escape the result of rewrite_element, unless it's a safe string | |
# (e.g. the output of format_html) | |
result += conditional_escape(rewritten_element) | |
else: | |
raise Exception( | |
"Invalid ElementRewriter: %r. An ElementRewriter must implement " | |
"either rewrite_element or rewrite_attributes" | |
% matching_rule.rewriter | |
) | |
@staticmethod | |
def unpack_attr_string(attr_string): | |
"""Unpack a string of HTML attributes into a dict of unescaped strings""" | |
attributes = {} | |
for match in ATTRIBUTE.finditer(attr_string): | |
if match.group(1): | |
name = match.group(1) | |
val = match.group(2) | |
elif match.group(3): | |
name = match.group(3) | |
val = match.group(4) | |
elif match.group(5): | |
name = match.group(5) | |
val = match.group(6) | |
attributes[name] = html.unescape(val) | |
return attributes | |
class ElementRewriter: | |
pass | |
class ImageRewriter(ElementRewriter): | |
def rewrite_element(self, name, attributes, content): | |
return format_html( | |
'<img src="/images/image-{}.jpg" alt="{}" />', | |
attributes['id'], | |
attributes['alt'] | |
) | |
class PageLinkRewriter(ElementRewriter): | |
def rewrite_attributes(self, name, attributes): | |
return { | |
'href': '/pages/page-%s' % attributes['id'], | |
'class': 'page', | |
} | |
class IntroParagraphRewriter(ElementRewriter): | |
def rewrite_element(self, name, attributes, content): | |
return format_html('<p class="intro">{}</p>', content) | |
class ROT13Rewriter(ElementRewriter): | |
def rewrite_element(self, name, attributes, content): | |
import codecs | |
return codecs.encode(content, 'rot_13') | |
rewriter = HTMLRewriter({ | |
"embed[embedtype='image']": ImageRewriter(), | |
"a[linktype='page']": PageLinkRewriter(), | |
"intro": IntroParagraphRewriter(), | |
"rot13": ROT13Rewriter(), | |
}) | |
db_html = """ | |
<h1>HTML Rewriter example</h1> | |
<intro><strong>First</strong>, we have a paragraph with <a linktype="page" id="123">a <em>wonderful</em> link</a> in it.</intro> | |
<p>This <a href="https://torchbox.com/">external link</a> doesn't get rewritten though. Now for an image:</p> | |
<embed embedtype="image" id="456" alt="I <3 kittens" /> | |
<p>But all we really care about is that <rot13>Darth Vader is Luke's father</rot13>.</p> | |
""" | |
print(rewriter.rewrite(db_html)) | |
""" | |
Output: | |
<h1>HTML Rewriter example</h1> | |
<p class="intro"><strong>First</strong>, we have a paragraph with <a href="/pages/page-123" class="page">a <em>wonderful</em> link</a> in it.</p> | |
<p>This <a href="https://torchbox.com/">external link</a> doesn't get rewritten though. Now for an image:</p> | |
<img src="/images/image-456.jpg" alt="I <3 kittens" /> | |
<p>But all we really care about is that Qnegu Inqre vf Yhxr'f sngure.</p> | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment