/usr/share/pyshared/bleach/sanitizer.py is in python-bleach 1.4-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | from __future__ import unicode_literals
import re
from xml.sax.saxutils import escape, unescape
from html5lib.constants import tokenTypes
from html5lib.sanitizer import HTMLSanitizerMixin
from html5lib.tokenizer import HTMLTokenizer
PROTOS = HTMLSanitizerMixin.acceptable_protocols
PROTOS.remove('feed')
class BleachSanitizerMixin(HTMLSanitizerMixin):
"""Mixin to replace sanitize_token() and sanitize_css()."""
allowed_svg_properties = []
def sanitize_token(self, token):
"""Sanitize a token either by HTML-encoding or dropping.
Unlike HTMLSanitizerMixin.sanitize_token, allowed_attributes can be
a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}.
Here callable is a function with two arguments of attribute name
and value. It should return true of false.
Also gives the option to strip tags instead of encoding.
"""
if (getattr(self, 'wildcard_attributes', None) is None and
isinstance(self.allowed_attributes, dict)):
self.wildcard_attributes = self.allowed_attributes.get('*', [])
if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'],
tokenTypes['EmptyTag']):
if token['name'] in self.allowed_elements:
if 'data' in token:
if isinstance(self.allowed_attributes, dict):
allowed_attributes = self.allowed_attributes.get(
token['name'], [])
if not callable(allowed_attributes):
allowed_attributes += self.wildcard_attributes
else:
allowed_attributes = self.allowed_attributes
attrs = dict([(name, val) for name, val in
token['data'][::-1]
if (allowed_attributes(name, val)
if callable(allowed_attributes)
else name in allowed_attributes)])
for attr in self.attr_val_is_uri:
if not attr in attrs:
continue
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
unescape(attrs[attr])).lower()
# Remove replacement characters from unescaped
# characters.
val_unescaped = val_unescaped.replace("\ufffd", "")
if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped)
and (val_unescaped.split(':')[0] not in
self.allowed_protocols)):
del attrs[attr]
for attr in self.svg_attr_val_allows_ref:
if attr in attrs:
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
' ',
unescape(attrs[attr]))
if (token['name'] in self.svg_allow_local_href and
'xlink:href' in attrs and
re.search(r'^\s*[^#\s].*', attrs['xlink:href'])):
del attrs['xlink:href']
if 'style' in attrs:
attrs['style'] = self.sanitize_css(attrs['style'])
token['data'] = [(name, val) for name, val in
attrs.items()]
return token
elif self.strip_disallowed_elements:
pass
else:
if token['type'] == tokenTypes['EndTag']:
token['data'] = '</{0!s}>'.format(token['name'])
elif token['data']:
attr = ' {0!s}="{1!s}"'
attrs = ''.join([attr.format(k, escape(v)) for k, v in
token['data']])
token['data'] = '<{0!s}{1!s}>'.format(token['name'], attrs)
else:
token['data'] = '<{0!s}>'.format(token['name'])
if token['selfClosing']:
token['data'] = token['data'][:-1] + '/>'
token['type'] = tokenTypes['Characters']
del token["name"]
return token
elif token['type'] == tokenTypes['Comment']:
if not self.strip_html_comments:
return token
else:
return token
def sanitize_css(self, style):
"""HTMLSanitizerMixin.sanitize_css replacement.
HTMLSanitizerMixin.sanitize_css always whitelists background-*,
border-*, margin-*, and padding-*. We only whitelist what's in
the whitelist.
"""
# disallow urls
style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
# gauntlet
# TODO: Make sure this does what it's meant to - I *think* it wants to
# validate style attribute contents.
parts = style.split(';')
gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'"""
"""\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""")
for part in parts:
if not gauntlet.match(part):
return ''
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
return ''
clean = []
for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):
if not value:
continue
if prop.lower() in self.allowed_css_properties:
clean.append(prop + ': ' + value + ';')
elif prop.lower() in self.allowed_svg_properties:
clean.append(prop + ': ' + value + ';')
return ' '.join(clean)
class BleachSanitizer(HTMLTokenizer, BleachSanitizerMixin):
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
lowercaseElementName=True, lowercaseAttrName=True, **kwargs):
HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
lowercaseElementName, lowercaseAttrName,
**kwargs)
def __iter__(self):
for token in HTMLTokenizer.__iter__(self):
token = self.sanitize_token(token)
if token:
yield token
|