/usr/lib/python3/dist-packages/bleach/sanitizer.py is in python3-bleach 1.4.2-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147  | from __future__ import unicode_literals
import re
from xml.sax.saxutils import escape, unescape
from html5lib.constants import tokenTypes
from html5lib.sanitizer import HTMLSanitizerMixin
from html5lib.tokenizer import HTMLTokenizer
PROTOS = HTMLSanitizerMixin.acceptable_protocols
PROTOS.remove('feed')
class BleachSanitizerMixin(HTMLSanitizerMixin):
    """Mixin to replace sanitize_token() and sanitize_css()."""
    allowed_svg_properties = []
    def sanitize_token(self, token):
        """Sanitize a token either by HTML-encoding or dropping.
        Unlike HTMLSanitizerMixin.sanitize_token, allowed_attributes can be
        a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}.
        Here callable is a function with two arguments of attribute name
        and value. It should return true of false.
        Also gives the option to strip tags instead of encoding.
        """
        if (getattr(self, 'wildcard_attributes', None) is None and
                isinstance(self.allowed_attributes, dict)):
            self.wildcard_attributes = self.allowed_attributes.get('*', [])
        if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'],
                             tokenTypes['EmptyTag']):
            if token['name'] in self.allowed_elements:
                if 'data' in token:
                    if isinstance(self.allowed_attributes, dict):
                        allowed_attributes = self.allowed_attributes.get(
                            token['name'], [])
                        if not callable(allowed_attributes):
                            allowed_attributes += self.wildcard_attributes
                    else:
                        allowed_attributes = self.allowed_attributes
                    attrs = dict([(name, val) for name, val in
                                  token['data'][::-1]
                                  if (allowed_attributes(name, val)
                                      if callable(allowed_attributes)
                                      else name in allowed_attributes)])
                    for attr in self.attr_val_is_uri:
                        if attr not in attrs:
                            continue
                        val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
                                               unescape(attrs[attr])).lower()
                        # Remove replacement characters from unescaped
                        # characters.
                        val_unescaped = val_unescaped.replace("\ufffd", "")
                        if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped)
                            and (val_unescaped.split(':')[0] not in
                                 self.allowed_protocols)):
                            del attrs[attr]
                    for attr in self.svg_attr_val_allows_ref:
                        if attr in attrs:
                            attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
                                                 ' ',
                                                 unescape(attrs[attr]))
                    if (token['name'] in self.svg_allow_local_href and
                            'xlink:href' in attrs and
                            re.search(r'^\s*[^#\s].*', attrs['xlink:href'])):
                        del attrs['xlink:href']
                    if 'style' in attrs:
                        attrs['style'] = self.sanitize_css(attrs['style'])
                    token['data'] = [(name, val) for name, val in
                                     attrs.items()]
                return token
            elif self.strip_disallowed_elements:
                pass
            else:
                if token['type'] == tokenTypes['EndTag']:
                    token['data'] = '</{0!s}>'.format(token['name'])
                elif token['data']:
                    attr = ' {0!s}="{1!s}"'
                    attrs = ''.join([attr.format(k, escape(v)) for k, v in
                                    token['data']])
                    token['data'] = '<{0!s}{1!s}>'.format(token['name'], attrs)
                else:
                    token['data'] = '<{0!s}>'.format(token['name'])
                if token['selfClosing']:
                    token['data'] = token['data'][:-1] + '/>'
                token['type'] = tokenTypes['Characters']
                del token["name"]
                return token
        elif token['type'] == tokenTypes['Comment']:
            if not self.strip_html_comments:
                return token
        else:
            return token
    def sanitize_css(self, style):
        """HTMLSanitizerMixin.sanitize_css replacement.
        HTMLSanitizerMixin.sanitize_css always whitelists background-*,
        border-*, margin-*, and padding-*. We only whitelist what's in
        the whitelist.
        """
        # disallow urls
        style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
        # gauntlet
        # TODO: Make sure this does what it's meant to - I *think* it wants to
        # validate style attribute contents.
        parts = style.split(';')
        gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'"""
                              """\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""")
        for part in parts:
            if not gauntlet.match(part):
                return ''
        if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
            return ''
        clean = []
        for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):
            if not value:
                continue
            if prop.lower() in self.allowed_css_properties:
                clean.append(prop + ': ' + value + ';')
            elif prop.lower() in self.allowed_svg_properties:
                clean.append(prop + ': ' + value + ';')
        return ' '.join(clean)
class BleachSanitizer(HTMLTokenizer, BleachSanitizerMixin):
    def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
                 lowercaseElementName=True, lowercaseAttrName=True, **kwargs):
        HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
                               lowercaseElementName, lowercaseAttrName,
                               **kwargs)
    def __iter__(self):
        for token in HTMLTokenizer.__iter__(self):
            token = self.sanitize_token(token)
            if token:
                yield token
 |