/usr/lib/python2.7/dist-packages/skytools-3.0/skytools/utf8.py is in python-skytools3 3.2.6-4.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 | r"""UTF-8 sanitizer.
Python's UTF-8 parser is quite relaxed, this creates problems when
talking with other software that uses stricter parsers.
>>> safe_utf8_decode("foobar")
(True, u'foobar')
>>> safe_utf8_decode('X\xed\xa0\x80Y\xed\xb0\x89Z')
(False, u'X\ufffdY\ufffdZ')
>>> safe_utf8_decode('X\xed\xa0\x80\xed\xb0\x89Z')
(False, u'X\U00010009Z')
>>> safe_utf8_decode('X\0Z')
(False, u'X\ufffdZ')
>>> safe_utf8_decode('OK')
(True, u'OK')
>>> safe_utf8_decode('X\xF1Y')
(False, u'X\ufffdY')
"""
import re, codecs
__all__ = ['safe_utf8_decode']
# by default, use same symbol as 'replace'
REPLACEMENT_SYMBOL = unichr(0xFFFD)
def _fix_utf8(m):
"""Merge UTF16 surrogates, replace others"""
u = m.group()
if len(u) == 2:
# merge into single symbol
c1 = ord(u[0])
c2 = ord(u[1])
c = 0x10000 + ((c1 & 0x3FF) << 10) + (c2 & 0x3FF)
return unichr(c)
else:
# use replacement symbol
return REPLACEMENT_SYMBOL
_urc = None
def sanitize_unicode(u):
"""Fix invalid symbols in unicode string."""
global _urc
assert isinstance(u, unicode)
# regex for finding invalid chars, works on unicode string
if not _urc:
rx = u"[\uD800-\uDBFF] [\uDC00-\uDFFF]? | [\0\uDC00-\uDFFF]"
_urc = re.compile(rx, re.X)
# now find and fix UTF16 surrogates
m = _urc.search(u)
if m:
u = _urc.sub(_fix_utf8, u)
return u
def safe_replace(exc):
"""Replace only one symbol at a time.
Builtin .decode('xxx', 'replace') replaces several symbols
together, which is unsafe.
"""
if not isinstance(exc, UnicodeDecodeError):
raise exc
c2 = REPLACEMENT_SYMBOL
# we could assume latin1
if 0:
c1 = exc.object[exc.start]
c2 = unichr(ord(c1))
return c2, exc.start + 1
# register, it will be globally available
codecs.register_error("safe_replace", safe_replace)
def safe_utf8_decode(s):
"""Decode UTF-8 safely.
Acts like str.decode('utf8', 'replace') but also fixes
UTF16 surrogates and NUL bytes, which Python's default
decoder does not do.
@param s: utf8-encoded byte string
@return: tuple of (was_valid_utf8, unicode_string)
"""
# decode with error detection
ok = True
try:
# expect no errors by default
u = s.decode('utf8')
except UnicodeDecodeError:
u = s.decode('utf8', 'safe_replace')
ok = False
u2 = sanitize_unicode(u)
if u is not u2:
ok = False
return (ok, u2)
if __name__ == '__main__':
import doctest
doctest.testmod()
|