/usr/share/pyshared/allmydata/util/encodingutil.py is in tahoe-lafs 1.9.2-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 | """
Functions used to convert inputs from whatever encoding used in the system to
unicode and back.
"""
import sys
import os
import re
from allmydata.util.assertutil import precondition
from twisted.python import usage
import locale
from allmydata.util import log
from allmydata.util.fileutil import abspath_expanduser_unicode
def canonical_encoding(encoding):
if encoding is None:
log.msg("Warning: falling back to UTF-8 encoding.", level=log.WEIRD)
encoding = 'utf-8'
encoding = encoding.lower()
if encoding == "cp65001":
encoding = 'utf-8'
elif encoding == "us-ascii" or encoding == "646" or encoding == "ansi_x3.4-1968":
encoding = 'ascii'
return encoding
def check_encoding(encoding):
# sometimes Python returns an encoding name that it doesn't support for conversion
# fail early if this happens
try:
u"test".encode(encoding)
except (LookupError, AttributeError):
raise AssertionError("The character encoding '%s' is not supported for conversion." % (encoding,))
filesystem_encoding = None
io_encoding = None
is_unicode_platform = False
def _reload():
global filesystem_encoding, io_encoding, is_unicode_platform
filesystem_encoding = canonical_encoding(sys.getfilesystemencoding())
check_encoding(filesystem_encoding)
if sys.platform == 'win32':
# On Windows we install UTF-8 stream wrappers for sys.stdout and
# sys.stderr, and reencode the arguments as UTF-8 (see scripts/runner.py).
io_encoding = 'utf-8'
else:
ioenc = None
if hasattr(sys.stdout, 'encoding'):
ioenc = sys.stdout.encoding
if ioenc is None:
try:
ioenc = locale.getpreferredencoding()
except Exception:
pass # work around <http://bugs.python.org/issue1443504>
io_encoding = canonical_encoding(ioenc)
check_encoding(io_encoding)
is_unicode_platform = sys.platform in ["win32", "darwin"]
_reload()
def get_filesystem_encoding():
"""
Returns expected encoding for local filenames.
"""
return filesystem_encoding
def get_io_encoding():
"""
Returns expected encoding for writing to stdout or stderr, and for arguments in sys.argv.
"""
return io_encoding
def argv_to_unicode(s):
"""
Decode given argv element to unicode. If this fails, raise a UsageError.
"""
precondition(isinstance(s, str), s)
try:
return unicode(s, io_encoding)
except UnicodeDecodeError:
raise usage.UsageError("Argument %s cannot be decoded as %s." %
(quote_output(s), io_encoding))
def argv_to_abspath(s):
"""
Convenience function to decode an argv element to an absolute path, with ~ expanded.
If this fails, raise a UsageError.
"""
return abspath_expanduser_unicode(argv_to_unicode(s))
def unicode_to_argv(s, mangle=False):
"""
Encode the given Unicode argument as a bytestring.
If the argument is to be passed to a different process, then the 'mangle' argument
should be true; on Windows, this uses a mangled encoding that will be reversed by
code in runner.py.
"""
precondition(isinstance(s, unicode), s)
if mangle and sys.platform == "win32":
# This must be the same as 'mangle' in bin/tahoe-script.template.
return str(re.sub(ur'[^\x20-\x7F]', lambda m: u'\x7F%x;' % (ord(m.group(0)),), s))
else:
return s.encode(io_encoding)
def unicode_to_url(s):
"""
Encode an unicode object used in an URL.
"""
# According to RFC 2718, non-ascii characters in URLs must be UTF-8 encoded.
# FIXME
return to_str(s)
#precondition(isinstance(s, unicode), s)
#return s.encode('utf-8')
def to_str(s):
if s is None or isinstance(s, str):
return s
return s.encode('utf-8')
PRINTABLE_ASCII = re.compile(r'^[\n\r\x20-\x7E]*$', re.DOTALL)
PRINTABLE_8BIT = re.compile(r'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL)
def is_printable_ascii(s):
return PRINTABLE_ASCII.search(s) is not None
def unicode_to_output(s):
"""
Encode an unicode object for representation on stdout or stderr.
"""
precondition(isinstance(s, unicode), s)
try:
out = s.encode(io_encoding)
except (UnicodeEncodeError, UnicodeDecodeError):
raise UnicodeEncodeError(io_encoding, s, 0, 0,
"A string could not be encoded as %s for output to the terminal:\n%r" %
(io_encoding, repr(s)))
if PRINTABLE_8BIT.search(out) is None:
raise UnicodeEncodeError(io_encoding, s, 0, 0,
"A string encoded as %s for output to the terminal contained unsafe bytes:\n%r" %
(io_encoding, repr(s)))
return out
def _unicode_escape(m):
u = m.group(0)
if u == '"' or u == '$' or u == '`' or u == '\\':
return u'\\' + u
if len(u) == 2:
codepoint = (ord(u[0])-0xD800)*0x400 + ord(u[1])-0xDC00 + 0x10000
else:
codepoint = ord(u)
if codepoint > 0xFFFF:
return u'\\U%08x' % (codepoint,)
elif codepoint > 0xFF:
return u'\\u%04x' % (codepoint,)
else:
return u'\\x%02x' % (codepoint,)
def _str_escape(m):
c = m.group(0)
if c == '"' or c == '$' or c == '`' or c == '\\':
return '\\' + c
else:
return '\\x%02x' % (ord(c),)
MUST_DOUBLE_QUOTE = re.compile(ur'[^\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
# if we must double-quote, then we have to escape ", $ and `, but need not escape '
ESCAPABLE_UNICODE = re.compile(ur'([\uD800-\uDBFF][\uDC00-\uDFFF])|' # valid surrogate pairs
ur'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]',
re.DOTALL)
ESCAPABLE_8BIT = re.compile( r'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL)
def quote_output(s, quotemarks=True, encoding=None):
"""
Encode either a Unicode string or a UTF-8-encoded bytestring for representation
on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is
always quoted; otherwise, it is quoted only if necessary to avoid ambiguity or
control bytes in the output.
Quoting may use either single or double quotes. Within single quotes, all
characters stand for themselves, and ' will not appear. Within double quotes,
Python-compatible backslash escaping is used.
"""
precondition(isinstance(s, (str, unicode)), s)
if isinstance(s, str):
try:
s = s.decode('utf-8')
except UnicodeDecodeError:
return 'b"%s"' % (ESCAPABLE_8BIT.sub(_str_escape, s),)
if MUST_DOUBLE_QUOTE.search(s) is None:
try:
out = s.encode(encoding or io_encoding)
if quotemarks or out.startswith('"'):
return "'%s'" % (out,)
else:
return out
except (UnicodeDecodeError, UnicodeEncodeError):
pass
escaped = ESCAPABLE_UNICODE.sub(_unicode_escape, s)
return '"%s"' % (escaped.encode(encoding or io_encoding, 'backslashreplace'),)
def quote_path(path, quotemarks=True):
return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks)
def unicode_platform():
"""
Does the current platform handle Unicode filenames natively?
"""
return is_unicode_platform
class FilenameEncodingError(Exception):
"""
Filename cannot be encoded using the current encoding of your filesystem
(%s). Please configure your locale correctly or rename this file.
"""
pass
def listdir_unicode_fallback(path):
"""
This function emulates a fallback Unicode API similar to one available
under Windows or MacOS X.
If badly encoded filenames are encountered, an exception is raised.
"""
precondition(isinstance(path, unicode), path)
try:
byte_path = path.encode(filesystem_encoding)
except (UnicodeEncodeError, UnicodeDecodeError):
raise FilenameEncodingError(path)
try:
return [unicode(fn, filesystem_encoding) for fn in os.listdir(byte_path)]
except UnicodeDecodeError:
raise FilenameEncodingError(fn)
def listdir_unicode(path):
"""
Wrapper around listdir() which provides safe access to the convenient
Unicode API even under platforms that don't provide one natively.
"""
precondition(isinstance(path, unicode), path)
# On Windows and MacOS X, the Unicode API is used
# On other platforms (ie. Unix systems), the byte-level API is used
if is_unicode_platform:
return os.listdir(path)
else:
return listdir_unicode_fallback(path)
|