/usr/share/pyshared/wader/common/encoding.py is in python-wader 0.5.12-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 | # -*- coding: utf-8 -*-
# Copyright (C) 2006-2011 Vodafone España, S.A.
# Copyright (C) 2008-2009 Warp Networks, S.L.
# Author: Pablo Martí
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""Helper methods for dealing with encoded strings"""
import messaging.sms.gsm0338 # imports GSM7 codec
CONTROL_0, CONTROL_1, LATIN_EX_A, LATIN_EX_B = range(4)
ZIP_BASE64_MAGIC = '{ZIP+BASE64}'
def pack_ucs2_bytes(s):
"""
Converts string ``s`` to UCS2
:rtype: str
"""
return to_u(s).encode('utf_16_be').encode('hex').upper()
def unpack_ucs2_bytes(s):
"""
Unpacks string ``s`` from UCS2
:rtype: unicode
"""
return s.decode('hex').decode('utf_16_be', 'ignore')
def unpack_ucs2_bytes_in_ts31101_80(s):
"""
Returns a string from ``s`` which is encoded in TS 31.101 (Annex A) type 80
Check out the function comments
"""
# Below is the detail from there, but we expect the first two hex
# chars(80) to have been removed already.
# If the first byte in the alpha string is '80', then the remaining
# bytes are 16 bit UCS2 characters, with the more significant byte (MSB)
# of the UCS2 character coded in the lower numbered byte of the alpha
# field, and the less significant byte (LSB) of the UCS2 character is
# coded in the higher numbered alpha field byte, i.e. byte 2 of the
# alpha field contains the more significant byte (MSB) of the first UCS2
# character, and byte 3 of the alpha field contains the less significant
# byte (LSB) of the first UCS2 character (as shown below). Unused bytes
# shall be set to 'FF', and if the alpha field is an even number of bytes
# in length, then the last (unusable) byte shall be set to 'FF'.
#
# example string '058300440586FF'
vl = len(s) - len(s) % 4
vs = s[:vl]
try:
t = unpack_ucs2_bytes(vs)
except:
t = vs # show the invalid unicode
return t
def unpack_ucs2_bytes_in_ts31101_81(s):
"""
Returns a string from ``s`` which is encoded in TS 31.101 (Annex A) type 81
Check out the function comments
"""
# Below is the detail from there, but we expect the first two hex
# chars(81) to have been removed already.
# If the first byte of the alpha string is set to '81', then the second
# byte contains a value indicating the number of characters in the string,
# and the third byte contains an 8 bit number which defines bits 15 to 8
# of a 16 bit base pointer, where bit 16 is set to zero, and bits 7 to 1
# are also set to zero. These sixteen bits constitute a base pointer
# to a "half-page" in the UCS2 code space, to be used with some or all of
# the remaining bytes in the string. The fourth and subsequent bytes in
# the string contain codings as follows; if bit 8 of the byte is set to
# zero, the remaining 7 bits of the byte contain a GSM Default Alphabet
# character, whereas if bit 8 of the byte is set to one, then the
# remaining seven bits are an offset value added to the 16 bit base
# pointer defined earlier, and the resultant 16 bit value is a UCS2 code
# point, and completely defines a UCS2 character.
#
# example string '0602A46563746F72FF'
num = ord(s[:2].decode('hex'))
base = (ord(s[2:4].decode('hex')) & 0x7f) << 7 # bits 15..8
chars = s[4:4 + num * 2]
t = ''
for i in range(num):
j = i * 2
c_hex = chars[j:j + 2]
c_chr = c_hex.decode('hex')
c_ord = ord(c_chr)
if c_ord & 0x80 == 0:
t += c_chr.decode('gsm0338', 'replace')
else:
t += unichr(base + (c_ord & 0x7f))
return t
def unpack_ucs2_bytes_in_ts31101_82(s):
"""
Returns a string from ``s`` which is encoded in TS 31.101 (Annex A) type 82
Check out the function comments
"""
# Below is the detail from there, but we expect the first two hex
# chars(82) to have been removed already.
# If the first byte of the alpha string is set to '82', then the
# second byte contains a value indicating the number of characters
# in the string, and the third and fourth bytes contain a 16 bit number
# which defines the complete 16 bit base pointer to a "half-page" in the
# UCS2 code space, for use with some or all of the remaining bytes in the
# string. The fifth and subsequent bytes in the string contain codings as
# follows; if bit 8 of the byte is set to zero, the remaining 7 bits of
# the byte contain a GSM Default Alphabet character, whereas if bit 8 of
# the byte is set to one, the remaining seven bits are an offset value
# added to the base pointer defined in bytes three and four, and the
# resultant 16 bit value is a UCS2 code point, and defines a UCS2 character
#
# example string '0505302D82D32D31'
num = ord(s[:2].decode('hex'))
base = ord(s[2:4].decode('hex')) << 8 # bits 16..9
base += ord(s[4:6].decode('hex')) # bits 8..1
chars = s[6:6 + num * 2]
t = ''
for i in range(num):
j = i * 2
c_hex = chars[j:j + 2]
c_chr = c_hex.decode('hex')
c_ord = ord(c_chr)
if c_ord & 0x80 == 0:
t += c_chr.decode('gsm0338', 'replace')
else:
t += unichr(base + (c_ord & 0x7f))
return t
def from_8bit_in_gsm(s):
"""
Converts a string from ``s`` from GSM7 encoding using 8 bits.
"""
t = s.decode('hex')
index = t.find('\xff')
if index != -1:
t = t[:index]
return t.decode('gsm0338', 'replace')
def from_8bit_in_gsm_or_ts31101(s):
"""
Returns a string from ``s`` to GSM7 or ts31101 encodings.
It checks if first byte indicates ts31101_80, 81 or 82,
if not it defaults to GSM7.
"""
encoding = s[:2]
hexbytes = s[2:]
if encoding == '80':
return unpack_ucs2_bytes_in_ts31101_80(hexbytes).rstrip(u'\uffff')
elif encoding == '81':
return unpack_ucs2_bytes_in_ts31101_81(hexbytes)
elif encoding == '82':
return unpack_ucs2_bytes_in_ts31101_82(hexbytes)
else:
# default to GSM7 8bits, zero first bit.
return from_8bit_in_gsm(s)
def check_if_ucs2(text, limit=None):
"""
Test whether ``text`` is a UCS2 encoded string
:rtype: bool
"""
# XXX: Returns false positives
if text and isinstance(text, basestring) and (len(text) % 4 == 0):
try:
unpack_ucs2_bytes(text)
except (UnicodeDecodeError, TypeError):
return False
else:
if limit is None:
return True
s = text
while len(s):
val = int(s[:4], 16)
if limit == LATIN_EX_B and val > 0x024F:
return False
if limit == LATIN_EX_A and val > 0x017F:
return False
if limit == CONTROL_1 and val > 0x00FF:
return False
if limit == CONTROL_0 and val > 0x007F:
return False
s = s[4:]
return True
return False
def from_u(s):
"""
Encodes ``s`` to utf-8 if its not already encoded
:rtype: str
"""
return (s.encode('utf8') if isinstance(s, unicode) else s)
def from_ucs2(s):
"""
Converts ``s`` from UCS2 if not already converted
:rtype: str
"""
return (unpack_ucs2_bytes(s) if check_if_ucs2(s) else s)
def to_u(s):
"""
Converts ``s`` to unicode if not already converted
:rtype: unicode
"""
return (s if isinstance(s, unicode) else unicode(s, 'utf8'))
def pack_dbus_safe_string(s):
"""
Converts string ``s`` to BASE64 encoded ZIP compressed string
:rtype: str
"""
return '%s\n%s' % (ZIP_BASE64_MAGIC, s.encode('zip').encode('base64'))
def unpack_dbus_safe_string(s):
"""
Converts BASE64 encoded ZIP compressed string ``s`` to string
:rtype: str
"""
if not s.startswith(ZIP_BASE64_MAGIC):
raise ValueError
try:
return s[len(ZIP_BASE64_MAGIC):].decode('base64').decode('zip')
except:
raise ValueError
|