/usr/lib/python2.7/dist-packages/dicom/charset.py is in python-dicom 0.9.9-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 | # charset.py
"""Handle alternate character sets for character strings."""
#
# Copyright (c) 2008-2012 Darcy Mason
# This file is part of pydicom, released under a modified MIT license.
# See the file license.txt included with this distribution, also
# available at http://pydicom.googlecode.com
#
import logging
logger = logging.getLogger('pydicom')
from dicom.valuerep import PersonNameUnicode, text_VRs
from dicom import in_py3
# Map DICOM Specific Character Set to python equivalent
python_encoding = {
'': 'iso8859', # default character set for DICOM
'ISO_IR 6': 'iso8859', # alias for latin_1 too
'ISO_IR 100': 'latin_1',
'ISO 2022 IR 87': 'iso2022_jp',
'ISO 2022 IR 13': 'shift_jis',
'ISO 2022 IR 149': 'euc_kr', # needs cleanup via clean_escseq()
'ISO_IR 192': 'UTF8', # from Chinese example, 2008 PS3.5 Annex J p1-4
'GB18030': 'GB18030',
'ISO_IR 126': 'iso_ir_126', # Greek
'ISO_IR 127': 'iso_ir_127', # Arab
'ISO_IR 138': 'iso_ir_138', # Hebrew
'ISO_IR 144': 'iso_ir_144', # Russian
}
default_encoding = "iso8859"
def clean_escseq(element, encodings):
"""Remove escape sequences that Python does not remove from
Korean encoding ISO 2022 IR 149 due to the G1 code element.
"""
if 'euc_kr' in encodings:
return element.replace(
"\x1b\x24\x29\x43", "").replace("\x1b\x28\x42", "")
else:
return element
# DICOM PS3.5-2008 6.1.1 (p 18) says:
# default is ISO-IR 6 G0, equiv to common chr set of ISO 8859 (PS3.5 6.1.2.1)
# (0008,0005) value 1 can *replace* the default encoding...
# for VRs of SH, LO, ST, LT, PN and UT (PS3.5 6.1.2.3)...
# with a single-byte character encoding
# if (0008,0005) is multi-valued, then value 1 (or default if blank)...
# is used until code extension escape sequence is hit,
# which can be at start of string, or after CR/LF, FF, or
# in Person Name PN, after ^ or =
# NOTE also that 7.5.3 SEQUENCE INHERITANCE states that if (0008,0005)
# is not present in a sequence item then it is inherited from its parent.
def convert_encodings(encodings):
"""Converts DICOM encodings into corresponding python encodings"""
# If a list if passed, we don't want to modify the list in place so copy it
encodings = encodings[:]
if isinstance(encodings, basestring):
encodings = [encodings]
elif not encodings[0]:
encodings[0] = 'ISO_IR 6'
try:
encodings = [python_encoding[x] for x in encodings]
except KeyError: # Assume that it is already the python encoding (is there a way to check this?)
pass
if len(encodings) == 1:
encodings = [encodings[0]] * 3
elif len(encodings) == 2:
encodings.append(encodings[1])
return encodings
def decode(data_element, dicom_character_set):
"""Apply the DICOM character encoding to the data element
data_element -- DataElement instance containing a value to convert
dicom_character_set -- the value of Specific Character Set (0008,0005),
which may be a single value,
a multiple value (code extension), or
may also be '' or None.
If blank or None, ISO_IR 6 is used.
"""
if not dicom_character_set:
dicom_character_set = ['ISO_IR 6']
encodings = convert_encodings(dicom_character_set)
# decode the string value to unicode
# PN is special case as may have 3 components with differenct chr sets
if data_element.VR == "PN":
# logger.warn("%s ... type: %s" %(str(data_element), type(data_element.VR)))
if in_py3:
if data_element.VM == 1:
data_element.value = data_element.value.decode(encodings)
else:
data_element.value = [val.decode(encodings) for val in data_element.value]
else:
if data_element.VM == 1:
data_element.value = PersonNameUnicode(data_element.value, encodings)
else:
data_element.value = [PersonNameUnicode(value, encodings)
for value in data_element.value]
if data_element.VR in text_VRs:
# Remove the first encoding if this is a multi-byte encoding
if len(encodings) > 1:
del encodings[0]
# You can't re-decode unicode (string literals in py3)
if data_element.VM == 1:
if isinstance(data_element.value, unicode):
return
data_element.value = clean_escseq(
data_element.value.decode(encodings[0]), encodings)
else:
output = list()
for value in data_element.value:
if isinstance(value, unicode):
output.append(value)
else:
output.append(clean_escseq(value.decode(encodings[0]), encodings))
data_element.value = output
|