/usr/share/pyshared/translate/lang/common.py is in translate-toolkit 1.10.0-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 | #!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright 2007-2008 Zuza Software Foundation
#
# This file is part of translate.
#
# translate is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# translate is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>.
"""This module contains all the common features for languages.
Supported features:
- language code (km, af)
- language name (Khmer, Afrikaans)
- Plurals
- Number of plurals (nplurals)
- Plural equation
- pofilter tests to ignore
Segmentation:
- characters
- words
- sentences
Punctuation:
- End of sentence
- Start of sentence
- Middle of sentence
- Quotes
- single
- double
- Valid characters
- Accelerator characters
- Special characters
- Direction (rtl or ltr)
TODOs and Ideas for possible features:
- Language-Team information
- Segmentation
- phrases
"""
import re
from translate.lang import data
class Common(object):
"""This class is the common parent class for all language classes."""
code = ""
"""The ISO 639 language code, possibly with a country specifier or other
modifier.
Examples::
km
pt_BR
sr_YU@Latn
"""
fullname = ""
"""The full (English) name of this language.
Dialect codes should have the form of:
- Khmer
- Portugese (Brazil)
- TODO: sr_YU@Latn?
"""
nplurals = 0
"""The number of plural forms of this language.
0 is not a valid value - it must be overridden.
Any positive integer is valid (it should probably be between 1 and 6)
.. seealso:: :mod:`translate.lang.data`
"""
pluralequation = "0"
"""The plural equation for selection of plural forms.
This is used for PO files to fill into the header.
.. seealso::
`Gettext manual <http://www.gnu.org/software/gettext/manual/html_node/gettext_150.html#Plural-forms>`_, :mod:`translate.lang.data`
"""
# Don't change these defaults of nplurals or pluralequation willy-nilly:
# some code probably depends on these for unrecognised languages
listseperator = u", "
"""This string is used to separate lists of textual elements. Most
languages probably can stick with the default comma, but Arabic and some
Asian languages might want to override this."""
commonpunc = u".,;:!?-@#$%^*_()[]{}/\\'`\"<>"
"""These punctuation marks are common in English and most languages that
use latin script."""
quotes = u"‘’‛“”„‟′″‴‵‶‷‹›«»"
"""These are different quotation marks used by various languages."""
invertedpunc = u"¿¡"
"""Inverted punctuation sometimes used at the beginning of sentences in
Spanish, Asturian, Galician, and Catalan."""
rtlpunc = u"،؟؛÷"
"""These punctuation marks are used by Arabic and Persian, for example."""
CJKpunc = u"。、,;!?「」『』【】"
"""These punctuation marks are used in certain circumstances with CJK
languages."""
indicpunc = u"।॥॰"
"""These punctuation marks are used by several Indic languages."""
ethiopicpunc = u"።፤፣"
"""These punctuation marks are used by several Ethiopic languages."""
miscpunc = u"…±°¹²³·©®×£¥€"
"""The middle dot (·) is used by Greek and Georgian."""
punctuation = u"".join([commonpunc, quotes, invertedpunc, rtlpunc, CJKpunc,\
indicpunc, ethiopicpunc, miscpunc])
"""We include many types of punctuation here, simply since this is only
meant to determine if something is punctuation. Hopefully we catch some
languages which might not be represented with modules. Most languages won't
need to override this."""
sentenceend = u".!?…։؟।。!?።\u06d4"
"""These marks can indicate a sentence end. Once again we try to account
for many languages. Most langauges won't need to override this."""
#The following tries to account for a lot of things. For the best idea of
#what works, see test_common.py. We try to ignore abbreviations, for
#example, by checking that the following sentence doesn't start with lower
#case or numbers.
sentencere = re.compile(ur"""
(?s) # make . also match newlines
.*? # anything, but match non-greedy
[%s] # the puntuation for sentence ending
\s+ # the spacing after the puntuation
(?=[^a-zа-џ\d]) # lookahead that next part starts with caps
""" % sentenceend, re.VERBOSE | re.UNICODE
)
puncdict = {}
"""A dictionary of punctuation transformation rules that can be used by
punctranslate()."""
ignoretests = []
"""List of pofilter tests for this language that must be ignored."""
checker = None
"""A language specific checker (see filters.checks).
This doesn't need to be supplied, but will be used if it exists."""
_languages = {}
validaccel = None
"""Characters that can be used as accelerators (access keys) i.e. Alt+X
where X is the accelerator. These can include combining diacritics as
long as they are accessible from the users keyboard in a single keystroke,
but normally they would be at least precomposed characters. All characters,
lower and upper, are included in the list."""
validdoublewords = []
"""Some languages allow double words in certain cases. This is a dictionary
of such words."""
def __new__(cls, code):
"""This returns the language class for the given code, following a
singleton like approach (only one object per language)."""
code = code or ""
# First see if a language object for this code already exists
if code in cls._languages:
return cls._languages[code]
# No existing language. Let's build a new one and keep a copy
language = cls._languages[code] = object.__new__(cls)
language.code = code
while code:
langdata = data.languages.get(code, None)
if langdata:
language.fullname, language.nplurals, \
language.pluralequation = langdata
break
code = data.simplercode(code)
if not code:
#print >> sys.stderr, \
# "Warning: No information found about language code %s" % code
pass
return language
def __deepcopy__(self, memo={}):
memo[id(self)] = self
return self
def __repr__(self):
"""Give a simple string representation without address information to
be able to store it in text for comparison later."""
detail = ""
if self.code:
detail = "(%s)" % self.code
return "<class 'translate.lang.common.Common%s'>" % detail
def punctranslate(cls, text):
"""Converts the punctuation in a string according to the rules of the
language."""
# TODO: look at po::escapeforpo() for performance idea
if not text:
return text
ellipses_end = text.endswith(u"...")
if ellipses_end:
text = text[:-3]
for source, target in cls.puncdict.iteritems():
text = text.replace(source, target)
if ellipses_end:
if u"..." in cls.puncdict:
text += cls.puncdict[u"..."]
else:
text += u"..."
# Let's account for cases where a punctuation symbol plus a space is
# replaced, but the space won't exist at the end of the source message.
# As a simple improvement for messages ending in ellipses (...), we
# test that the last character is different from the second last
# This is only relevant if the string has two characters or more
if ((text[-1] + u" " in cls.puncdict) and
(len(text) < 2 or text[-2] != text[-1])):
text = text[:-1] + cls.puncdict[text[-1] + u" "].rstrip()
return text
punctranslate = classmethod(punctranslate)
def length_difference(cls, length):
"""Returns an estimate to a likely change in length relative to an
English string of length length."""
# This is just a rudimentary heuristic guessing that most translations
# will be somewhat longer than the source language
expansion_factor = 0
code = cls.code
while code:
expansion_factor = data.expansion_factors.get(cls.code, 0)
if expansion_factor:
break
code = data.simplercode(code)
else:
expansion_factor = 0.1 # default
constant = max(5, int(40 * expansion_factor))
# The default: return 5 + length/10
return constant + int(expansion_factor * length)
length_difference = classmethod(length_difference)
def alter_length(cls, text):
"""Converts the given string by adding or removing characters as an
estimation of translation length (with English assumed as source
language)."""
def alter_it(text):
l = len(text)
if l > 9:
extra = cls.length_difference(l)
if extra > 0:
text = text[:extra].replace(u'\n', u'') + text
else:
text = text[-extra:]
return text
expanded = []
for subtext in text.split(u"\n\n"):
expanded.append(alter_it(subtext))
text = u"\n\n".join(expanded)
return text
alter_length = classmethod(alter_length)
def character_iter(cls, text):
"""Returns an iterator over the characters in text."""
#We don't return more than one consecutive whitespace character
prev = 'A'
for c in text:
if c.isspace() and prev.isspace():
continue
prev = c
if not (c in cls.punctuation):
yield c
character_iter = classmethod(character_iter)
def characters(cls, text):
"""Returns a list of characters in text."""
return [c for c in cls.character_iter(text)]
characters = classmethod(characters)
def word_iter(cls, text):
"""Returns an iterator over the words in text."""
#TODO: Consider replacing puctuation with space before split()
for w in text.split():
word = w.strip(cls.punctuation)
if word:
yield word
word_iter = classmethod(word_iter)
def words(cls, text):
"""Returns a list of words in text."""
return [w for w in cls.word_iter(text)]
words = classmethod(words)
def sentence_iter(cls, text, strip=True):
"""Returns an iterator over the sentences in text."""
lastmatch = 0
text = text or ""
for item in cls.sentencere.finditer(text):
lastmatch = item.end()
sentence = item.group()
if strip:
sentence = sentence.strip()
if sentence:
yield sentence
remainder = text[lastmatch:]
if strip:
remainder = remainder.strip()
if remainder:
yield remainder
sentence_iter = classmethod(sentence_iter)
def sentences(cls, text, strip=True):
"""Returns a list of senteces in text."""
return [s for s in cls.sentence_iter(text, strip=strip)]
sentences = classmethod(sentences)
def capsstart(cls, text):
"""Determines whether the text starts with a capital letter."""
stripped = text.lstrip().lstrip(cls.punctuation)
return stripped and stripped[0].isupper()
capsstart = classmethod(capsstart)
def numstart(cls, text):
"""Determines whether the text starts with a mumeric value."""
stripped = text.lstrip().lstrip(cls.punctuation)
return stripped and stripped[0].isnumeric()
numstart = classmethod(numstart)
|