This file is indexed.

/usr/share/pyshared/translate/lang/common.py is in translate-toolkit 1.10.0-2.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright 2007-2008 Zuza Software Foundation
#
# This file is part of translate.
#
# translate is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# translate is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>.

"""This module contains all the common features for languages.

Supported features:

- language code (km, af)
- language name (Khmer, Afrikaans)
- Plurals

  - Number of plurals (nplurals)
  - Plural equation

- pofilter tests to ignore

Segmentation:

- characters
- words
- sentences

Punctuation:

- End of sentence
- Start of sentence
- Middle of sentence
- Quotes

  - single
  - double

- Valid characters
- Accelerator characters
- Special characters
- Direction (rtl or ltr)

TODOs and Ideas for possible features:

- Language-Team information
- Segmentation

  - phrases
"""

import re

from translate.lang import data


class Common(object):
    """This class is the common parent class for all language classes."""

    code = ""
    """The ISO 639 language code, possibly with a country specifier or other
    modifier.

    Examples::

        km
        pt_BR
        sr_YU@Latn
    """

    fullname = ""
    """The full (English) name of this language.

    Dialect codes should have the form of:

    - Khmer
    - Portugese (Brazil)
    - TODO: sr_YU@Latn?
    """

    nplurals = 0
    """The number of plural forms of this language.

    0 is not a valid value - it must be overridden.
    Any positive integer is valid (it should probably be between 1 and 6)

    .. seealso:: :mod:`translate.lang.data`
    """

    pluralequation = "0"
    """The plural equation for selection of plural forms.

    This is used for PO files to fill into the header.

    .. seealso::

       `Gettext manual <http://www.gnu.org/software/gettext/manual/html_node/gettext_150.html#Plural-forms>`_, :mod:`translate.lang.data`
    """
    # Don't change these defaults of nplurals or pluralequation willy-nilly:
    # some code probably depends on these for unrecognised languages

    listseperator = u", "
    """This string is used to separate lists of textual elements. Most
    languages probably can stick with the default comma, but Arabic and some
    Asian languages might want to override this."""

    commonpunc = u".,;:!?-@#$%^*_()[]{}/\\'`\"<>"
    """These punctuation marks are common in English and most languages that
    use latin script."""

    quotes = u"‘’‛“”„‟′″‴‵‶‷‹›«»"
    """These are different quotation marks used by various languages."""

    invertedpunc = u"¿¡"
    """Inverted punctuation sometimes used at the beginning of sentences in
    Spanish, Asturian, Galician, and Catalan."""

    rtlpunc = u"،؟؛÷"
    """These punctuation marks are used by Arabic and Persian, for example."""

    CJKpunc = u"。、,;!?「」『』【】"
    """These punctuation marks are used in certain circumstances with CJK
    languages."""

    indicpunc = u"।॥॰"
    """These punctuation marks are used by several Indic languages."""

    ethiopicpunc = u"።፤፣"
    """These punctuation marks are used by several Ethiopic languages."""

    miscpunc = u"…±°¹²³·©®×£¥€"
    """The middle dot (·) is used by Greek and Georgian."""

    punctuation = u"".join([commonpunc, quotes, invertedpunc, rtlpunc, CJKpunc,\
            indicpunc, ethiopicpunc, miscpunc])
    """We include many types of punctuation here, simply since this is only
    meant to determine if something is punctuation. Hopefully we catch some
    languages which might not be represented with modules. Most languages won't
    need to override this."""

    sentenceend = u".!?…։؟।。!?።\u06d4"
    """These marks can indicate a sentence end. Once again we try to account
    for many languages. Most langauges won't need to override this."""

    #The following tries to account for a lot of things. For the best idea of
    #what works, see test_common.py. We try to ignore abbreviations, for
    #example, by checking that the following sentence doesn't start with lower
    #case or numbers.
    sentencere = re.compile(ur"""
        (?s)        # make . also match newlines
        .*?         # anything, but match non-greedy
        [%s]        # the puntuation for sentence ending
        \s+         # the spacing after the puntuation
        (?=[^a-zа-џ\d])  # lookahead that next part starts with caps
        """ % sentenceend, re.VERBOSE | re.UNICODE
    )

    puncdict = {}
    """A dictionary of punctuation transformation rules that can be used by
    punctranslate()."""

    ignoretests = []
    """List of pofilter tests for this language that must be ignored."""

    checker = None
    """A language specific checker (see filters.checks).

    This doesn't need to be supplied, but will be used if it exists."""

    _languages = {}

    validaccel = None
    """Characters that can be used as accelerators (access keys) i.e. Alt+X
    where X is the accelerator.  These can include combining diacritics as
    long as they are accessible from the users keyboard in a single keystroke,
    but normally they would be at least precomposed characters. All characters,
    lower and upper, are included in the list."""

    validdoublewords = []
    """Some languages allow double words in certain cases.  This is a dictionary
    of such words."""

    def __new__(cls, code):
        """This returns the language class for the given code, following a
        singleton like approach (only one object per language)."""
        code = code or ""
        # First see if a language object for this code already exists
        if code in cls._languages:
            return cls._languages[code]
        # No existing language. Let's build a new one and keep a copy
        language = cls._languages[code] = object.__new__(cls)

        language.code = code
        while code:
            langdata = data.languages.get(code, None)
            if langdata:
                language.fullname, language.nplurals, \
                    language.pluralequation = langdata
                break
            code = data.simplercode(code)
        if not code:
            #print >> sys.stderr, \
            #         "Warning: No information found about language code %s" % code
            pass
        return language

    def __deepcopy__(self, memo={}):
        memo[id(self)] = self
        return self

    def __repr__(self):
        """Give a simple string representation without address information to
        be able to store it in text for comparison later."""
        detail = ""
        if self.code:
            detail = "(%s)" % self.code
        return "<class 'translate.lang.common.Common%s'>" % detail

    def punctranslate(cls, text):
        """Converts the punctuation in a string according to the rules of the
        language."""
#        TODO: look at po::escapeforpo() for performance idea
        if not text:
            return text
        ellipses_end = text.endswith(u"...")
        if ellipses_end:
            text = text[:-3]
        for source, target in cls.puncdict.iteritems():
            text = text.replace(source, target)
        if ellipses_end:
            if u"..." in cls.puncdict:
                text += cls.puncdict[u"..."]
            else:
                text += u"..."
        # Let's account for cases where a punctuation symbol plus a space is
        # replaced, but the space won't exist at the end of the source message.
        # As a simple improvement for messages ending in ellipses (...), we
        # test that the last character is different from the second last
        # This is only relevant if the string has two characters or more
        if ((text[-1] + u" " in cls.puncdict) and
            (len(text) < 2 or text[-2] != text[-1])):
            text = text[:-1] + cls.puncdict[text[-1] + u" "].rstrip()
        return text
    punctranslate = classmethod(punctranslate)

    def length_difference(cls, length):
        """Returns an estimate to a likely change in length relative to an
        English string of length length."""
        # This is just a rudimentary heuristic guessing that most translations
        # will be somewhat longer than the source language
        expansion_factor = 0
        code = cls.code
        while code:
            expansion_factor = data.expansion_factors.get(cls.code, 0)
            if expansion_factor:
                break
            code = data.simplercode(code)
        else:
            expansion_factor = 0.1  # default
        constant = max(5, int(40 * expansion_factor))
        # The default: return 5 + length/10
        return constant + int(expansion_factor * length)
    length_difference = classmethod(length_difference)

    def alter_length(cls, text):
        """Converts the given string by adding or removing characters as an
        estimation of translation length (with English assumed as source
        language)."""

        def alter_it(text):
            l = len(text)
            if l > 9:
                extra = cls.length_difference(l)
                if extra > 0:
                    text = text[:extra].replace(u'\n', u'') + text
                else:
                    text = text[-extra:]
            return text
        expanded = []
        for subtext in text.split(u"\n\n"):
            expanded.append(alter_it(subtext))
        text = u"\n\n".join(expanded)
        return text
    alter_length = classmethod(alter_length)

    def character_iter(cls, text):
        """Returns an iterator over the characters in text."""
        #We don't return more than one consecutive whitespace character
        prev = 'A'
        for c in text:
            if c.isspace() and prev.isspace():
                continue
            prev = c
            if not (c in cls.punctuation):
                yield c
    character_iter = classmethod(character_iter)

    def characters(cls, text):
        """Returns a list of characters in text."""
        return [c for c in cls.character_iter(text)]
    characters = classmethod(characters)

    def word_iter(cls, text):
        """Returns an iterator over the words in text."""
        #TODO: Consider replacing puctuation with space before split()
        for w in text.split():
            word = w.strip(cls.punctuation)
            if word:
                yield word
    word_iter = classmethod(word_iter)

    def words(cls, text):
        """Returns a list of words in text."""
        return [w for w in cls.word_iter(text)]
    words = classmethod(words)

    def sentence_iter(cls, text, strip=True):
        """Returns an iterator over the sentences in text."""
        lastmatch = 0
        text = text or ""
        for item in cls.sentencere.finditer(text):
            lastmatch = item.end()
            sentence = item.group()
            if strip:
                sentence = sentence.strip()
            if sentence:
                yield sentence
        remainder = text[lastmatch:]
        if strip:
            remainder = remainder.strip()
        if remainder:
            yield remainder
    sentence_iter = classmethod(sentence_iter)

    def sentences(cls, text, strip=True):
        """Returns a list of senteces in text."""
        return [s for s in cls.sentence_iter(text, strip=strip)]
    sentences = classmethod(sentences)

    def capsstart(cls, text):
        """Determines whether the text starts with a capital letter."""
        stripped = text.lstrip().lstrip(cls.punctuation)
        return stripped and stripped[0].isupper()
    capsstart = classmethod(capsstart)

    def numstart(cls, text):
        """Determines whether the text starts with a mumeric value."""
        stripped = text.lstrip().lstrip(cls.punctuation)
        return stripped and stripped[0].isnumeric()
    numstart = classmethod(numstart)