This file is indexed.

/usr/share/pyshared/enchant/tokenize/en.py is in python-enchant 1.6.5-2.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# pyenchant
#
# Copyright (C) 2004-2008, Ryan Kelly
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.
#
# In addition, as a special exception, you are
# given permission to link the code of this program with
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
# spell checker backend) and distribute linked combinations including
# the two.  You must obey the GNU Lesser General Public License in all
# respects for all of the code used other than said providers.  If you modify
# this file, you may extend this exception to your version of the
# file, but you are not obligated to do so.  If you do not wish to
# do so, delete this exception statement from your version.
#
"""

    enchant.tokenize.en:    Tokenizer for the English language
    
    This module implements a PyEnchant text tokenizer for the English
    language, based on very simple rules.

"""

import unicodedata

import enchant.tokenize
from enchant.utils import unicode


class tokenize(enchant.tokenize.tokenize):
    """Iterator splitting text into words, reporting position.
    
    This iterator takes a text string as input, and yields tuples
    representing each distinct word found in the text.  The tuples
    take the form:
        
        (<word>,<pos>)
        
    Where <word> is the word string found and <pos> is the position
    of the start of the word within the text.
    
    The optional argument <valid_chars> may be used to specify a
    list of additional characters that can form part of a word.
    By default, this list contains only the apostrophe ('). Note that
    these characters cannot appear at the start or end of a word.
    """

    _DOC_ERRORS = ["pos","pos"]
    
    def __init__(self,text,valid_chars=("'",)):
        self._valid_chars = valid_chars
        self._text = text
        self.offset = 0
        # Select proper implementation of self._consume_alpha.
        # 'text' isn't necessarily a string (it could be e.g. a mutable array)
        # so we can't use isinstance(text,unicode) to detect unicode.
        # Instead we typetest the first character of the text.
        # If there's no characters then it doesn't matter what implementation
        # we use since it won't be called anyway. 
        try:
            char1 = text[0]
        except IndexError:
            self._consume_alpha = self._consume_alpha_b
        else:
            if isinstance(char1,unicode):
                self._consume_alpha = self._consume_alpha_u
            else:
                self._consume_alpha = self._consume_alpha_b
    
    def _consume_alpha_b(self,text,offset):
        """Consume an alphabetic character from the given bytestring.

        Given a bytestring and the current offset, this method returns
        the number of characters occupied by the next alphabetic character
        in the string.  Non-ASCII bytes are interpreted as utf-8 and can
        result in multiple characters being consumed.
        """
        assert offset < len(text)
        if text[offset].isalpha():
            return 1
        elif text[offset] >= "\x80":
            return self._consume_alpha_utf8(text,offset)
        return 0

    def _consume_alpha_utf8(self,text,offset):
        """Consume a sequence of utf8 bytes forming an alphabetic character."""
        incr = 2
        u = ""
        while not u and incr <= 4:
            try:
                try:
                    #  In the common case this will be a string
                    u = text[offset:offset+incr].decode("utf8")
                except AttributeError:
                    #  Looks like it was e.g. a mutable char array.
                    try:
                        s = text[offset:offset+incr].tostring()
                    except AttributeError:
                        s = "".join([c for c in text[offset:offset+incr]])
                    u = s.decode("utf8")
            except UnicodeDecodeError:
                incr += 1
        if not u:
            return 0
        if u.isalpha():
            return incr
        if unicodedata.category(u)[0] == "M":
            return incr
        return 0

    def _consume_alpha_u(self,text,offset):
        """Consume an alphabetic character from the given unicode string.

        Given a unicode string and the current offset, this method returns
        the number of characters occupied by the next alphabetic character
        in the string.  Trailing combining characters are consumed as a
        single letter.
        """
        assert offset < len(text)
        incr = 0
        if text[offset].isalpha():
            incr = 1
            while offset + incr < len(text):
                if unicodedata.category(text[offset+incr])[0] != "M":
                    break
                incr += 1
        return incr

    def next(self):
        text = self._text
        offset = self.offset
        while offset < len(text):
            # Find start of next word (must be alpha)
            while offset < len(text):
                incr = self._consume_alpha(text,offset)
                if incr:
                    break
                offset += 1
            curPos = offset
            # Find end of word using, allowing valid_chars
            while offset < len(text):
                incr = self._consume_alpha(text,offset)
                if not incr:
                    if text[offset] in self._valid_chars:
                        incr = 1
                    else:
                        break
                offset += incr
            # Return if word isnt empty
            if(curPos != offset):
                # Make sure word doesn't end with a valid_char
                while text[offset-1] in self._valid_chars:
                    offset = offset - 1
                self.offset = offset
                return (text[curPos:offset],curPos)
        self.offset = offset
        raise StopIteration()