/usr/share/pyshared/enchant/tokenize/en.py is in python-enchant 1.6.5-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 | # pyenchant
#
# Copyright (C) 2004-2008, Ryan Kelly
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.
#
# In addition, as a special exception, you are
# given permission to link the code of this program with
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
# spell checker backend) and distribute linked combinations including
# the two. You must obey the GNU Lesser General Public License in all
# respects for all of the code used other than said providers. If you modify
# this file, you may extend this exception to your version of the
# file, but you are not obligated to do so. If you do not wish to
# do so, delete this exception statement from your version.
#
"""
enchant.tokenize.en: Tokenizer for the English language
This module implements a PyEnchant text tokenizer for the English
language, based on very simple rules.
"""
import unicodedata
import enchant.tokenize
from enchant.utils import unicode
class tokenize(enchant.tokenize.tokenize):
"""Iterator splitting text into words, reporting position.
This iterator takes a text string as input, and yields tuples
representing each distinct word found in the text. The tuples
take the form:
(<word>,<pos>)
Where <word> is the word string found and <pos> is the position
of the start of the word within the text.
The optional argument <valid_chars> may be used to specify a
list of additional characters that can form part of a word.
By default, this list contains only the apostrophe ('). Note that
these characters cannot appear at the start or end of a word.
"""
_DOC_ERRORS = ["pos","pos"]
def __init__(self,text,valid_chars=("'",)):
self._valid_chars = valid_chars
self._text = text
self.offset = 0
# Select proper implementation of self._consume_alpha.
# 'text' isn't necessarily a string (it could be e.g. a mutable array)
# so we can't use isinstance(text,unicode) to detect unicode.
# Instead we typetest the first character of the text.
# If there's no characters then it doesn't matter what implementation
# we use since it won't be called anyway.
try:
char1 = text[0]
except IndexError:
self._consume_alpha = self._consume_alpha_b
else:
if isinstance(char1,unicode):
self._consume_alpha = self._consume_alpha_u
else:
self._consume_alpha = self._consume_alpha_b
def _consume_alpha_b(self,text,offset):
"""Consume an alphabetic character from the given bytestring.
Given a bytestring and the current offset, this method returns
the number of characters occupied by the next alphabetic character
in the string. Non-ASCII bytes are interpreted as utf-8 and can
result in multiple characters being consumed.
"""
assert offset < len(text)
if text[offset].isalpha():
return 1
elif text[offset] >= "\x80":
return self._consume_alpha_utf8(text,offset)
return 0
def _consume_alpha_utf8(self,text,offset):
"""Consume a sequence of utf8 bytes forming an alphabetic character."""
incr = 2
u = ""
while not u and incr <= 4:
try:
try:
# In the common case this will be a string
u = text[offset:offset+incr].decode("utf8")
except AttributeError:
# Looks like it was e.g. a mutable char array.
try:
s = text[offset:offset+incr].tostring()
except AttributeError:
s = "".join([c for c in text[offset:offset+incr]])
u = s.decode("utf8")
except UnicodeDecodeError:
incr += 1
if not u:
return 0
if u.isalpha():
return incr
if unicodedata.category(u)[0] == "M":
return incr
return 0
def _consume_alpha_u(self,text,offset):
"""Consume an alphabetic character from the given unicode string.
Given a unicode string and the current offset, this method returns
the number of characters occupied by the next alphabetic character
in the string. Trailing combining characters are consumed as a
single letter.
"""
assert offset < len(text)
incr = 0
if text[offset].isalpha():
incr = 1
while offset + incr < len(text):
if unicodedata.category(text[offset+incr])[0] != "M":
break
incr += 1
return incr
def next(self):
text = self._text
offset = self.offset
while offset < len(text):
# Find start of next word (must be alpha)
while offset < len(text):
incr = self._consume_alpha(text,offset)
if incr:
break
offset += 1
curPos = offset
# Find end of word using, allowing valid_chars
while offset < len(text):
incr = self._consume_alpha(text,offset)
if not incr:
if text[offset] in self._valid_chars:
incr = 1
else:
break
offset += incr
# Return if word isnt empty
if(curPos != offset):
# Make sure word doesn't end with a valid_char
while text[offset-1] in self._valid_chars:
offset = offset - 1
self.offset = offset
return (text[curPos:offset],curPos)
self.offset = offset
raise StopIteration()
|