/usr/share/pyshared/enchant/tokenize/en.py

# pyenchant
#
# Copyright (C) 2004-2008, Ryan Kelly
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.
#
# In addition, as a special exception, you are
# given permission to link the code of this program with
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
# spell checker backend) and distribute linked combinations including
# the two.  You must obey the GNU Lesser General Public License in all
# respects for all of the code used other than said providers.  If you modify
# this file, you may extend this exception to your version of the
# file, but you are not obligated to do so.  If you do not wish to
# do so, delete this exception statement from your version.
#
"""

    enchant.tokenize.en:    Tokenizer for the English language
    
    This module implements a PyEnchant text tokenizer for the English
    language, based on very simple rules.

"""

import unicodedata

import enchant.tokenize
from enchant.utils import unicode


class tokenize(enchant.tokenize.tokenize):
    """Iterator splitting text into words, reporting position.
    
    This iterator takes a text string as input, and yields tuples
    representing each distinct word found in the text.  The tuples
    take the form:
        
        (<word>,<pos>)
        
    Where <word> is the word string found and <pos> is the position
    of the start of the word within the text.
    
    The optional argument <valid_chars> may be used to specify a
    list of additional characters that can form part of a word.
    By default, this list contains only the apostrophe ('). Note that
    these characters cannot appear at the start or end of a word.
    """

    _DOC_ERRORS = ["pos","pos"]
    
    def __init__(self,text,valid_chars=("'",)):
        self._valid_chars = valid_chars
        self._text = text
        self.offset = 0
        # Select proper implementation of self._consume_alpha.
        # 'text' isn't necessarily a string (it could be e.g. a mutable array)
        # so we can't use isinstance(text,unicode) to detect unicode.
        # Instead we typetest the first character of the text.
        # If there's no characters then it doesn't matter what implementation
        # we use since it won't be called anyway. 
        try:
            char1 = text[0]
        except IndexError:
            self._consume_alpha = self._consume_alpha_b
        else:
            if isinstance(char1,unicode):
                self._consume_alpha = self._consume_alpha_u
            else:
                self._consume_alpha = self._consume_alpha_b
    
    def _consume_alpha_b(self,text,offset):
        """Consume an alphabetic character from the given bytestring.

        Given a bytestring and the current offset, this method returns
        the number of characters occupied by the next alphabetic character
        in the string.  Non-ASCII bytes are interpreted as utf-8 and can
        result in multiple characters being consumed.
        """
        assert offset < len(text)
        if text[offset].isalpha():
            return 1
        elif text[offset] >= "\x80":
            return self._consume_alpha_utf8(text,offset)
        return 0

    def _consume_alpha_utf8(self,text,offset):
        """Consume a sequence of utf8 bytes forming an alphabetic character."""
        incr = 2
        u = ""
        while not u and incr <= 4:
            try:
                try:
                    #  In the common case this will be a string
                    u = text[offset:offset+incr].decode("utf8")
                except AttributeError:
                    #  Looks like it was e.g. a mutable char array.
                    try:
                        s = text[offset:offset+incr].tostring()
                    except AttributeError:
                        s = "".join([c for c in text[offset:offset+incr]])
                    u = s.decode("utf8")
            except UnicodeDecodeError:
                incr += 1
        if not u:
            return 0
        if u.isalpha():
            return incr
        if unicodedata.category(u)[0] == "M":
            return incr
        return 0

    def _consume_alpha_u(self,text,offset):
        """Consume an alphabetic character from the given unicode string.

        Given a unicode string and the current offset, this method returns
        the number of characters occupied by the next alphabetic character
        in the string.  Trailing combining characters are consumed as a
        single letter.
        """
        assert offset < len(text)
        incr = 0
        if text[offset].isalpha():
            incr = 1
            while offset + incr < len(text):
                if unicodedata.category(text[offset+incr])[0] != "M":
                    break
                incr += 1
        return incr

    def next(self):
        text = self._text
        offset = self.offset
        while offset < len(text):
            # Find start of next word (must be alpha)
            while offset < len(text):
                incr = self._consume_alpha(text,offset)
                if incr:
                    break
                offset += 1
            curPos = offset
            # Find end of word using, allowing valid_chars
            while offset < len(text):
                incr = self._consume_alpha(text,offset)
                if not incr:
                    if text[offset] in self._valid_chars:
                        incr = 1
                    else:
                        break
                offset += incr
            # Return if word isnt empty
            if(curPos != offset):
                # Make sure word doesn't end with a valid_char
                while text[offset-1] in self._valid_chars:
                    offset = offset - 1
                self.offset = offset
                return (text[curPos:offset],curPos)
        self.offset = offset
        raise StopIteration()
python-enchant 1.6.5-2build1 / usr / share / pyshared / enchant / tokenize / en.py