/usr/lib/python2.7/dist-packages/whoosh/reading.py

# Copyright 2007 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#    1. Redistributions of source code must retain the above copyright notice,
#       this list of conditions and the following disclaimer.
#
#    2. Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.

"""This module contains classes that allow reading from an index.
"""

from math import log
from bisect import bisect_right
from heapq import heapify, heapreplace, heappop, nlargest

from whoosh import columns
from whoosh.compat import abstractmethod
from whoosh.compat import xrange, zip_, next, iteritems
from whoosh.filedb.filestore import OverlayStorage
from whoosh.matching import MultiMatcher
from whoosh.support.levenshtein import distance
from whoosh.system import emptybytes


# Exceptions

class ReaderClosed(Exception):
    """Exception raised when you try to do some operation on a closed searcher
    (or a Results object derived from a searcher that has since been closed).
    """

    message = "Operation on a closed reader"


class TermNotFound(Exception):
    pass


# Term Info base class

class TermInfo(object):
    """Represents a set of statistics about a term. This object is returned by
    :meth:`IndexReader.term_info`. These statistics may be useful for
    optimizations and scoring algorithms.
    """

    def __init__(self, weight=0, df=0, minlength=None,
                 maxlength=0, maxweight=0, minid=None, maxid=0):
        self._weight = weight
        self._df = df
        self._minlength = minlength
        self._maxlength = maxlength
        self._maxweight = maxweight
        self._minid = minid
        self._maxid = maxid

    def add_posting(self, docnum, weight, length=None):
        if self._minid is None:
            self._minid = docnum
        self._maxid = docnum
        self._weight += weight
        self._df += 1
        self._maxweight = max(self._maxweight, weight)

        if length is not None:
            if self._minlength is None:
                self._minlength = length
            else:
                self._minlength = min(self._minlength, length)
            self._maxlength = max(self._maxlength, length)

    def weight(self):
        """Returns the total frequency of the term across all documents.
        """

        return self._weight

    def doc_frequency(self):
        """Returns the number of documents the term appears in.
        """

        return self._df

    def min_length(self):
        """Returns the length of the shortest field value the term appears
        in.
        """

        return self._minlength

    def max_length(self):
        """Returns the length of the longest field value the term appears
        in.
        """

        return self._maxlength

    def max_weight(self):
        """Returns the number of times the term appears in the document in
        which it appears the most.
        """

        return self._maxweight

    def min_id(self):
        """Returns the lowest document ID this term appears in.
        """

        return self._minid

    def max_id(self):
        """Returns the highest document ID this term appears in.
        """

        return self._maxid


# Reader base class

class IndexReader(object):
    """Do not instantiate this object directly. Instead use Index.reader().
    """

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.close()

    @abstractmethod
    def __contains__(self, term):
        """Returns True if the given term tuple (fieldname, text) is
        in this reader.
        """
        raise NotImplementedError

    def codec(self):
        """Returns the :class:`whoosh.codec.base.Codec` object used to read
        this reader's segment. If this reader is not atomic
        (``reader.is_atomic() == True``), returns None.
        """

        return None

    def segment(self):
        """Returns the :class:`whoosh.index.Segment` object used by this reader.
        If this reader is not atomic (``reader.is_atomic() == True``), returns
        None.
        """

        return None

    def storage(self):
        """Returns the :class:`whoosh.filedb.filestore.Storage` object used by
        this reader to read its files. If the reader is not atomic,
        (``reader.is_atomic() == True``), returns None.
        """

        return None

    def is_atomic(self):
        return True

    def _text_to_bytes(self, fieldname, text):
        if fieldname not in self.schema:
            raise TermNotFound((fieldname, text))
        return self.schema[fieldname].to_bytes(text)

    def close(self):
        """Closes the open files associated with this reader.
        """

        pass

    def generation(self):
        """Returns the generation of the index being read, or -1 if the backend
        is not versioned.
        """

        return None

    @abstractmethod
    def indexed_field_names(self):
        """Returns an iterable of strings representing the names of the indexed
        fields. This may include additional names not explicitly listed in the
        Schema if you use "glob" fields.
        """

        raise NotImplementedError

    @abstractmethod
    def all_terms(self):
        """Yields (fieldname, text) tuples for every term in the index.
        """

        raise NotImplementedError

    def terms_from(self, fieldname, prefix):
        """Yields (fieldname, text) tuples for every term in the index starting
        at the given prefix.
        """

        # The default implementation just scans the whole list of terms
        for fname, text in self.all_terms():
            if fname < fieldname or text < prefix:
                continue
            yield (fname, text)

    @abstractmethod
    def term_info(self, fieldname, text):
        """Returns a :class:`TermInfo` object allowing access to various
        statistics about the given term.
        """

        raise NotImplementedError

    def expand_prefix(self, fieldname, prefix):
        """Yields terms in the given field that start with the given prefix.
        """

        for fn, text in self.terms_from(fieldname, prefix):
            if fn != fieldname or not text.startswith(prefix):
                return
            yield text

    def lexicon(self, fieldname):
        """Yields all bytestrings in the given field.
        """

        for fn, btext in self.terms_from(fieldname, emptybytes):
            if fn != fieldname:
                return
            yield btext

    def field_terms(self, fieldname):
        """Yields all term values (converted from on-disk bytes) in the given
        field.
        """

        from_bytes = self.schema[fieldname].from_bytes
        for btext in self.lexicon(fieldname):
            yield from_bytes(btext)

    def __iter__(self):
        """Yields ((fieldname, text), terminfo) tuples for each term in the
        reader, in lexical order.
        """

        term_info = self.term_info
        for term in self.all_terms():
            yield (term, term_info(*term))

    def iter_from(self, fieldname, text):
        """Yields ((fieldname, text), terminfo) tuples for all terms in the
        reader, starting at the given term.
        """

        term_info = self.term_info
        text = self._text_to_bytes(fieldname, text)
        for term in self.terms_from(fieldname, text):
            yield (term, term_info(*term))

    def iter_field(self, fieldname, prefix=''):
        """Yields (text, terminfo) tuples for all terms in the given field.
        """

        prefix = self._text_to_bytes(fieldname, prefix)
        for (fn, text), terminfo in self.iter_from(fieldname, prefix):
            if fn != fieldname:
                return
            yield text, terminfo

    def iter_prefix(self, fieldname, prefix):
        """Yields (text, terminfo) tuples for all terms in the given field with
        a certain prefix.
        """

        prefix = self._text_to_bytes(fieldname, prefix)
        for text, terminfo in self.iter_field(fieldname, prefix):
            if not text.startswith(prefix):
                return
            yield (text, terminfo)

    @abstractmethod
    def has_deletions(self):
        """Returns True if the underlying index/segment has deleted
        documents.
        """

        raise NotImplementedError

    def all_doc_ids(self):
        """Returns an iterator of all (undeleted) document IDs in the reader.
        """

        is_deleted = self.is_deleted
        return (docnum for docnum in xrange(self.doc_count_all())
                if not is_deleted(docnum))

    def iter_docs(self):
        """Yields a series of ``(docnum, stored_fields_dict)``
        tuples for the undeleted documents in the reader.
        """

        for docnum in self.all_doc_ids():
            yield docnum, self.stored_fields(docnum)

    @abstractmethod
    def is_deleted(self, docnum):
        """Returns True if the given document number is marked deleted.
        """

        raise NotImplementedError

    @abstractmethod
    def stored_fields(self, docnum):
        """Returns the stored fields for the given document number.

        :param numerickeys: use field numbers as the dictionary keys instead of
            field names.
        """

        raise NotImplementedError

    def all_stored_fields(self):
        """Yields the stored fields for all non-deleted documents.
        """

        is_deleted = self.is_deleted
        for docnum in xrange(self.doc_count_all()):
            if not is_deleted(docnum):
                yield self.stored_fields(docnum)

    @abstractmethod
    def doc_count_all(self):
        """Returns the total number of documents, DELETED OR UNDELETED,
        in this reader.
        """

        raise NotImplementedError

    @abstractmethod
    def doc_count(self):
        """Returns the total number of UNDELETED documents in this reader.
        """

        return self.doc_count_all() - self.deleted_count()

    @abstractmethod
    def frequency(self, fieldname, text):
        """Returns the total number of instances of the given term in the
        collection.
        """
        raise NotImplementedError

    @abstractmethod
    def doc_frequency(self, fieldname, text):
        """Returns how many documents the given term appears in.
        """
        raise NotImplementedError

    @abstractmethod
    def field_length(self, fieldname):
        """Returns the total number of terms in the given field. This is used
        by some scoring algorithms.
        """
        raise NotImplementedError

    @abstractmethod
    def min_field_length(self, fieldname):
        """Returns the minimum length of the field across all documents. This
        is used by some scoring algorithms.
        """
        raise NotImplementedError

    @abstractmethod
    def max_field_length(self, fieldname):
        """Returns the minimum length of the field across all documents. This
        is used by some scoring algorithms.
        """
        raise NotImplementedError

    @abstractmethod
    def doc_field_length(self, docnum, fieldname, default=0):
        """Returns the number of terms in the given field in the given
        document. This is used by some scoring algorithms.
        """
        raise NotImplementedError

    def first_id(self, fieldname, text):
        """Returns the first ID in the posting list for the given term. This
        may be optimized in certain backends.
        """

        text = self._text_to_bytes(fieldname, text)
        p = self.postings(fieldname, text)
        if p.is_active():
            return p.id()
        raise TermNotFound((fieldname, text))

    def iter_postings(self):
        """Low-level method, yields all postings in the reader as
        ``(fieldname, text, docnum, weight, valuestring)`` tuples.
        """

        for fieldname, btext in self.all_terms():
            m = self.postings(fieldname, btext)
            while m.is_active():
                yield (fieldname, btext, m.id(), m.weight(), m.value())
                m.next()

    @abstractmethod
    def postings(self, fieldname, text):
        """Returns a :class:`~whoosh.matching.Matcher` for the postings of the
        given term.

        >>> pr = reader.postings("content", "render")
        >>> pr.skip_to(10)
        >>> pr.id
        12

        :param fieldname: the field name or field number of the term.
        :param text: the text of the term.
        :rtype: :class:`whoosh.matching.Matcher`
        """

        raise NotImplementedError

    @abstractmethod
    def has_vector(self, docnum, fieldname):
        """Returns True if the given document has a term vector for the given
        field.
        """
        raise NotImplementedError

    @abstractmethod
    def vector(self, docnum, fieldname, format_=None):
        """Returns a :class:`~whoosh.matching.Matcher` object for the
        given term vector.

        >>> docnum = searcher.document_number(path=u'/a/b/c')
        >>> v = searcher.vector(docnum, "content")
        >>> v.all_as("frequency")
        [(u"apple", 3), (u"bear", 2), (u"cab", 2)]

        :param docnum: the document number of the document for which you want
            the term vector.
        :param fieldname: the field name or field number of the field for which
            you want the term vector.
        :rtype: :class:`whoosh.matching.Matcher`
        """
        raise NotImplementedError

    def vector_as(self, astype, docnum, fieldname):
        """Returns an iterator of (termtext, value) pairs for the terms in the
        given term vector. This is a convenient shortcut to calling vector()
        and using the Matcher object when all you want are the terms and/or
        values.

        >>> docnum = searcher.document_number(path=u'/a/b/c')
        >>> searcher.vector_as("frequency", docnum, "content")
        [(u"apple", 3), (u"bear", 2), (u"cab", 2)]

        :param docnum: the document number of the document for which you want
            the term vector.
        :param fieldname: the field name or field number of the field for which
            you want the term vector.
        :param astype: a string containing the name of the format you want the
            term vector's data in, for example "weights".
        """

        vec = self.vector(docnum, fieldname)
        if astype == "weight":
            while vec.is_active():
                yield (vec.id(), vec.weight())
                vec.next()
        else:
            format_ = self.schema[fieldname].format
            decoder = format_.decoder(astype)
            while vec.is_active():
                yield (vec.id(), decoder(vec.value()))
                vec.next()

    def corrector(self, fieldname):
        """Returns a :class:`whoosh.spelling.Corrector` object that suggests
        corrections based on the terms in the given field.
        """

        from whoosh.spelling import ReaderCorrector

        fieldobj = self.schema[fieldname]
        return ReaderCorrector(self, fieldname, fieldobj)

    def terms_within(self, fieldname, text, maxdist, prefix=0):
        """
        Returns a generator of words in the given field within ``maxdist``
        Damerau-Levenshtein edit distance of the given text.

        Important: the terms are returned in **no particular order**. The only
        criterion is that they are within ``maxdist`` edits of ``text``. You
        may want to run this method multiple times with increasing ``maxdist``
        values to ensure you get the closest matches first. You may also have
        additional information (such as term frequency or an acoustic matching
        algorithm) you can use to rank terms with the same edit distance.

        :param maxdist: the maximum edit distance.
        :param prefix: require suggestions to share a prefix of this length
            with the given word. This is often justifiable since most
            misspellings do not involve the first letter of the word.
            Using a prefix dramatically decreases the time it takes to generate
            the list of words.
        :param seen: an optional set object. Words that appear in the set will
            not be yielded.
        """

        fieldobj = self.schema[fieldname]
        for btext in self.expand_prefix(fieldname, text[:prefix]):
            word = fieldobj.from_bytes(btext)
            k = distance(word, text, limit=maxdist)
            if k <= maxdist:
                yield word

    def most_frequent_terms(self, fieldname, number=5, prefix=''):
        """Returns the top 'number' most frequent terms in the given field as a
        list of (frequency, text) tuples.
        """

        gen = ((terminfo.weight(), text) for text, terminfo
               in self.iter_prefix(fieldname, prefix))
        return nlargest(number, gen)

    def most_distinctive_terms(self, fieldname, number=5, prefix=''):
        """Returns the top 'number' terms with the highest `tf*idf` scores as
        a list of (score, text) tuples.
        """

        N = float(self.doc_count())
        gen = ((terminfo.weight() * log(N / terminfo.doc_frequency()), text)
               for text, terminfo in self.iter_prefix(fieldname, prefix))
        return nlargest(number, gen)

    def leaf_readers(self):
        """Returns a list of (IndexReader, docbase) pairs for the child readers
        of this reader if it is a composite reader. If this is not a composite
        reader, it returns `[(self, 0)]`.
        """

        return [(self, 0)]

    def supports_caches(self):
        return False

    def has_column(self, fieldname):
        return False

    def column_reader(self, fieldname, column=None, reverse=False,
                      translate=False):
        """

        :param fieldname: the name of the field for which to get a reader.
        :param column: if passed, use this Column object instead of the one
            associated with the field in the Schema.
        :param reverse: if passed, reverses the order of keys returned by the
            reader's ``sort_key()`` method. If the column type is not
            reversible, this will raise a ``NotImplementedError``.
        :param translate: if True, wrap the reader to call the field's
            ``from_bytes()`` method on the returned values.
        :return: a :class:`whoosh.columns.ColumnReader` object.
        """

        raise NotImplementedError


# Segment-based reader

class SegmentReader(IndexReader):
    def __init__(self, storage, schema, segment, generation=None, codec=None):
        self.schema = schema
        self.is_closed = False

        self._segment = segment
        self._segid = self._segment.segment_id()
        self._gen = generation

        # self.files is a storage object from which to load the segment files.
        # This is different from the general storage (which will be used for
        # caches) if the segment is in a compound file.
        if segment.is_compound():
            # Open the compound file as a storage object
            files = segment.open_compound_file(storage)
            # Use an overlay here instead of just the compound storage, in rare
            # circumstances a segment file may be added after the segment is
            # written
            self._storage = OverlayStorage(files, storage)
        else:
            self._storage = storage

        # Get subreaders from codec
        self._codec = codec if codec else segment.codec()
        self._terms = self._codec.terms_reader(self._storage, segment)
        self._perdoc = self._codec.per_document_reader(self._storage, segment)

    def codec(self):
        return self._codec

    def segment(self):
        return self._segment

    def storage(self):
        return self._storage

    def has_deletions(self):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.has_deletions()

    def doc_count(self):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.doc_count()

    def doc_count_all(self):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.doc_count_all()

    def is_deleted(self, docnum):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.is_deleted(docnum)

    def generation(self):
        return self._gen

    def __repr__(self):
        return "%s(%r, %r)" % (self.__class__.__name__, self._storage,
                               self._segment)

    def __contains__(self, term):
        if self.is_closed:
            raise ReaderClosed
        fieldname, text = term
        if fieldname not in self.schema:
            return False
        text = self._text_to_bytes(fieldname, text)
        return (fieldname, text) in self._terms

    def close(self):
        if self.is_closed:
            raise ReaderClosed("Reader already closed")
        self._terms.close()
        self._perdoc.close()

        # It's possible some weird codec that doesn't use storage might have
        # passed None instead of a storage object
        if self._storage:
            self._storage.close()

        self.is_closed = True

    def stored_fields(self, docnum):
        if self.is_closed:
            raise ReaderClosed
        assert docnum >= 0
        schema = self.schema
        sfs = self._perdoc.stored_fields(docnum)
        # Double-check with schema to filter out removed fields
        return dict(item for item in iteritems(sfs) if item[0] in schema)

    # Delegate doc methods to the per-doc reader

    def all_doc_ids(self):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.all_doc_ids()

    def iter_docs(self):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.iter_docs()

    def all_stored_fields(self):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.all_stored_fields()

    def field_length(self, fieldname):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.field_length(fieldname)

    def min_field_length(self, fieldname):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.min_field_length(fieldname)

    def max_field_length(self, fieldname):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.max_field_length(fieldname)

    def doc_field_length(self, docnum, fieldname, default=0):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.doc_field_length(docnum, fieldname, default)

    def has_vector(self, docnum, fieldname):
        if self.is_closed:
            raise ReaderClosed
        return self._perdoc.has_vector(docnum, fieldname)

    #

    def _test_field(self, fieldname):
        if self.is_closed:
            raise ReaderClosed
        if fieldname not in self.schema:
            raise TermNotFound("No field %r" % fieldname)
        if self.schema[fieldname].format is None:
            raise TermNotFound("Field %r is not indexed" % fieldname)

    def indexed_field_names(self):
        return self._terms.indexed_field_names()

    def all_terms(self):
        if self.is_closed:
            raise ReaderClosed
        schema = self.schema
        return ((fieldname, text) for fieldname, text in self._terms.terms()
                if fieldname in schema)

    def terms_from(self, fieldname, prefix):
        self._test_field(fieldname)
        prefix = self._text_to_bytes(fieldname, prefix)
        schema = self.schema
        return ((fname, text) for fname, text
                in self._terms.terms_from(fieldname, prefix)
                if fname in schema)

    def term_info(self, fieldname, text):
        self._test_field(fieldname)
        text = self._text_to_bytes(fieldname, text)
        try:
            return self._terms.term_info(fieldname, text)
        except KeyError:
            raise TermNotFound("%s:%r" % (fieldname, text))

    def expand_prefix(self, fieldname, prefix):
        self._test_field(fieldname)
        prefix = self._text_to_bytes(fieldname, prefix)
        return IndexReader.expand_prefix(self, fieldname, prefix)

    def lexicon(self, fieldname):
        self._test_field(fieldname)
        return IndexReader.lexicon(self, fieldname)

    def __iter__(self):
        if self.is_closed:
            raise ReaderClosed
        schema = self.schema
        return ((term, terminfo) for term, terminfo in self._terms.items()
                if term[0] in schema)

    def iter_from(self, fieldname, text):
        self._test_field(fieldname)
        schema = self.schema
        text = self._text_to_bytes(fieldname, text)
        for term, terminfo in self._terms.items_from(fieldname, text):
            if term[0] not in schema:
                continue
            yield (term, terminfo)

    def frequency(self, fieldname, text):
        self._test_field(fieldname)
        text = self._text_to_bytes(fieldname, text)
        try:
            return self._terms.frequency(fieldname, text)
        except KeyError:
            return 0

    def doc_frequency(self, fieldname, text):
        self._test_field(fieldname)
        text = self._text_to_bytes(fieldname, text)
        try:
            return self._terms.doc_frequency(fieldname, text)
        except KeyError:
            return 0

    def postings(self, fieldname, text, scorer=None):
        from whoosh.matching.wrappers import FilterMatcher

        if self.is_closed:
            raise ReaderClosed
        if fieldname not in self.schema:
            raise TermNotFound("No  field %r" % fieldname)
        text = self._text_to_bytes(fieldname, text)
        format_ = self.schema[fieldname].format
        matcher = self._terms.matcher(fieldname, text, format_, scorer=scorer)
        deleted = frozenset(self._perdoc.deleted_docs())
        if deleted:
            matcher = FilterMatcher(matcher, deleted, exclude=True)
        return matcher

    def vector(self, docnum, fieldname, format_=None):
        if self.is_closed:
            raise ReaderClosed
        if fieldname not in self.schema:
            raise TermNotFound("No  field %r" % fieldname)
        vformat = format_ or self.schema[fieldname].vector
        if not vformat:
            raise Exception("No vectors are stored for field %r" % fieldname)
        return self._perdoc.vector(docnum, fieldname, vformat)

    def cursor(self, fieldname):
        if self.is_closed:
            raise ReaderClosed
        fieldobj = self.schema[fieldname]
        return self._terms.cursor(fieldname, fieldobj)

    def terms_within(self, fieldname, text, maxdist, prefix=0):
        # Replaces the horribly inefficient base implementation with one based
        # on skipping through the word list efficiently using a DFA

        fieldobj = self.schema[fieldname]
        spellfield = fieldobj.spelling_fieldname(fieldname)
        auto = self._codec.automata(self._storage, self._segment)
        fieldcur = self.cursor(spellfield)
        return auto.terms_within(fieldcur, text, maxdist, prefix)

    # Column methods

    def has_column(self, fieldname):
        if self.is_closed:
            raise ReaderClosed
        coltype = self.schema[fieldname].column_type
        return coltype and self._perdoc.has_column(fieldname)

    def column_reader(self, fieldname, column=None, reverse=False,
                      translate=True):
        if self.is_closed:
            raise ReaderClosed

        fieldobj = self.schema[fieldname]
        column = column or fieldobj.column_type
        if not column:
            raise Exception("No column for field %r in %r"
                            % (fieldname, self))

        if self._perdoc.has_column(fieldname):
            creader = self._perdoc.column_reader(fieldname, column)
            if reverse:
                creader.set_reverse()
        else:
            # This segment doesn't have a column file for this field, so create
            # a fake column reader that always returns the default value.
            default = column.default_value(reverse)
            creader = columns.EmptyColumnReader(default, self.doc_count_all())

        if translate:
            # Wrap the column in a Translator to give the caller
            # nice values instead of sortable representations
            fcv = fieldobj.from_column_value
            creader = columns.TranslatingColumnReader(creader, fcv)

        return creader


# Fake IndexReader class for empty indexes

class EmptyReader(IndexReader):
    def __init__(self, schema):
        self.schema = schema

    def __contains__(self, term):
        return False

    def __iter__(self):
        return iter([])

    def cursor(self, fieldname):
        from whoosh.codec.base import EmptyCursor

        return EmptyCursor()

    def indexed_field_names(self):
        return []

    def all_terms(self):
        return iter([])

    def term_info(self, fieldname, text):
        raise TermNotFound((fieldname, text))

    def iter_from(self, fieldname, text):
        return iter([])

    def iter_field(self, fieldname, prefix=''):
        return iter([])

    def iter_prefix(self, fieldname, prefix=''):
        return iter([])

    def lexicon(self, fieldname):
        return iter([])

    def has_deletions(self):
        return False

    def is_deleted(self, docnum):
        return False

    def stored_fields(self, docnum):
        raise KeyError("No document number %s" % docnum)

    def all_stored_fields(self):
        return iter([])

    def doc_count_all(self):
        return 0

    def doc_count(self):
        return 0

    def frequency(self, fieldname, text):
        return 0

    def doc_frequency(self, fieldname, text):
        return 0

    def field_length(self, fieldname):
        return 0

    def min_field_length(self, fieldname):
        return 0

    def max_field_length(self, fieldname):
        return 0

    def doc_field_length(self, docnum, fieldname, default=0):
        return default

    def postings(self, fieldname, text, scorer=None):
        raise TermNotFound("%s:%r" % (fieldname, text))

    def has_vector(self, docnum, fieldname):
        return False

    def vector(self, docnum, fieldname, format_=None):
        raise KeyError("No document number %s" % docnum)

    def most_frequent_terms(self, fieldname, number=5, prefix=''):
        return iter([])

    def most_distinctive_terms(self, fieldname, number=5, prefix=None):
        return iter([])


# Multisegment reader class

class MultiReader(IndexReader):
    """Do not instantiate this object directly. Instead use Index.reader().
    """

    def __init__(self, readers, generation=None):
        self.readers = readers
        self._gen = generation
        self.schema = None
        if readers:
            self.schema = readers[0].schema

        self.doc_offsets = []
        self.base = 0
        for r in self.readers:
            self.doc_offsets.append(self.base)
            self.base += r.doc_count_all()

        self.is_closed = False

    def _document_segment(self, docnum):
        return max(0, bisect_right(self.doc_offsets, docnum) - 1)

    def _segment_and_docnum(self, docnum):
        segmentnum = self._document_segment(docnum)
        offset = self.doc_offsets[segmentnum]
        return segmentnum, docnum - offset

    def cursor(self, fieldname):
        return MultiCursor([r.cursor(fieldname) for r in self.readers])

    def is_atomic(self):
        return False

    def leaf_readers(self):
        return zip_(self.readers, self.doc_offsets)

    def add_reader(self, reader):
        self.readers.append(reader)
        self.doc_offsets.append(self.base)
        self.base += reader.doc_count_all()

    def close(self):
        for d in self.readers:
            d.close()
        self.is_closed = True

    def generation(self):
        return self._gen

    def format(self, fieldname):
        for r in self.readers:
            fmt = r.format(fieldname)
            if fmt is not None:
                return fmt

    def vector_format(self, fieldname):
        for r in self.readers:
            vfmt = r.vector_format(fieldname)
            if vfmt is not None:
                return vfmt

    # Term methods

    def __contains__(self, term):
        return any(r.__contains__(term) for r in self.readers)

    def _merge_terms(self, iterlist):
        # Merge-sorts terms coming from a list of term iterators.

        # Create a map so we can look up each iterator by its id() value
        itermap = {}
        for it in iterlist:
            itermap[id(it)] = it

        # Fill in the list with the head term from each iterator.

        current = []
        for it in iterlist:
            try:
                term = next(it)
            except StopIteration:
                continue
            current.append((term, id(it)))
        # Number of active iterators
        active = len(current)

        # If only one iterator is active, just yield from it and return
        if active == 1:
            term, itid = current[0]
            it = itermap[itid]
            yield term
            for term in it:
                yield term
            return

        # Otherwise, do a streaming heap sort of the terms from the iterators
        heapify(current)
        while active:
            # Peek at the first term in the sorted list
            term = current[0][0]

            # Re-iterate on all items in the list that have that term
            while active and current[0][0] == term:
                it = itermap[current[0][1]]
                try:
                    nextterm = next(it)
                    heapreplace(current, (nextterm, id(it)))
                except StopIteration:
                    heappop(current)
                    active -= 1

            # Yield the term
            yield term

    def indexed_field_names(self):
        names = set()
        for r in self.readers:
            names.update(r.indexed_field_names())
        return iter(names)

    def all_terms(self):
        return self._merge_terms([r.all_terms() for r in self.readers])

    def terms_from(self, fieldname, prefix):
        return self._merge_terms([r.terms_from(fieldname, prefix)
                                  for r in self.readers])

    def term_info(self, fieldname, text):
        term = (fieldname, text)

        # Get the term infos for the sub-readers containing the term
        tis = [(r.term_info(fieldname, text), offset) for r, offset
               in zip_(self.readers, self.doc_offsets) if term in r]

        # If only one reader had the term, return its terminfo with the offset
        # added
        if not tis:
            raise TermNotFound(term)

        return combine_terminfos(tis)

    def frequency(self, fieldname, text):
        return sum(r.frequency(fieldname, text) for r in self.readers)

    def doc_frequency(self, fieldname, text):
        return sum(r.doc_frequency(fieldname, text) for r in self.readers)

    def postings(self, fieldname, text):
        # This method does not add a scorer; for that, use Searcher.postings()

        postreaders = []
        docoffsets = []
        term = (fieldname, text)

        for i, r in enumerate(self.readers):
            if term in r:
                offset = self.doc_offsets[i]
                pr = r.postings(fieldname, text)
                postreaders.append(pr)
                docoffsets.append(offset)

        if not postreaders:
            raise TermNotFound(fieldname, text)

        return MultiMatcher(postreaders, docoffsets)

    def first_id(self, fieldname, text):
        for i, r in enumerate(self.readers):
            try:
                id = r.first_id(fieldname, text)
            except (KeyError, TermNotFound):
                pass
            else:
                if id is None:
                    raise TermNotFound((fieldname, text))
                else:
                    return self.doc_offsets[i] + id

        raise TermNotFound((fieldname, text))

    # Deletion methods

    def has_deletions(self):
        return any(r.has_deletions() for r in self.readers)

    def is_deleted(self, docnum):
        segmentnum, segmentdoc = self._segment_and_docnum(docnum)
        return self.readers[segmentnum].is_deleted(segmentdoc)

    def stored_fields(self, docnum):
        segmentnum, segmentdoc = self._segment_and_docnum(docnum)
        return self.readers[segmentnum].stored_fields(segmentdoc)

    # Columns

    def has_column(self, fieldname):
        return any(r.has_column(fieldname) for r in self.readers)

    def column_reader(self, fieldname, column=None, reverse=False,
                      translate=True):
        crs = []
        doc_offsets = []
        for i, r in enumerate(self.readers):
            if r.has_column(fieldname):
                cr = r.column_reader(fieldname, column=column, reverse=reverse,
                                     translate=translate)
                crs.append(cr)
                doc_offsets.append(self.doc_offsets[i])
        return columns.MultiColumnReader(crs, doc_offsets)

    # Per doc methods

    def all_stored_fields(self):
        for reader in self.readers:
            for result in reader.all_stored_fields():
                yield result

    def doc_count_all(self):
        return sum(dr.doc_count_all() for dr in self.readers)

    def doc_count(self):
        return sum(dr.doc_count() for dr in self.readers)

    def field_length(self, fieldname):
        return sum(dr.field_length(fieldname) for dr in self.readers)

    def min_field_length(self, fieldname):
        return min(r.min_field_length(fieldname) for r in self.readers)

    def max_field_length(self, fieldname):
        return max(r.max_field_length(fieldname) for r in self.readers)

    def doc_field_length(self, docnum, fieldname, default=0):
        segmentnum, segmentdoc = self._segment_and_docnum(docnum)
        reader = self.readers[segmentnum]
        return reader.doc_field_length(segmentdoc, fieldname, default=default)

    def has_vector(self, docnum, fieldname):
        segmentnum, segmentdoc = self._segment_and_docnum(docnum)
        return self.readers[segmentnum].has_vector(segmentdoc, fieldname)

    def vector(self, docnum, fieldname, format_=None):
        segmentnum, segmentdoc = self._segment_and_docnum(docnum)
        return self.readers[segmentnum].vector(segmentdoc, fieldname)

    def vector_as(self, astype, docnum, fieldname):
        segmentnum, segmentdoc = self._segment_and_docnum(docnum)
        return self.readers[segmentnum].vector_as(astype, segmentdoc,
                                                  fieldname)


def combine_terminfos(tis):
    if len(tis) == 1:
        ti, offset = tis[0]
        ti._minid += offset
        ti._maxid += offset
        return ti

    # Combine the various statistics
    w = sum(ti.weight() for ti, _ in tis)
    df = sum(ti.doc_frequency() for ti, _ in tis)
    ml = min(ti.min_length() for ti, _ in tis)
    xl = max(ti.max_length() for ti, _ in tis)
    xw = max(ti.max_weight() for ti, _ in tis)

    # For min and max ID, we need to add the doc offsets
    mid = min(ti.min_id() + offset for ti, offset in tis)
    xid = max(ti.max_id() + offset for ti, offset in tis)

    return TermInfo(w, df, ml, xl, xw, mid, xid)


class MultiCursor(object):
    def __init__(self, cursors):
        self._cursors = [c for c in cursors if c.is_valid()]
        self._low = []
        self._text = None
        self.next()

    def _find_low(self):
        low = []
        lowterm = None

        for c in self._cursors:
            if c.is_valid():
                cterm = c.term()
                if low and cterm == lowterm:
                    low.append(c)
                elif low and cterm < lowterm:
                    low = [c]
                    lowterm = cterm

        self._low = low
        self._text = lowterm
        return lowterm

    def first(self):
        for c in self._cursors:
            c.first()
        return self._find_low()

    def find(self, term):
        for c in self._cursors:
            c.find(term)
        return self._find_low()

    def next(self):
        for c in self._cursors:
            c.next()
        return self._find_low()

    def term_info(self):
        tis = [c.term_info() for c in self._low]
        return combine_terminfos(tis) if tis else None

    def is_valid(self):
        return any(c.is_valid() for c in self._cursors)
python-whoosh 2.7.0-2 / usr / lib / python2.7 / dist-packages / whoosh / reading.py