/usr/share/pyshared/patsy/state.py

# This file is part of Patsy
# Copyright (C) 2011 Nathaniel Smith <njs@pobox.com>
# See file COPYING for license information.

# Stateful transform protocol:
#   def __init__(self):
#       pass
#   def memorize_chunk(self, input_data):
#       return None
#   def memorize_finish(self):
#       return None
#   def transform(self, input_data):
#       return output_data

# BETTER WAY: always run the first row of data through the builder alone, and
# check that it gives the same output row as when running the whole block of
# data through at once. This gives us the same information, but it's robust
# against people writing their own centering functions.

# QUESTION: right now we refuse to even fit a model that contains a
# my_transform(x)-style function. Maybe we should allow it to be fit (with a
# warning), and only disallow making predictions with it? Need to revisit this
# question once it's clearer what exactly our public API will look like,
# because right now I'm not sure how to tell whether we are being called for
# fitting versus being called for prediction.

import numpy as np
from patsy.util import (atleast_2d_column_default,
                        asarray_or_pandas, pandas_friendly_reshape,
                        wide_dtype_for)
from patsy.compat import wraps

# These are made available in the patsy.* namespace
__all__ = ["stateful_transform",
           "center", "standardize", "scale",
           ]

def stateful_transform(class_):
    """Create a stateful transform callable object from a class that fulfills
    the :ref:`stateful transform protocol <stateful-transform-protocol>`.
    """
    @wraps(class_)
    def stateful_transform_wrapper(*args, **kwargs):
        transform = class_()
        transform.memorize_chunk(*args, **kwargs)
        transform.memorize_finish()
        return transform.transform(*args, **kwargs)
    stateful_transform_wrapper.__patsy_stateful_transform__ = class_
    return stateful_transform_wrapper

# class NonIncrementalStatefulTransform(object):
#     def __init__(self):
#         self._data = []
#
#     def memorize_chunk(self, input_data, *args, **kwargs):
#         self._data.append(input_data)
#         self._args = _args
#         self._kwargs = kwargs
#
#     def memorize_finish(self):
#         all_data = np.row_stack(self._data)
#         args = self._args
#         kwargs = self._kwargs
#         del self._data
#         del self._args
#         del self._kwargs
#         self.memorize_all(all_data, *args, **kwargs)
#
#     def memorize_all(self, input_data, *args, **kwargs):
#         raise NotImplementedError
#
#     def transform(self, input_data, *args, **kwargs):
#         raise NotImplementedError
#
# class QuantileEstimatingTransform(NonIncrementalStatefulTransform):
#     def memorize_all(self, input_data, *args, **kwargs):

class Center(object):
    """center(x)

    A stateful transform that centers input data, i.e., subtracts the mean.

    If input has multiple columns, centers each column separately.

    Equivalent to ``standardize(x, rescale=False)``
    """
    def __init__(self):
        self._sum = None
        self._count = 0

    def memorize_chunk(self, x):
        x = atleast_2d_column_default(x)
        self._count += x.shape[0]
        this_total = np.sum(x, 0, dtype=wide_dtype_for(x))
        # This is to handle potentially multi-column x's:
        if self._sum is None:
            self._sum = this_total
        else:
            self._sum += this_total

    def memorize_finish(self):
        pass

    def transform(self, x):
        x = asarray_or_pandas(x)
        # This doesn't copy data unless our input is a DataFrame that has
        # heterogenous types. And in that case we're going to be munging the
        # types anyway, so copying isn't a big deal.
        x_arr = np.asarray(x)
        if np.issubdtype(x_arr.dtype, np.integer):
            dt = float
        else:
            dt = x_arr.dtype
        mean_val = np.asarray(self._sum / self._count, dtype=dt)
        centered = atleast_2d_column_default(x, preserve_pandas=True) - mean_val
        return pandas_friendly_reshape(centered, x.shape)

center = stateful_transform(Center)

# See:
#   http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
# or page 232 of Knuth vol. 3 (3rd ed.).
class Standardize(object):
    """standardize(x, center=True, rescale=True, ddof=0)

    A stateful transform that standardizes input data, i.e. it subtracts the
    mean and divides by the sample standard deviation.

    Either centering or rescaling or both can be disabled by use of keyword
    arguments. The `ddof` argument controls the delta degrees of freedom when
    computing the standard deviation (cf. :func:`numpy.std`). The default of
    ``ddof=0`` produces the maximum likelihood estimate; use ``ddof=1`` if you
    prefer the square root of the unbiased estimate of the variance.

    If input has multiple columns, standardizes each column separately.

    .. note:: This function computes the mean and standard deviation using a
       memory-efficient online algorithm, making it suitable for use with
       large incrementally processed data-sets.
    """
    def __init__(self):
        self.current_n = 0
        self.current_mean = None
        self.current_M2 = None

    def memorize_chunk(self, x, center=True, rescale=True, ddof=0):
        x = atleast_2d_column_default(x)
        if self.current_mean is None:
            self.current_mean = np.zeros(x.shape[1], dtype=wide_dtype_for(x))
            self.current_M2 = np.zeros(x.shape[1], dtype=wide_dtype_for(x))
        # XX this can surely be vectorized but I am feeling lazy:
        for i in xrange(x.shape[0]):
            self.current_n += 1
            delta = x[i, :] - self.current_mean
            self.current_mean += delta / self.current_n
            self.current_M2 += delta * (x[i, :] - self.current_mean)

    def memorize_finish(self):
        pass

    def transform(self, x, center=True, rescale=True, ddof=0):
        # XX: this forces all inputs to double-precision real, even if the
        # input is single- or extended-precision or complex. But I got all
        # tangled up in knots trying to do that without breaking something
        # else (e.g. by requiring an extra copy).
        x = asarray_or_pandas(x, copy=True, dtype=float)
        x_2d = atleast_2d_column_default(x, preserve_pandas=True)
        if center:
            x_2d -= self.current_mean
        if rescale:
            x_2d /= np.sqrt(self.current_M2 / (self.current_n - ddof))
        return pandas_friendly_reshape(x_2d, x.shape)

standardize = stateful_transform(Standardize)
# R compatibility:
scale = standardize
python-patsy 0.2.1-3 / usr / share / pyshared / patsy / state.py