/usr/lib/python2.7/dist-packages/cogent/cluster/goodness_of_fit.py is in python-cogent 1.9-9.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 | #!usr/bin/env python
"""Goodness of fit of Multidimensional Scaling
Implements several functions that measure the degree of correspondence
between an MDS and its coordinates and the original input distances.
See for example: * Johnson & Wichern (2002): Applied Multivariate
Statistical Analysis
"""
import numpy
__author__ = "Andreas Wilm"
__copyright__ = "Copyright 2007-2016, The Cogent Project"
__credits__ = ["Andreas Wilm"]
__license__ = "GPL"
__version__ = "1.9"
__maintainer__ = "Andreas Wilm"
__email__ = "andreas.wilm@ucd.ie"
__status__ = "Production"
class Stress(object):
"""Degree of correspondence between input distances and an MDS
Stress measures the goodness of fit or degree of correspondence
between distances implied by an MDS mapping and the original
distances. There are many different variants of Stress; Kruskal's
Stress or Stress-1 probably being the most popular one.
"""
def __init__(self, orig_distmat, mds_coords, apply_scaling=True):
"""Setup class by storing pointers to orginal matrix and
calculating distances implied by MDS
By default, implied distances (disparities) are scaled to
match the range of the original distances.
The calculation of the implied distances is the only really
time intensive step, so __init__ will take some time for big
matrices.
Arguments:
* orig_distmat (numpy array or numpy matrix)
original distance matrix
* mds_coords (numpy array)
mds coordinates
* apply_scaling (boolean)
scale distances implied by an MDS mapping to match
those of original distance matrix
"""
# don't use coercion here (orig_distmat to matrix, mds_coords
# to array),to avoid conversion/copy overhead as these
# arrays/matrices can potentially be quite big
#
assert isinstance(orig_distmat, numpy.ndarray), \
"orig_distmat is not a numpy.ndarray instance"
assert isinstance(mds_coords, numpy.ndarray), \
"mds_coords is not a numpy.ndarray instance"
# check array/matrix shapes and sizes
assert len(orig_distmat.shape) == 2, \
"orig_distmat is not a 2D array/matrix."
assert len(mds_coords.shape) == 2, \
"mds_coords is not a 2D array."
assert orig_distmat.shape[0] == mds_coords.shape[0], \
"orig_distmat and mds_coords do not have the same" \
" number of rows/objects."
assert orig_distmat.shape[1] > mds_coords.shape[1], \
"orig_distmat shape bigger than mds_coords shape." \
" Possible argument mixup"
self._orig_distmat = orig_distmat
# compute distances implied by MDS and scale if requested (and
# necessary) by the ratio of maxima of both matrices
#
self._reproduced_distmat = self._calc_pwdist(mds_coords)
if apply_scaling:
max_orig = self._orig_distmat.max()
max_derived = self._reproduced_distmat.max()
scale = max_derived / max_orig
if scale != 1.0:
self._reproduced_distmat = self._reproduced_distmat / scale
def calcKruskalStress(self):
"""Calculate Kruskal's Stress AKA Stress-1
Kruskal's Stress or Stress-1 is defined as:
sqrt( SUM_ij (d'(i,j) - d(i,j))^2 / SUM_ij d(i,j)^2 for i<j
where d(i,j) is the distance between i and j in the original distance
matrix (here: self._orig_distmat), and d'(i,j) the distance implied by an
mds mapping (here: self._reproduced_distmat)
According to Johnson & Wichern (2002): 'Applied Multivariate
Statistical Analysis'' p701 (citing Kruskal (1964)) the
informal interpretation of stress1 is:
Stress [%] Goodness of fit
20 Poor
10 Fair
5 Good
2.5 Excellent
0 Perfect
AFAIK there is no upper bound for stress1.
Remember that the stress of an MDS mapping also depends on the
dimensionality of the MDS, i.e. you will get better Stress
values for higher dimensions.
Arguments:
* None
Returns:
* Kruskal Stress as float
"""
# as we have a symmetrical matrix we can restrict ourselves to
# i<j, but the element-wise computation (looping over i<j) takes
# ages and gives the same result.
# use power here for element-wise exponentiation
# **-operator might fail if we use array instead of matrix
numerator = numpy.sum(
numpy.power(self._reproduced_distmat - self._orig_distmat, 2))
denominator = numpy.sum(numpy.power(self._orig_distmat, 2))
result = numpy.sqrt(numerator / denominator)
return result
def calcSstress(self):
"""Calculate SStress
SStress (Takane, 1977) is defined as
sqrt( SUM_ij (d'(i,j)^2 - d(i,j)^2)^2 / SUM_ij d(i,j)^4 for i<j
where d(i,j) is the distance between i and j in the original
matrix (self._orig_distmat), and d'(i,j) the distance implied by an
mds mapping (self._reproduced_distmat)
Preferred criterion according to Johnson & Wichern (2002)
'Applied Multivariate Statistical Analysis'' (p701)
Value of SStress is always between 0 and 1. Values less then
0.1 mean good representation.
Squaring causes S-Stress to emphasize larger dissimliarities more
than smaller ones (See p204 in 'Modern multidimensional scaling' by
Borg & Groenen, 1997)
Arguments:
* None
Returns:
* Sstress as float
"""
# as we have a symmetrical matrix we can restrict ourselves to
# i<j, but the element-wise computation (looping over i<j) takes
# ages and gives the same result.
# use power here for element-wise exponentiation
# **-operator might fail if we use array instead of matrix
numerator = numpy.power(
self._reproduced_distmat, 2) - numpy.power(self._orig_distmat, 2)
numerator = numpy.sum(numpy.power(numerator, 2))
denominator = numpy.sum(numpy.power(self._orig_distmat, 4))
result = numpy.sqrt(numerator / denominator)
return result
@staticmethod
def _calc_rowdist(row1, row2):
"""Calculate the euclidean distance between two row vectors.
NOTE: This function will be called quite often. To safe some
time, no test will be made if arguments are actually vectors!
This function will happily work away on full matrices and
arrays as well!
Arguments:
* row1 (numpy array or numpy matrix)
row vector, i.e. shape has to be [1],n or n,[1]
* row2 (numpy array or numpy matrix)
row vector, i.e. shape has to be [1],n or n,[1]
Returns:
* distance as float
"""
row_diffsq = numpy.power(row1 - row2, 2)
result = numpy.sqrt(row_diffsq.sum())
return result
def _calc_pwdist(self, mds_coords):
"""Compute a full symmetric distance matrix from MDS coordinates
Calculate pairwise distances between m original distances
(rows) in n-dimensional space (columns)
Note: this is the same as
import scipy.spatial.distance as ssd
ssd.squareform(ssd.pdist(x))
Arguments:
* mds_coords (numpy.ndarray)
mds coordinates, i.e. a recangular 2D array with shape m,n m>n)
Returns
* distance as float
"""
assert isinstance(mds_coords, numpy.ndarray), \
"mds_coords is not a numpy ndarray"
result = numpy.zeros((mds_coords.shape[0], mds_coords.shape[0]))
for i in range(mds_coords.shape[0]):
row_i = mds_coords[i, :]
for j in range(i+1, mds_coords.shape[0]):
result[i, j] = self._calc_rowdist(row_i, mds_coords[j, :])
result[j, i] = result[i, j]
return result
|