/usr/share/octave/packages/statistics-1.3.0/normalise_distribution.m is in octave-statistics 1.3.0-4.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 | ## Copyright (C) 2011 Alexander Klein <alexander.klein@math.uni-giessen.de>
##
## This program is free software; you can redistribute it and/or modify it under
## the terms of the GNU General Public License as published by the Free Software
## Foundation; either version 3 of the License, or (at your option) any later
## version.
##
## This program is distributed in the hope that it will be useful, but WITHOUT
## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
## details.
##
## You should have received a copy of the GNU General Public License along with
## this program; if not, see <http://www.gnu.org/licenses/>.
## -*- texinfo -*-
## @deftypefn{Function File} {@var{NORMALISED} =} normalise_distribution (@var{DATA})
## @deftypefnx{Function File} {@var{NORMALISED} =} normalise_distribution (@var{DATA}, @var{DISTRIBUTION})
## @deftypefnx{Function File} {@var{NORMALISED} =} normalise_distribution (@var{DATA}, @var{DISTRIBUTION}, @var{DIMENSION})
##
## Transform a set of data so as to be N(0,1) distributed according to an idea
## by van Albada and Robinson.
## This is achieved by first passing it through its own cumulative distribution
## function (CDF) in order to get a uniform distribution, and then mapping
## the uniform to a normal distribution.
## The data must be passed as a vector or matrix in @var{DATA}.
## If the CDF is unknown, then [] can be passed in @var{DISTRIBUTION}, and in
## this case the empirical CDF will be used.
## Otherwise, if the CDFs for all data are known, they can be passed in
## @var{DISTRIBUTION},
## either in the form of a single function name as a string,
## or a single function handle,
## or a cell array consisting of either all function names as strings,
## or all function handles.
## In the latter case, the number of CDFs passed must match the number
## of rows, or columns respectively, to normalise.
## If the data are passed as a matrix, then the transformation will
## operate either along the first non-singleton dimension,
## or along @var{DIMENSION} if present.
##
## Notes:
## The empirical CDF will map any two sets of data
## having the same size and their ties in the same places after sorting
## to some permutation of the same normalised data:
## @example
## @code{normalise_distribution([1 2 2 3 4])}
## @result{} -1.28 0.00 0.00 0.52 1.28
##
## @code{normalise_distribution([1 10 100 10 1000])}
## @result{} -1.28 0.00 0.52 0.00 1.28
## @end example
##
## Original source:
## S.J. van Albada, P.A. Robinson
## "Transformation of arbitrary distributions to the
## normal distribution with application to EEG
## test-retest reliability"
## Journal of Neuroscience Methods, Volume 161, Issue 2,
## 15 April 2007, Pages 205-211
## ISSN 0165-0270, 10.1016/j.jneumeth.2006.11.004.
## (http://www.sciencedirect.com/science/article/pii/S0165027006005668)
## @end deftypefn
function [ normalised ] = normalise_distribution ( data, distribution, dimension )
if ( nargin < 1 || nargin > 3 )
print_usage;
elseif ( !ismatrix ( data ) || length ( size ( data ) ) > 2 )
error ( "First argument must be a vector or matrix" );
end
if ( nargin >= 2 )
if ( !isempty ( distribution ) )
#Wrap a single handle in a cell array.
if ( strcmp ( typeinfo ( distribution ), typeinfo ( @(x)(x) ) ) )
distribution = { distribution };
#Do we have a string argument instead?
elseif ( ischar ( distribution ) )
##Is it a single string?
if ( rows ( distribution ) == 1 )
distribution = { str2func( distribution ) };
else
error ( ["Second argument cannot contain more than one string" ...
" unless in a cell array"] );
end
##Do we have a cell array of distributions instead?
elseif ( iscell ( distribution ) )
##Does it consist of strings only?
if ( all ( cellfun ( @ischar, distribution ) ) )
distribution = cellfun ( @str2func, distribution, "UniformOutput", false );
end
##Does it eventually consist of function handles only
if ( !all ( cellfun ( @ ( h ) ( strcmp ( typeinfo ( h ), typeinfo ( @(x)(x) ) ) ), distribution ) ) )
error ( ["Second argument must contain either" ...
" a single function name or handle or " ...
" a cell array of either all function names or handles!"] );
end
else
error ( "Illegal second argument: ", typeinfo ( distribution ) );
end
end
else
distribution = [];
end
if ( nargin == 3 )
if ( !isscalar ( dimension ) || ( dimension != 1 && dimension != 2 ) )
error ( "Third argument must be either 1 or 2" );
end
else
if ( isvector ( data ) && rows ( data ) == 1 )
dimension = 2;
else
dimension = 1;
end
end
trp = ( dimension == 2 );
if ( trp )
data = data';
end
r = rows ( data );
c = columns ( data );
normalised = NA ( r, c );
##Do we know the distribution of the sample?
if ( isempty ( distribution ) )
precomputed_normalisation = [];
for k = 1 : columns ( data )
##Note that this line is in accordance with equation (16) in the
##original text. The author's original program, however, produces
##different values in the presence of ties, namely those you'd
##get replacing "last" by "first".
[ uniq, indices ] = unique ( sort ( data ( :, k ) ), "last" );
##Does the sample have ties?
if ( rows ( uniq ) != r )
##Transform to uniform, then normal distribution.
uniform = ( indices - 1/2 ) / r;
normal = norminv ( uniform );
else
## Without ties everything is pretty much straightforward as
## stated in the text.
if ( isempty ( precomputed_normalisation ) )
precomputed_normalisation = norminv ( 1 / (2*r) : 1/r : 1 - 1 / (2*r) );
end
normal = precomputed_normalisation;
end
#Find the original indices in the unsorted sample.
#This somewhat quirky way of doing it is still faster than
#using a for-loop.
[ ignore, ignore, target_indices ] = unique ( data (:, k ) );
#Put normalised values in the places where they belong.
f_remap = @( k ) ( normal ( k ) );
normalised ( :, k ) = arrayfun ( f_remap, target_indices );
end
else
##With known distributions, everything boils down to a few lines of code
##The same distribution for all data?
if ( all ( size ( distribution ) == 1 ) )
normalised = norminv ( distribution {1,1} ( data ) );
elseif ( length ( vec ( distribution ) ) == c )
for k = 1 : c
normalised ( :, k ) = norminv ( distribution { k } ( data ) ( :, k ) );
end
else
error ( "Number of distributions does not match data size! ")
end
end
if ( trp )
normalised = normalised';
end
endfunction
%!test
%! v = normalise_distribution ( [ 1 2 3 ], [], 1 );
%! assert ( v, [ 0 0 0 ] )
%!test
%! v = normalise_distribution ( [ 1 2 3 ], [], 2 );
%! assert ( v, norminv ( [ 1 3 5 ] / 6 ), 3 * eps )
%!test
%! v = normalise_distribution ( [ 1 2 3 ]', [], 2 );
%! assert ( v, [ 0 0 0 ]' )
%!test
%! v = normalise_distribution ( [ 1 2 3 ]' , [], 1 );
%! assert ( v, norminv ( [ 1 3 5 ]' / 6 ), 3 * eps )
%!test
%! v = normalise_distribution ( [ 1 1 2 2 3 3 ], [], 2 );
%! assert ( v, norminv ( [ 3 3 7 7 11 11 ] / 12 ), 3 * eps )
%!test
%! v = normalise_distribution ( [ 1 1 2 2 3 3 ]', [], 1 );
%! assert ( v, norminv ( [ 3 3 7 7 11 11 ]' / 12 ), 3 * eps )
%!test
%! A = randn ( 10 );
%! N = normalise_distribution ( A, @normcdf );
%! assert ( A, N, 1000 * eps )
%!xtest
%! A = exprnd ( 1, 100 );
%! N = normalise_distribution ( A, @ ( x ) ( expcdf ( x, 1 ) ) );
%! assert ( mean ( vec ( N ) ), 0, 0.1 )
%! assert ( std ( vec ( N ) ), 1, 0.1 )
%!xtest
%! A = rand (1000,1);
%! N = normalise_distribution ( A, "unifcdf" );
%! assert ( mean ( vec ( N ) ), 0, 0.1 )
%! assert ( std ( vec ( N ) ), 1, 0.1 )
%!xtest
%! A = [rand(1000,1), randn( 1000, 1)];
%! N = normalise_distribution ( A, { "unifcdf", "normcdf" } );
%! assert ( mean ( N ), [ 0, 0 ], 0.1 )
%! assert ( std ( N ), [ 1, 1 ], 0.1 )
%!xtest
%! A = [rand(1000,1), randn( 1000, 1), exprnd( 1, 1000, 1 )]';
%! N = normalise_distribution ( A, { @unifcdf; @normcdf; @( x )( expcdf ( x, 1 ) ) }, 2 );
%! assert ( mean ( N, 2 ), [ 0, 0, 0 ]', 0.1 )
%! assert ( std ( N, [], 2 ), [ 1, 1, 1 ]', 0.1 )
%!xtest
%! A = exprnd ( 1, 1000, 9 ); A ( 300 : 500, 4:6 ) = 17;
%! N = normalise_distribution ( A );
%! assert ( mean ( N ), [ 0 0 0 0.38 0.38 0.38 0 0 0 ], 0.1 );
%! assert ( var ( N ), [ 1 1 1 2.59 2.59 2.59 1 1 1 ], 0.1 );
%!test
%! fail ("normalise_distribution( zeros ( 3, 4 ), { @unifcdf; @normcdf; @( x )( expcdf ( x, 1 ) ) } )", ...
%! "Number of distributions does not match data size!");
|