This file is indexed.

/usr/include/shark/ObjectiveFunctions/KernelTargetAlignment.h is in libshark-dev 3.1.4+ds1-1ubuntu1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
/*!
 * 
 *
 * \brief       Kernel Target Alignment - a measure of alignment of a kernel Gram matrix with labels.
 * 
 * 
 *
 * \author      T. Glasmachers, O.Krause
 * \date        2010-2012
 *
 *
 * \par Copyright 1995-2015 Shark Development Team
 * 
 * <BR><HR>
 * This file is part of Shark.
 * <http://image.diku.dk/shark/>
 * 
 * Shark is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published 
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * Shark is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
 *
 */
#ifndef SHARK_OBJECTIVEFUNCTIONS_KERNELTARGETALIGNMENT_H
#define SHARK_OBJECTIVEFUNCTIONS_KERNELTARGETALIGNMENT_H

#include <shark/ObjectiveFunctions/AbstractObjectiveFunction.h>
#include <shark/Data/Dataset.h>
#include <shark/Data/Statistics.h>
#include <shark/Models/Kernels/AbstractKernelFunction.h>


namespace shark{

/*!
 *  \brief Kernel Target Alignment - a measure of alignment of a kernel Gram matrix with labels.
 *
 *  \par
 *  The Kernel Target Alignment (KTA) was originally proposed in the paper:<br/>
 *  <i>On Kernel-Target Alignment</i>. N. Cristianini, J. Shawe-Taylor,
 *  A. Elisseeff, J. Kandola. Innovations in Machine Learning, 2006.<br/>
 *  Here we provide a version with centering of the features as proposed
 *  in the paper:<br/>
 *  <i>Two-Stage Learning Kernel Algorithms</i>. C. Cortes, M. Mohri,
 *  A. Rostamizadeh. ICML 2010.<br/>
 *
 *  \par
 *  The kernel target alignment is defined as
 *  \f[ \hat A = \frac{\langle K, y y^T \rangle}{\sqrt{\langle K, K \rangle \cdot \langle y y^T, y y^T \rangle}} \f]
 *  where K is the kernel Gram matrix of the data and y is the vector of
 *  +1/-1 valued labels. The outer product \f$ y y^T \f$ corresponds to
 *  an &quot;ideal&quot; Gram matrix corresponding to a kernel that maps
 *  the two classes each to a single point, thus minimizing within-class
 *  distance for fixed inter-class distance. The inner products denote the
 *  Frobenius product of matrices:
 *  http://en.wikipedia.org/wiki/Matrix_multiplication#Frobenius_product
 *
 *  \par
 *  In kernel-based learning, the kernel Gram matrix K is of the form
 *  \f[ K_{i,j} = k(x_i, x_j) = \langle \phi(x_i), \phi(x_j) \rangle \f]
 *  for a Mercer kernel function k and inputs \f$ x_i, x_j \f$. In this
 *  version of the KTA we use centered feature vectors. Let
 *  \f[ \psi(x_i) = \phi(x_i) - \frac{1}{\ell} \sum_{j=1}^{\ell} \phi(x_j) \f]
 *  denote the centered feature vectors, then the centered Gram matrix
 *  \f$ K^c \f$ is given by
 *  \f[ K^c_{i,j} = \langle \psi(x_i), \psi(x_j) \rangle = K_{i,j} - \frac{1}{\ell} \sum_{n=1}^\ell K_{i,n} + K_{j,n} + \frac{1}{\ell^2} \sum_{m,n=1}^\ell K_{n,m} \f]
 *  The alignment measure computed by this class is the exact same formula
 *  for \f$ \hat A \f$, but with \f$ K^c \f$ plugged in in place of $\f$ K \f$.
 *
 *  \par
 *  KTA measures the Frobenius inner product between a kernel Gram matrix
 *  and this ideal matrix. The interpretation is that KTA measures how
 *  well a given kernel fits a classification problem. The actual measure
 *  is invariant under kernel rescaling.
 *  In Shark, objective functions are minimized by convention. Therefore
 *  the negative alignment \f$ - \hat A \f$ is implemented. The measure is
 *  extended for multi-class problems by using prototype vectors instead
 *  of scalar labels.
 *
 *  \par
 *  The following properties of KTA are important from a model selection
 *  point of view: it is relatively fast and easy to compute, it is
 *  differentiable w.r.t. the kernel function, and it is independent of
 *  the actual classifier.
 *
 *  \par
 *  The following notation is used in several of the methods of the class.
 *  \f$ K^c \f$ denotes the centered Gram matrix, y is the vector of labels,
 *  Y is the outer product of this vector with itself, k is the row
 *  (or column) wise average of the uncentered Gram matrix K, my is the
 *  label average, and u is the vector of all ones, and \f$ \ell \f$ is the
 *  number of data points, and thus the size of the Gram matrix.
 */
template<class InputType = RealVector,class LabelType = unsigned int>
class KernelTargetAlignment : public SingleObjectiveFunction
{
private:
	typedef typename Batch<LabelType>::type BatchLabelType;
public:
	/// \brief Construction of the Kernel Target Alignment (KTA) from a kernel object.
	///
	/// Don't forget to provide a data set with the setDataset method
	/// before using the object.
	KernelTargetAlignment(
		LabeledData<InputType, LabelType> const& dataset, 
		AbstractKernelFunction<InputType>* kernel
	){
		SHARK_CHECK(kernel != NULL, "[KernelTargetAlignment] kernel must not be NULL");
		
		mep_kernel = kernel;
		
		m_features|=HAS_VALUE;
		m_features|=CAN_PROPOSE_STARTING_POINT;
		
		if(mep_kernel -> hasFirstParameterDerivative())
			m_features|=HAS_FIRST_DERIVATIVE;
		
		m_data = dataset;
		m_elements = dataset.numberOfElements();
		
		
		setupY(dataset.labels());
	}

	/// \brief From INameable: return the class name.
	std::string name() const
	{ return "KernelTargetAlignment"; }

	/// Return the current kernel parameters as a starting point for an optimization run.
	SearchPointType proposeStartingPoint() const {
		return  mep_kernel -> parameterVector();
	}
	
	
	std::size_t numberOfVariables()const{
		return mep_kernel->numberOfParameters();
	}

	/// \brief Evaluate the (centered, negative) Kernel Target Alignment (KTA).
	///
	/// See the class description for more details on this computation.
	double eval(RealVector const& input) const{
		mep_kernel->setParameterVector(input);

		return -evaluateKernelMatrix().error;
	}

	/// \brief Compute the derivative of the KTA as a function of the kernel parameters.
	///
	/// It holds:
	/// \f[ \langle K^c, K^c \rangle = \langle K,K \rangle  -2 \ell \langle k,k \rangle  + mk^2 \ell^2 \\
	///     (\langle  K^c, K^c  \rangle )'  = 2 \langle K,K' \rangle  -4\ell \langle k, \frac{1}{\ell} \sum_j K'_ij \rangle  +2 \ell^2 mk \sum_ij 1/(\ell^2) K'_ij \\
	///   = 2 \langle K,K' \rangle  -4 \langle k, \sum_j K'_ij \rangle + 2 mk \sum_ij K_ij \\
	///   = 2 \langle K,K' \rangle  -4 \langle k u^T, K' \rangle + 2 mk \langle  u u^T, K' \rangle \\
	///   = 2\langle K -2 k u^T + mk u u^T, K' \rangle ) \\
	///     \langle Y, K^c \rangle  = \langle Y, K \rangle  - 2 n \langle y, k \rangle  + n^2 my mk \\
	///     (\langle  Y  , K^c  \rangle )' =   \langle Y -2 y u^T + my u u^T, K'  \rangle \f]
	/// now the derivative is computed from this values in a second sweep over the data:
	/// we get:
	/// \f[ \hat A' = 1/\langle K^c,K^c \rangle ^{3/2} (\langle K^c,K^c \rangle  (\langle Y,K^c \rangle )' - 0.5*\langle Y, K^c \rangle  (\langle  K^c , K^c \rangle )') \\
	///    = 1/\langle K^c,K^c \rangle ^{3/2} \langle  \langle K^c,K^c \rangle  (Y -2 y u^T + my u u^T)- \langle Y, K^c \rangle (K -2 k u^T+ mk u u^T),K'  \rangle \\
	///    = 1/\langle K^c,K^c \rangle ^{3/2} \langle W,K' \rangle \f]
	///reordering rsults in
	/// \f[ W= \langle K^c,K^c \rangle  Y - \langle Y, K^c \rangle K \\
	///     - 2 (\langle K^c,K^c \rangle y - \langle Y, K^c \rangle k) u^T \\
	///     +   (\langle K^c,K^c \rangle my - \langle Y, K^c \rangle mk) u u^T \f]
	/// where \f$ K' \f$ is the derivative of K with respct of the kernel parameters.
	ResultType evalDerivative( const SearchPointType & input, FirstOrderDerivative & derivative ) const {
		mep_kernel->setParameterVector(input);
		// the drivative is calculated in two sweeps of the data. first the required statistics
		// \langle K^c,K^c \rangle , mk and k are evaluated exactly as in eval

		KernelMatrixResults results = evaluateKernelMatrix();
				
		std::size_t parameters = mep_kernel->numberOfParameters();
		derivative.resize(parameters);
		derivative.clear();
		SHARK_PARALLEL_FOR(int i = 0; i < (int)m_data.numberOfBatches(); ++i){
			std::size_t startX = 0;
			for(int j = 0; j != i; ++j){
				startX+= size(m_data.batch(j));
			}
			RealVector threadDerivative(parameters,0.0);
			RealVector blockDerivative;
			boost::shared_ptr<State> state = mep_kernel->createState();
			RealMatrix blockK;//block of the KernelMatrix
			RealMatrix blockW;//block of the WeightMatrix
			std::size_t startY = 0;
			for(int j = 0; j <= i; ++j){
				mep_kernel->eval(m_data.batch(i).input,m_data.batch(j).input,blockK,*state);
				mep_kernel->weightedParameterDerivative(
					m_data.batch(i).input,m_data.batch(j).input,
					generateDerivativeWeightBlock(i,j,startX,startY,blockK,results),//takes symmetry into account
					*state,
					blockDerivative
				);
				noalias(threadDerivative) += blockDerivative;
				startY += size(m_data.batch(j));
			}
			SHARK_CRITICAL_REGION{
				noalias(derivative) += threadDerivative;
			}
		}
		derivative *= -1;
		return -results.error;
	}

private:
	AbstractKernelFunction<InputType>* mep_kernel;     ///< kernel function
	LabeledData<InputType,LabelType> m_data;      ///< data points
	RealVector m_columnMeanY;                        ///< mean label vector
	double m_meanY;                                  ///< mean label element
	std::size_t m_numberOfClasses;                  ///< number of classes
	std::size_t m_elements;                          ///< number of data points

	struct KernelMatrixResults{
		RealVector k;
		double KcKc;
		double YcKc;
		double error;
		double meanK;
	};
	
	void setupY(Data<unsigned int>const& labels){
		//preprocess Y so calculate column means and overall mean
		//the most efficient way to do this is via the class counts
		std::vector<std::size_t> classCount = classSizes(labels);
		m_numberOfClasses = classCount.size();
		RealVector classMean(m_numberOfClasses);
		double dm1 = m_numberOfClasses-1.0;
		for(std::size_t i = 0; i != m_numberOfClasses; ++i){
			classMean(i) = classCount[i]-(m_elements-classCount[i])/dm1;
			classMean /= m_elements;
		}
		
		m_columnMeanY.resize(m_elements);
		for(std::size_t i = 0; i != m_elements; ++i){
			m_columnMeanY(i) = classMean(labels.element(i)); 
		}
		m_meanY=sum(m_columnMeanY)/m_elements;
	}
	
	void setupY(Data<RealVector>const& labels){
		RealVector meanLabel = mean(labels);
		m_columnMeanY.resize(m_elements);
		for(std::size_t i = 0; i != m_elements; ++i){
			m_columnMeanY(i) = inner_prod(labels.element(i),meanLabel); 
		}
		m_meanY=sum(m_columnMeanY)/m_elements;
	}

	/// Update a sub-block of the matrix \f$ \langle Y, K^x \rangle \f$.
	double updateYKc(UIntVector const& labelsi,UIntVector const& labelsj, RealMatrix const& block)const{
		std::size_t blockSize1 = labelsi.size();
		std::size_t blockSize2 = labelsj.size();
		//todo optimize the i=j case
		double result = 0;
		double dm1 = m_numberOfClasses-1.0;
		for(std::size_t k = 0; k != blockSize1; ++k){
			for(std::size_t l = 0; l != blockSize2; ++l){
				if(labelsi(k) == labelsj(l))
					result += block(k,l);
				else
					result -= block(k,l)/dm1;
			}
		}
		return result;
	}
	
	/// Update a sub-block of the matrix \f$ \langle Y, K^x \rangle \f$.
	double updateYKc(RealMatrix const& labelsi,RealMatrix const& labelsj, RealMatrix const& block)const{
		std::size_t blockSize1 = labelsi.size1();
		std::size_t blockSize2 = labelsj.size1();
		//todo optimize the i=j case
		double result = 0;
		for(std::size_t k = 0; k != blockSize1; ++k){
			for(std::size_t l = 0; l != blockSize2; ++l){
				double y_kl = inner_prod(row(labelsi,k),row(labelsj,l));
				result += y_kl*block(k,l);
			}
		}
		return result;
	}
	
	void computeBlockY(UIntVector const& labelsi,UIntVector const& labelsj, RealMatrix& blockY)const{
		std::size_t blockSize1 = labelsi.size();
		std::size_t blockSize2 = labelsj.size();
		double dm1 = m_numberOfClasses-1.0;
		for(std::size_t k = 0; k != blockSize1; ++k){
			for(std::size_t l = 0; l != blockSize2; ++l){
				if( labelsi(k) ==  labelsj(l))
					blockY(k,l) = 1;
				else
					blockY(k,l) = -1.0/dm1;
			}
		}
	}
	
	void computeBlockY(RealMatrix const& labelsi,RealMatrix const& labelsj, RealMatrix& blockY)const{
		std::size_t blockSize1 = labelsi.size1();
		std::size_t blockSize2 = labelsj.size1();
		for(std::size_t k = 0; k != blockSize1; ++k){
			for(std::size_t l = 0; l != blockSize2; ++l){
				blockY(k,l) = inner_prod(row(labelsi,k),row(labelsj,l));
			}
		}
	}

	/// Compute a sub-block of the matrix
	/// \f[ W = \langle K^c, K^c \rangle Y - \langle Y, K^c \rangle K -2 (\langle K^c, K^c \rangle y - \langle Y, K^c \rangle k) u^T + (\langle K^c, K^c \rangle my - \langle Y, K^c \rangle mk) u u^T \f]
	RealMatrix generateDerivativeWeightBlock(
		std::size_t i, std::size_t j, 
		std::size_t start1, std::size_t start2, 
		RealMatrix const& blockK, 
		KernelMatrixResults const& matrixStatistics
	)const{
		std::size_t blockSize1 = size(m_data.batch(i));
		std::size_t blockSize2 = size(m_data.batch(j));
		//double n = m_elements;
		double KcKc = matrixStatistics.KcKc;
		double YcKc = matrixStatistics.YcKc;
		double meanK = matrixStatistics.meanK;
		RealMatrix blockW(blockSize1,blockSize2);
		
		//first calculate \langle Kc,Kc \rangle  Y.
		computeBlockY(m_data.batch(i).label,m_data.batch(j).label,blockW);
		blockW *= KcKc;
		//- \langle Y,K^c \rangle K
		blockW-=YcKc*blockK;
		//  -2(\langle Kc,Kc \rangle y -\langle Y, K^c \rangle  k) u^T
		// implmented as: -(\langle K^c,K^c \rangle y -2\langle Y, K^c \rangle  k) u^T -u^T(\langle K^c,K^c \rangle y -2\langle Y, K^c \rangle  k)^T
		//todo find out why this is correct and the calculation above is not.
		blockW-=repeat(subrange(KcKc*m_columnMeanY - YcKc*matrixStatistics.k,start2,start2+blockSize2),blockSize1);
		blockW-=trans(repeat(subrange(KcKc*m_columnMeanY - YcKc*matrixStatistics.k,start1,start1+blockSize1),blockSize2));
		// + (\langle Kc,Kc \rangle  my-2\langle Y, Kc \rangle mk) u u^T
		blockW+= KcKc*m_meanY-YcKc*meanK;
		blockW /= KcKc*std::sqrt(KcKc);
		//std::cout<<blockW<<std::endl;
		//symmetry
		if(i != j)
			blockW *= 2.0;
		return blockW;
	}

	/// \brief Evaluate the centered kernel Gram matrix.
	///
	/// The computation is as follows. By means of a
	/// number of identities it holds
	/// \f[ \langle K^c, K^c \rangle = \\
	///     \langle K^c, K^c \rangle  = \langle K,K \rangle  -2n\langle k,k \rangle  +mk^2n^2 \\
	///     \langle K^c, Y \rangle  = \langle K, Y \rangle  - 2 n \langle k, y \rangle  + n^2 mk my \f]
	/// where k is the row mean over K and y the row mean over y, mk, my the total means of K and Y 
	/// and n the number of elements
	KernelMatrixResults evaluateKernelMatrix()const{
		//it holds
		// \langle K^c,K^c \rangle  = \langle K,K \rangle  -2n\langle k,k \rangle  +mk^2n^2
		// \langle K^c,Y \rangle  = \langle K, Y \rangle  - 2 n \langle k, y \rangle  + n^2 mk my
		// where k is the row mean over K and y the row mean over y, mk, my the total means of K and Y 
		// and n the number of elements
		
		double KK = 0; //stores \langle K,K \rangle 
		double YKc = 0; //stores \langle Y,K^c \rangle 
		RealVector k(m_elements,0.0);//stores the row/column means of K
		SHARK_PARALLEL_FOR(int i = 0; i < (int)m_data.numberOfBatches(); ++i){
			std::size_t startRow = 0;
			for(int j = 0; j != i; ++j){
				startRow+= size(m_data.batch(j));
			}
			std::size_t rowSize = size(m_data.batch(i));
			double threadKK = 0;
			double threadYKc = 0;
			RealVector threadk(m_elements,0.0);
			std::size_t startColumn = 0; //starting column of the current block
			for(int j = 0; j <= i; ++j){
				std::size_t columnSize = size(m_data.batch(j));
				RealMatrix blockK = (*mep_kernel)(m_data.batch(i).input,m_data.batch(j).input);
				if(i == j){
					threadKK += frobenius_prod(blockK,blockK);
					subrange(threadk,startColumn,startColumn+columnSize)+=sum_rows(blockK);//update sum_rows(K)
					threadYKc += updateYKc(m_data.batch(i).label,m_data.batch(j).label,blockK);
				}
				else{//use symmetry ok K
					threadKK += 2.0 * frobenius_prod(blockK,blockK);
					subrange(threadk,startColumn,startColumn+columnSize)+=sum_rows(blockK);
					subrange(threadk,startRow,startRow+rowSize)+=sum_columns(blockK);//symmetry: block(j,i)
					threadYKc += 2.0 * updateYKc(m_data.batch(i).label,m_data.batch(j).label,blockK);
				}
				startColumn+=columnSize;
			}
			SHARK_CRITICAL_REGION{
				KK += threadKK;
				YKc +=threadYKc;
				noalias(k) +=threadk;
			}
		}
		//calculate the error
		double n = m_elements;
		k /= n;//means
		double meanK = sum(k)/n;
		double n2 = sqr(n);
		double YcKc = YKc-2.0*n*inner_prod(k,m_columnMeanY)+n2*m_meanY*meanK;
		double KcKc = KK - 2.0*n*inner_prod(k,k)+n2*sqr(meanK);

		KernelMatrixResults results;
		results.k=k;
		results.YcKc = YcKc;
		results.KcKc = KcKc;
		results.meanK = meanK;
		results.error = YcKc/std::sqrt(KcKc);
		return results;
	}
};


}
#endif