This file is indexed.

/usr/share/psychtoolbox-3/PsychTests/ConvolutionKernelTest.m is in psychtoolbox-3-common 3.0.12.20160126.dfsg1-1ubuntu1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
function [passed difference speedup] = ConvolutionKernelTest(win, nrinchannels, nroutchannels, kernel1, kernel2, imgsize, shadertype, debug)
% [passed difference speedup] = ConvolutionKernelTest(win, nrinchannels, nroutchannels, kernel1, kernel2, imgsize, shadertype, debug)
%
% Test Psychtoolbox imaging pipeline's 2D convolution shaders for
% correctness and accuracy, perform speed benchmark, return fastest setup,
% when using a specific (pair) of kernel(s) and parameters.
%
% This routine builds and tests a set of convolution shaders from the given
% convolution kernel (or pair of kernels for separable dual-pass
% convolution). Each shader is compared against the results of
% Matlabs/Octaves conv2 function, applied to a random noise luminance image
% matrix. The shader is tagged as working correctly if the conv2 result and
% PTB's result do not disagree by more than 1 unit at any location in the
% convolved output images. Accuracy (maximum difference) is reported. All
% correctly working shaders are then benchmarked for speed during a test
% period of 10 seconds and the speedup of the GPU vs. Matlab (CPU) is
% determined and reported. At the end, the best configuration (wrt.
% correctness, accuracy and speed) is reported/recommended for use with the
% given kernel.
%
% The routine takes at least 10 seconds per tested shader, so a full test
% run will take at least 4*10 = 40 seconds, probably a bit more for setup
% and shutdown. Status messages will tell you about progress of the
% operation. You shouldn't use your machine and don't run any other
% applications during benchmarking, otherwise the measured speedup numbers
% may be wrong due to GPU or CPU overload.
%
% Optional parameters and their defaults:
%
% 'win' Window handle of the onscreen window to test on. If none provided,
% will open a suitable one by itself on screen 0.
%
% 'nrinchannels' number of image color channels in test image: Default is 1
% for pure luminance convolution. This script always only tests the first
% channel (red/luminance) for correctness/accuracy, even on multi-channel
% images, but choice of channels will affect overall correctness and speed.
%
% 'nroutchannels' number of image output channels from convolution: By
% default 1 == convolve, replicate result to RGB channels, pass alpha
% through. 3 == Convolve RGB separately, pass through alpha. 4 == Convolve
% RGBA separately.
%
% 'kernel1' == 2D kernel or first 1D kernel in separable mode.
% 'kernel2' == 1D kernel for 2nd pass in separable convolution test.
%
% 'imgsize' == Either size of the random noise test image (default =
% 512x512), or a Matlab image matrix to test on.
%
% 'shadertype' == Vector of mode ids: Tests all modes in the vector. By
% default all shadertypes are tested ie shadertype = [0 1 2 3]. PTB
% provides different implementations of convolution (0,1,2,3) which may
% have different accuracy and performance for a given hardware and kernel.
% The shaders provided in this vector will be tested against each other.
%
% 'debug'Defaults to zero (no output): Amount of debug output to write to
% Matlab window.
%
% THIS SCRIPT IS NOT COMPLETELY FINISHED YET.

% Some Benchmark results for convolution:
%
% MacOS/X 10.4.10 + Matlab 7.1 + PowerPC G5 1.6 Ghz single core vs. NVidia
% GeforceFX-5200 Ultra:
%
% Gaussian kernel, 2D single pass, 1->1 channels, 512 x 512 image:
% Kernel of size 3x3: Speedup 2.5x
%
% Gaussian kernel, 1D separable dual pass, 1->1 channels, 512 x 512 image:
% Kernels of size 9: Speedup 3.5x
%
% -> Kernels greater than 3x3 2D or 9 1D are unsupported on GF5200, can
% cause a system crash!
%
% WindowsXP + Matlab 7.4 + PentiumIV, 3.0 Ghz DualCore (CPU) vs. NVidia
% Geforce7800-GTX:
%
% Case 1: Gaussian kernel, 2D single-pass, 1->1 channels, 512 x 512 image:
% 
% Kernel of size 3x3: Speedup   21.0x
% Kernel of size 5x5: Speedup   12.3x
% Kernel of size 7x7: Speedup   10.5x
% Kernel of size 9x9: Speedup    3.4x
% Kernel of size 11x11: Speedup  6.9x
% Kernel of size 13x13: Speedup  6.8x
% Kernel of size 15x15: Speedup  7.0x
% Kernel of size 17x17: Speedup  6.8x
% Kernel of size 19x19: Speedup  6.9x
%
% 3x3 Sobel, Prewitt, Laplace: Speedup 70x
%
% Case 2: Gaussian kernel, 1D dual-pass, 1->1 channels, 512 x 512 image:
% size   5: 0.65ms vs. 21.7ms: Speedup 33.0x
% size  15: 1.62ms vs. 55.5ms: Speedup 34.3x
% size  33: 3.78ms vs. 115ms:  Speedup 30.6x
% size  65: 7.57ms vs. 242ms:  Speedup 32.0x
% size 129:14.85ms vs. 603ms:  Speedup 40.6x
%
% --> 4-channels vs. 1 channel: CPU x4, GPU const. ---> Speedup times 4!
%
% History:
% 10/13/2007 Written (MK).
% 06/13/2009 Remove Octave special case handling.

global GL;

% Prepare test for this configuration:
% ------------------------------------

% Parse inputs, assign defaults:
if nargin < 2 || isempty(nrinchannels)
    nrinchannels = 1;
end

if nargin < 3 || isempty(nroutchannels)
    nroutchannels = 1;
end

if nargin < 4 || isempty(kernel1)
    error('You must provide at least kernel1 !!');
end

if nargin < 5 || isempty(kernel2)
    kernel2 = [];
end

noiseimg = [];
if nargin < 6 || isempty(imgsize)
    imgsize = 512;
else
    if length(imgsize)>1
        % imgsize is an image matrix:
        noiseimg = imgsize;
        imgsize = length(imgsize);
    end
end

if nargin < 7 || isempty(shadertype)
    shadertype = [0 1 2 3];
end

if nargin < 8 || isempty(debug)
    debug = 0;
end

if nargin < 1 || isempty(win) || win < 10
    % No window provided: Open a suitable one:
    if nargin >=1 && ~isempty(win) && win < 10
        screenid = win;
        win = 0;
    else
        screenid = max(Screen('Screens'));
    end
    win = Screen('OpenWindow', screenid, 0, [], [], [], [], [], kPsychNeedFastBackingStore);
    doclose = 1;
else
    doclose = 0;
end

try

    Screen('TextSize', win, 24);
    Screen('TextColor', win, 255);
    DrawFormattedText(win, 'Preparing test\nThis can take multiple seconds\nPlease standby...', 'center', 'center');
    Screen('Flip', win);

    % Create operator's:
    for i=1:length(shadertype)
        % Store shader type:
        gloperatorid(i) = shadertype(i);

        % Create operator of maximum precision:
        gloperator(i) = CreateGLOperator(win, kPsychNeed32BPCFloat);

        try
            % Separable or non-separable?
            if isempty(kernel2)
                % Non-separable: Full blown single-pass 2D convolution:
                Add2DConvolutionToGLOperator(gloperator(i), kernel1, [], nrinchannels, nroutchannels, debug, shadertype(i));
            else
                % Separable: Two 1D convolutions:
                Add2DSeparableConvolutionToGLOperator(gloperator(i), kernel1, kernel2, [], nrinchannels, nroutchannels, debug, shadertype(i));
            end
        catch
            % Shader creation failed, e.g., GLSL compile/link failure due
            % to shader which would overload hardware ressources. Mark this
            % mode as invalid:
            gloperator(i)=-1;
            le = psychlasterror;
            fprintf('Failed to create shader: %s\n', le.message);
        end
    end
    
    % Test correctness and accuracy:

    % Build test-image: Random noise with mean 128 and stddev. +/- 50:
    if isempty(noiseimg)
        noiseimg=(50*randn(imgsize, imgsize) + 128);
    end

    % Cast to uint8 to make sure both CPU and GPU get the same uint8 input
    % data:
    noiseimg=uint8(noiseimg);

    % Convert it to a texture 'tex':
    tex=Screen('MakeTexture', win, noiseimg);

    % Show it onscreen:
    Screen('DrawTexture', win, tex, [], Screen('Rect', tex));
    Screen('Flip', win);

    % Cast back to single precision for CPU:
    noiseimg=single(noiseimg(:,:,1));
    kernel1= single(kernel1);
    kernel2= single(kernel2);

    kernel = kernel1;
    
    for i=1:length(shadertype)
        if gloperator(i)==-1
            maxdiff(i)=inf;
            fprintf('Shadertype %i: Excluded - Beyond hardware limits.\n', shadertype(i));
            continue;
        end
        
        % On GPU: Apply filter to texture:
        xtex = Screen('TransformTexture', tex, gloperator(i));

        % On CPU: Do the same with conv2 routine of Matlab/Octave:
        if isempty(kernel2)
            % Non-separable convolution with single-precision kernel:
            ref = conv2(noiseimg, kernel1, 'same');
        else
            % Separable dual-pass convolution with single-precision kernels:
            ref = conv2(kernel1, kernel2, noiseimg, 'same');
        end

        % Readback result from GPU with highest precision (ie. float):
        clear gpu;
        gpu = Screen('GetImage', xtex, [], [], 1, 1) * 255;

        % Compute difference matrix:
        difference = gpu(:,:,1) - ref;

        % Only look at inner region, ie. ignore borders the size of the kernel
        % to make sure we don't get confused by boundary artefacts. Compute
        % absolute difference ie. discard sign:
        kernel = kernel1;
        difference = abs(difference(length(kernel):end-length(kernel), length(kernel):end-length(kernel)));

        % Compute maximum difference:
        maxdiff(i) = max(max(difference));

        % Show original and xformed tex onscreen:
        Screen('DrawTexture', win, tex, [], Screen('Rect', tex));
        Screen('DrawTexture', win, xtex, [], AdjoinRect(Screen('Rect', xtex), Screen('Rect', tex), RectRight));
        DrawFormattedText(win, sprintf('Result of mode %i.\nTesting...\n', shadertype(i)), 'center', 'center');
        Screen('Flip', win);

        % Discard old 'xtex':
        Screen('Close', xtex);

        fprintf('Shadertype %i: Maximum difference: %f units. --> ', shadertype(i), maxdiff(i));
        if maxdiff(i) < 1
            fprintf('PASSED!\n');
        else
            fprintf('FAILED!\n');
        end

        % Next shadertype...
    end

    % For those tests that passed, measure performance vs. Matlab/Octave:
    for i=1:length(shadertype)
        if maxdiff(i) < 1
            % Perform 5 seconds of testing on GPU:

            % On GPU: Apply filter to texture once to preheat:
            xtex = Screen('TransformTexture', tex, gloperator(i));

            Screen('DrawTexture', win, tex, [], Screen('Rect', tex));
            Screen('DrawTexture', win, xtex, [], AdjoinRect(Screen('Rect', xtex), Screen('Rect', tex), RectRight),0,0);
            DrawFormattedText(win, sprintf('Benchmarking mode %i.\nCan take more than 10 seconds - Please wait...\n', shadertype(i)), 'center', 'center');
            Screen('Flip', win);

            % Prepare test:
            glFinish;

            count = 0;
            tstart=GetSecs;
            % Do as many xforms as possible in 5 seconds...
            while GetSecs - tstart < 5
                % We recycle the cached xtex target texture as we would do in a
                % real experiment script:
                xtex = Screen('TransformTexture', tex, gloperator(i), [], xtex);
                % Increment counter:
                count = count + 1;
            end

            % Done. Sync the pipeline, take elapsed time:
            glFinish;
            telapsed = GetSecs - tstart;

            avggpu(i) = telapsed / count;

            % Retain last texture for double-checking:
            clear gpu;
            %err = glGetError;
            %fprintf('GL-ERROR1: %i %s\n', err, gluErrorString(err));
            gpu = Screen('GetImage', xtex, [], [], 1, 1) * 255;
            %err = glGetError;
            %fprintf('GL-ERROR2: %i %s\n', err, gluErrorString(err));

            % Compute difference matrix:
            difference = gpu(:,:,1) - ref;

            % Only look at inner region, ie. ignore borders the size of the kernel
            % to make sure we don't get confused by boundary artefacts. Compute
            % absolute difference ie. discard sign:
            kernel = kernel1;
            difference = abs(difference(length(kernel):end-length(kernel), length(kernel):end-length(kernel)));

            % Compute maximum difference:
            maxdiff2(i) = max(max(difference));
            
            if maxdiff2(i) > 1
                fprintf('Shadertype %i FAILED revalidation with error %f!\n', shadertype(i), maxdiff2(i));
            else
                fprintf('Shadertype %i revalidated with error %f!\n', shadertype(i), maxdiff2(i));                
            end
            
            % Close texture:
            Screen('Close', xtex);
            xtex = 0;

            % Now the same game on the CPU:

            % On CPU: Do the same with conv2 routine of Matlab/Octave:
            if isempty(kernel2)
                % Non-separable convolution with single-precision kernel:
                ref = conv2(noiseimg, kernel1, 'same');
            else
                % Separable dual-pass convolution with single-precision kernels:
                ref = conv2(kernel1, kernel2, noiseimg, 'same');
            end

            count = 0;
            tstart=GetSecs;

            % Do as many xforms as possible in 5 seconds...
            while GetSecs - tstart < 5
                % On CPU: Do the same with conv2 routine of Matlab/Octave:
                if isempty(kernel2)
                    % Non-separable convolution with single-precision kernel:
                    ref = conv2(noiseimg, kernel, 'same');
                else
                    % Separable dual-pass convolution with single-precision kernels:
                    ref = conv2(kernel1, kernel2, noiseimg, 'same');
                end

                % Increment counter:
                count = count + 1;
            end

            % Done. Sync the pipeline, take elapsed time:
            telapsed = GetSecs - tstart;

            avgcpu(i) = telapsed / count;


            fprintf('Shadertype %i: Msecs per xform:  GPU = %f   :  CPU = %f  --> ', shadertype(i), 1000 * avggpu(i), 1000 * avgcpu(i));
            fprintf('Speedup is %f times vs. CPU.\n', avgcpu(i) / avggpu(i));
            speedup(i) = avgcpu(i) / avggpu(i);
        else
            % This one failed the test:
            avgcpu(i)=inf;
            avggpu(i)=inf;
            speedup(i)=0;
        end
        % Next shadertype...
    end

    % Done. Destroy operators and textures:
    if doclose
        % Close onscreen window as well:
        Screen('CloseAll');
    else
        % Do not close onscreen window:
        Screen('Close');
    end

    if exist('maxdiff2', 'var')
        for i=1:length(maxdiff2)
            if maxdiff2(i)>1
                speedup(i)=0;
            end
        end

        [maxspeedup bestid] = max(speedup);
        bestid = gloperatorid(bestid);

        fprintf('Finished: Optimum configuration is mode %i with a speedup of %f x.\n', bestid, maxspeedup);
    end
    
    return;
catch
    Screen('CloseAll');
    psychrethrow(psychlasterror);
end