This file is indexed.

/usr/include/madness/tensor/systolic.h is in libmadness-dev 0.10-3.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
#ifndef MADNESS_SYSTOLIC_H
#define MADNESS_SYSTOLIC_H

/*
  This file is part of MADNESS.

  Copyright (C) 2007,2010 Oak Ridge National Laboratory

  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with this program; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

  For more information please contact:

  Robert J. Harrison
  Oak Ridge National Laboratory
  One Bethel Valley Road
  P.O. Box 2008, MS-6367

  email: harrisonrj@ornl.gov
  tel:   865-241-3937
  fax:   865-572-0680
*/

#include <madness/world/MADworld.h>
#include <utility>
#include <madness/tensor/tensor.h>
#include <madness/tensor/distributed_matrix.h>

namespace madness {

    /// Base class for parallel algorithms that employ a systolic loop to generate all row pairs in parallel
    template <typename T>
    class SystolicMatrixAlgorithm : public TaskInterface {
    private:
        DistributedMatrix<T>& A;
        const int64_t nproc;            ///< No. of processes with rows of the matrix (not size of world)
        const int64_t coldim;           ///< A(coldim,rowdim)
        const int64_t rowdim;           ///< A(coldim,rowdim)
        const int64_t nlocal;           ///< No. of local pairs
        const ProcessID rank;           ///< Rank of current process
        const int tag;                  ///< MPI tag to be used for messages
        std::vector<T*> iptr, jptr;     ///< Indirection for implementing cyclic buffer !! SHOULD BE VOLATILE ?????
        std::vector<int64_t> map;       ///< Used to keep track of actual row indices

#ifdef HAVE_INTEL_TBB
        void iteration(const int nthread) {

            // Parallel initialization hook
            tbb::parallel_for(0, nthread, [=](const int id) {
                this->start_iteration_hook(TaskThreadEnv(nthread, id));
            });

            if (nlocal > 0) {
//                int64_t ilo, ihi;
//                A.local_colrange(ilo, ihi);

                const int neven = coldim + (coldim&0x1);
                const int pairlo = rank*A.coltile()/2;

                for (int loop=0; loop<(neven-1); ++loop) {

                    // This loop is parallelized over threads
                    tbb::parallel_for(0, nthread,
                        [this,neven,pairlo,nthread,loop](const int id) {
                            for (int pair=id; pair<nlocal; pair+=nthread) {

                                int rp = neven/2-1-(pair+pairlo);
                                int iii = (rp+loop)%(neven-1);
                                int jjj = (2*neven-2-rp+loop)%(neven-1);
                                if (rp == 0) jjj = neven-1;

                                iii = map[iii];
                                jjj = map[jjj];

                                if (jptr[pair]) {
                                    this->kernel(iii, jjj, iptr[pair], jptr[pair]);
                                }
                            }
                        });

                    cycle();

                }
            }

            // Parallel finalization hook
            tbb::parallel_for(0, nthread, [=](const int id) {
                this->end_iteration_hook(TaskThreadEnv(nthread, id));
            });

        }
#else
        void iteration(const TaskThreadEnv& env) {

            env.barrier();
            start_iteration_hook(env);
            env.barrier();

            if (nlocal > 0) {
                int64_t ilo, ihi;
                A.local_colrange(ilo, ihi);

                int neven = coldim + (coldim&0x1);

                int pairlo = rank*A.coltile()/2;

                int threadid = env.id();
                int nthread = env.nthread();

                for (int loop=0; loop<(neven-1); ++loop) {

                    // This loop is parallelized over threads
                    for (int pair=env.id(); pair<nlocal; pair+=nthread) {

                        int rp = neven/2-1-(pair+pairlo);
                        int iii = (rp+loop)%(neven-1);
                        int jjj = (2*neven-2-rp+loop)%(neven-1);
                        if (rp == 0) jjj = neven-1;

                        iii = map[iii];
                        jjj = map[jjj];

                        if (jptr[pair]) {
                            kernel(iii, jjj, iptr[pair], jptr[pair]);
                        }
                    }
                    env.barrier();

                    if (threadid == 0) cycle();

                    env.barrier();
                }
            }

            end_iteration_hook(env);

            env.barrier();
        }
#endif // HAVE_INTEL_TBB

        /// Call this after iterating to restore correct order of rows in original matrix

        /// At the end of each iteration the matrix rows are logically back in
        /// their correct order.  However, due to indirection to reduce data motion,
        /// if the local column dimension is not a factor of the number of cycles
        /// the underlying data may be in a different order.  This restores sanity.
        ///
        /// Only one thread should invoke this routine
        void unshuffle() {
            if (nlocal <= 0) return;
            Tensor<T>& t = A.data();
            Tensor<T> tmp(2L, t.dims(), false);
            T* tp = tmp.ptr();
            for (int64_t i=0; i<nlocal; ++i) {
                memcpy(tp+i*rowdim, iptr[i], rowdim*sizeof(T));
                if (jptr[i]) {
                    memcpy(tp+(i+nlocal)*rowdim, jptr[i], rowdim*sizeof(T));
                }
                iptr[i] = &t(i,0);
                jptr[i] = &t(i+nlocal,0);
            }
            memcpy(t.ptr(), tmp.ptr(), t.size()*sizeof(T));

            if (rank==(nproc-1) && (coldim&0x1)) jptr[nlocal-1] = 0;
        }

        /// Cycles data around the loop ... only one thread should invoke this
        void cycle() {
            if (coldim <= 2) return; // No cycling necessary
            if (nlocal <= 0) {       // Nothing local
                MADNESS_ASSERT(rank >= nproc);
                return;
            }

            // Check assumption that tiling put incomplete tile at the end
            MADNESS_ASSERT(A.local_coldim() == A.coltile()  ||  rank == (nproc-1));

            const ProcessID left = rank-1; //Invalid values are not used
            const ProcessID right = rank+1;

            /*
              Consider matrix (10,*) distributed with coltile=4 over
              three processors.

              .   0 1 2 3      4 5 6 7      8 9

              This is divided up as follows into this initial
              configuration for the loop

              .            P=0          P=1         P=2
              .                  msg          msg
              .   i    -->0-->1  -->   4-->5  -->    8  -->
              .       ^                                   |  msg
              .       |                         <---------
              .   j    <--2<--3  <--   6<--7  <--|   9
              .                  msg          msg

              The first and last processes in the loop have to wrap ... others
              just pass left and right.  Note that 9 stays put.

              Note that the algorithm is assuming distribution puts equal
              amount of data on all nodes except the last.

              The i data is considered as flowing to the right.
              The j data is considered as flowing to the left.


              Hence, we should explore the pairs in this order
              (n-1 sets of n/2 pairs)

              .          P=0         P=1        P=2
              .          0  1        4  5       8
              .          2  3        6  7       9

              .          2  0        1  4       5
              .          3  6        7  8       9

              .          3  2        0  1       4
              .          6  7        8  5       9

              .          6  3        2  0       1
              .          7  8        5  4       9

              .          7  6        3  2       0
              .          8  5        4  1       9

              .          8  7        6  3       2
              .          5  4        1  0       9

              .          5  8        7  6       3
              .          4  1        0  2       9

              .          4  5        8  7       6
              .          1  0        2  3       9

              .          1  4        5  8       7
              .          0  2        3  6       9
            */

            // Copy end elements before they are overwritten
            T* ilast  = iptr[nlocal-1];
            T* jfirst = jptr[0];

            // Cycle local pointers
            for (int64_t i=0; i<nlocal-1; ++i) {
                iptr[nlocal-i-1] = iptr[nlocal-i-2];
                jptr[i] = jptr[i+1];
            }

            World& world = A.get_world();

            if (nproc == 1) {
                iptr[0] = jfirst;
                jptr[nlocal-2] = ilast;
            }
            else if (rank == 0) {
                iptr[0] = jfirst;
                world.mpi.Send(ilast, rowdim, right, tag);
                jptr[nlocal-1] = ilast;
                world.mpi.Recv(ilast, rowdim, right, tag);
            }
            else if (rank == (nproc-1)) {
                if (nlocal > 1) {
                    iptr[0] = jfirst;
                    jptr[nlocal-2] = ilast;
                }
                std::vector<T> buf(rowdim);
                SafeMPI::Request req = world.mpi.Irecv(&buf[0], rowdim, left, tag);
                world.mpi.Send(iptr[0], rowdim, left, tag);
                world.await(req,false);
                std::memcpy(iptr[0], &buf[0], rowdim*sizeof(T));
            }
            else {
                std::vector<T> buf1(rowdim);
                std::vector<T> buf2(rowdim);
                SafeMPI::Request req1 = world.mpi.Irecv(&buf1[0], rowdim, left, tag);
                SafeMPI::Request req2 = world.mpi.Irecv(&buf2[0], rowdim, right, tag);
                world.mpi.Send( ilast, rowdim, right, tag);
                world.mpi.Send(jfirst, rowdim,  left, tag);
                world.await(req1,false);
                world.await(req2,false);
                std::memcpy(ilast, &buf2[0], rowdim*sizeof(T)); //world.mpi.Recv( ilast, rowdim, right, tag);
                std::memcpy(jfirst, &buf1[0], rowdim*sizeof(T)); //world.mpi.Recv(jfirst, rowdim,  left, tag);

                iptr[0] = jfirst;
                jptr[nlocal-1] = ilast;
            }
        }

        /// Get the task id

        /// \param id The id to set for this task
        virtual void get_id(std::pair<void*,unsigned short>& id) const {
            PoolTaskInterface::make_id(id, *this);
        }

    public:
        /// A must be a column distributed matrix with an even column tile >= 2

        /// It is assumed that it is the main thread invoking this.
        /// @param[in,out] A The matrix on which the algorithm is performed and modified in-place
        /// @param[in] tag The MPI tag used for communication (obtain from \c world.mpi.comm().unique_tag() )
        /// @param[in] nthread The number of local threads to use (default is main thread all threads in the pool)
        SystolicMatrixAlgorithm(DistributedMatrix<T>& A, int tag, int nthread=ThreadPool::size()+1)
            : A(A)
            , nproc(A.process_coldim()*A.process_rowdim())
            , coldim(A.coldim())
            , rowdim(A.rowdim())
            , nlocal((A.local_coldim()+1)/2)
            , rank(A.get_world().rank())
            , tag(tag)
            , iptr(nlocal)
            , jptr(nlocal)
            , map(coldim+(coldim&0x1))
        {
            TaskInterface::set_nthread(nthread);

            MADNESS_ASSERT(A.is_column_distributed() && (nproc==1 || (A.coltile()&0x1)==0));

            // Initialize vectors of pointers to matrix rows)
            Tensor<T>& t = A.data();

            //madness::print(nproc, coldim, rowdim, nlocal, rank, tag);

            for (int64_t i=0; i<nlocal; ++i) {
                iptr[i] = &t(i,0);
                jptr[i] = &t(i+nlocal,0);
            }

            // If no. of rows is odd, last process should have an empty last row
            if (rank==(nproc-1) && (coldim&0x1)) jptr[nlocal-1] = 0;

            // Initialize map from logical index order to actual index order

            int neven = (coldim+1)/2;
            int ii=0;
            for (ProcessID p=0; p<nproc; ++p) {
                int64_t lo, hi;
                A.get_colrange(p, lo, hi);
                int p_nlocal = (hi - lo + 2)/2;
                //print("I think process",p,"has",lo,hi,p_nlocal);
                for (int i=0; i<p_nlocal; ++i) {
                    map[ii+i] = lo+i;
                    //map[coldim-ii-nlocal+i] = lo+i+nlocal;
                    map[ii+i+neven] = lo+i+p_nlocal;
                }
                ii += p_nlocal;
            }

            std::reverse(map.begin(),map.begin()+neven);

            //print("MAP", map);
        }

        virtual ~SystolicMatrixAlgorithm() {}

        /// Threadsafe routine to apply the operation to rows i and j of the matrix

        /// @param[in] i First row index in the matrix
        /// @param[in] j Second row index in the matrix
        /// @param[in] rowi Pointer to row \c i of the matrix (to be modified by kernel in-place)
        /// @param[in] rowj Pointer to row \c j of the matrix (to be modified by kernel in-place)
        virtual void kernel(int i, int j, T* rowi, T* rowj) = 0;


        /// Invoked simultaneously by all threads after each sweep to test for convergence

        /// There is a thread barrier before and after the invocation of this routine
        /// @param[in] env The madness thread environment in case synchronization between threads is needed during computation of the convergence condition.
        virtual bool converged(const TaskThreadEnv& env) const = 0;


        /// Invoked by all threads at the start of each iteration

        /// There is a thread barrier before and after the invocation of this routine
        /// @param[in] env The madness thread environment in case synchronization between threads is needed during startup.
        virtual void start_iteration_hook(const TaskThreadEnv& env) {}


        /// Invoked by all threads at the end of each iteration before convergence test

        /// There is a thread barrier before and after the invocation of this routine.
        /// Note that the \c converged() method is \c const whereas this can modify the class.
        /// @param[in] env The madness thread environment in case synchronization between threads is needed during startup.
        virtual void end_iteration_hook(const TaskThreadEnv& env) {}


#ifdef HAVE_INTEL_TBB
        /// Invoked by the task queue to run the algorithm with multiple threads

        /// This is a collective call ... all processes in world should submit this task
        void run(World& world, const TaskThreadEnv& env) {
            const int nthread = env.nthread();
            bool done = false;
            do {
                iteration(nthread);
                done = tbb::parallel_reduce(tbb::blocked_range<int>(0,nthread), true,
                    [=] (const tbb::blocked_range<int>& range, bool init) -> bool {
                        for(int id = range.begin(); id < range.end(); ++id)
                            init = init &&
                                this->converged(TaskThreadEnv(nthread, id));
                        return init;
                    },
                    [] (const bool l, const bool r) { return l && r; });

            } while (!done);

            unshuffle();
        }
#else
        /// Invoked by the task queue to run the algorithm with multiple threads

        /// This is a collective call ... all processes in world should submit this task
        void run(World& world, const TaskThreadEnv& env) {
            do {
                iteration(env);
            } while (!converged(env));

            if (env.id() == 0) unshuffle();

            env.barrier();
        }
#endif // HAVE_INTEL_TBB

        /// Invoked by the user to run the algorithm with one thread mostly for debugging

        /// This is a collective call ... all processes in world should call this routine.
        void solve_sequential() {
            run(A.get_world(), TaskThreadEnv(1,0,0));
        }

        /// Returns length of row
        int64_t get_rowdim() const {return rowdim;}


        /// Returns length of column
        int64_t get_coldim() const {return coldim;}

        /// Returns a reference to the world
        World& get_world() const {
            return A.get_world();
        }

        /// Returns rank of this process in the world
        ProcessID get_rank() const {
            return rank;
        }
    };
}

#endif