This file is indexed.

/usr/include/NTL/sp_arith.h is in libntl-dev 10.5.0-2.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

   1
   2
   3
   4
   5
   6
   7
   8
   9
  10
  11
  12
  13
  14
  15
  16
  17
  18
  19
  20
  21
  22
  23
  24
  25
  26
  27
  28
  29
  30
  31
  32
  33
  34
  35
  36
  37
  38
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53
  54
  55
  56
  57
  58
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91
  92
  93
  94
  95
  96
  97
  98
  99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
#ifndef NTL_sp_arith__H
#define NTL_sp_arith__H


/****************************************************************

    Single-precision modular arithmetic

*****************************************************************/


/*
these routines implement single-precision modular arithmetic.
If n is the modulus, all inputs should be in the range 0..n-1.
The number n itself should be in the range 1..2^{NTL_SP_NBITS}-1.
*/


// DIRT: undocumented feature: in all of these MulMod routines,
// the first argument, a, need only be in the range
// 0..2^{NTL_SP_NBITS}-1.  This is assumption is used internally
// in some NT routines...I've tried to mark all such uses with a
// DIRT comment.  I may decide to make this feature part
// of the documented interface at some point in the future.

// NOTE: this header file is for internal use only, via the ZZ.h header.
// It is also used in the LIP implementation files c/g_lip_impl.h.


#include <NTL/lip.h>
#include <NTL/tools.h>


NTL_OPEN_NNS


#define NTL_HAVE_MULMOD_T

#if (defined(NTL_SPMM_ULL) && defined(NTL_HAVE_LL_TYPE))
// we only honor SPMM_ULL if we have LL
#define NTL_SPMM_ULL_VIABLE

#elif (defined(NTL_SPMM_ULL) && defined(NTL_WIZARD_HACK))
// raise an error when running the wizard and we cannot honor SPMM_ULL
#error "cannot honor NTL_SPMM_ULL"

#endif



#if 0
// the following code can be used to use new-style clients with old versions
// of NTL


#ifndef NTL_HAVE_MULMOD_T

NTL_OPEN_NNS

typedef double mulmod_t;
typedef double muldivrem_t;


inline double PrepMulMod(long n)
{
   return double(1L)/double(n);
}

inline double PrepMulDivRem(long b, long n, double ninv)
{
   return double(b)*ninv;
}

inline double PrepMulDivRem(long b, long n)
{
   return double(b)/double(n);
}


inline double PrepMulModPrecon(long b, long n)
{
   return PrepMulModPrecon(b, n, PrepMulMod(n));
}

NTL_CLOSE_NNS



#endif



#endif







/*********************************************************


HELPER ROUTINES:

long sp_SignMask(long a) 
long sp_SignMask(unsigned long a)
// if (long(a) < 0) then -1 else 0

bool sp_Negative(unsigned long a)
// long(a) < 0

long sp_CorrectDeficit(long a, long n)
long sp_CorrectDeficit(unsigned long a, long n): 
// if (long(a) >= 0) then a else a+n

// it is assumed that n in (0..B), where B = 2^(NTL_BITS_PER_LONG-1),
// and that long(a) >= -n 

long sp_CorrectExcess(long a, long n) 
long sp_CorrectExcess(unsigned long a, long n): 
// if (a < n) then a else a-n

// For the signed version, it is assumed that a >= 0.
// In either version, it is assumed that 
//   n in (0..B) and a-n in (-B..B).


These are designed to respect the flags NTL_CLEAN_INT,
NTL_ARITH_RIGHT_SHIFT, and NTL_AVOID_BRANCHING.


*********************************************************/


#if (NTL_ARITH_RIGHT_SHIFT && !defined(NTL_CLEAN_INT))
// DIRT: IMPL-DEF: arithmetic right shift and cast unsigned to signed

inline 
long sp_SignMask(long a)
{
   return a >> (NTL_BITS_PER_LONG-1);
}

inline 
long sp_SignMask(unsigned long a)
{
   return cast_signed(a) >> (NTL_BITS_PER_LONG-1);
}
#else
inline 
long sp_SignMask(long a)
{
   return -long(cast_unsigned(a) >> (NTL_BITS_PER_LONG-1));
}

inline 
long sp_SignMask(unsigned long a)
{
   return -long(a >> (NTL_BITS_PER_LONG-1));
}
#endif

inline
bool sp_Negative(unsigned long a)
{
   return clean_cast_signed(a) < 0;
}



#if (!defined(NTL_AVOID_BRANCHING))

// The C++ code is written using branching, but  
// on machines with large branch penalties, this code
// should yield "predicated instructions" (i.e., on x86,
// conditional moves).  The "branching" version of sp_CorrectExcess
// in written in a particular way to get optimal machine code:
// subtract, cmove (tested on clang, gcc, icc).

inline 
long sp_CorrectDeficit(long a, long n)
{
   return a >= 0 ? a : a+n;
}

template<class T> 
long sp_CorrectDeficitQuo(T& q, long a, long n, long amt=1)
{
   return a >= 0 ? a : (q -= amt, a+n);
}



inline 
long sp_CorrectDeficit(unsigned long a, long n)
{
   return !sp_Negative(a) ? a : a+n;
}

template<class T> 
long sp_CorrectDeficitQuo(T& q, unsigned long a, long n, long amt=1)
{
   return !sp_Negative(a) ? a : (q -= amt, a+n);
}



inline 
long sp_CorrectExcess(long a, long n)
{
   return a-n >= 0 ? a-n : a;
}

template<class T> 
long sp_CorrectExcessQuo(T& q, long a, long n, long amt=1)
{
   return a-n >= 0 ? (q += amt, a-n) : a;
}



inline 
long sp_CorrectExcess(unsigned long a, long n)
{
   return !sp_Negative(a-n) ? a-n : a;
}

template<class T> 
long sp_CorrectExcessQuo(T& q, unsigned long a, long n, long amt=1)
{
   return !sp_Negative(a-n) ? (q += amt, a-n) : a;
}

#else


// This C++ code uses traditional masking and adding
// to avoid branching. 

inline 
long sp_CorrectDeficit(long a, long n)
{
   return a + (sp_SignMask(a) & n);
}

template<class T> 
long sp_CorrectDeficitQuo(T& q, long a, long n, long amt=1)
{
   q += sp_SignMask(a)*amt;
   return a + (sp_SignMask(a) & n);
}





inline 
long sp_CorrectDeficit(unsigned long a, long n)
{
   return a + (sp_SignMask(a) & n);
}

template<class T> 
long sp_CorrectDeficitQuo(T& q, unsigned long a, long n, long amt=1)
{
   q += sp_SignMask(a)*amt;
   return a + (sp_SignMask(a) & n);
}




inline 
long sp_CorrectExcess(long a, long n)
{
   return (a-n) + (sp_SignMask(a-n) & n);
}

template<class T> 
long sp_CorrectExcessQuo(T& q, long a, long n, long amt=1)
{
   q += (1L + sp_SignMask(a-n))*amt;
   return (a-n) + (sp_SignMask(a-n) & n);
}




inline 
long sp_CorrectExcess(unsigned long a, long n)
{
   return (a-n) + (sp_SignMask(a-n) & n);
}

template<class T> 
long sp_CorrectExcessQuo(T& q, unsigned long a, long n, long amt=1)
{
   q += (1L + sp_SignMask(a-n))*amt;
   return (a-n) + (sp_SignMask(a-n) & n);
}

#endif


// **********************************************************************





#ifdef NTL_HAVE_BUILTIN_CLZL

inline long 
sp_CountLeadingZeros(unsigned long x)
{
   return __builtin_clzl(x);
}

#else

inline long 
sp_CountLeadingZeros(unsigned long x)
{
   long res = NTL_BITS_PER_LONG-NTL_SP_NBITS;
   x = x << NTL_BITS_PER_LONG-NTL_SP_NBITS;
   while (x < (1UL << (NTL_BITS_PER_LONG-1))) {
      x <<= 1;
      res++;
   }

   return res;
}


#endif




inline 
long AddMod(long a, long b, long n)
{
   long r = a+b;
   return sp_CorrectExcess(r, n);
}

inline 
long SubMod(long a, long b, long n)
{
   long r = a-b;
   return sp_CorrectDeficit(r, n);
}

inline 
long NegateMod(long a, long n)
{
   return SubMod(0, a, n);
}





#if (!defined(NTL_LONGLONG_SP_MULMOD))



#ifdef NTL_LEGACY_SP_MULMOD

#define NTL_WIDE_DOUBLE_PRECISION NTL_DOUBLE_PRECISION
#define NTL_WIDE_FDOUBLE_PRECISION NTL_WIDE_DOUBLE_DP
typedef double wide_double;


#else


#ifdef NTL_LONGDOUBLE_SP_MULMOD


#define NTL_WIDE_DOUBLE_PRECISION NTL_LONGDOUBLE_PRECISION
#define NTL_WIDE_FDOUBLE_PRECISION NTL_WIDE_DOUBLE_LDP
typedef long double wide_double_impl_t;

#else

#define NTL_WIDE_DOUBLE_PRECISION NTL_DOUBLE_PRECISION
#define NTL_WIDE_FDOUBLE_PRECISION NTL_WIDE_DOUBLE_DP
typedef double wide_double_impl_t;

#endif




class wide_double {
public:
   wide_double_impl_t data;

   wide_double() { }

   wide_double(const wide_double& x) : data(x.data) { }

   template<class T>
   explicit wide_double(const T& x) : data(x) { }

   operator wide_double_impl_t() const { return data; }

};

inline wide_double operator+(wide_double x, wide_double y)
{
   return wide_double(x.data + y.data); 
}

inline wide_double operator-(wide_double x, wide_double y)
{
   return wide_double(x.data - y.data); 
}



inline wide_double operator*(wide_double x, wide_double y)
{
   return wide_double(x.data * y.data); 
}

inline wide_double operator/(wide_double x, wide_double y)
{
   return wide_double(x.data / y.data); 
}

inline wide_double floor(wide_double x)
{
   return wide_double(std::floor(x.data));
}

inline wide_double& operator+=(wide_double& x, wide_double y)
{
   return x = x + y;
}

inline wide_double& operator-=(wide_double& x, wide_double y)
{
   return x = x - y;
}

inline wide_double& operator*=(wide_double& x, wide_double y)
{
   return x = x * y;
}


inline wide_double& operator/=(wide_double& x, wide_double y)
{
   return x = x / y;
}

#endif



// old-style MulMod code using floating point arithmetic

typedef wide_double mulmod_t;
typedef wide_double muldivrem_t;

inline wide_double PrepMulMod(long n)
{
   return wide_double(1L)/wide_double(n);
}

inline wide_double PrepMulDivRem(long b, long n, wide_double ninv)
{
   return wide_double(b)*ninv;
}


inline 
long MulMod(long a, long b, long n, wide_double ninv)
{
   long q  = (long) ((((wide_double) a) * ((wide_double) b)) * ninv); 
   unsigned long rr = cast_unsigned(a)*cast_unsigned(b) - 
                      cast_unsigned(q)*cast_unsigned(n);
   long r = sp_CorrectDeficit(rr, n);
   return sp_CorrectExcess(r, n);
}

inline 
long NormalizedMulMod(long a, long b, long n, wide_double ninv)
{
   return MulMod(a, b, n, ninv);
}

inline bool NormalizedModulus(wide_double ninv) { return true; }



inline 
long MulModWithQuo(long& qres, long a, long b, long n, wide_double ninv)
{
   long q  = (long) ((((wide_double) a) * ((wide_double) b)) * ninv); 
   unsigned long rr = cast_unsigned(a)*cast_unsigned(b) - 
                      cast_unsigned(q)*cast_unsigned(n);

   long r = sp_CorrectDeficitQuo(q, rr, n);
   r = sp_CorrectExcessQuo(q, r, n);
   qres = q;
   return r;
}


inline 
long MulMod2_legacy(long a, long b, long n, wide_double bninv)
{
   long q  = (long) (((wide_double) a) * bninv);
   unsigned long rr = cast_unsigned(a)*cast_unsigned(b) - 
                      cast_unsigned(q)*cast_unsigned(n);
   long r = sp_CorrectDeficit(rr, n);
   r = sp_CorrectExcess(r, n);
   return r;
}

inline 
long MulDivRem(long& qres, long a, long b, long n, wide_double bninv)
{
   long q  = (long) (((wide_double) a) * bninv);
   unsigned long rr = cast_unsigned(a)*cast_unsigned(b) - 
                      cast_unsigned(q)*cast_unsigned(n);

   long r = sp_CorrectDeficitQuo(q, rr, n);
   r = sp_CorrectExcessQuo(q, r, n);
   qres = q;
   return r;
}

#else

// new-style MulMod code using ULL arithmetic



struct sp_inverse {
   unsigned long inv;
   long shamt;

   sp_inverse() NTL_DEFAULT
   sp_inverse(unsigned long _inv, long _shamt) : inv(_inv), shamt(_shamt) { }
};

typedef sp_inverse mulmod_t;



#if (NTL_BITS_PER_LONG >= NTL_SP_NBITS+4)

#define NTL_PRE_SHIFT1 (NTL_BITS_PER_LONG-NTL_SP_NBITS-4)
#define NTL_POST_SHIFT (0)

#define NTL_PRE_SHIFT2 (2*NTL_SP_NBITS+2)

#else

#define NTL_PRE_SHIFT1 (0)
#define NTL_POST_SHIFT (1)

#define NTL_PRE_SHIFT2 (2*NTL_SP_NBITS+1)

#endif




#if (NTL_SP_NBITS <= 2*NTL_DOUBLE_PRECISION-10)


inline unsigned long
sp_NormalizedPrepMulMod(long n)
{
   double ninv = 1/double(n); 
   unsigned long nn = n;

   // initial approximation to quotient
   unsigned long qq = long((double(1L << (NTL_SP_NBITS-1)) * double(1L << NTL_SP_NBITS)) * ninv);

   // NOTE: the true quotient is <= 2^{NTL_SP_NBITS}

   // compute approximate remainder using ULL arithmetic
   NTL_ULL_TYPE rr = (((NTL_ULL_TYPE)(1)) << (2*NTL_SP_NBITS-1)) -
                     (((NTL_ULL_TYPE)(nn)) * ((NTL_ULL_TYPE)(qq)));
                    

   rr = (rr << (NTL_PRE_SHIFT2-2*NTL_SP_NBITS+1)) - 1;

   // now compute a floating point approximation to r,
   // but avoiding unsigned -> float conversions,
   // as these are not as well supported in hardware as
   // signed -> float conversions
   
   unsigned long rrlo = (unsigned long) rr;
   unsigned long rrhi = ((unsigned long) (rr >> NTL_BITS_PER_LONG)) 
                        + (rrlo >> (NTL_BITS_PER_LONG-1));

   long rlo = clean_cast_signed(rrlo);  // these should be No-Ops
   long rhi = clean_cast_signed(rrhi);

   const double bpl_as_double (double(1L << NTL_SP_NBITS) * double(1L << (NTL_BITS_PER_LONG-NTL_SP_NBITS)));
   double fr = double(rlo) + double(rhi)*bpl_as_double;

   // now convert fr*ninv to a long
   // but we have to be careful: fr may be negative.
   // the result should still give floor(r/n) pm 1,
   // and is computed in a way that avoids branching

   long q1 = long(fr*ninv);
   if (q1 < 0) q1--;  
   // This counteracts the round-to-zero behavior of conversion
   // to long.  It should be compiled into branch-free code.

   unsigned long qq1 = q1;

   unsigned long rr1 = rrlo - qq1*nn;

   qq1 += 1L + sp_SignMask(rr1) + sp_SignMask(rr1-n);

   unsigned long res = (qq << (NTL_PRE_SHIFT2-2*NTL_SP_NBITS+1)) + qq1;

   res = res << NTL_PRE_SHIFT1;
   return res;
}

#else

inline unsigned long 
sp_NormalizedPrepMulMod(long n)
{
   return 
      (((unsigned long) ( ((((NTL_ULL_TYPE) 1) << NTL_PRE_SHIFT2) - 1)/n )) << NTL_PRE_SHIFT1);
}

#endif


inline sp_inverse
PrepMulMod(long n)
{
   long shamt = sp_CountLeadingZeros(n) - (NTL_BITS_PER_LONG-NTL_SP_NBITS);
   unsigned long inv = sp_NormalizedPrepMulMod(n << shamt);
   return sp_inverse(inv, shamt);
}







inline long 
sp_NormalizedMulMod(long a, long b, long n, unsigned long ninv) 
{
   ll_type U;
   ll_imul(U, a, b);
   unsigned long H = ll_rshift_get_lo<NTL_SP_NBITS-2>(U);
   unsigned long Q = ll_mul_hi(H, ninv);
   Q = Q >> NTL_POST_SHIFT;
   unsigned long L = ll_get_lo(U);
   long r = L - Q*cast_unsigned(n);  // r in [0..2*n)

   r = sp_CorrectExcess(r, n);
   return r;
}



inline long 
MulMod(long a, long b, long n, sp_inverse ninv)
{
   return sp_NormalizedMulMod(a, b << ninv.shamt, n << ninv.shamt, ninv.inv) >> ninv.shamt;
}

// if you know what you're doing....
// FIXME: eventually, put this is the documented interface...
// but for now, it's "experimental"
inline long 
NormalizedMulMod(long a, long b, long n, sp_inverse ninv)
{
   return sp_NormalizedMulMod(a, b, n, ninv.inv);
}

inline bool
NormalizedModulus(sp_inverse ninv) { return ninv.shamt == 0; }




inline long 
sp_NormalizedMulModWithQuo(long& qres, long a, long b, long n, unsigned long ninv)
{
   ll_type U;
   ll_imul(U, a, b);
   unsigned long H = ll_rshift_get_lo<NTL_SP_NBITS-2>(U);
   unsigned long Q = ll_mul_hi(H, ninv);
   Q = Q >> NTL_POST_SHIFT;
   unsigned long L = ll_get_lo(U);
   long r = L - Q*cast_unsigned(n);  // r in [0..2*n)

   r = sp_CorrectExcessQuo(Q, r, n);
   qres = Q;
   return r;
}

inline long 
MulModWithQuo(long& qres, long a, long b, long n, sp_inverse ninv)
{
   return sp_NormalizedMulModWithQuo(qres, a, b << ninv.shamt, n << ninv.shamt, ninv.inv) >> ninv.shamt;
}




#endif



#if (defined(NTL_SPMM_ULL_VIABLE) || defined(NTL_LONGLONG_SP_MULMOD))


typedef unsigned long mulmod_precon_t;


#if (!defined(NTL_LONGLONG_SP_MULMOD))

inline unsigned long PrepMulModPrecon(long b, long n, wide_double ninv)
{
   long q  = (long) ( (((wide_double) b) * wide_double(NTL_SP_BOUND)) * ninv ); 
   unsigned long rr = (cast_unsigned(b) << NTL_SP_NBITS) - cast_unsigned(q)*cast_unsigned(n);

   q += sp_SignMask(rr) + sp_SignMask(rr-n) + 1L;

   return cast_unsigned(q) << (NTL_BITS_PER_LONG - NTL_SP_NBITS);
}

#else


inline unsigned long 
sp_NormalizedPrepMulModPrecon(long b, long n, unsigned long ninv)
{
   unsigned long H = cast_unsigned(b) << 2;
   unsigned long Q = ll_mul_hi(H, ninv);
   Q = Q >> NTL_POST_SHIFT;
   unsigned long L = cast_unsigned(b) << NTL_SP_NBITS;
   long r = L - Q*cast_unsigned(n);  // r in [0..2*n)


   Q += 1L + sp_SignMask(r-n);
   return Q;  // NOTE: not shifted
}


inline unsigned long 
PrepMulModPrecon(long b, long n, sp_inverse ninv)
{
   return sp_NormalizedPrepMulModPrecon(b << ninv.shamt, n << ninv.shamt, ninv.inv) << (NTL_BITS_PER_LONG-NTL_SP_NBITS);
}





#endif


   


inline long MulModPrecon(long a, long b, long n, unsigned long bninv)
{
   unsigned long qq = ll_mul_hi(a, bninv);
   unsigned long rr = cast_unsigned(a)*cast_unsigned(b) - qq*cast_unsigned(n);
   return sp_CorrectExcess(long(rr), n);
}



inline long MulModPreconWithQuo(long& qres, long a, long b, long n, unsigned long bninv)
{
   unsigned long qq = ll_mul_hi(a, bninv);
   unsigned long rr = cast_unsigned(a)*cast_unsigned(b) - qq*cast_unsigned(n);
   long r = sp_CorrectExcessQuo(qq, long(rr), n);
   qres = long(qq);
   return r;
}



#else

// default, wide_double version

typedef wide_double mulmod_precon_t;


inline wide_double PrepMulModPrecon(long b, long n, wide_double ninv)
{
   return ((wide_double) b) * ninv;
}

inline long MulModPrecon(long a, long b, long n, wide_double bninv)
{
   return MulMod2_legacy(a, b, n, bninv);
}

inline long MulModPreconWithQuo(long& qq, long a, long b, long n, wide_double bninv)
{
   return MulDivRem(qq, a, b, n, bninv);
}



#endif








#if (defined(NTL_LONGLONG_SP_MULMOD))

// some annoying backward-compatibiliy nonsense

struct sp_muldivrem_struct {
   unsigned long bninv;

   explicit sp_muldivrem_struct(unsigned long _bninv) : bninv(_bninv) { }
   sp_muldivrem_struct() NTL_DEFAULT
};

typedef sp_muldivrem_struct muldivrem_t;

inline sp_muldivrem_struct PrepMulDivRem(long b, long n, sp_inverse ninv)
{
   return sp_muldivrem_struct(PrepMulModPrecon(b, n, ninv));
}


inline
long MulDivRem(long& qres, long a, long b, long n,  sp_muldivrem_struct bninv)
{
   return MulModPreconWithQuo(qres, a, b, n, bninv.bninv);
}

#endif






inline mulmod_precon_t PrepMulModPrecon(long b, long n)
{
   return PrepMulModPrecon(b, n, PrepMulMod(n));
}


inline 
long MulMod(long a, long b, long n)
{
   return MulMod(a, b, n, PrepMulMod(n));
}

inline muldivrem_t PrepMulDivRem(long b, long n)
{
   return PrepMulDivRem(b, n, PrepMulMod(n));
}






#ifdef NTL_LEGACY_SP_MULMOD

inline long MulMod2(long a, long b, long n, wide_double bninv)
{
   return MulMod2_legacy(a, b, n, bninv);
}


#endif




inline
void VectorMulModPrecon(long k, long *x, const long *a, long b, long n, 
                        mulmod_precon_t bninv)
{
   for (long i = 0; i < k; i++)
      x[i] = MulModPrecon(a[i], b, n, bninv);
}

inline
void VectorMulMod(long k, long *x, const long *a, long b, long n, 
                  mulmod_t ninv)
{
   mulmod_precon_t bninv;
   bninv = PrepMulModPrecon(b, n, ninv);
   VectorMulModPrecon(k, x, a, b, n, bninv);
}


inline 
void VectorMulMod(long k, long *x, const long *a, long b, long n)
{
   mulmod_t ninv = PrepMulMod(n);
   VectorMulMod(k, x, a, b, n, ninv);
}

#ifdef NTL_HAVE_LL_TYPE


struct sp_reduce_struct {
   unsigned long ninv;
   long sgn;

   sp_reduce_struct(unsigned long _ninv, long _sgn) : 
      ninv(_ninv), sgn(_sgn)  { }

   sp_reduce_struct() NTL_DEFAULT
};

inline
sp_reduce_struct sp_PrepRem(long n)
{
   unsigned long q = (1UL << (NTL_BITS_PER_LONG-1))/cast_unsigned(n);
   long r = (1UL << (NTL_BITS_PER_LONG-1)) - q*cast_unsigned(n);

   long r1 = 2*r;
   q = 2*q;
   r1 = sp_CorrectExcessQuo(q, r1, n);
   
   return sp_reduce_struct(q, r);
}



inline
long rem(unsigned long a, long n, sp_reduce_struct red) 
{
   unsigned long Q = ll_mul_hi(a, red.ninv);
   long r = a - Q*cast_unsigned(n);
   r = sp_CorrectExcess(r, n);
   return r;
}


inline
long rem(long a, long n, sp_reduce_struct red) 
{
   unsigned long a0 = cast_unsigned(a) & ((1UL << (NTL_BITS_PER_LONG-1))-1);
   long r = rem(a0, n, red);
   long s = sp_SignMask(a) & red.sgn;
   return SubMod(r, s, n);
}
#else

struct sp_reduce_struct { };


inline
sp_reduce_struct sp_PrepRem(long n) 
{
   return sp_reduce_struct();
}


inline
long rem(unsigned long a, long n, sp_reduce_struct red) 
{
   return a % cast_unsigned(n);
}

inline
long rem(long a, long n, sp_reduce_struct red)
{
   long r = a % n;
   return sp_CorrectDeficit(r, n);
}


#endif


#ifdef NTL_HAVE_LL_TYPE

#define NTL_HAVE_SP_LL_ROUTINES


// some routines that are currently not part of the documented
// interface.  They currently are only defined when we have appropriate
// LL type.


struct sp_ll_reduce_struct {
   unsigned long inv;
   long nbits;

   sp_ll_reduce_struct() NTL_DEFAULT

   sp_ll_reduce_struct(unsigned long _inv, long _nbits) : inv(_inv), nbits(_nbits) { }

};


inline sp_ll_reduce_struct
make_sp_ll_reduce_struct(long n)
{
   long nbits = NTL_BITS_PER_LONG - sp_CountLeadingZeros(n);
   unsigned long inv =
       (unsigned long) ( ((((NTL_ULL_TYPE) 1) << (nbits+NTL_BITS_PER_LONG))-1UL) / ((NTL_ULL_TYPE) n) );

   return sp_ll_reduce_struct(inv, nbits);
}


// computes remainder (hi, lo) mod d, assumes hi < d
inline long  
sp_ll_red_21(unsigned long hi, unsigned long lo, long d, 
            sp_ll_reduce_struct dinv)
{
   unsigned long H = 
      (hi << (NTL_BITS_PER_LONG-dinv.nbits)) | (lo >> dinv.nbits);
   unsigned long Q = ll_mul_hi(H, dinv.inv) + H;
   unsigned long rr = lo - Q*cast_unsigned(d); // rr in [0..4*d)
   long r = sp_CorrectExcess(rr, 2*d); // r in [0..2*d)
   r = sp_CorrectExcess(r, d);
   return r;
}

// computes remainder (x[n-1], ..., x[0]) mod d
inline long 
sp_ll_red_n1(const unsigned long *x, long n, long d, sp_ll_reduce_struct dinv)
{
   long carry = 0;
   long i;
   for (i = n-1; i >= 0; i--) 
      carry = sp_ll_red_21(carry, x[i], d, dinv);
   return carry;
} 

// computes remainder (x2, x1, x0) mod d, assumes x2 < d
inline long 
sp_ll_red_31(unsigned long x2, unsigned long x1, unsigned long x0,
           long d, sp_ll_reduce_struct dinv)
{
   long carry = sp_ll_red_21(x2, x1, d, dinv);
   return sp_ll_red_21(carry, x0, d, dinv);
}


// normalized versions of the above: assume NumBits(d) == NTL_SP_NBITS

// computes remainder (hi, lo) mod d, assumes hi < d
inline long  
sp_ll_red_21_normalized(unsigned long hi, unsigned long lo, long d, 
            sp_ll_reduce_struct dinv)
{
   unsigned long H = 
      (hi << (NTL_BITS_PER_LONG-NTL_SP_NBITS)) | (lo >> NTL_SP_NBITS);
   unsigned long Q = ll_mul_hi(H, dinv.inv) + H;
   unsigned long rr = lo - Q*cast_unsigned(d); // rr in [0..4*d)
   long r = sp_CorrectExcess(rr, 2*d); // r in [0..2*d)
   r = sp_CorrectExcess(r, d);
   return r;
}

// computes remainder (x[n-1], ..., x[0]) mod d
inline long 
sp_ll_red_n1_normalized(const unsigned long *x, long n, long d, sp_ll_reduce_struct dinv)
{
   long carry = 0;
   long i;
   for (i = n-1; i >= 0; i--) 
      carry = sp_ll_red_21_normalized(carry, x[i], d, dinv);
   return carry;
} 

// computes remainder (x2, x1, x0) mod d, assumes x2 < d
inline long 
sp_ll_red_31_normalized(unsigned long x2, unsigned long x1, unsigned long x0,
           long d, sp_ll_reduce_struct dinv)
{
   long carry = sp_ll_red_21_normalized(x2, x1, d, dinv);
   return sp_ll_red_21_normalized(carry, x0, d, dinv);
}


#else

// provided to streamline some code


struct sp_ll_reduce_struct { };


inline sp_ll_reduce_struct
make_sp_ll_reduce_struct(long n)
{
   return sp_ll_reduce_struct();
}

#endif


NTL_CLOSE_NNS

#endif





/**************************************************************************

Implementation notes -- the LONGLONG MulMod implementation

I started out basing this on Granlund-Moeller multiplication,
but it evolved into something a little bit different.

We assume that modulus n has w bits, so 2^{w-1} <= n < 2^w.
We also assume that 2 <= w <= BPL-2.

As a precomputation step, we compute X = floor((2^v-1)/n), i.e.,
   2^v - 1 = X n + Y,    where 0 <= Y < n

Now, we are given U to reduce mod n.
Write 
   U  = H 2^s + L
  H X = Q 2^t + R
where s + t = v.

Some simple calculations yield:
   H <= U/2^s
   2^{v-w} <= X <  2^v/n <= 2^{v-w+1}
 H X <  2^t U / n

Also:
   U - Qn   <   n U / 2^v  + L + n

For the case where BPL >= 64, we generally work with w = BPL-4.
In this case, we set v = 2w+2, s = w-2, t = w+4.
Then we have:
   U - Qn < n/4 + n/2 + n < 2n
This choice of parameters allows us to do a MulMod with just a single
correction.  It also allows us to do the LazyPrepMulModPrecon
step with just a single correction.

If w = BPL-2, we set v = 2w+1, s = w-2, t = w+3.
Then we have:
   U - Qn   <   n/2 + n/2 + n = 2n 
So again, we we do a MulMod with just a single correction,
although the LazyPrepMulModPrecon now takes two corrections.

For the Lazy stuff, we are computing floor(2^{w+2}b/n), so 
U = 2^{w+2} b.
For the case w = BPL-4, we are setting s = w+2 and t = w.
L = 0 in this case, and we obtain U - Qn < n + n = 2n.
In the case w = BPL-2, we set s = w, t = w+1.
We obtain U - Qn < 2n + n = 3n.
Also, we need to write X = 2^{w+1} + X_0 to perform the 
HX computation correctly.

***************************************************************************/