/usr/share/doc/libntl-dev/NTL/quad_float.cpp.html is in libntl-dev 6.2.1-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
<title>/Volumes/Unix/unix-files.noindex/ntl-new/ntl-6.1.0/doc/quad_float.cpp.html</title>
<meta name="Generator" content="Vim/7.3">
<meta name="plugin-version" content="vim7.3_v6">
<meta name="syntax" content="cpp">
<meta name="settings" content="use_css">
<style type="text/css">
<!--
pre { font-family: monospace; color: #000000; background-color: #ffffff; }
body { font-family: monospace; color: #000000; background-color: #ffffff; }
.Statement { color: #b03060; font-weight: bold; }
.Type { color: #008b00; font-weight: bold; }
.String { color: #4a708b; }
.PreProc { color: #1874cd; }
.Comment { color: #0000ee; font-style: italic; }
-->
</style>
</head>
<body>
<pre>
<span class="Comment">/*</span><span class="Comment">*************************************************************************\</span>
<span class="Comment">MODULE: quad_float</span>
<span class="Comment">SUMMARY:</span>
<span class="Comment">The class quad_float is used to represent quadruple precision numbers.</span>
<span class="Comment">Thus, with standard IEEE floating point, you should get the equivalent</span>
<span class="Comment">of about 106 bits of precision (but actually just a bit less).</span>
<span class="Comment">The interface allows you to treat quad_floats more or less as if they were</span>
<span class="Comment">"ordinary" floating point types.</span>
<span class="Comment">See below for more implementation details.</span>
<span class="Comment">\*************************************************************************</span><span class="Comment">*/</span>
<span class="PreProc">#include </span><span class="String"><NTL/ZZ.h></span>
<span class="Type">class</span> quad_float {
<span class="Statement">public</span>:
quad_float(); <span class="Comment">// = 0</span>
quad_float(<span class="Type">const</span> quad_float& a); <span class="Comment">// copy constructor</span>
<span class="Type">explicit</span> quad_float(<span class="Type">double</span> a); <span class="Comment">// promotion constructor</span>
quad_float& <span class="Statement">operator</span>=(<span class="Type">const</span> quad_float& a); <span class="Comment">// assignment operator</span>
quad_float& <span class="Statement">operator</span>=(<span class="Type">double</span> a);
~quad_float();
<span class="Type">static</span> <span class="Type">void</span> SetOutputPrecision(<span class="Type">long</span> p);
<span class="Comment">// This sets the number of decimal digits to be output. Default is</span>
<span class="Comment">// 10.</span>
<span class="Type">static</span> <span class="Type">long</span> OutputPrecision();
<span class="Comment">// returns current output precision.</span>
};
<span class="Comment">/*</span><span class="Comment">*************************************************************************\</span>
<span class="Comment"> Arithmetic Operations</span>
<span class="Comment">\*************************************************************************</span><span class="Comment">*/</span>
quad_float <span class="Statement">operator</span> +(<span class="Type">const</span> quad_float& x, <span class="Type">const</span> quad_float& y);
quad_float <span class="Statement">operator</span> -(<span class="Type">const</span> quad_float& x, <span class="Type">const</span> quad_float& y);
quad_float <span class="Statement">operator</span> *(<span class="Type">const</span> quad_float& x, <span class="Type">const</span> quad_float& y);
quad_float <span class="Statement">operator</span> /(<span class="Type">const</span> quad_float& x, <span class="Type">const</span> quad_float& y);
<span class="Comment">// PROMOTIONS: operators +, -, *, / promote double to quad_float</span>
<span class="Comment">// on (x, y).</span>
quad_float <span class="Statement">operator</span> -(<span class="Type">const</span> quad_float& x);
quad_float& <span class="Statement">operator</span> += (quad_float& x, <span class="Type">const</span> quad_float& y);
quad_float& <span class="Statement">operator</span> += (quad_float& x, <span class="Type">double</span> y);
quad_float& <span class="Statement">operator</span> -= (quad_float& x, <span class="Type">const</span> quad_float& y);
quad_float& <span class="Statement">operator</span> -= (quad_float& x, <span class="Type">double</span> y);
quad_float& <span class="Statement">operator</span> *= (quad_float& x, <span class="Type">const</span> quad_float& y);
quad_float& <span class="Statement">operator</span> *= (quad_float& x, <span class="Type">double</span> y);
quad_float& <span class="Statement">operator</span> /= (quad_float& x, <span class="Type">const</span> quad_float& y);
quad_float& <span class="Statement">operator</span> /= (quad_float& x, <span class="Type">double</span> y);
quad_float& <span class="Statement">operator</span>++(quad_float& a); <span class="Comment">// prefix</span>
<span class="Type">void</span> <span class="Statement">operator</span>++(quad_float& a, <span class="Type">int</span>); <span class="Comment">// postfix</span>
quad_float& <span class="Statement">operator</span>--(quad_float& a); <span class="Comment">// prefix</span>
<span class="Type">void</span> <span class="Statement">operator</span>--(quad_float& a, <span class="Type">int</span>); <span class="Comment">// postfix</span>
<span class="Comment">/*</span><span class="Comment">*************************************************************************\</span>
<span class="Comment"> Comparison</span>
<span class="Comment">\*************************************************************************</span><span class="Comment">*/</span>
<span class="Type">long</span> <span class="Statement">operator</span>> (<span class="Type">const</span> quad_float& x, <span class="Type">const</span> quad_float& y);
<span class="Type">long</span> <span class="Statement">operator</span>>=(<span class="Type">const</span> quad_float& x, <span class="Type">const</span> quad_float& y);
<span class="Type">long</span> <span class="Statement">operator</span>< (<span class="Type">const</span> quad_float& x, <span class="Type">const</span> quad_float& y);
<span class="Type">long</span> <span class="Statement">operator</span><=(<span class="Type">const</span> quad_float& x, <span class="Type">const</span> quad_float& y);
<span class="Type">long</span> <span class="Statement">operator</span>==(<span class="Type">const</span> quad_float& x, <span class="Type">const</span> quad_float& y);
<span class="Type">long</span> <span class="Statement">operator</span>!=(<span class="Type">const</span> quad_float& x, <span class="Type">const</span> quad_float& y);
<span class="Type">long</span> sign(<span class="Type">const</span> quad_float& x); <span class="Comment">// sign of x, -1, 0, +1</span>
<span class="Type">long</span> compare(<span class="Type">const</span> quad_float& x, <span class="Type">const</span> quad_float& y); <span class="Comment">// sign of x - y</span>
<span class="Comment">// PROMOTIONS: operators >, ..., != and function compare</span>
<span class="Comment">// promote double to quad_float on (x, y).</span>
<span class="Comment">/*</span><span class="Comment">*************************************************************************\</span>
<span class="Comment"> Input/Output</span>
<span class="Comment">Input Syntax:</span>
<span class="Comment"><number>: [ "-" ] <unsigned-number></span>
<span class="Comment"><unsigned-number>: <dotted-number> [ <e-part> ] | <e-part></span>
<span class="Comment"><dotted-number>: <digits> | <digits> "." <digits> | "." <digits> | <digits> "."</span>
<span class="Comment"><digits>: <digit> <digits> | <digit></span>
<span class="Comment"><digit>: "0" | ... | "9"</span>
<span class="Comment"><e-part>: ( "E" | "e" ) [ "+" | "-" ] <digits></span>
<span class="Comment">Examples of valid input:</span>
<span class="Comment">17 1.5 0.5 .5 5. -.5 e10 e-10 e+10 1.5e10 .5e10 .5E10</span>
<span class="Comment">Note that the number of decimal digits of precision that are used</span>
<span class="Comment">for output can be set to any number p >= 1 by calling</span>
<span class="Comment">the routine quad_float::SetOutputPrecision(p). </span>
<span class="Comment">The default value of p is 10.</span>
<span class="Comment">The current value of p is returned by a call to quad_float::OutputPrecision().</span>
<span class="Comment">\*************************************************************************</span><span class="Comment">*/</span>
istream& <span class="Statement">operator</span> >> (istream& s, quad_float& x);
ostream& <span class="Statement">operator</span> << (ostream& s, <span class="Type">const</span> quad_float& x);
<span class="Comment">/*</span><span class="Comment">*************************************************************************\</span>
<span class="Comment"> Miscellaneous</span>
<span class="Comment">\*************************************************************************</span><span class="Comment">*/</span>
quad_float sqrt(<span class="Type">const</span> quad_float& x);
quad_float floor(<span class="Type">const</span> quad_float& x);
quad_float ceil(<span class="Type">const</span> quad_float& x);
quad_float trunc(<span class="Type">const</span> quad_float& x);
quad_float fabs(<span class="Type">const</span> quad_float& x);
quad_float exp(<span class="Type">const</span> quad_float& x);
quad_float log(<span class="Type">const</span> quad_float& x);
<span class="Type">void</span> power(quad_float& x, <span class="Type">const</span> quad_float& a, <span class="Type">long</span> e); <span class="Comment">// x = a^e</span>
quad_float power(<span class="Type">const</span> quad_float& a, <span class="Type">long</span> e);
<span class="Type">void</span> power2(quad_float& x, <span class="Type">long</span> e); <span class="Comment">// x = 2^e</span>
quad_float power2_quad_float(<span class="Type">long</span> e);
quad_float ldexp(<span class="Type">const</span> quad_float& x, <span class="Type">long</span> e); <span class="Comment">// return x*2^e</span>
<span class="Type">long</span> IsFinite(quad_float *x); <span class="Comment">// checks if x is "finite" </span>
<span class="Comment">// pointer is used for compatability with</span>
<span class="Comment">// IsFinite(double*)</span>
<span class="Type">void</span> random(quad_float& x);
quad_float random_quad_float();
<span class="Comment">// generate a random quad_float x with 0 <= x <= 1</span>
<span class="Comment">/*</span><span class="Comment">**********************************************************************\</span>
<span class="Comment">IMPLEMENTATION DETAILS</span>
<span class="Comment">A quad_float x is represented as a pair of doubles, x.hi and x.lo,</span>
<span class="Comment">such that the number represented by x is x.hi + x.lo, where</span>
<span class="Comment"> |x.lo| <= 0.5*ulp(x.hi), (*)</span>
<span class="Comment">and ulp(y) means "unit in the last place of y". </span>
<span class="Comment">For the software to work correctly, IEEE Standard Arithmetic is sufficient. </span>
<span class="Comment">That includes just about every modern computer; the only exception I'm</span>
<span class="Comment">aware of is Intel x86 platforms running Linux (but you can still</span>
<span class="Comment">use this platform--see below).</span>
<span class="Comment">Also sufficient is any platform that implements arithmetic with correct </span>
<span class="Comment">rounding, i.e., given double floating point numbers a and b, a op b </span>
<span class="Comment">is computed exactly and then rounded to the nearest double. </span>
<span class="Comment">The tie-breaking rule is not important.</span>
<span class="Comment">This is a rather wierd representation; although it gives one</span>
<span class="Comment">essentially twice the precision of an ordinary double, it is</span>
<span class="Comment">not really the equivalent of quadratic precision (despite the name).</span>
<span class="Comment">For example, the number 1 + 2^{-200} can be represented exactly as</span>
<span class="Comment">a quad_float. Also, there is no real notion of "machine precision".</span>
<span class="Comment">Note that overflow/underflow for quad_floats does not follow any particularly</span>
<span class="Comment">useful rules, even if the underlying floating point arithmetic is IEEE</span>
<span class="Comment">compliant. Generally, when an overflow/underflow occurs, the resulting value</span>
<span class="Comment">is unpredicatble, although typically when overflow occurs in computing a value</span>
<span class="Comment">x, the result is non-finite (i.e., IsFinite(&x) == 0). Note, however, that</span>
<span class="Comment">some care is taken to ensure that the ZZ to quad_float conversion routine</span>
<span class="Comment">produces a non-finite value upon overflow.</span>
<span class="Comment">THE INTEL x86 PROBLEM</span>
<span class="Comment">Although just about every modern processor implements the IEEE</span>
<span class="Comment">floating point standard, there still can be problems</span>
<span class="Comment">on processors that support IEEE extended double precision.</span>
<span class="Comment">The only processor I know of that supports this is the x86/Pentium.</span>
<span class="Comment">While extended double precision may sound like a nice thing,</span>
<span class="Comment">it is not. Normal double precision has 53 bits of precision.</span>
<span class="Comment">Extended has 64. On x86s, the FP registers have 53 or 64 bits</span>
<span class="Comment">of precision---this can be set at run-time by modifying</span>
<span class="Comment">the cpu "control word" (something that can be done</span>
<span class="Comment">only in assembly code).</span>
<span class="Comment">However, doubles stored in memory always have only 53 bits.</span>
<span class="Comment">Compilers may move values between memory and registers</span>
<span class="Comment">whenever they want, which can effectively change the value</span>
<span class="Comment">of a floating point number even though at the C/C++ level,</span>
<span class="Comment">nothing has happened that should have changed the value.</span>
<span class="Comment">Is that sick, or what?</span>
<span class="Comment">Actually, the new C99 standard seems to outlaw such "spontaneous"</span>
<span class="Comment">value changes; however, this behavior is not necessarily</span>
<span class="Comment">universally implemented.</span>
<span class="Comment">This is a real headache, and if one is not just a bit careful,</span>
<span class="Comment">the quad_float code will break. This breaking is not at all subtle,</span>
<span class="Comment">and the program QuadTest will catch the problem if it exists.</span>
<span class="Comment">You should not need to worry about any of this, because NTL automatically</span>
<span class="Comment">detects and works around these problems as best it can, as described below.</span>
<span class="Comment">It shouldn't make a mistake, but if it does, you will</span>
<span class="Comment">catch it in the QuadTest program.</span>
<span class="Comment">If things don't work quite right, you might try</span>
<span class="Comment">setting NTL_FIX_X86 or NTL_NO_FIX_X86 flags in ntl_config.h,</span>
<span class="Comment">but this should not be necessary.</span>
<span class="Comment">Here are the details about how NTL fixes the problem.</span>
<span class="Comment">The first and best way is to have the default setting of the control word</span>
<span class="Comment">be 53 bits. However, you are at the mercy of your platform</span>
<span class="Comment">(compiler, OS, run-time libraries). Windows does this,</span>
<span class="Comment">and so the problem simply does not arise here, and NTL neither</span>
<span class="Comment">detects nor fixes the problem. Linux, however, does not do this,</span>
<span class="Comment">which really sucks. Can we talk these Linux people into changing this?</span>
<span class="Comment">The second way to fix the problem is by having NTL </span>
<span class="Comment">fiddle with control word itself. If you compile NTL using a GNU compiler</span>
<span class="Comment">on an x86, this should happen automatically.</span>
<span class="Comment">On the one hand, this is not a general, portable solution,</span>
<span class="Comment">since it will only work if you use a GNU compiler, or at least one that</span>
<span class="Comment">supports GNU 'asm' syntax. </span>
<span class="Comment">On the other hand, almost everybody who compiles C++ on x86/Linux</span>
<span class="Comment">platforms uses GNU compilers (although there are some commercial</span>
<span class="Comment">compilers out there that I don't know too much about).</span>
<span class="Comment">The third way to fix the problem is to 'force' all intermediate</span>
<span class="Comment">floating point results into memory. This is not an 'ideal' fix,</span>
<span class="Comment">since it is not fully equivalent to 53-bit precision (because of </span>
<span class="Comment">double rounding), but it works (although to be honest, I've never seen</span>
<span class="Comment">a full proof of correctness in this case).</span>
<span class="Comment">NTL's quad_float code does this by storing intermediate results</span>
<span class="Comment">in local variables declared to be 'volatile'.</span>
<span class="Comment">This is the solution to the problem that NTL uses if it detects</span>
<span class="Comment">the problem and can't fix it using the GNU 'asm' hack mentioned above.</span>
<span class="Comment">This solution should work on any platform that faithfully</span>
<span class="Comment">implements 'volatile' according to the ANSI C standard.</span>
<span class="Comment">BACKGROUND INFO</span>
<span class="Comment">The code NTL uses algorithms designed by Knuth, Kahan, Dekker, and</span>
<span class="Comment">Linnainmaa. The original transcription to C++ was done by Douglas</span>
<span class="Comment">Priest. Enhancements and bug fixes were done by Keith Briggs</span>
<span class="Comment">(<a href="http://epidem13.plantsci.cam.ac.uk/~kbriggs)">http://epidem13.plantsci.cam.ac.uk/~kbriggs)</a>. The NTL version is a</span>
<span class="Comment">stripped down version of Briggs' code, with a couple of bug fixes and</span>
<span class="Comment">portability improvements. Briggs has continued to develop his</span>
<span class="Comment">library; see his web page above for the latest version and more information.</span>
<span class="Comment">Here is a brief annotated bibliography (compiled by Priest) of papers </span>
<span class="Comment">dealing with DP and similar techniques, arranged chronologically.</span>
<span class="Comment">Kahan, W., Further Remarks on Reducing Truncation Errors,</span>
<span class="Comment"> {\it Comm.\ ACM\/} {\bf 8} (1965), 40.</span>
<span class="Comment">M{\o}ller, O., Quasi Double Precision in Floating-Point Addition,</span>
<span class="Comment"> {\it BIT\/} {\bf 5} (1965), 37--50.</span>
<span class="Comment"> The two papers that first presented the idea of recovering the</span>
<span class="Comment"> roundoff of a sum.</span>
<span class="Comment">Dekker, T., A Floating-Point Technique for Extending the Available</span>
<span class="Comment"> Precision, {\it Numer.\ Math.} {\bf 18} (1971), 224--242.</span>
<span class="Comment"> The classic reference for DP algorithms for sum, product, quotient,</span>
<span class="Comment"> and square root.</span>
<span class="Comment">Pichat, M., Correction d'une Somme en Arithmetique \`a Virgule</span>
<span class="Comment"> Flottante, {\it Numer.\ Math.} {\bf 19} (1972), 400--406.</span>
<span class="Comment"> An iterative algorithm for computing a protracted sum to working</span>
<span class="Comment"> precision by repeatedly applying the sum-and-roundoff method.</span>
<span class="Comment">Linnainmaa, S., Analysis of Some Known Methods of Improving the Accuracy</span>
<span class="Comment"> of Floating-Point Sums, {\it BIT\/} {\bf 14} (1974), 167--202.</span>
<span class="Comment"> Comparison of Kahan and M{\o}ller algorithms with variations given</span>
<span class="Comment"> by Knuth.</span>
<span class="Comment">Bohlender, G., Floating-Point Computation of Functions with Maximum</span>
<span class="Comment"> Accuracy, {\it IEEE Trans.\ Comput.} {\bf C-26} (1977), 621--632.</span>
<span class="Comment"> Extended the analysis of Pichat's algorithm to compute a multi-word</span>
<span class="Comment"> representation of the exact sum of n working precision numbers.</span>
<span class="Comment"> This is the algorithm Kahan has called "distillation".</span>
<span class="Comment">Linnainmaa, S., Software for Doubled-Precision Floating-Point Computations,</span>
<span class="Comment"> {\it ACM Trans.\ Math.\ Soft.} {\bf 7} (1981), 272--283.</span>
<span class="Comment"> Generalized the hypotheses of Dekker and showed how to take advantage</span>
<span class="Comment"> of extended precision where available.</span>
<span class="Comment">Leuprecht, H., and W.~Oberaigner, Parallel Algorithms for the Rounding-Exact</span>
<span class="Comment"> Summation of Floating-Point Numbers, {\it Computing} {\bf 28} (1982), 89--104.</span>
<span class="Comment"> Variations of distillation appropriate for parallel and vector</span>
<span class="Comment"> architectures.</span>
<span class="Comment">Kahan, W., Paradoxes in Concepts of Accuracy, lecture notes from Joint</span>
<span class="Comment"> Seminar on Issues and Directions in Scientific Computation, Berkeley, 1989.</span>
<span class="Comment"> Gives the more accurate DP sum I've shown above, discusses some</span>
<span class="Comment"> examples.</span>
<span class="Comment">Priest, D., Algorithms for Arbitrary Precision Floating Point Arithmetic,</span>
<span class="Comment"> in P.~Kornerup and D.~Matula, Eds., {\it Proc.\ 10th Symposium on Com-</span>
<span class="Comment"> puter Arithmetic}, IEEE Computer Society Press, Los Alamitos, Calif., 1991.</span>
<span class="Comment"> Extends from DP to arbitrary precision; gives portable algorithms and</span>
<span class="Comment"> general proofs.</span>
<span class="Comment">Sorensen, D., and P.~Tang, On the Orthogonality of Eigenvectors Computed</span>
<span class="Comment"> by Divide-and-Conquer Techniques, {\it SIAM J.\ Num.\ Anal.} {\bf 28}</span>
<span class="Comment"> (1991), 1752--1775.</span>
<span class="Comment"> Uses some DP arithmetic to retain orthogonality of eigenvectors</span>
<span class="Comment"> computed by a parallel divide-and-conquer scheme.</span>
<span class="Comment">Priest, D., On Properties of Floating Point Arithmetics: Numerical Stability</span>
<span class="Comment"> and the Cost of Accurate Computations, Ph.D. dissertation, University</span>
<span class="Comment"> of California at Berkeley, 1992.</span>
<span class="Comment"> More examples, organizes proofs in terms of common properties of fp</span>
<span class="Comment"> addition/subtraction, gives other summation algorithms.</span>
<span class="Comment">Another relevant paper: </span>
<span class="Comment">X. S. Li, et al.</span>
<span class="Comment">Design, implementation, and testing of extended and mixed </span>
<span class="Comment">precision BLAS. ACM Trans. Math. Soft., 28:152-205, 2002.</span>
<span class="Comment">\**********************************************************************</span><span class="Comment">*/</span>
</pre>
</body>
</html>
|