1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16
17 #include "reference_math.h"
18 #include "harness/compat.h"
19
20 #include <climits>
21
22 #if !defined(_WIN32)
23 #include <cstring>
24 #endif
25
26 #include "utility.h"
27
28 #if defined(__SSE__) \
29 || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
30 #include <xmmintrin.h>
31 #endif
32 #if defined(__SSE2__) \
33 || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
34 #include <emmintrin.h>
35 #endif
36
37 #ifndef M_PI_4
38 #define M_PI_4 (M_PI / 4)
39 #endif
40
41 #pragma STDC FP_CONTRACT OFF
42 static void __log2_ep(double *hi, double *lo, double x);
43
44 union uint64d_t {
45 uint64_t i;
46 double d;
47 };
48
49 static const uint64d_t _CL_NAN = { 0x7ff8000000000000ULL };
50
51 #define cl_make_nan() _CL_NAN.d
52
reduce1(double x)53 static double reduce1(double x)
54 {
55 if (fabs(x) >= HEX_DBL(+, 1, 0, +, 53))
56 {
57 if (fabs(x) == INFINITY) return cl_make_nan();
58
59 return 0.0; // we patch up the sign for sinPi and cosPi later, since
60 // they need different signs
61 }
62
63 // Find the nearest multiple of 2
64 const double r = copysign(HEX_DBL(+, 1, 0, +, 53), x);
65 double z = x + r;
66 z -= r;
67
68 // subtract it from x. Value is now in the range -1 <= x <= 1
69 return x - z;
70 }
71
reference_acospi(double x)72 double reference_acospi(double x) { return reference_acos(x) / M_PI; }
reference_asinpi(double x)73 double reference_asinpi(double x) { return reference_asin(x) / M_PI; }
reference_atanpi(double x)74 double reference_atanpi(double x) { return reference_atan(x) / M_PI; }
reference_atan2pi(double y,double x)75 double reference_atan2pi(double y, double x)
76 {
77 return reference_atan2(y, x) / M_PI;
78 }
reference_cospi(double x)79 double reference_cospi(double x)
80 {
81 if (reference_fabs(x) >= HEX_DBL(+, 1, 0, +, 52))
82 {
83 if (reference_fabs(x) == INFINITY) return cl_make_nan();
84
85 // Note this probably fails for odd values between 0x1.0p52 and
86 // 0x1.0p53. However, when starting with single precision inputs, there
87 // will be no odd values.
88
89 return 1.0;
90 }
91
92 x = reduce1(x + 0.5);
93
94 // reduce to [-0.5, 0.5]
95 if (x < -0.5)
96 x = -1 - x;
97 else if (x > 0.5)
98 x = 1 - x;
99
100 // cosPi zeros are all +0
101 if (x == 0.0) return 0.0;
102
103 return reference_sin(x * M_PI);
104 }
105
reference_relaxed_cospi(double x)106 double reference_relaxed_cospi(double x) { return reference_cospi(x); }
107
reference_relaxed_divide(double x,double y)108 double reference_relaxed_divide(double x, double y)
109 {
110 return (float)(((float)x) / ((float)y));
111 }
112
reference_divide(double x,double y)113 double reference_divide(double x, double y) { return x / y; }
114
115 // Add a + b. If the result modulo overflowed, write 1 to *carry, otherwise 0
add_carry(cl_ulong a,cl_ulong b,cl_ulong * carry)116 static inline cl_ulong add_carry(cl_ulong a, cl_ulong b, cl_ulong *carry)
117 {
118 cl_ulong result = a + b;
119 *carry = result < a;
120 return result;
121 }
122
123 // Subtract a - b. If the result modulo overflowed, write 1 to *carry, otherwise
124 // 0
sub_carry(cl_ulong a,cl_ulong b,cl_ulong * carry)125 static inline cl_ulong sub_carry(cl_ulong a, cl_ulong b, cl_ulong *carry)
126 {
127 cl_ulong result = a - b;
128 *carry = result > a;
129 return result;
130 }
131
fallback_frexpf(float x,int * iptr)132 static float fallback_frexpf(float x, int *iptr)
133 {
134 cl_uint u, v;
135 float fu, fv;
136
137 memcpy(&u, &x, sizeof(u));
138
139 cl_uint exponent = u & 0x7f800000U;
140 cl_uint mantissa = u & ~0x7f800000U;
141
142 // add 1 to the exponent
143 exponent += 0x00800000U;
144
145 if ((cl_int)exponent < (cl_int)0x01000000)
146 { // subnormal, NaN, Inf
147 mantissa |= 0x3f000000U;
148
149 v = mantissa & 0xff800000U;
150 u = mantissa;
151 memcpy(&fv, &v, sizeof(v));
152 memcpy(&fu, &u, sizeof(u));
153
154 fu -= fv;
155
156 memcpy(&v, &fv, sizeof(v));
157 memcpy(&u, &fu, sizeof(u));
158
159 exponent = u & 0x7f800000U;
160 mantissa = u & ~0x7f800000U;
161
162 *iptr = (exponent >> 23) + (-126 + 1 - 126);
163 u = mantissa | 0x3f000000U;
164 memcpy(&fu, &u, sizeof(u));
165 return fu;
166 }
167
168 *iptr = (exponent >> 23) - 127;
169 u = mantissa | 0x3f000000U;
170 memcpy(&fu, &u, sizeof(u));
171 return fu;
172 }
173
extractf(float x,cl_uint * mant)174 static inline int extractf(float x, cl_uint *mant)
175 {
176 static float (*frexppf)(float, int *) = NULL;
177 int e;
178
179 // verify that frexp works properly
180 if (NULL == frexppf)
181 {
182 if (0.5f == frexpf(HEX_FLT(+, 1, 0, -, 130), &e) && e == -129)
183 frexppf = frexpf;
184 else
185 frexppf = fallback_frexpf;
186 }
187
188 *mant = (cl_uint)(HEX_FLT(+, 1, 0, +, 32) * fabsf(frexppf(x, &e)));
189 return e - 1;
190 }
191
192 // Shift right by shift bits. Any bits lost on the right side are bitwise OR'd
193 // together and ORd into the LSB of the result
shift_right_sticky_64(cl_ulong * p,int shift)194 static inline void shift_right_sticky_64(cl_ulong *p, int shift)
195 {
196 cl_ulong sticky = 0;
197 cl_ulong r = *p;
198
199 // C doesn't handle shifts greater than the size of the variable dependably
200 if (shift >= 64)
201 {
202 sticky |= (0 != r);
203 r = 0;
204 }
205 else
206 {
207 sticky |= (0 != (r << (64 - shift)));
208 r >>= shift;
209 }
210
211 *p = r | sticky;
212 }
213
214 // Add two 64 bit mantissas. Bits that are below the LSB of the result are OR'd
215 // into the LSB of the result
add64(cl_ulong * p,cl_ulong c,int * exponent)216 static inline void add64(cl_ulong *p, cl_ulong c, int *exponent)
217 {
218 cl_ulong carry;
219 c = add_carry(c, *p, &carry);
220 if (carry)
221 {
222 carry = c & 1; // set aside sticky bit
223 c >>= 1; // right shift to deal with overflow
224 c |= carry
225 | 0x8000000000000000ULL; // or in carry bit, and sticky bit. The
226 // latter is to prevent rounding from
227 // believing we are exact half way case
228 *exponent = *exponent + 1; // adjust exponent
229 }
230
231 *p = c;
232 }
233
234 // IEEE-754 round to nearest, ties to even rounding
round_to_nearest_even_float(cl_ulong p,int exponent)235 static float round_to_nearest_even_float(cl_ulong p, int exponent)
236 {
237 union {
238 cl_uint u;
239 cl_float d;
240 } u;
241
242 // If mantissa is zero, return 0.0f
243 if (p == 0) return 0.0f;
244
245 // edges
246 if (exponent > 127)
247 {
248 volatile float r = exponent * CL_FLT_MAX; // signal overflow
249
250 // attempt to fool the compiler into not optimizing the above line away
251 if (r > CL_FLT_MAX) return INFINITY;
252
253 return r;
254 }
255 if (exponent == -150 && p > 0x8000000000000000ULL)
256 return HEX_FLT(+, 1, 0, -, 149);
257 if (exponent <= -150) return 0.0f;
258
259 // Figure out which bits go where
260 int shift = 8 + 32;
261 if (exponent < -126)
262 {
263 shift -= 126 + exponent; // subnormal: shift is not 52
264 exponent = -127; // set exponent to 0
265 }
266 else
267 p &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove
268 // it.
269
270 // Assemble the double (round toward zero)
271 u.u = (cl_uint)(p >> shift) | ((cl_uint)(exponent + 127) << 23);
272
273 // put a representation of the residual bits into hi
274 p <<= (64 - shift);
275
276 // round to nearest, ties to even based on the unused portion of p
277 if (p < 0x8000000000000000ULL) return u.d;
278 if (p == 0x8000000000000000ULL)
279 u.u += u.u & 1U;
280 else
281 u.u++;
282
283 return u.d;
284 }
285
round_to_nearest_even_float_ftz(cl_ulong p,int exponent)286 static float round_to_nearest_even_float_ftz(cl_ulong p, int exponent)
287 {
288 extern int gCheckTininessBeforeRounding;
289
290 union {
291 cl_uint u;
292 cl_float d;
293 } u;
294 int shift = 8 + 32;
295
296 // If mantissa is zero, return 0.0f
297 if (p == 0) return 0.0f;
298
299 // edges
300 if (exponent > 127)
301 {
302 volatile float r = exponent * CL_FLT_MAX; // signal overflow
303
304 // attempt to fool the compiler into not optimizing the above line away
305 if (r > CL_FLT_MAX) return INFINITY;
306
307 return r;
308 }
309
310 // Deal with FTZ for gCheckTininessBeforeRounding
311 if (exponent < (gCheckTininessBeforeRounding - 127)) return 0.0f;
312
313 if (exponent
314 == -127) // only happens for machines that check tininess after rounding
315 p = (p & 1) | (p >> 1);
316 else
317 p &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove
318 // it.
319
320 cl_ulong q = p;
321
322
323 // Assemble the double (round toward zero)
324 u.u = (cl_uint)(q >> shift) | ((cl_uint)(exponent + 127) << 23);
325
326 // put a representation of the residual bits into hi
327 q <<= (64 - shift);
328
329 // round to nearest, ties to even based on the unused portion of p
330 if (q > 0x8000000000000000ULL)
331 u.u++;
332 else if (q == 0x8000000000000000ULL)
333 u.u += u.u & 1U;
334
335 // Deal with FTZ for ! gCheckTininessBeforeRounding
336 if (0 == (u.u & 0x7f800000U)) return 0.0f;
337
338 return u.d;
339 }
340
341
342 // IEEE-754 round toward zero.
round_toward_zero_float(cl_ulong p,int exponent)343 static float round_toward_zero_float(cl_ulong p, int exponent)
344 {
345 union {
346 cl_uint u;
347 cl_float d;
348 } u;
349
350 // If mantissa is zero, return 0.0f
351 if (p == 0) return 0.0f;
352
353 // edges
354 if (exponent > 127)
355 {
356 volatile float r = exponent * CL_FLT_MAX; // signal overflow
357
358 // attempt to fool the compiler into not optimizing the above line away
359 if (r > CL_FLT_MAX) return CL_FLT_MAX;
360
361 return r;
362 }
363
364 if (exponent <= -149) return 0.0f;
365
366 // Figure out which bits go where
367 int shift = 8 + 32;
368 if (exponent < -126)
369 {
370 shift -= 126 + exponent; // subnormal: shift is not 52
371 exponent = -127; // set exponent to 0
372 }
373 else
374 p &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove
375 // it.
376
377 // Assemble the double (round toward zero)
378 u.u = (cl_uint)(p >> shift) | ((cl_uint)(exponent + 127) << 23);
379
380 return u.d;
381 }
382
round_toward_zero_float_ftz(cl_ulong p,int exponent)383 static float round_toward_zero_float_ftz(cl_ulong p, int exponent)
384 {
385 union {
386 cl_uint u;
387 cl_float d;
388 } u;
389 int shift = 8 + 32;
390
391 // If mantissa is zero, return 0.0f
392 if (p == 0) return 0.0f;
393
394 // edges
395 if (exponent > 127)
396 {
397 volatile float r = exponent * CL_FLT_MAX; // signal overflow
398
399 // attempt to fool the compiler into not optimizing the above line away
400 if (r > CL_FLT_MAX) return CL_FLT_MAX;
401
402 return r;
403 }
404
405 // Deal with FTZ for gCheckTininessBeforeRounding
406 if (exponent < -126) return 0.0f;
407
408 cl_ulong q = p &=
409 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove it.
410
411 // Assemble the double (round toward zero)
412 u.u = (cl_uint)(q >> shift) | ((cl_uint)(exponent + 127) << 23);
413
414 // put a representation of the residual bits into hi
415 q <<= (64 - shift);
416
417 return u.d;
418 }
419
420 // Subtract two significands.
sub64(cl_ulong * c,cl_ulong p,cl_uint * signC,int * expC)421 static inline void sub64(cl_ulong *c, cl_ulong p, cl_uint *signC, int *expC)
422 {
423 cl_ulong carry;
424 p = sub_carry(*c, p, &carry);
425
426 if (carry)
427 {
428 *signC ^= 0x80000000U;
429 p = -p;
430 }
431
432 // normalize
433 if (p)
434 {
435 int shift = 32;
436 cl_ulong test = 1ULL << 32;
437 while (0 == (p & 0x8000000000000000ULL))
438 {
439 if (p < test)
440 {
441 p <<= shift;
442 *expC = *expC - shift;
443 }
444 shift >>= 1;
445 test <<= shift;
446 }
447 }
448 else
449 {
450 // zero result.
451 *expC = -200;
452 *signC =
453 0; // IEEE rules say a - a = +0 for all rounding modes except -inf
454 }
455
456 *c = p;
457 }
458
459
reference_fma(float a,float b,float c,int shouldFlush)460 float reference_fma(float a, float b, float c, int shouldFlush)
461 {
462 static const cl_uint kMSB = 0x80000000U;
463
464 // Make bits accessible
465 union {
466 cl_uint u;
467 cl_float d;
468 } ua;
469 ua.d = a;
470 union {
471 cl_uint u;
472 cl_float d;
473 } ub;
474 ub.d = b;
475 union {
476 cl_uint u;
477 cl_float d;
478 } uc;
479 uc.d = c;
480
481 // deal with Nans, infinities and zeros
482 if (isnan(a) || isnan(b) || isnan(c) || isinf(a) || isinf(b) || isinf(c)
483 || 0 == (ua.u & ~kMSB) || // a == 0, defeat host FTZ behavior
484 0 == (ub.u & ~kMSB) || // b == 0, defeat host FTZ behavior
485 0 == (uc.u & ~kMSB)) // c == 0, defeat host FTZ behavior
486 {
487 FPU_mode_type oldMode;
488 RoundingMode oldRoundMode = kRoundToNearestEven;
489 if (isinf(c) && !isinf(a) && !isinf(b)) return (c + a) + b;
490
491 if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat);
492
493 memset(&oldMode, 0, sizeof(oldMode));
494 if (shouldFlush) ForceFTZ(&oldMode);
495
496 a = (float)reference_multiply(
497 a, b); // some risk that the compiler will insert a non-compliant
498 // fma here on some platforms.
499 a = (float)reference_add(
500 a,
501 c); // We use STDC FP_CONTRACT OFF above to attempt to defeat that.
502
503 if (shouldFlush) RestoreFPState(&oldMode);
504
505 if (gIsInRTZMode) set_round(oldRoundMode, kfloat);
506 return a;
507 }
508
509 // extract exponent and mantissa
510 // exponent is a standard unbiased signed integer
511 // mantissa is a cl_uint, with leading non-zero bit positioned at the MSB
512 cl_uint mantA, mantB, mantC;
513 int expA = extractf(a, &mantA);
514 int expB = extractf(b, &mantB);
515 int expC = extractf(c, &mantC);
516 cl_uint signC = uc.u & kMSB; // We'll need the sign bit of C later to decide
517 // if we are adding or subtracting
518
519 // exact product of A and B
520 int exponent = expA + expB;
521 cl_uint sign = (ua.u ^ ub.u) & kMSB;
522 cl_ulong product = (cl_ulong)mantA * (cl_ulong)mantB;
523
524 // renormalize -- 1.m * 1.n yields a number between 1.0 and 3.99999..
525 // The MSB might not be set. If so, fix that. Otherwise, reflect the fact
526 // that we got another power of two from the multiplication
527 if (0 == (0x8000000000000000ULL & product))
528 product <<= 1;
529 else
530 exponent++; // 2**31 * 2**31 gives 2**62. If the MSB was set, then our
531 // exponent increased.
532
533 // infinite precision add
534 cl_ulong addend = (cl_ulong)mantC << 32;
535 if (exponent >= expC)
536 {
537 // Shift C relative to the product so that their exponents match
538 if (exponent > expC) shift_right_sticky_64(&addend, exponent - expC);
539
540 // Add
541 if (sign ^ signC)
542 sub64(&product, addend, &sign, &exponent);
543 else
544 add64(&product, addend, &exponent);
545 }
546 else
547 {
548 // Shift the product relative to C so that their exponents match
549 shift_right_sticky_64(&product, expC - exponent);
550
551 // add
552 if (sign ^ signC)
553 sub64(&addend, product, &signC, &expC);
554 else
555 add64(&addend, product, &expC);
556
557 product = addend;
558 exponent = expC;
559 sign = signC;
560 }
561
562 // round to IEEE result -- we do not do flushing to zero here. That part is
563 // handled manually in ternary.c.
564 if (gIsInRTZMode)
565 {
566 if (shouldFlush)
567 ua.d = round_toward_zero_float_ftz(product, exponent);
568 else
569 ua.d = round_toward_zero_float(product, exponent);
570 }
571 else
572 {
573 if (shouldFlush)
574 ua.d = round_to_nearest_even_float_ftz(product, exponent);
575 else
576 ua.d = round_to_nearest_even_float(product, exponent);
577 }
578
579 // Set the sign
580 ua.u |= sign;
581
582 return ua.d;
583 }
584
reference_relaxed_exp10(double x)585 double reference_relaxed_exp10(double x) { return reference_exp10(x); }
586
reference_exp10(double x)587 double reference_exp10(double x)
588 {
589 return reference_exp2(x * HEX_DBL(+, 1, a934f0979a371, +, 1));
590 }
591
592
reference_ilogb(double x)593 int reference_ilogb(double x)
594 {
595 extern int gDeviceILogb0, gDeviceILogbNaN;
596 union {
597 cl_double f;
598 cl_ulong u;
599 } u;
600
601 u.f = (float)x;
602 cl_int exponent = (cl_int)(u.u >> 52) & 0x7ff;
603 if (exponent == 0x7ff)
604 {
605 if (u.u & 0x000fffffffffffffULL) return gDeviceILogbNaN;
606
607 return CL_INT_MAX;
608 }
609
610 if (exponent == 0)
611 { // deal with denormals
612 u.f = x * HEX_DBL(+, 1, 0, +, 64);
613 exponent = (cl_int)(u.u >> 52) & 0x7ff;
614 if (exponent == 0) return gDeviceILogb0;
615
616 return exponent - (1023 + 64);
617 }
618
619 return exponent - 1023;
620 }
621
reference_nan(cl_uint x)622 double reference_nan(cl_uint x)
623 {
624 union {
625 cl_uint u;
626 cl_float f;
627 } u;
628 u.u = x | 0x7fc00000U;
629 return (double)u.f;
630 }
631
reference_maxmag(double x,double y)632 double reference_maxmag(double x, double y)
633 {
634 double fabsx = fabs(x);
635 double fabsy = fabs(y);
636
637 if (fabsx < fabsy) return y;
638
639 if (fabsy < fabsx) return x;
640
641 return reference_fmax(x, y);
642 }
643
reference_minmag(double x,double y)644 double reference_minmag(double x, double y)
645 {
646 double fabsx = fabs(x);
647 double fabsy = fabs(y);
648
649 if (fabsx > fabsy) return y;
650
651 if (fabsy > fabsx) return x;
652
653 return reference_fmin(x, y);
654 }
655
reference_relaxed_mad(double a,double b,double c)656 double reference_relaxed_mad(double a, double b, double c)
657 {
658 return ((float)a) * ((float)b) + (float)c;
659 }
660
reference_mad(double a,double b,double c)661 double reference_mad(double a, double b, double c) { return a * b + c; }
662
reference_recip(double x)663 double reference_recip(double x) { return 1.0 / x; }
reference_rootn(double x,int i)664 double reference_rootn(double x, int i)
665 {
666
667 // rootn ( x, 0 ) returns a NaN.
668 if (0 == i) return cl_make_nan();
669
670 // rootn ( x, n ) returns a NaN for x < 0 and n is even.
671 if (x < 0 && 0 == (i & 1)) return cl_make_nan();
672
673 if (x == 0.0)
674 {
675 switch (i & 0x80000001)
676 {
677 // rootn ( +-0, n ) is +0 for even n > 0.
678 case 0: return 0.0f;
679
680 // rootn ( +-0, n ) is +-0 for odd n > 0.
681 case 1: return x;
682
683 // rootn ( +-0, n ) is +inf for even n < 0.
684 case 0x80000000: return INFINITY;
685
686 // rootn ( +-0, n ) is +-inf for odd n < 0.
687 case 0x80000001: return copysign(INFINITY, x);
688 }
689 }
690
691 double sign = x;
692 x = reference_fabs(x);
693 x = reference_exp2(reference_log2(x) / (double)i);
694 return reference_copysignd(x, sign);
695 }
696
reference_rsqrt(double x)697 double reference_rsqrt(double x) { return 1.0 / reference_sqrt(x); }
698
reference_sinpi(double x)699 double reference_sinpi(double x)
700 {
701 double r = reduce1(x);
702
703 // reduce to [-0.5, 0.5]
704 if (r < -0.5)
705 r = -1 - r;
706 else if (r > 0.5)
707 r = 1 - r;
708
709 // sinPi zeros have the same sign as x
710 if (r == 0.0) return reference_copysignd(0.0, x);
711
712 return reference_sin(r * M_PI);
713 }
714
reference_relaxed_sinpi(double x)715 double reference_relaxed_sinpi(double x) { return reference_sinpi(x); }
716
reference_tanpi(double x)717 double reference_tanpi(double x)
718 {
719 // set aside the sign (allows us to preserve sign of -0)
720 double sign = reference_copysignd(1.0, x);
721 double z = reference_fabs(x);
722
723 // if big and even -- caution: only works if x only has single precision
724 if (z >= HEX_DBL(+, 1, 0, +, 24))
725 {
726 if (z == INFINITY) return x - x; // nan
727
728 return reference_copysignd(
729 0.0, x); // tanpi ( n ) is copysign( 0.0, n) for even integers n.
730 }
731
732 // reduce to the range [ -0.5, 0.5 ]
733 double nearest = reference_rint(z); // round to nearest even places n + 0.5
734 // values in the right place for us
735 int i = (int)nearest; // test above against 0x1.0p24 avoids overflow here
736 z -= nearest;
737
738 // correction for odd integer x for the right sign of zero
739 if ((i & 1) && z == 0.0) sign = -sign;
740
741 // track changes to the sign
742 sign *= reference_copysignd(1.0, z); // really should just be an xor
743 z = reference_fabs(z); // remove the sign again
744
745 // reduce once more
746 // If we don't do this, rounding error in z * M_PI will cause us not to
747 // return infinities properly
748 if (z > 0.25)
749 {
750 z = 0.5 - z;
751 return sign
752 / reference_tan(z * M_PI); // use system tan to get the right result
753 }
754
755 //
756 return sign
757 * reference_tan(z * M_PI); // use system tan to get the right result
758 }
759
reference_pown(double x,int i)760 double reference_pown(double x, int i) { return reference_pow(x, (double)i); }
reference_powr(double x,double y)761 double reference_powr(double x, double y)
762 {
763 // powr ( x, y ) returns NaN for x < 0.
764 if (x < 0.0) return cl_make_nan();
765
766 // powr ( x, NaN ) returns the NaN for x >= 0.
767 // powr ( NaN, y ) returns the NaN.
768 if (isnan(x) || isnan(y))
769 return x + y; // Note: behavior different here than for pow(1,NaN),
770 // pow(NaN, 0)
771
772 if (x == 1.0)
773 {
774 // powr ( +1, +-inf ) returns NaN.
775 if (reference_fabs(y) == INFINITY) return cl_make_nan();
776
777 // powr ( +1, y ) is 1 for finite y. (NaN handled above)
778 return 1.0;
779 }
780
781 if (y == 0.0)
782 {
783 // powr ( +inf, +-0 ) returns NaN.
784 // powr ( +-0, +-0 ) returns NaN.
785 if (x == 0.0 || x == INFINITY) return cl_make_nan();
786
787 // powr ( x, +-0 ) is 1 for finite x > 0. (x <= 0, NaN, INF already
788 // handled above)
789 return 1.0;
790 }
791
792 if (x == 0.0)
793 {
794 // powr ( +-0, -inf) is +inf.
795 // powr ( +-0, y ) is +inf for finite y < 0.
796 if (y < 0.0) return INFINITY;
797
798 // powr ( +-0, y ) is +0 for y > 0. (NaN, y==0 handled above)
799 return 0.0;
800 }
801
802 // x = +inf
803 if (isinf(x))
804 {
805 if (y < 0) return 0;
806 return INFINITY;
807 }
808
809 double fabsx = reference_fabs(x);
810 double fabsy = reference_fabs(y);
811
812 // y = +-inf cases
813 if (isinf(fabsy))
814 {
815 if (y < 0)
816 {
817 if (fabsx < 1) return INFINITY;
818 return 0;
819 }
820 if (fabsx < 1) return 0;
821 return INFINITY;
822 }
823
824 double hi, lo;
825 __log2_ep(&hi, &lo, x);
826 double prod = y * hi;
827 double result = reference_exp2(prod);
828
829 return result;
830 }
831
reference_fract(double x,double * ip)832 double reference_fract(double x, double *ip)
833 {
834 if (isnan(x))
835 {
836 *ip = cl_make_nan();
837 return cl_make_nan();
838 }
839
840 float i;
841 float f = modff((float)x, &i);
842 if (f < 0.0)
843 {
844 f = 1.0f + f;
845 i -= 1.0f;
846 if (f == 1.0f) f = HEX_FLT(+, 1, fffffe, -, 1);
847 }
848 *ip = i;
849 return f;
850 }
851
852
reference_add(double x,double y)853 double reference_add(double x, double y)
854 {
855 volatile float a = (float)x;
856 volatile float b = (float)y;
857
858 #if defined(__SSE__) \
859 || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
860 // defeat x87
861 __m128 va = _mm_set_ss((float)a);
862 __m128 vb = _mm_set_ss((float)b);
863 va = _mm_add_ss(va, vb);
864 _mm_store_ss((float *)&a, va);
865 #elif defined(__PPC__)
866 // Most Power host CPUs do not support the non-IEEE mode (NI) which flushes
867 // denorm's to zero. As such, the reference add with FTZ must be emulated in
868 // sw.
869 if (fpu_control & _FPU_MASK_NI)
870 {
871 union {
872 cl_uint u;
873 cl_float d;
874 } ua;
875 ua.d = a;
876 union {
877 cl_uint u;
878 cl_float d;
879 } ub;
880 ub.d = b;
881 cl_uint mantA, mantB;
882 cl_ulong addendA, addendB, sum;
883 int expA = extractf(a, &mantA);
884 int expB = extractf(b, &mantB);
885 cl_uint signA = ua.u & 0x80000000U;
886 cl_uint signB = ub.u & 0x80000000U;
887
888 // Force matching exponents if an operand is 0
889 if (a == 0.0f)
890 {
891 expA = expB;
892 }
893 else if (b == 0.0f)
894 {
895 expB = expA;
896 }
897
898 addendA = (cl_ulong)mantA << 32;
899 addendB = (cl_ulong)mantB << 32;
900
901 if (expA >= expB)
902 {
903 // Shift B relative to the A so that their exponents match
904 if (expA > expB) shift_right_sticky_64(&addendB, expA - expB);
905
906 // add
907 if (signA ^ signB)
908 sub64(&addendA, addendB, &signA, &expA);
909 else
910 add64(&addendA, addendB, &expA);
911 }
912 else
913 {
914 // Shift the A relative to B so that their exponents match
915 shift_right_sticky_64(&addendA, expB - expA);
916
917 // add
918 if (signA ^ signB)
919 sub64(&addendB, addendA, &signB, &expB);
920 else
921 add64(&addendB, addendA, &expB);
922
923 addendA = addendB;
924 expA = expB;
925 signA = signB;
926 }
927
928 // round to IEEE result
929 if (gIsInRTZMode)
930 {
931 ua.d = round_toward_zero_float_ftz(addendA, expA);
932 }
933 else
934 {
935 ua.d = round_to_nearest_even_float_ftz(addendA, expA);
936 }
937 // Set the sign
938 ua.u |= signA;
939 a = ua.d;
940 }
941 else
942 {
943 a += b;
944 }
945 #else
946 a += b;
947 #endif
948 return (double)a;
949 }
950
951
reference_subtract(double x,double y)952 double reference_subtract(double x, double y)
953 {
954 volatile float a = (float)x;
955 volatile float b = (float)y;
956 #if defined(__SSE__) \
957 || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
958 // defeat x87
959 __m128 va = _mm_set_ss((float)a);
960 __m128 vb = _mm_set_ss((float)b);
961 va = _mm_sub_ss(va, vb);
962 _mm_store_ss((float *)&a, va);
963 #else
964 a -= b;
965 #endif
966 return a;
967 }
968
reference_multiply(double x,double y)969 double reference_multiply(double x, double y)
970 {
971 volatile float a = (float)x;
972 volatile float b = (float)y;
973 #if defined(__SSE__) \
974 || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
975 // defeat x87
976 __m128 va = _mm_set_ss((float)a);
977 __m128 vb = _mm_set_ss((float)b);
978 va = _mm_mul_ss(va, vb);
979 _mm_store_ss((float *)&a, va);
980 #elif defined(__PPC__)
981 // Most Power host CPUs do not support the non-IEEE mode (NI) which flushes
982 // denorm's to zero. As such, the reference multiply with FTZ must be
983 // emulated in sw.
984 if (fpu_control & _FPU_MASK_NI)
985 {
986 // extract exponent and mantissa
987 // exponent is a standard unbiased signed integer
988 // mantissa is a cl_uint, with leading non-zero bit positioned at the
989 // MSB
990 union {
991 cl_uint u;
992 cl_float d;
993 } ua;
994 ua.d = a;
995 union {
996 cl_uint u;
997 cl_float d;
998 } ub;
999 ub.d = b;
1000 cl_uint mantA, mantB;
1001 int expA = extractf(a, &mantA);
1002 int expB = extractf(b, &mantB);
1003
1004 // exact product of A and B
1005 int exponent = expA + expB;
1006 cl_uint sign = (ua.u ^ ub.u) & 0x80000000U;
1007 cl_ulong product = (cl_ulong)mantA * (cl_ulong)mantB;
1008
1009 // renormalize -- 1.m * 1.n yields a number between 1.0 and 3.99999..
1010 // The MSB might not be set. If so, fix that. Otherwise, reflect the
1011 // fact that we got another power of two from the multiplication
1012 if (0 == (0x8000000000000000ULL & product))
1013 product <<= 1;
1014 else
1015 exponent++; // 2**31 * 2**31 gives 2**62. If the MSB was set, then
1016 // our exponent increased.
1017
1018 // round to IEEE result -- we do not do flushing to zero here. That part
1019 // is handled manually in ternary.c.
1020 if (gIsInRTZMode)
1021 {
1022 ua.d = round_toward_zero_float_ftz(product, exponent);
1023 }
1024 else
1025 {
1026 ua.d = round_to_nearest_even_float_ftz(product, exponent);
1027 }
1028 // Set the sign
1029 ua.u |= sign;
1030 a = ua.d;
1031 }
1032 else
1033 {
1034 a *= b;
1035 }
1036 #else
1037 a *= b;
1038 #endif
1039 return a;
1040 }
1041
reference_lgamma_r(double x,int * signp)1042 double reference_lgamma_r(double x, int *signp)
1043 {
1044 // This is not currently tested
1045 *signp = 0;
1046 return x;
1047 }
1048
1049
reference_isequal(double x,double y)1050 int reference_isequal(double x, double y) { return x == y; }
reference_isfinite(double x)1051 int reference_isfinite(double x) { return 0 != isfinite(x); }
reference_isgreater(double x,double y)1052 int reference_isgreater(double x, double y) { return x > y; }
reference_isgreaterequal(double x,double y)1053 int reference_isgreaterequal(double x, double y) { return x >= y; }
reference_isinf(double x)1054 int reference_isinf(double x) { return 0 != isinf(x); }
reference_isless(double x,double y)1055 int reference_isless(double x, double y) { return x < y; }
reference_islessequal(double x,double y)1056 int reference_islessequal(double x, double y) { return x <= y; }
reference_islessgreater(double x,double y)1057 int reference_islessgreater(double x, double y)
1058 {
1059 return 0 != islessgreater(x, y);
1060 }
reference_isnan(double x)1061 int reference_isnan(double x) { return 0 != isnan(x); }
reference_isnormal(double x)1062 int reference_isnormal(double x) { return 0 != isnormal((float)x); }
reference_isnotequal(double x,double y)1063 int reference_isnotequal(double x, double y) { return x != y; }
reference_isordered(double x,double y)1064 int reference_isordered(double x, double y) { return x == x && y == y; }
reference_isunordered(double x,double y)1065 int reference_isunordered(double x, double y) { return isnan(x) || isnan(y); }
reference_signbit(float x)1066 int reference_signbit(float x) { return 0 != signbit(x); }
1067
1068 #if 1 // defined( _MSC_VER )
1069
1070 // Missing functions for win32
1071
1072
reference_copysign(float x,float y)1073 float reference_copysign(float x, float y)
1074 {
1075 union {
1076 float f;
1077 cl_uint u;
1078 } ux, uy;
1079 ux.f = x;
1080 uy.f = y;
1081 ux.u &= 0x7fffffffU;
1082 ux.u |= uy.u & 0x80000000U;
1083 return ux.f;
1084 }
1085
1086
reference_copysignd(double x,double y)1087 double reference_copysignd(double x, double y)
1088 {
1089 union {
1090 double f;
1091 cl_ulong u;
1092 } ux, uy;
1093 ux.f = x;
1094 uy.f = y;
1095 ux.u &= 0x7fffffffffffffffULL;
1096 ux.u |= uy.u & 0x8000000000000000ULL;
1097 return ux.f;
1098 }
1099
1100
reference_round(double x)1101 double reference_round(double x)
1102 {
1103 double absx = reference_fabs(x);
1104 if (absx < 0.5) return reference_copysignd(0.0, x);
1105
1106 if (absx < HEX_DBL(+, 1, 0, +, 53))
1107 x = reference_trunc(x + reference_copysignd(0.5, x));
1108
1109 return x;
1110 }
1111
reference_trunc(double x)1112 double reference_trunc(double x)
1113 {
1114 if (fabs(x) < HEX_DBL(+, 1, 0, +, 53))
1115 {
1116 cl_long l = (cl_long)x;
1117
1118 return reference_copysignd((double)l, x);
1119 }
1120
1121 return x;
1122 }
1123
1124 #ifndef FP_ILOGB0
1125 #define FP_ILOGB0 INT_MIN
1126 #endif
1127
1128 #ifndef FP_ILOGBNAN
1129 #define FP_ILOGBNAN INT_MAX
1130 #endif
1131
1132
reference_cbrt(double x)1133 double reference_cbrt(double x)
1134 {
1135 return reference_copysignd(reference_pow(reference_fabs(x), 1.0 / 3.0), x);
1136 }
1137
reference_rint(double x)1138 double reference_rint(double x)
1139 {
1140 if (reference_fabs(x) < HEX_DBL(+, 1, 0, +, 52))
1141 {
1142 double magic = reference_copysignd(HEX_DBL(+, 1, 0, +, 52), x);
1143 double rounded = (x + magic) - magic;
1144 x = reference_copysignd(rounded, x);
1145 }
1146
1147 return x;
1148 }
1149
reference_acosh(double x)1150 double reference_acosh(double x)
1151 { // not full precision. Sufficient precision to cover float
1152 if (isnan(x)) return x + x;
1153
1154 if (x < 1.0) return cl_make_nan();
1155
1156 return reference_log(x + reference_sqrt(x + 1) * reference_sqrt(x - 1));
1157 }
1158
reference_asinh(double x)1159 double reference_asinh(double x)
1160 {
1161 /*
1162 * ====================================================
1163 * This function is from fdlibm: http://www.netlib.org
1164 * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
1165 *
1166 * Developed at SunSoft, a Sun Microsystems, Inc. business.
1167 * Permission to use, copy, modify, and distribute this
1168 * software is freely granted, provided that this notice
1169 * is preserved.
1170 * ====================================================
1171 */
1172 if (isnan(x) || isinf(x)) return x + x;
1173
1174 double absx = reference_fabs(x);
1175 if (absx < HEX_DBL(+, 1, 0, -, 28)) return x;
1176
1177 double sign = reference_copysignd(1.0, x);
1178
1179 if (absx > HEX_DBL(+, 1, 0, +, 28))
1180 return sign
1181 * (reference_log(absx)
1182 + 0.693147180559945309417232121458176568); // log(2)
1183
1184 if (absx > 2.0)
1185 return sign
1186 * reference_log(2.0 * absx
1187 + 1.0 / (reference_sqrt(x * x + 1.0) + absx));
1188
1189 return sign
1190 * reference_log1p(absx + x * x / (1.0 + reference_sqrt(1.0 + x * x)));
1191 }
1192
1193
reference_atanh(double x)1194 double reference_atanh(double x)
1195 {
1196 /*
1197 * ====================================================
1198 * This function is from fdlibm: http://www.netlib.org
1199 * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
1200 *
1201 * Developed at SunSoft, a Sun Microsystems, Inc. business.
1202 * Permission to use, copy, modify, and distribute this
1203 * software is freely granted, provided that this notice
1204 * is preserved.
1205 * ====================================================
1206 */
1207 if (isnan(x)) return x + x;
1208
1209 double signed_half = reference_copysignd(0.5, x);
1210 x = reference_fabs(x);
1211 if (x > 1.0) return cl_make_nan();
1212
1213 if (x < 0.5)
1214 return signed_half * reference_log1p(2.0 * (x + x * x / (1 - x)));
1215
1216 return signed_half * reference_log1p(2.0 * x / (1 - x));
1217 }
1218
reference_relaxed_atan(double x)1219 double reference_relaxed_atan(double x) { return reference_atan(x); }
1220
reference_relaxed_exp2(double x)1221 double reference_relaxed_exp2(double x) { return reference_exp2(x); }
1222
reference_exp2(double x)1223 double reference_exp2(double x)
1224 { // Note: only suitable for verifying single precision. Doesn't have range of a
1225 // full double exp2 implementation.
1226 if (x == 0.0) return 1.0;
1227
1228 // separate x into fractional and integer parts
1229 double i = reference_rint(x); // round to nearest integer
1230
1231 if (i < -150) return 0.0;
1232
1233 if (i > 129) return INFINITY;
1234
1235 double f = x - i; // -0.5 <= f <= 0.5
1236
1237 // find exp2(f)
1238 // calculate as p(f) = (exp2(f)-1)/f
1239 // exp2(f) = f * p(f) + 1
1240 // p(f) is a minimax polynomial with error within 0x1.c1fd80f0d1ab7p-50
1241
1242 double p = 0.693147180560184539289
1243 + (0.240226506955902863183
1244 + (0.055504108656833424373
1245 + (0.009618129212846484796
1246 + (0.001333355902958566035
1247 + (0.000154034191902497930
1248 + (0.000015252317761038105
1249 + (0.000001326283129417092
1250 + 0.000000102593187638680 * f)
1251 * f)
1252 * f)
1253 * f)
1254 * f)
1255 * f)
1256 * f)
1257 * f;
1258 f *= p;
1259 f += 1.0;
1260
1261 // scale by 2 ** i
1262 union {
1263 cl_ulong u;
1264 double d;
1265 } u;
1266 int exponent = (int)i + 1023;
1267 u.u = (cl_ulong)exponent << 52;
1268
1269 return f * u.d;
1270 }
1271
1272
reference_expm1(double x)1273 double reference_expm1(double x)
1274 { // Note: only suitable for verifying single precision. Doesn't have range of a
1275 // full double expm1 implementation. It is only accurate to 47 bits or less.
1276
1277 // early out for small numbers and NaNs
1278 if (!(reference_fabs(x) > HEX_DBL(+, 1, 0, -, 24))) return x;
1279
1280 // early out for large negative numbers
1281 if (x < -130.0) return -1.0;
1282
1283 // early out for large positive numbers
1284 if (x > 100.0) return INFINITY;
1285
1286 // separate x into fractional and integer parts
1287 double i = reference_rint(x); // round to nearest integer
1288 double f = x - i; // -0.5 <= f <= 0.5
1289
1290 // reduce f to the range -0.0625 .. f.. 0.0625
1291 int index = (int)(f * 16.0) + 8; // 0...16
1292
1293 static const double reduction[17] = { -0.5, -0.4375, -0.375, -0.3125,
1294 -0.25, -0.1875, -0.125, -0.0625,
1295 0.0, +0.0625, +0.125, +0.1875,
1296 +0.25, +0.3125, +0.375, +0.4375,
1297 +0.5 };
1298
1299
1300 // exponentials[i] = expm1(reduction[i])
1301 static const double exponentials[17] = {
1302 HEX_DBL(-, 1, 92e9a0720d3ec, -, 2),
1303 HEX_DBL(-, 1, 6adb1cd9205ee, -, 2),
1304 HEX_DBL(-, 1, 40373d42ce2e3, -, 2),
1305 HEX_DBL(-, 1, 12d35a41ba104, -, 2),
1306 HEX_DBL(-, 1, c5041854df7d4, -, 3),
1307 HEX_DBL(-, 1, 5e25fb4fde211, -, 3),
1308 HEX_DBL(-, 1, e14aed893eef4, -, 4),
1309 HEX_DBL(-, 1, f0540438fd5c3, -, 5),
1310 HEX_DBL(+, 0, 0, +, 0),
1311 HEX_DBL(+, 1, 082b577d34ed8, -, 4),
1312 HEX_DBL(+, 1, 10b022db7ae68, -, 3),
1313 HEX_DBL(+, 1, a65c0b85ac1a9, -, 3),
1314 HEX_DBL(+, 1, 22d78f0fa061a, -, 2),
1315 HEX_DBL(+, 1, 77a45d8117fd5, -, 2),
1316 HEX_DBL(+, 1, d1e944f6fbdaa, -, 2),
1317 HEX_DBL(+, 1, 190048ef6002, -, 1),
1318 HEX_DBL(+, 1, 4c2531c3c0d38, -, 1),
1319 };
1320
1321
1322 f -= reduction[index];
1323
1324 // find expm1(f)
1325 // calculate as p(f) = (exp(f)-1)/f
1326 // expm1(f) = f * p(f)
1327 // p(f) is a minimax polynomial with error within 0x1.1d7693618d001p-48 over
1328 // the range +- 0.0625
1329 double p = 0.999999999999998001599
1330 + (0.499999999999839628284
1331 + (0.166666666672817459505
1332 + (0.041666666612283048687
1333 + (0.008333330214567431435
1334 + (0.001389005319303770070 + 0.000198833381525156667 * f)
1335 * f)
1336 * f)
1337 * f)
1338 * f)
1339 * f;
1340 f *= p; // expm1( reduced f )
1341
1342 // expm1(f) = (exmp1( reduced_f) + 1.0) * ( exponentials[index] + 1 ) - 1
1343 // = exmp1( reduced_f) * exponentials[index] + exmp1( reduced_f) +
1344 // exponentials[index] + 1 -1 = exmp1( reduced_f) *
1345 // exponentials[index] + exmp1( reduced_f) + exponentials[index]
1346 f += exponentials[index] + f * exponentials[index];
1347
1348 // scale by e ** i
1349 int exponent = (int)i;
1350 if (0 == exponent) return f; // precise answer for x near 1
1351
1352 // table of e**(i-150)
1353 static const double exp_table[128 + 150 + 1] = {
1354 HEX_DBL(+, 1, 82e16284f5ec5, -, 217),
1355 HEX_DBL(+, 1, 06e9996332ba1, -, 215),
1356 HEX_DBL(+, 1, 6555cb289e44b, -, 214),
1357 HEX_DBL(+, 1, e5ab364643354, -, 213),
1358 HEX_DBL(+, 1, 4a0bd18e64df7, -, 211),
1359 HEX_DBL(+, 1, c094499cc578e, -, 210),
1360 HEX_DBL(+, 1, 30d759323998c, -, 208),
1361 HEX_DBL(+, 1, 9e5278ab1d4cf, -, 207),
1362 HEX_DBL(+, 1, 198fa3f30be25, -, 205),
1363 HEX_DBL(+, 1, 7eae636d6144e, -, 204),
1364 HEX_DBL(+, 1, 040f1036f4863, -, 202),
1365 HEX_DBL(+, 1, 6174e477a895f, -, 201),
1366 HEX_DBL(+, 1, e065b82dd95a, -, 200),
1367 HEX_DBL(+, 1, 4676be491d129, -, 198),
1368 HEX_DBL(+, 1, bbb5da5f7c823, -, 197),
1369 HEX_DBL(+, 1, 2d884eef5fdcb, -, 195),
1370 HEX_DBL(+, 1, 99d3397ab8371, -, 194),
1371 HEX_DBL(+, 1, 1681497ed15b3, -, 192),
1372 HEX_DBL(+, 1, 7a870f597fdbd, -, 191),
1373 HEX_DBL(+, 1, 013c74edba307, -, 189),
1374 HEX_DBL(+, 1, 5d9ec4ada7938, -, 188),
1375 HEX_DBL(+, 1, db2edfd20fa7c, -, 187),
1376 HEX_DBL(+, 1, 42eb9f39afb0b, -, 185),
1377 HEX_DBL(+, 1, b6e4f282b43f4, -, 184),
1378 HEX_DBL(+, 1, 2a42764857b19, -, 182),
1379 HEX_DBL(+, 1, 9560792d19314, -, 181),
1380 HEX_DBL(+, 1, 137b6ce8e052c, -, 179),
1381 HEX_DBL(+, 1, 766b45dd84f18, -, 178),
1382 HEX_DBL(+, 1, fce362fe6e7d, -, 177),
1383 HEX_DBL(+, 1, 59d34dd8a5473, -, 175),
1384 HEX_DBL(+, 1, d606847fc727a, -, 174),
1385 HEX_DBL(+, 1, 3f6a58b795de3, -, 172),
1386 HEX_DBL(+, 1, b2216c6efdac1, -, 171),
1387 HEX_DBL(+, 1, 2705b5b153fb8, -, 169),
1388 HEX_DBL(+, 1, 90fa1509bd50d, -, 168),
1389 HEX_DBL(+, 1, 107df698da211, -, 166),
1390 HEX_DBL(+, 1, 725ae6e7b9d35, -, 165),
1391 HEX_DBL(+, 1, f75d6040aeff6, -, 164),
1392 HEX_DBL(+, 1, 56126259e093c, -, 162),
1393 HEX_DBL(+, 1, d0ec7df4f7bd4, -, 161),
1394 HEX_DBL(+, 1, 3bf2cf6722e46, -, 159),
1395 HEX_DBL(+, 1, ad6b22f55db42, -, 158),
1396 HEX_DBL(+, 1, 23d1f3e5834a, -, 156),
1397 HEX_DBL(+, 1, 8c9feab89b876, -, 155),
1398 HEX_DBL(+, 1, 0d88cf37f00dd, -, 153),
1399 HEX_DBL(+, 1, 6e55d2bf838a7, -, 152),
1400 HEX_DBL(+, 1, f1e6b68529e33, -, 151),
1401 HEX_DBL(+, 1, 525be4e4e601d, -, 149),
1402 HEX_DBL(+, 1, cbe0a45f75eb1, -, 148),
1403 HEX_DBL(+, 1, 3884e838aea68, -, 146),
1404 HEX_DBL(+, 1, a8c1f14e2af5d, -, 145),
1405 HEX_DBL(+, 1, 20a717e64a9bd, -, 143),
1406 HEX_DBL(+, 1, 8851d84118908, -, 142),
1407 HEX_DBL(+, 1, 0a9bdfb02d24, -, 140),
1408 HEX_DBL(+, 1, 6a5bea046b42e, -, 139),
1409 HEX_DBL(+, 1, ec7f3b269efa8, -, 138),
1410 HEX_DBL(+, 1, 4eafb87eab0f2, -, 136),
1411 HEX_DBL(+, 1, c6e2d05bbc, -, 135),
1412 HEX_DBL(+, 1, 35208867c2683, -, 133),
1413 HEX_DBL(+, 1, a425b317eeacd, -, 132),
1414 HEX_DBL(+, 1, 1d8508fa8246a, -, 130),
1415 HEX_DBL(+, 1, 840fbc08fdc8a, -, 129),
1416 HEX_DBL(+, 1, 07b7112bc1ffe, -, 127),
1417 HEX_DBL(+, 1, 666d0dad2961d, -, 126),
1418 HEX_DBL(+, 1, e726c3f64d0fe, -, 125),
1419 HEX_DBL(+, 1, 4b0dc07cabf98, -, 123),
1420 HEX_DBL(+, 1, c1f2daf3b6a46, -, 122),
1421 HEX_DBL(+, 1, 31c5957a47de2, -, 120),
1422 HEX_DBL(+, 1, 9f96445648b9f, -, 119),
1423 HEX_DBL(+, 1, 1a6baeadb4fd1, -, 117),
1424 HEX_DBL(+, 1, 7fd974d372e45, -, 116),
1425 HEX_DBL(+, 1, 04da4d1452919, -, 114),
1426 HEX_DBL(+, 1, 62891f06b345, -, 113),
1427 HEX_DBL(+, 1, e1dd273aa8a4a, -, 112),
1428 HEX_DBL(+, 1, 4775e0840bfdd, -, 110),
1429 HEX_DBL(+, 1, bd109d9d94bda, -, 109),
1430 HEX_DBL(+, 1, 2e73f53fba844, -, 107),
1431 HEX_DBL(+, 1, 9b138170d6bfe, -, 106),
1432 HEX_DBL(+, 1, 175af0cf60ec5, -, 104),
1433 HEX_DBL(+, 1, 7baee1bffa80b, -, 103),
1434 HEX_DBL(+, 1, 02057d1245ceb, -, 101),
1435 HEX_DBL(+, 1, 5eafffb34ba31, -, 100),
1436 HEX_DBL(+, 1, dca23bae16424, -, 99),
1437 HEX_DBL(+, 1, 43e7fc88b8056, -, 97),
1438 HEX_DBL(+, 1, b83bf23a9a9eb, -, 96),
1439 HEX_DBL(+, 1, 2b2b8dd05b318, -, 94),
1440 HEX_DBL(+, 1, 969d47321e4cc, -, 93),
1441 HEX_DBL(+, 1, 1452b7723aed2, -, 91),
1442 HEX_DBL(+, 1, 778fe2497184c, -, 90),
1443 HEX_DBL(+, 1, fe7116182e9cc, -, 89),
1444 HEX_DBL(+, 1, 5ae191a99585a, -, 87),
1445 HEX_DBL(+, 1, d775d87da854d, -, 86),
1446 HEX_DBL(+, 1, 4063f8cc8bb98, -, 84),
1447 HEX_DBL(+, 1, b374b315f87c1, -, 83),
1448 HEX_DBL(+, 1, 27ec458c65e3c, -, 81),
1449 HEX_DBL(+, 1, 923372c67a074, -, 80),
1450 HEX_DBL(+, 1, 1152eaeb73c08, -, 78),
1451 HEX_DBL(+, 1, 737c5645114b5, -, 77),
1452 HEX_DBL(+, 1, f8e6c24b5592e, -, 76),
1453 HEX_DBL(+, 1, 571db733a9d61, -, 74),
1454 HEX_DBL(+, 1, d257d547e083f, -, 73),
1455 HEX_DBL(+, 1, 3ce9b9de78f85, -, 71),
1456 HEX_DBL(+, 1, aebabae3a41b5, -, 70),
1457 HEX_DBL(+, 1, 24b6031b49bda, -, 68),
1458 HEX_DBL(+, 1, 8dd5e1bb09d7e, -, 67),
1459 HEX_DBL(+, 1, 0e5b73d1ff53d, -, 65),
1460 HEX_DBL(+, 1, 6f741de1748ec, -, 64),
1461 HEX_DBL(+, 1, f36bd37f42f3e, -, 63),
1462 HEX_DBL(+, 1, 536452ee2f75c, -, 61),
1463 HEX_DBL(+, 1, cd480a1b7482, -, 60),
1464 HEX_DBL(+, 1, 39792499b1a24, -, 58),
1465 HEX_DBL(+, 1, aa0de4bf35b38, -, 57),
1466 HEX_DBL(+, 1, 2188ad6ae3303, -, 55),
1467 HEX_DBL(+, 1, 898471fca6055, -, 54),
1468 HEX_DBL(+, 1, 0b6c3afdde064, -, 52),
1469 HEX_DBL(+, 1, 6b7719a59f0e, -, 51),
1470 HEX_DBL(+, 1, ee001eed62aa, -, 50),
1471 HEX_DBL(+, 1, 4fb547c775da8, -, 48),
1472 HEX_DBL(+, 1, c8464f7616468, -, 47),
1473 HEX_DBL(+, 1, 36121e24d3bba, -, 45),
1474 HEX_DBL(+, 1, a56e0c2ac7f75, -, 44),
1475 HEX_DBL(+, 1, 1e642baeb84a, -, 42),
1476 HEX_DBL(+, 1, 853f01d6d53ba, -, 41),
1477 HEX_DBL(+, 1, 0885298767e9a, -, 39),
1478 HEX_DBL(+, 1, 67852a7007e42, -, 38),
1479 HEX_DBL(+, 1, e8a37a45fc32e, -, 37),
1480 HEX_DBL(+, 1, 4c1078fe9228a, -, 35),
1481 HEX_DBL(+, 1, c3527e433fab1, -, 34),
1482 HEX_DBL(+, 1, 32b48bf117da2, -, 32),
1483 HEX_DBL(+, 1, a0db0d0ddb3ec, -, 31),
1484 HEX_DBL(+, 1, 1b48655f37267, -, 29),
1485 HEX_DBL(+, 1, 81056ff2c5772, -, 28),
1486 HEX_DBL(+, 1, 05a628c699fa1, -, 26),
1487 HEX_DBL(+, 1, 639e3175a689d, -, 25),
1488 HEX_DBL(+, 1, e355bbaee85cb, -, 24),
1489 HEX_DBL(+, 1, 4875ca227ec38, -, 22),
1490 HEX_DBL(+, 1, be6c6fdb01612, -, 21),
1491 HEX_DBL(+, 1, 2f6053b981d98, -, 19),
1492 HEX_DBL(+, 1, 9c54c3b43bc8b, -, 18),
1493 HEX_DBL(+, 1, 18354238f6764, -, 16),
1494 HEX_DBL(+, 1, 7cd79b5647c9b, -, 15),
1495 HEX_DBL(+, 1, 02cf22526545a, -, 13),
1496 HEX_DBL(+, 1, 5fc21041027ad, -, 12),
1497 HEX_DBL(+, 1, de16b9c24a98f, -, 11),
1498 HEX_DBL(+, 1, 44e51f113d4d6, -, 9),
1499 HEX_DBL(+, 1, b993fe00d5376, -, 8),
1500 HEX_DBL(+, 1, 2c155b8213cf4, -, 6),
1501 HEX_DBL(+, 1, 97db0ccceb0af, -, 5),
1502 HEX_DBL(+, 1, 152aaa3bf81cc, -, 3),
1503 HEX_DBL(+, 1, 78b56362cef38, -, 2),
1504 HEX_DBL(+, 1, 0, +, 0),
1505 HEX_DBL(+, 1, 5bf0a8b145769, +, 1),
1506 HEX_DBL(+, 1, d8e64b8d4ddae, +, 2),
1507 HEX_DBL(+, 1, 415e5bf6fb106, +, 4),
1508 HEX_DBL(+, 1, b4c902e273a58, +, 5),
1509 HEX_DBL(+, 1, 28d389970338f, +, 7),
1510 HEX_DBL(+, 1, 936dc5690c08f, +, 8),
1511 HEX_DBL(+, 1, 122885aaeddaa, +, 10),
1512 HEX_DBL(+, 1, 749ea7d470c6e, +, 11),
1513 HEX_DBL(+, 1, fa7157c470f82, +, 12),
1514 HEX_DBL(+, 1, 5829dcf95056, +, 14),
1515 HEX_DBL(+, 1, d3c4488ee4f7f, +, 15),
1516 HEX_DBL(+, 1, 3de1654d37c9a, +, 17),
1517 HEX_DBL(+, 1, b00b5916ac955, +, 18),
1518 HEX_DBL(+, 1, 259ac48bf05d7, +, 20),
1519 HEX_DBL(+, 1, 8f0ccafad2a87, +, 21),
1520 HEX_DBL(+, 1, 0f2ebd0a8002, +, 23),
1521 HEX_DBL(+, 1, 709348c0ea4f9, +, 24),
1522 HEX_DBL(+, 1, f4f22091940bd, +, 25),
1523 HEX_DBL(+, 1, 546d8f9ed26e1, +, 27),
1524 HEX_DBL(+, 1, ceb088b68e804, +, 28),
1525 HEX_DBL(+, 1, 3a6e1fd9eecfd, +, 30),
1526 HEX_DBL(+, 1, ab5adb9c436, +, 31),
1527 HEX_DBL(+, 1, 226af33b1fdc1, +, 33),
1528 HEX_DBL(+, 1, 8ab7fb5475fb7, +, 34),
1529 HEX_DBL(+, 1, 0c3d3920962c9, +, 36),
1530 HEX_DBL(+, 1, 6c932696a6b5d, +, 37),
1531 HEX_DBL(+, 1, ef822f7f6731d, +, 38),
1532 HEX_DBL(+, 1, 50bba3796379a, +, 40),
1533 HEX_DBL(+, 1, c9aae4631c056, +, 41),
1534 HEX_DBL(+, 1, 370470aec28ed, +, 43),
1535 HEX_DBL(+, 1, a6b765d8cdf6d, +, 44),
1536 HEX_DBL(+, 1, 1f43fcc4b662c, +, 46),
1537 HEX_DBL(+, 1, 866f34a725782, +, 47),
1538 HEX_DBL(+, 1, 0953e2f3a1ef7, +, 49),
1539 HEX_DBL(+, 1, 689e221bc8d5b, +, 50),
1540 HEX_DBL(+, 1, ea215a1d20d76, +, 51),
1541 HEX_DBL(+, 1, 4d13fbb1a001a, +, 53),
1542 HEX_DBL(+, 1, c4b334617cc67, +, 54),
1543 HEX_DBL(+, 1, 33a43d282a519, +, 56),
1544 HEX_DBL(+, 1, a220d397972eb, +, 57),
1545 HEX_DBL(+, 1, 1c25c88df6862, +, 59),
1546 HEX_DBL(+, 1, 8232558201159, +, 60),
1547 HEX_DBL(+, 1, 0672a3c9eb871, +, 62),
1548 HEX_DBL(+, 1, 64b41c6d37832, +, 63),
1549 HEX_DBL(+, 1, e4cf766fe49be, +, 64),
1550 HEX_DBL(+, 1, 49767bc0483e3, +, 66),
1551 HEX_DBL(+, 1, bfc951eb8bb76, +, 67),
1552 HEX_DBL(+, 1, 304d6aeca254b, +, 69),
1553 HEX_DBL(+, 1, 9d97010884251, +, 70),
1554 HEX_DBL(+, 1, 19103e4080b45, +, 72),
1555 HEX_DBL(+, 1, 7e013cd114461, +, 73),
1556 HEX_DBL(+, 1, 03996528e074c, +, 75),
1557 HEX_DBL(+, 1, 60d4f6fdac731, +, 76),
1558 HEX_DBL(+, 1, df8c5af17ba3b, +, 77),
1559 HEX_DBL(+, 1, 45e3076d61699, +, 79),
1560 HEX_DBL(+, 1, baed16a6e0da7, +, 80),
1561 HEX_DBL(+, 1, 2cffdfebde1a1, +, 82),
1562 HEX_DBL(+, 1, 9919cabefcb69, +, 83),
1563 HEX_DBL(+, 1, 160345c9953e3, +, 85),
1564 HEX_DBL(+, 1, 79dbc9dc53c66, +, 86),
1565 HEX_DBL(+, 1, 00c810d464097, +, 88),
1566 HEX_DBL(+, 1, 5d009394c5c27, +, 89),
1567 HEX_DBL(+, 1, da57de8f107a8, +, 90),
1568 HEX_DBL(+, 1, 425982cf597cd, +, 92),
1569 HEX_DBL(+, 1, b61e5ca3a5e31, +, 93),
1570 HEX_DBL(+, 1, 29bb825dfcf87, +, 95),
1571 HEX_DBL(+, 1, 94a90db0d6fe2, +, 96),
1572 HEX_DBL(+, 1, 12fec759586fd, +, 98),
1573 HEX_DBL(+, 1, 75c1dc469e3af, +, 99),
1574 HEX_DBL(+, 1, fbfd219c43b04, +, 100),
1575 HEX_DBL(+, 1, 5936d44e1a146, +, 102),
1576 HEX_DBL(+, 1, d531d8a7ee79c, +, 103),
1577 HEX_DBL(+, 1, 3ed9d24a2d51b, +, 105),
1578 HEX_DBL(+, 1, b15cfe5b6e17b, +, 106),
1579 HEX_DBL(+, 1, 268038c2c0e, +, 108),
1580 HEX_DBL(+, 1, 9044a73545d48, +, 109),
1581 HEX_DBL(+, 1, 1002ab6218b38, +, 111),
1582 HEX_DBL(+, 1, 71b3540cbf921, +, 112),
1583 HEX_DBL(+, 1, f6799ea9c414a, +, 113),
1584 HEX_DBL(+, 1, 55779b984f3eb, +, 115),
1585 HEX_DBL(+, 1, d01a210c44aa4, +, 116),
1586 HEX_DBL(+, 1, 3b63da8e9121, +, 118),
1587 HEX_DBL(+, 1, aca8d6b0116b8, +, 119),
1588 HEX_DBL(+, 1, 234de9e0c74e9, +, 121),
1589 HEX_DBL(+, 1, 8bec7503ca477, +, 122),
1590 HEX_DBL(+, 1, 0d0eda9796b9, +, 124),
1591 HEX_DBL(+, 1, 6db0118477245, +, 125),
1592 HEX_DBL(+, 1, f1056dc7bf22d, +, 126),
1593 HEX_DBL(+, 1, 51c2cc3433801, +, 128),
1594 HEX_DBL(+, 1, cb108ffbec164, +, 129),
1595 HEX_DBL(+, 1, 37f780991b584, +, 131),
1596 HEX_DBL(+, 1, a801c0ea8ac4d, +, 132),
1597 HEX_DBL(+, 1, 20247cc4c46c1, +, 134),
1598 HEX_DBL(+, 1, 87a0553328015, +, 135),
1599 HEX_DBL(+, 1, 0a233dee4f9bb, +, 137),
1600 HEX_DBL(+, 1, 69b7f55b808ba, +, 138),
1601 HEX_DBL(+, 1, eba064644060a, +, 139),
1602 HEX_DBL(+, 1, 4e184933d9364, +, 141),
1603 HEX_DBL(+, 1, c614fe2531841, +, 142),
1604 HEX_DBL(+, 1, 3494a9b171bf5, +, 144),
1605 HEX_DBL(+, 1, a36798b9d969b, +, 145),
1606 HEX_DBL(+, 1, 1d03d8c0c04af, +, 147),
1607 HEX_DBL(+, 1, 836026385c974, +, 148),
1608 HEX_DBL(+, 1, 073fbe9ac901d, +, 150),
1609 HEX_DBL(+, 1, 65cae0969f286, +, 151),
1610 HEX_DBL(+, 1, e64a58639cae8, +, 152),
1611 HEX_DBL(+, 1, 4a77f5f9b50f9, +, 154),
1612 HEX_DBL(+, 1, c12744a3a28e3, +, 155),
1613 HEX_DBL(+, 1, 313b3b6978e85, +, 157),
1614 HEX_DBL(+, 1, 9eda3a31e587e, +, 158),
1615 HEX_DBL(+, 1, 19ebe56b56453, +, 160),
1616 HEX_DBL(+, 1, 7f2bc6e599b7e, +, 161),
1617 HEX_DBL(+, 1, 04644610df2ff, +, 163),
1618 HEX_DBL(+, 1, 61e8b490ac4e6, +, 164),
1619 HEX_DBL(+, 1, e103201f299b3, +, 165),
1620 HEX_DBL(+, 1, 46e1b637beaf5, +, 167),
1621 HEX_DBL(+, 1, bc473cfede104, +, 168),
1622 HEX_DBL(+, 1, 2deb1b9c85e2d, +, 170),
1623 HEX_DBL(+, 1, 9a5981ca67d1, +, 171),
1624 HEX_DBL(+, 1, 16dc8a9ef670b, +, 173),
1625 HEX_DBL(+, 1, 7b03166942309, +, 174),
1626 HEX_DBL(+, 1, 0190be03150a7, +, 176),
1627 HEX_DBL(+, 1, 5e1152f9a8119, +, 177),
1628 HEX_DBL(+, 1, dbca9263f8487, +, 178),
1629 HEX_DBL(+, 1, 43556dee93bee, +, 180),
1630 HEX_DBL(+, 1, b774c12967dfa, +, 181),
1631 HEX_DBL(+, 1, 2aa4306e922c2, +, 183),
1632 HEX_DBL(+, 1, 95e54c5dd4217, +, 184)
1633 };
1634
1635 // scale by e**i -- (expm1(f) + 1)*e**i - 1 = expm1(f) * e**i + e**i - 1 =
1636 // e**i
1637 return exp_table[exponent + 150] + (f * exp_table[exponent + 150] - 1.0);
1638 }
1639
1640
reference_fmax(double x,double y)1641 double reference_fmax(double x, double y)
1642 {
1643 if (isnan(y)) return x;
1644
1645 return x >= y ? x : y;
1646 }
1647
reference_fmin(double x,double y)1648 double reference_fmin(double x, double y)
1649 {
1650 if (isnan(y)) return x;
1651
1652 return x <= y ? x : y;
1653 }
1654
reference_hypot(double x,double y)1655 double reference_hypot(double x, double y)
1656 {
1657 // Since the inputs are actually floats, we don't have to worry about range
1658 // here
1659 if (isinf(x) || isinf(y)) return INFINITY;
1660
1661 return sqrt(x * x + y * y);
1662 }
1663
reference_ilogbl(long double x)1664 int reference_ilogbl(long double x)
1665 {
1666 extern int gDeviceILogb0, gDeviceILogbNaN;
1667
1668 // Since we are just using this to verify double precision, we can
1669 // use the double precision ilogb here
1670 union {
1671 double f;
1672 cl_ulong u;
1673 } u;
1674 u.f = (double)x;
1675
1676 int exponent = (int)(u.u >> 52) & 0x7ff;
1677 if (exponent == 0x7ff)
1678 {
1679 if (u.u & 0x000fffffffffffffULL) return gDeviceILogbNaN;
1680
1681 return CL_INT_MAX;
1682 }
1683
1684 if (exponent == 0)
1685 { // deal with denormals
1686 u.f = x * HEX_DBL(+, 1, 0, +, 64);
1687 exponent = (cl_uint)(u.u >> 52) & 0x7ff;
1688 if (exponent == 0) return gDeviceILogb0;
1689
1690 exponent -= 1023 + 64;
1691 return exponent;
1692 }
1693
1694 return exponent - 1023;
1695 }
1696
reference_relaxed_log2(double x)1697 double reference_relaxed_log2(double x) { return reference_log2(x); }
1698
reference_log2(double x)1699 double reference_log2(double x)
1700 {
1701 if (isnan(x) || x < 0.0 || x == -INFINITY) return cl_make_nan();
1702
1703 if (x == 0.0f) return -INFINITY;
1704
1705 if (x == INFINITY) return INFINITY;
1706
1707 double hi, lo;
1708 __log2_ep(&hi, &lo, x);
1709 return hi;
1710 }
1711
reference_log1p(double x)1712 double reference_log1p(double x)
1713 { // This function is suitable only for verifying log1pf(). It produces several
1714 // double precision ulps of error.
1715
1716 // Handle small and NaN
1717 if (!(reference_fabs(x) > HEX_DBL(+, 1, 0, -, 53))) return x;
1718
1719 // deal with special values
1720 if (x <= -1.0)
1721 {
1722 if (x < -1.0) return cl_make_nan();
1723 return -INFINITY;
1724 }
1725
1726 // infinity
1727 if (x == INFINITY) return INFINITY;
1728
1729 // High precision result for when near 0, to avoid problems with the
1730 // reference result falling in the wrong binade.
1731 if (reference_fabs(x) < HEX_DBL(+, 1, 0, -, 28)) return (1.0 - 0.5 * x) * x;
1732
1733 // Our polynomial is only good in the region +-2**-4.
1734 // If we aren't in that range then we need to reduce to be in that range
1735 double correctionLo =
1736 -0.0; // correction down stream to compensate for the reduction, if any
1737 double correctionHi =
1738 -0.0; // correction down stream to compensate for the exponent, if any
1739 if (reference_fabs(x) > HEX_DBL(+, 1, 0, -, 4))
1740 {
1741 x += 1.0; // double should cover any loss of precision here
1742
1743 // separate x into (1+f) * 2**i
1744 union {
1745 double d;
1746 cl_ulong u;
1747 } u;
1748 u.d = x;
1749 int i = (int)((u.u >> 52) & 0x7ff) - 1023;
1750 u.u &= 0x000fffffffffffffULL;
1751 int index = (int)(u.u >> 48);
1752 u.u |= 0x3ff0000000000000ULL;
1753 double f = u.d;
1754
1755 // further reduce f to be within 1/16 of 1.0
1756 static const double scale_table[16] = {
1757 1.0,
1758 HEX_DBL(+, 1, d2d2d2d6e3f79, -, 1),
1759 HEX_DBL(+, 1, b8e38e42737a1, -, 1),
1760 HEX_DBL(+, 1, a1af28711adf3, -, 1),
1761 HEX_DBL(+, 1, 8cccccd88dd65, -, 1),
1762 HEX_DBL(+, 1, 79e79e810ec8f, -, 1),
1763 HEX_DBL(+, 1, 68ba2e94df404, -, 1),
1764 HEX_DBL(+, 1, 590b216defb29, -, 1),
1765 HEX_DBL(+, 1, 4aaaaab1500ed, -, 1),
1766 HEX_DBL(+, 1, 3d70a3e0d6f73, -, 1),
1767 HEX_DBL(+, 1, 313b13bb39f4f, -, 1),
1768 HEX_DBL(+, 1, 25ed09823f1cc, -, 1),
1769 HEX_DBL(+, 1, 1b6db6e77457b, -, 1),
1770 HEX_DBL(+, 1, 11a7b96a3a34f, -, 1),
1771 HEX_DBL(+, 1, 0888888e46fea, -, 1),
1772 HEX_DBL(+, 1, 00000038e9862, -, 1)
1773 };
1774
1775 // correction_table[i] = -log( scale_table[i] )
1776 // All entries have >= 64 bits of precision (rather than the expected
1777 // 53)
1778 static const double correction_table[16] = {
1779 -0.0,
1780 HEX_DBL(+, 1, 7a5c722c16058, -, 4),
1781 HEX_DBL(+, 1, 323db16c89ab1, -, 3),
1782 HEX_DBL(+, 1, a0f87d180629, -, 3),
1783 HEX_DBL(+, 1, 050279324e17c, -, 2),
1784 HEX_DBL(+, 1, 36f885bb270b0, -, 2),
1785 HEX_DBL(+, 1, 669b771b5cc69, -, 2),
1786 HEX_DBL(+, 1, 94203a6292a05, -, 2),
1787 HEX_DBL(+, 1, bfb4f9cb333a4, -, 2),
1788 HEX_DBL(+, 1, e982376ddb80e, -, 2),
1789 HEX_DBL(+, 1, 08d5d8769b2b2, -, 1),
1790 HEX_DBL(+, 1, 1c288bc00e0cf, -, 1),
1791 HEX_DBL(+, 1, 2ec7535b31ecb, -, 1),
1792 HEX_DBL(+, 1, 40bed0adc63fb, -, 1),
1793 HEX_DBL(+, 1, 521a5c0330615, -, 1),
1794 HEX_DBL(+, 1, 62e42f7dd092c, -, 1)
1795 };
1796
1797 f *= scale_table[index];
1798 correctionLo = correction_table[index];
1799
1800 // log( 2**(i) ) = i * log(2)
1801 correctionHi = (double)i * 0.693147180559945309417232121458176568;
1802
1803 x = f - 1.0;
1804 }
1805
1806
1807 // minmax polynomial for p(x) = (log(x+1) - x)/x valid over the range x =
1808 // [-1/16, 1/16]
1809 // max error HEX_DBL( +, 1, 048f61f9a5eca, -, 52 )
1810 double p = HEX_DBL(-, 1, cc33de97a9d7b, -, 46)
1811 + (HEX_DBL(-, 1, fffffffff3eb7, -, 2)
1812 + (HEX_DBL(+, 1, 5555555633ef7, -, 2)
1813 + (HEX_DBL(-, 1, 00000062c78, -, 2)
1814 + (HEX_DBL(+, 1, 9999958a3321, -, 3)
1815 + (HEX_DBL(-, 1, 55534ce65c347, -, 3)
1816 + (HEX_DBL(+, 1, 24957208391a5, -, 3)
1817 + (HEX_DBL(-, 1, 02287b9a5b4a1, -, 3)
1818 + HEX_DBL(+, 1, c757d922180ed, -, 4) * x)
1819 * x)
1820 * x)
1821 * x)
1822 * x)
1823 * x)
1824 * x)
1825 * x;
1826
1827 // log(x+1) = x * p(x) + x
1828 x += x * p;
1829
1830 return correctionHi + (correctionLo + x);
1831 }
1832
reference_logb(double x)1833 double reference_logb(double x)
1834 {
1835 union {
1836 float f;
1837 cl_uint u;
1838 } u;
1839 u.f = (float)x;
1840
1841 cl_int exponent = (u.u >> 23) & 0xff;
1842 if (exponent == 0xff) return x * x;
1843
1844 if (exponent == 0)
1845 { // deal with denormals
1846 u.u = (u.u & 0x007fffff) | 0x3f800000;
1847 u.f -= 1.0f;
1848 exponent = (u.u >> 23) & 0xff;
1849 if (exponent == 0) return -INFINITY;
1850
1851 return exponent - (127 + 126);
1852 }
1853
1854 return exponent - 127;
1855 }
1856
reference_relaxed_reciprocal(double x)1857 double reference_relaxed_reciprocal(double x) { return 1.0f / ((float)x); }
1858
reference_reciprocal(double x)1859 double reference_reciprocal(double x) { return 1.0 / x; }
1860
reference_remainder(double x,double y)1861 double reference_remainder(double x, double y)
1862 {
1863 int i;
1864 return reference_remquo(x, y, &i);
1865 }
1866
reference_lgamma(double x)1867 double reference_lgamma(double x)
1868 {
1869 /*
1870 * ====================================================
1871 * This function is from fdlibm. http://www.netlib.org
1872 * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
1873 *
1874 * Developed at SunSoft, a Sun Microsystems, Inc. business.
1875 * Permission to use, copy, modify, and distribute this
1876 * software is freely granted, provided that this notice
1877 * is preserved.
1878 * ====================================================
1879 *
1880 */
1881
1882 static const double // two52 = 4.50359962737049600000e+15, /* 0x43300000,
1883 // 0x00000000 */
1884 half = 5.00000000000000000000e-01, /* 0x3FE00000,
1885 0x00000000 */
1886 one = 1.00000000000000000000e+00, /* 0x3FF00000, 0x00000000 */
1887 pi = 3.14159265358979311600e+00, /* 0x400921FB, 0x54442D18 */
1888 a0 = 7.72156649015328655494e-02, /* 0x3FB3C467, 0xE37DB0C8 */
1889 a1 = 3.22467033424113591611e-01, /* 0x3FD4A34C, 0xC4A60FAD */
1890 a2 = 6.73523010531292681824e-02, /* 0x3FB13E00, 0x1A5562A7 */
1891 a3 = 2.05808084325167332806e-02, /* 0x3F951322, 0xAC92547B */
1892 a4 = 7.38555086081402883957e-03, /* 0x3F7E404F, 0xB68FEFE8 */
1893 a5 = 2.89051383673415629091e-03, /* 0x3F67ADD8, 0xCCB7926B */
1894 a6 = 1.19270763183362067845e-03, /* 0x3F538A94, 0x116F3F5D */
1895 a7 = 5.10069792153511336608e-04, /* 0x3F40B6C6, 0x89B99C00 */
1896 a8 = 2.20862790713908385557e-04, /* 0x3F2CF2EC, 0xED10E54D */
1897 a9 = 1.08011567247583939954e-04, /* 0x3F1C5088, 0x987DFB07 */
1898 a10 = 2.52144565451257326939e-05, /* 0x3EFA7074, 0x428CFA52 */
1899 a11 = 4.48640949618915160150e-05, /* 0x3F07858E, 0x90A45837 */
1900 tc = 1.46163214496836224576e+00, /* 0x3FF762D8, 0x6356BE3F */
1901 tf = -1.21486290535849611461e-01, /* 0xBFBF19B9, 0xBCC38A42 */
1902 /* tt = -(tail of tf) */
1903 tt = -3.63867699703950536541e-18, /* 0xBC50C7CA, 0xA48A971F */
1904 t0 = 4.83836122723810047042e-01, /* 0x3FDEF72B, 0xC8EE38A2 */
1905 t1 = -1.47587722994593911752e-01, /* 0xBFC2E427, 0x8DC6C509 */
1906 t2 = 6.46249402391333854778e-02, /* 0x3FB08B42, 0x94D5419B */
1907 t3 = -3.27885410759859649565e-02, /* 0xBFA0C9A8, 0xDF35B713 */
1908 t4 = 1.79706750811820387126e-02, /* 0x3F9266E7, 0x970AF9EC */
1909 t5 = -1.03142241298341437450e-02, /* 0xBF851F9F, 0xBA91EC6A */
1910 t6 = 6.10053870246291332635e-03, /* 0x3F78FCE0, 0xE370E344 */
1911 t7 = -3.68452016781138256760e-03, /* 0xBF6E2EFF, 0xB3E914D7 */
1912 t8 = 2.25964780900612472250e-03, /* 0x3F6282D3, 0x2E15C915 */
1913 t9 = -1.40346469989232843813e-03, /* 0xBF56FE8E, 0xBF2D1AF1 */
1914 t10 = 8.81081882437654011382e-04, /* 0x3F4CDF0C, 0xEF61A8E9 */
1915 t11 = -5.38595305356740546715e-04, /* 0xBF41A610, 0x9C73E0EC */
1916 t12 = 3.15632070903625950361e-04, /* 0x3F34AF6D, 0x6C0EBBF7 */
1917 t13 = -3.12754168375120860518e-04, /* 0xBF347F24, 0xECC38C38 */
1918 t14 = 3.35529192635519073543e-04, /* 0x3F35FD3E, 0xE8C2D3F4 */
1919 u0 = -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */
1920 u1 = 6.32827064025093366517e-01, /* 0x3FE4401E, 0x8B005DFF */
1921 u2 = 1.45492250137234768737e+00, /* 0x3FF7475C, 0xD119BD6F */
1922 u3 = 9.77717527963372745603e-01, /* 0x3FEF4976, 0x44EA8450 */
1923 u4 = 2.28963728064692451092e-01, /* 0x3FCD4EAE, 0xF6010924 */
1924 u5 = 1.33810918536787660377e-02, /* 0x3F8B678B, 0xBF2BAB09 */
1925 v1 = 2.45597793713041134822e+00, /* 0x4003A5D7, 0xC2BD619C */
1926 v2 = 2.12848976379893395361e+00, /* 0x40010725, 0xA42B18F5 */
1927 v3 = 7.69285150456672783825e-01, /* 0x3FE89DFB, 0xE45050AF */
1928 v4 = 1.04222645593369134254e-01, /* 0x3FBAAE55, 0xD6537C88 */
1929 v5 = 3.21709242282423911810e-03, /* 0x3F6A5ABB, 0x57D0CF61 */
1930 s0 = -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */
1931 s1 = 2.14982415960608852501e-01, /* 0x3FCB848B, 0x36E20878 */
1932 s2 = 3.25778796408930981787e-01, /* 0x3FD4D98F, 0x4F139F59 */
1933 s3 = 1.46350472652464452805e-01, /* 0x3FC2BB9C, 0xBEE5F2F7 */
1934 s4 = 2.66422703033638609560e-02, /* 0x3F9B481C, 0x7E939961 */
1935 s5 = 1.84028451407337715652e-03, /* 0x3F5E26B6, 0x7368F239 */
1936 s6 = 3.19475326584100867617e-05, /* 0x3F00BFEC, 0xDD17E945 */
1937 r1 = 1.39200533467621045958e+00, /* 0x3FF645A7, 0x62C4AB74 */
1938 r2 = 7.21935547567138069525e-01, /* 0x3FE71A18, 0x93D3DCDC */
1939 r3 = 1.71933865632803078993e-01, /* 0x3FC601ED, 0xCCFBDF27 */
1940 r4 = 1.86459191715652901344e-02, /* 0x3F9317EA, 0x742ED475 */
1941 r5 = 7.77942496381893596434e-04, /* 0x3F497DDA, 0xCA41A95B */
1942 r6 = 7.32668430744625636189e-06, /* 0x3EDEBAF7, 0xA5B38140 */
1943 w0 = 4.18938533204672725052e-01, /* 0x3FDACFE3, 0x90C97D69 */
1944 w1 = 8.33333333333329678849e-02, /* 0x3FB55555, 0x5555553B */
1945 w2 = -2.77777777728775536470e-03, /* 0xBF66C16C, 0x16B02E5C */
1946 w3 = 7.93650558643019558500e-04, /* 0x3F4A019F, 0x98CF38B6 */
1947 w4 = -5.95187557450339963135e-04, /* 0xBF4380CB, 0x8C0FE741 */
1948 w5 = 8.36339918996282139126e-04, /* 0x3F4B67BA, 0x4CDAD5D1 */
1949 w6 = -1.63092934096575273989e-03; /* 0xBF5AB89D, 0x0B9E43E4 */
1950
1951 static const double zero = 0.00000000000000000000e+00;
1952 double nadj = zero;
1953 double t, y, z, p, p1, p2, p3, q, r, w;
1954 cl_int i, hx, lx, ix;
1955
1956 union {
1957 double d;
1958 cl_ulong u;
1959 } u;
1960 u.d = x;
1961
1962 hx = (cl_int)(u.u >> 32);
1963 lx = (cl_int)(u.u & 0xffffffffULL);
1964
1965 /* purge off +-inf, NaN, +-0, and negative arguments */
1966 // *signgamp = 1;
1967 ix = hx & 0x7fffffff;
1968 if (ix >= 0x7ff00000) return x * x;
1969 if ((ix | lx) == 0) return INFINITY;
1970 if (ix < 0x3b900000)
1971 { /* |x|<2**-70, return -log(|x|) */
1972 if (hx < 0)
1973 {
1974 // *signgamp = -1;
1975 return -reference_log(-x);
1976 }
1977 else
1978 return -reference_log(x);
1979 }
1980 if (hx < 0)
1981 {
1982 if (ix >= 0x43300000) /* |x|>=2**52, must be -integer */
1983 return INFINITY;
1984 t = reference_sinpi(x);
1985 if (t == zero) return INFINITY; /* -integer */
1986 nadj = reference_log(pi / reference_fabs(t * x));
1987 // if(t<zero) *signgamp = -1;
1988 x = -x;
1989 }
1990
1991 /* purge off 1 and 2 */
1992 if ((((ix - 0x3ff00000) | lx) == 0) || (((ix - 0x40000000) | lx) == 0))
1993 r = 0;
1994 /* for x < 2.0 */
1995 else if (ix < 0x40000000)
1996 {
1997 if (ix <= 0x3feccccc)
1998 { /* lgamma(x) = lgamma(x+1)-log(x) */
1999 r = -reference_log(x);
2000 if (ix >= 0x3FE76944)
2001 {
2002 y = 1.0 - x;
2003 i = 0;
2004 }
2005 else if (ix >= 0x3FCDA661)
2006 {
2007 y = x - (tc - one);
2008 i = 1;
2009 }
2010 else
2011 {
2012 y = x;
2013 i = 2;
2014 }
2015 }
2016 else
2017 {
2018 r = zero;
2019 if (ix >= 0x3FFBB4C3)
2020 {
2021 y = 2.0 - x;
2022 i = 0;
2023 } /* [1.7316,2] */
2024 else if (ix >= 0x3FF3B4C4)
2025 {
2026 y = x - tc;
2027 i = 1;
2028 } /* [1.23,1.73] */
2029 else
2030 {
2031 y = x - one;
2032 i = 2;
2033 }
2034 }
2035 switch (i)
2036 {
2037 case 0:
2038 z = y * y;
2039 p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10))));
2040 p2 = z
2041 * (a1
2042 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11)))));
2043 p = y * p1 + p2;
2044 r += (p - 0.5 * y);
2045 break;
2046 case 1:
2047 z = y * y;
2048 w = z * y;
2049 p1 = t0
2050 + w
2051 * (t3
2052 + w * (t6 + w * (t9 + w * t12))); /* parallel comp */
2053 p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13)));
2054 p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14)));
2055 p = z * p1 - (tt - w * (p2 + y * p3));
2056 r += (tf + p);
2057 break;
2058 case 2:
2059 p1 = y
2060 * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));
2061 p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));
2062 r += (-0.5 * y + p1 / p2);
2063 }
2064 }
2065 else if (ix < 0x40200000)
2066 { /* x < 8.0 */
2067 i = (int)x;
2068 t = zero;
2069 y = x - (double)i;
2070 p = y
2071 * (s0
2072 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6))))));
2073 q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6)))));
2074 r = half * y + p / q;
2075 z = one; /* lgamma(1+s) = log(s) + lgamma(s) */
2076 switch (i)
2077 {
2078 case 7: z *= (y + 6.0); /* FALLTHRU */
2079 case 6: z *= (y + 5.0); /* FALLTHRU */
2080 case 5: z *= (y + 4.0); /* FALLTHRU */
2081 case 4: z *= (y + 3.0); /* FALLTHRU */
2082 case 3:
2083 z *= (y + 2.0); /* FALLTHRU */
2084 r += reference_log(z);
2085 break;
2086 }
2087 /* 8.0 <= x < 2**58 */
2088 }
2089 else if (ix < 0x43900000)
2090 {
2091 t = reference_log(x);
2092 z = one / x;
2093 y = z * z;
2094 w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6)))));
2095 r = (x - half) * (t - one) + w;
2096 }
2097 else
2098 /* 2**58 <= x <= inf */
2099 r = x * (reference_log(x) - one);
2100 if (hx < 0) r = nadj - r;
2101 return r;
2102 }
2103
2104 #endif // _MSC_VER
2105
reference_assignment(double x)2106 double reference_assignment(double x) { return x; }
2107
reference_not(double x)2108 int reference_not(double x)
2109 {
2110 int r = !x;
2111 return r;
2112 }
2113
2114 #pragma mark -
2115 #pragma mark Double testing
2116
2117 #ifndef M_PIL
2118 #define M_PIL \
2119 3.14159265358979323846264338327950288419716939937510582097494459230781640628620899L
2120 #endif
2121
2122 static long double reduce1l(long double x);
2123
2124 #ifdef __PPC__
2125 // Since long double on PPC is really extended precision double arithmetic
2126 // consisting of two doubles (a high and low). This form of long double has
2127 // the potential of representing a number with more than LDBL_MANT_DIG digits
2128 // such that reduction algorithm used for other architectures will not work.
2129 // Instead and alternate reduction method is used.
2130
reduce1l(long double x)2131 static long double reduce1l(long double x)
2132 {
2133 union {
2134 long double ld;
2135 double d[2];
2136 } u;
2137
2138 // Reduce the high and low halfs separately.
2139 u.ld = x;
2140 return ((long double)reduce1(u.d[0]) + reduce1(u.d[1]));
2141 }
2142
2143 #else // !__PPC__
2144
reduce1l(long double x)2145 static long double reduce1l(long double x)
2146 {
2147 static long double unit_exp = 0;
2148 if (0.0L == unit_exp) unit_exp = scalbnl(1.0L, LDBL_MANT_DIG);
2149
2150 if (reference_fabsl(x) >= unit_exp)
2151 {
2152 if (reference_fabsl(x) == INFINITY) return cl_make_nan();
2153
2154 return 0.0L; // we patch up the sign for sinPi and cosPi later, since
2155 // they need different signs
2156 }
2157
2158 // Find the nearest multiple of 2
2159 const long double r = reference_copysignl(unit_exp, x);
2160 long double z = x + r;
2161 z -= r;
2162
2163 // subtract it from x. Value is now in the range -1 <= x <= 1
2164 return x - z;
2165 }
2166 #endif // __PPC__
2167
reference_acospil(long double x)2168 long double reference_acospil(long double x)
2169 {
2170 return reference_acosl(x) / M_PIL;
2171 }
reference_asinpil(long double x)2172 long double reference_asinpil(long double x)
2173 {
2174 return reference_asinl(x) / M_PIL;
2175 }
reference_atanpil(long double x)2176 long double reference_atanpil(long double x)
2177 {
2178 return reference_atanl(x) / M_PIL;
2179 }
reference_atan2pil(long double y,long double x)2180 long double reference_atan2pil(long double y, long double x)
2181 {
2182 return reference_atan2l(y, x) / M_PIL;
2183 }
reference_cospil(long double x)2184 long double reference_cospil(long double x)
2185 {
2186 if (reference_fabsl(x) >= HEX_LDBL(+, 1, 0, +, 54))
2187 {
2188 if (reference_fabsl(x) == INFINITY) return cl_make_nan();
2189
2190 // Note this probably fails for odd values between 0x1.0p52 and
2191 // 0x1.0p53. However, when starting with single precision inputs, there
2192 // will be no odd values.
2193
2194 return 1.0L;
2195 }
2196
2197 x = reduce1l(x);
2198
2199 #if DBL_MANT_DIG >= LDBL_MANT_DIG
2200
2201 // phase adjust
2202 double xhi = 0.0;
2203 double xlo = 0.0;
2204 xhi = (double)x + 0.5;
2205
2206 if (reference_fabsl(x) > 0.5L)
2207 {
2208 xlo = xhi - x;
2209 xlo = 0.5 - xlo;
2210 }
2211 else
2212 {
2213 xlo = xhi - 0.5;
2214 xlo = x - xlo;
2215 }
2216
2217 // reduce to [-0.5, 0.5]
2218 if (xhi < -0.5)
2219 {
2220 xhi = -1.0 - xhi;
2221 xlo = -xlo;
2222 }
2223 else if (xhi > 0.5)
2224 {
2225 xhi = 1.0 - xhi;
2226 xlo = -xlo;
2227 }
2228
2229 // cosPi zeros are all +0
2230 if (xhi == 0.0 && xlo == 0.0) return 0.0;
2231
2232 xhi *= M_PI;
2233 xlo *= M_PI;
2234
2235 xhi += xlo;
2236
2237 return reference_sinl(xhi);
2238
2239 #else
2240 // phase adjust
2241 x += 0.5L;
2242
2243 // reduce to [-0.5, 0.5]
2244 if (x < -0.5L)
2245 x = -1.0L - x;
2246 else if (x > 0.5L)
2247 x = 1.0L - x;
2248
2249 // cosPi zeros are all +0
2250 if (x == 0.0L) return 0.0L;
2251
2252 return reference_sinl(x * M_PIL);
2253 #endif
2254 }
2255
reference_dividel(long double x,long double y)2256 long double reference_dividel(long double x, long double y)
2257 {
2258 double dx = x;
2259 double dy = y;
2260 return dx / dy;
2261 }
2262
2263 struct double_double
2264 {
2265 double hi, lo;
2266 };
2267
2268 // Split doubles_double into a series of consecutive 26-bit precise doubles and
2269 // a remainder. Note for later -- for multiplication, it might be better to
2270 // split each double into a power of two and two 26 bit portions
2271 // multiplication of a double double by a known power of
2272 // two is cheap. The current approach causes some inexact
2273 // arithmetic in mul_dd.
split_dd(double_double x,double_double * hi,double_double * lo)2274 static inline void split_dd(double_double x, double_double *hi,
2275 double_double *lo)
2276 {
2277 union {
2278 double d;
2279 cl_ulong u;
2280 } u;
2281 u.d = x.hi;
2282 u.u &= 0xFFFFFFFFF8000000ULL;
2283 hi->hi = u.d;
2284 x.hi -= u.d;
2285
2286 u.d = x.hi;
2287 u.u &= 0xFFFFFFFFF8000000ULL;
2288 hi->lo = u.d;
2289 x.hi -= u.d;
2290
2291 double temp = x.hi;
2292 x.hi += x.lo;
2293 x.lo -= x.hi - temp;
2294 u.d = x.hi;
2295 u.u &= 0xFFFFFFFFF8000000ULL;
2296 lo->hi = u.d;
2297 x.hi -= u.d;
2298
2299 lo->lo = x.hi + x.lo;
2300 }
2301
accum_d(double_double a,double b)2302 static inline double_double accum_d(double_double a, double b)
2303 {
2304 double temp;
2305 if (fabs(b) > fabs(a.hi))
2306 {
2307 temp = a.hi;
2308 a.hi += b;
2309 a.lo += temp - (a.hi - b);
2310 }
2311 else
2312 {
2313 temp = a.hi;
2314 a.hi += b;
2315 a.lo += b - (a.hi - temp);
2316 }
2317
2318 if (isnan(a.lo)) a.lo = 0.0;
2319
2320 return a;
2321 }
2322
add_dd(double_double a,double_double b)2323 static inline double_double add_dd(double_double a, double_double b)
2324 {
2325 double_double r = { -0.0, -0.0 };
2326
2327 if (isinf(a.hi) || isinf(b.hi) || isnan(a.hi) || isnan(b.hi) || 0.0 == a.hi
2328 || 0.0 == b.hi)
2329 {
2330 r.hi = a.hi + b.hi;
2331 r.lo = a.lo + b.lo;
2332 if (isnan(r.lo)) r.lo = 0.0;
2333 return r;
2334 }
2335
2336 // merge sort terms by magnitude -- here we assume that |a.hi| > |a.lo|,
2337 // |b.hi| > |b.lo|, so we don't have to do the first merge pass
2338 double terms[4] = { a.hi, b.hi, a.lo, b.lo };
2339 double temp;
2340
2341 // Sort hi terms
2342 if (fabs(terms[0]) < fabs(terms[1]))
2343 {
2344 temp = terms[0];
2345 terms[0] = terms[1];
2346 terms[1] = temp;
2347 }
2348 // sort lo terms
2349 if (fabs(terms[2]) < fabs(terms[3]))
2350 {
2351 temp = terms[2];
2352 terms[2] = terms[3];
2353 terms[3] = temp;
2354 }
2355 // Fix case where small high term is less than large low term
2356 if (fabs(terms[1]) < fabs(terms[2]))
2357 {
2358 temp = terms[1];
2359 terms[1] = terms[2];
2360 terms[2] = temp;
2361 }
2362
2363 // accumulate the results
2364 r.hi = terms[2] + terms[3];
2365 r.lo = terms[3] - (r.hi - terms[2]);
2366
2367 temp = r.hi;
2368 r.hi += terms[1];
2369 r.lo += temp - (r.hi - terms[1]);
2370
2371 temp = r.hi;
2372 r.hi += terms[0];
2373 r.lo += temp - (r.hi - terms[0]);
2374
2375 // canonicalize the result
2376 temp = r.hi;
2377 r.hi += r.lo;
2378 r.lo = r.lo - (r.hi - temp);
2379 if (isnan(r.lo)) r.lo = 0.0;
2380
2381 return r;
2382 }
2383
mul_dd(double_double a,double_double b)2384 static inline double_double mul_dd(double_double a, double_double b)
2385 {
2386 double_double result = { -0.0, -0.0 };
2387
2388 // Inf, nan and 0
2389 if (isnan(a.hi) || isnan(b.hi) || isinf(a.hi) || isinf(b.hi) || 0.0 == a.hi
2390 || 0.0 == b.hi)
2391 {
2392 result.hi = a.hi * b.hi;
2393 return result;
2394 }
2395
2396 double_double ah, al, bh, bl;
2397 split_dd(a, &ah, &al);
2398 split_dd(b, &bh, &bl);
2399
2400 double p0 = ah.hi * bh.hi; // exact (52 bits in product) 0
2401 double p1 = ah.hi * bh.lo; // exact (52 bits in product) 26
2402 double p2 = ah.lo * bh.hi; // exact (52 bits in product) 26
2403 double p3 = ah.lo * bh.lo; // exact (52 bits in product) 52
2404 double p4 = al.hi * bh.hi; // exact (52 bits in product) 52
2405 double p5 = al.hi * bh.lo; // exact (52 bits in product) 78
2406 double p6 = al.lo * bh.hi; // inexact (54 bits in product) 78
2407 double p7 = al.lo * bh.lo; // inexact (54 bits in product) 104
2408 double p8 = ah.hi * bl.hi; // exact (52 bits in product) 52
2409 double p9 = ah.hi * bl.lo; // inexact (54 bits in product) 78
2410 double pA = ah.lo * bl.hi; // exact (52 bits in product) 78
2411 double pB = ah.lo * bl.lo; // inexact (54 bits in product) 104
2412 double pC = al.hi * bl.hi; // exact (52 bits in product) 104
2413 // the last 3 terms are two low to appear in the result
2414
2415
2416 // take advantage of the known relative magnitudes of the partial products
2417 // to avoid some sorting Combine 2**-78 and 2**-104 terms. Here we are a bit
2418 // sloppy about canonicalizing the double_doubles
2419 double_double t0 = { pA, pC };
2420 double_double t1 = { p9, pB };
2421 double_double t2 = { p6, p7 };
2422 double temp0, temp1, temp2;
2423
2424 t0 = accum_d(t0, p5); // there is an extra 2**-78 term to deal with
2425
2426 // Add in 2**-52 terms. Here we are a bit sloppy about canonicalizing the
2427 // double_doubles
2428 temp0 = t0.hi;
2429 temp1 = t1.hi;
2430 temp2 = t2.hi;
2431 t0.hi += p3;
2432 t1.hi += p4;
2433 t2.hi += p8;
2434 temp0 -= t0.hi - p3;
2435 temp1 -= t1.hi - p4;
2436 temp2 -= t2.hi - p8;
2437 t0.lo += temp0;
2438 t1.lo += temp1;
2439 t2.lo += temp2;
2440
2441 // Add in 2**-26 terms. Here we are a bit sloppy about canonicalizing the
2442 // double_doubles
2443 temp1 = t1.hi;
2444 temp2 = t2.hi;
2445 t1.hi += p1;
2446 t2.hi += p2;
2447 temp1 -= t1.hi - p1;
2448 temp2 -= t2.hi - p2;
2449 t1.lo += temp1;
2450 t2.lo += temp2;
2451
2452 // Combine accumulators to get the low bits of result
2453 t1 = add_dd(t1, add_dd(t2, t0));
2454
2455 // Add in MSB's, and round to precision
2456 return accum_d(t1, p0); // canonicalizes
2457 }
2458
2459
reference_exp10l(long double z)2460 long double reference_exp10l(long double z)
2461 {
2462 const double_double log2_10 = { HEX_DBL(+, 1, a934f0979a371, +, 1),
2463 HEX_DBL(+, 1, 7f2495fb7fa6d, -, 53) };
2464 double_double x;
2465 int j;
2466
2467 // Handle NaNs
2468 if (isnan(z)) return z;
2469
2470 // init x
2471 x.hi = z;
2472 x.lo = z - x.hi;
2473
2474
2475 // 10**x = exp2( x * log2(10) )
2476
2477 x = mul_dd(x, log2_10); // x * log2(10)
2478
2479 // Deal with overflow and underflow for exp2(x) stage next
2480 if (x.hi >= 1025) return INFINITY;
2481
2482 if (x.hi < -1075 - 24) return +0.0;
2483
2484 // find nearest integer to x
2485 int i = (int)rint(x.hi);
2486
2487 // x now holds fractional part. The result would be then 2**i * exp2( x )
2488 x.hi -= i;
2489
2490 // We could attempt to find a minimax polynomial for exp2(x) over the range
2491 // x = [-0.5, 0.5]. However, this would converge very slowly near the
2492 // extrema, where 0.5**n is not a lot different from 0.5**(n+1), thereby
2493 // requiring something like a 20th order polynomial to get 53 + 24 bits of
2494 // precision. Instead we further reduce the range to [-1/32, 1/32] by
2495 // observing that
2496 //
2497 // 2**(a+b) = 2**a * 2**b
2498 //
2499 // We can thus build a table of 2**a values for a = n/16, n = [-8, 8], and
2500 // reduce the range of x to [-1/32, 1/32] by subtracting away the nearest
2501 // value of n/16 from x.
2502 const double_double corrections[17] = {
2503 { HEX_DBL(+, 1, 6a09e667f3bcd, -, 1),
2504 HEX_DBL(-, 1, bdd3413b26456, -, 55) },
2505 { HEX_DBL(+, 1, 7a11473eb0187, -, 1),
2506 HEX_DBL(-, 1, 41577ee04992f, -, 56) },
2507 { HEX_DBL(+, 1, 8ace5422aa0db, -, 1),
2508 HEX_DBL(+, 1, 6e9f156864b27, -, 55) },
2509 { HEX_DBL(+, 1, 9c49182a3f09, -, 1),
2510 HEX_DBL(+, 1, c7c46b071f2be, -, 57) },
2511 { HEX_DBL(+, 1, ae89f995ad3ad, -, 1),
2512 HEX_DBL(+, 1, 7a1cd345dcc81, -, 55) },
2513 { HEX_DBL(+, 1, c199bdd85529c, -, 1),
2514 HEX_DBL(+, 1, 11065895048dd, -, 56) },
2515 { HEX_DBL(+, 1, d5818dcfba487, -, 1),
2516 HEX_DBL(+, 1, 2ed02d75b3707, -, 56) },
2517 { HEX_DBL(+, 1, ea4afa2a490da, -, 1),
2518 HEX_DBL(-, 1, e9c23179c2893, -, 55) },
2519 { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
2520 { HEX_DBL(+, 1, 0b5586cf9890f, +, 0),
2521 HEX_DBL(+, 1, 8a62e4adc610b, -, 54) },
2522 { HEX_DBL(+, 1, 172b83c7d517b, +, 0),
2523 HEX_DBL(-, 1, 19041b9d78a76, -, 55) },
2524 { HEX_DBL(+, 1, 2387a6e756238, +, 0),
2525 HEX_DBL(+, 1, 9b07eb6c70573, -, 54) },
2526 { HEX_DBL(+, 1, 306fe0a31b715, +, 0),
2527 HEX_DBL(+, 1, 6f46ad23182e4, -, 55) },
2528 { HEX_DBL(+, 1, 3dea64c123422, +, 0),
2529 HEX_DBL(+, 1, ada0911f09ebc, -, 55) },
2530 { HEX_DBL(+, 1, 4bfdad5362a27, +, 0),
2531 HEX_DBL(+, 1, d4397afec42e2, -, 56) },
2532 { HEX_DBL(+, 1, 5ab07dd485429, +, 0),
2533 HEX_DBL(+, 1, 6324c054647ad, -, 54) },
2534 { HEX_DBL(+, 1, 6a09e667f3bcd, +, 0),
2535 HEX_DBL(-, 1, bdd3413b26456, -, 54) }
2536
2537 };
2538 int index = (int)rint(x.hi * 16.0);
2539 x.hi -= (double)index * 0.0625;
2540
2541 // canonicalize x
2542 double temp = x.hi;
2543 x.hi += x.lo;
2544 x.lo -= x.hi - temp;
2545
2546 // Minimax polynomial for (exp2(x)-1)/x, over the range [-1/32, 1/32]. Max
2547 // Error: 2 * 0x1.e112p-87
2548 const double_double c[] = { { HEX_DBL(+, 1, 62e42fefa39ef, -, 1),
2549 HEX_DBL(+, 1, abc9e3ac1d244, -, 56) },
2550 { HEX_DBL(+, 1, ebfbdff82c58f, -, 3),
2551 HEX_DBL(-, 1, 5e4987a631846, -, 57) },
2552 { HEX_DBL(+, 1, c6b08d704a0c, -, 5),
2553 HEX_DBL(-, 1, d323200a05713, -, 59) },
2554 { HEX_DBL(+, 1, 3b2ab6fba4e7a, -, 7),
2555 HEX_DBL(+, 1, c5ee8f8b9f0c1, -, 63) },
2556 { HEX_DBL(+, 1, 5d87fe78a672a, -, 10),
2557 HEX_DBL(+, 1, 884e5e5cc7ecc, -, 64) },
2558 { HEX_DBL(+, 1, 430912f7e8373, -, 13),
2559 HEX_DBL(+, 1, 4f1b59514a326, -, 67) },
2560 { HEX_DBL(+, 1, ffcbfc5985e71, -, 17),
2561 HEX_DBL(-, 1, db7d6a0953b78, -, 71) },
2562 { HEX_DBL(+, 1, 62c150eb16465, -, 20),
2563 HEX_DBL(+, 1, e0767c2d7abf5, -, 80) },
2564 { HEX_DBL(+, 1, b52502b5e953, -, 24),
2565 HEX_DBL(+, 1, 6797523f944bc, -, 78) } };
2566 size_t count = sizeof(c) / sizeof(c[0]);
2567
2568 // Do polynomial
2569 double_double r = c[count - 1];
2570 for (j = (int)count - 2; j >= 0; j--) r = add_dd(c[j], mul_dd(r, x));
2571
2572 // unwind approximation
2573 r = mul_dd(r, x); // before: r =(exp2(x)-1)/x; after: r = exp2(x) - 1
2574
2575 // correct for [-0.5, 0.5] -> [-1/32, 1/32] reduction above
2576 // exp2(x) = (r + 1) * correction = r * correction + correction
2577 r = mul_dd(r, corrections[index + 8]);
2578 r = add_dd(r, corrections[index + 8]);
2579
2580 // Format result for output:
2581
2582 // Get mantissa
2583 long double m = ((long double)r.hi + (long double)r.lo);
2584
2585 // Handle a pesky overflow cases when long double = double
2586 if (i > 512)
2587 {
2588 m *= HEX_DBL(+, 1, 0, +, 512);
2589 i -= 512;
2590 }
2591 else if (i < -512)
2592 {
2593 m *= HEX_DBL(+, 1, 0, -, 512);
2594 i += 512;
2595 }
2596
2597 return m * ldexpl(1.0L, i);
2598 }
2599
2600
fallback_frexp(double x,int * iptr)2601 static double fallback_frexp(double x, int *iptr)
2602 {
2603 cl_ulong u, v;
2604 double fu, fv;
2605
2606 memcpy(&u, &x, sizeof(u));
2607
2608 cl_ulong exponent = u & 0x7ff0000000000000ULL;
2609 cl_ulong mantissa = u & ~0x7ff0000000000000ULL;
2610
2611 // add 1 to the exponent
2612 exponent += 0x0010000000000000ULL;
2613
2614 if ((cl_long)exponent < (cl_long)0x0020000000000000LL)
2615 { // subnormal, NaN, Inf
2616 mantissa |= 0x3fe0000000000000ULL;
2617
2618 v = mantissa & 0xfff0000000000000ULL;
2619 u = mantissa;
2620 memcpy(&fv, &v, sizeof(v));
2621 memcpy(&fu, &u, sizeof(u));
2622
2623 fu -= fv;
2624
2625 memcpy(&v, &fv, sizeof(v));
2626 memcpy(&u, &fu, sizeof(u));
2627
2628 exponent = u & 0x7ff0000000000000ULL;
2629 mantissa = u & ~0x7ff0000000000000ULL;
2630
2631 *iptr = (exponent >> 52) + (-1022 + 1 - 1022);
2632 u = mantissa | 0x3fe0000000000000ULL;
2633 memcpy(&fu, &u, sizeof(u));
2634 return fu;
2635 }
2636
2637 *iptr = (exponent >> 52) - 1023;
2638 u = mantissa | 0x3fe0000000000000ULL;
2639 memcpy(&fu, &u, sizeof(u));
2640 return fu;
2641 }
2642
2643 // Assumes zeros, infinities and NaNs handed elsewhere
extract(double x,cl_ulong * mant)2644 static inline int extract(double x, cl_ulong *mant)
2645 {
2646 static double (*frexpp)(double, int *) = NULL;
2647 int e;
2648
2649 // verify that frexp works properly
2650 if (NULL == frexpp)
2651 {
2652 if (0.5 == frexp(HEX_DBL(+, 1, 0, -, 1030), &e) && e == -1029)
2653 frexpp = frexp;
2654 else
2655 frexpp = fallback_frexp;
2656 }
2657
2658 *mant = (cl_ulong)(HEX_DBL(+, 1, 0, +, 64) * fabs(frexpp(x, &e)));
2659 return e - 1;
2660 }
2661
2662 // Return 128-bit product of a*b as (hi << 64) + lo
mul128(cl_ulong a,cl_ulong b,cl_ulong * hi,cl_ulong * lo)2663 static inline void mul128(cl_ulong a, cl_ulong b, cl_ulong *hi, cl_ulong *lo)
2664 {
2665 cl_ulong alo = a & 0xffffffffULL;
2666 cl_ulong ahi = a >> 32;
2667 cl_ulong blo = b & 0xffffffffULL;
2668 cl_ulong bhi = b >> 32;
2669 cl_ulong aloblo = alo * blo;
2670 cl_ulong alobhi = alo * bhi;
2671 cl_ulong ahiblo = ahi * blo;
2672 cl_ulong ahibhi = ahi * bhi;
2673
2674 alobhi += (aloblo >> 32)
2675 + (ahiblo
2676 & 0xffffffffULL); // cannot overflow: (2^32-1)^2 + 2 * (2^32-1) =
2677 // (2^64 - 2^33 + 1) + (2^33 - 2) = 2^64 - 1
2678 *hi = ahibhi + (alobhi >> 32)
2679 + (ahiblo >> 32); // cannot overflow: (2^32-1)^2 + 2 * (2^32-1) =
2680 // (2^64 - 2^33 + 1) + (2^33 - 2) = 2^64 - 1
2681 *lo = (aloblo & 0xffffffffULL) | (alobhi << 32);
2682 }
2683
round_to_nearest_even_double(cl_ulong hi,cl_ulong lo,int exponent)2684 static double round_to_nearest_even_double(cl_ulong hi, cl_ulong lo,
2685 int exponent)
2686 {
2687 union {
2688 cl_ulong u;
2689 cl_double d;
2690 } u;
2691
2692 // edges
2693 if (exponent > 1023) return INFINITY;
2694 if (exponent == -1075 && (hi | (lo != 0)) > 0x8000000000000000ULL)
2695 return HEX_DBL(+, 1, 0, -, 1074);
2696 if (exponent <= -1075) return 0.0;
2697
2698 // Figure out which bits go where
2699 int shift = 11;
2700 if (exponent < -1022)
2701 {
2702 shift -= 1022 + exponent; // subnormal: shift is not 52
2703 exponent = -1023; // set exponent to 0
2704 }
2705 else
2706 hi &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove
2707 // it.
2708
2709 // Assemble the double (round toward zero)
2710 u.u = (hi >> shift) | ((cl_ulong)(exponent + 1023) << 52);
2711
2712 // put a representation of the residual bits into hi
2713 hi <<= (64 - shift);
2714 hi |= lo >> shift;
2715 lo <<= (64 - shift);
2716 hi |= lo != 0;
2717
2718 // round to nearest, ties to even
2719 if (hi < 0x8000000000000000ULL) return u.d;
2720 if (hi == 0x8000000000000000ULL)
2721 u.u += u.u & 1ULL;
2722 else
2723 u.u++;
2724
2725 return u.d;
2726 }
2727
2728 // Shift right. Bits lost on the right will be OR'd together and OR'd with the
2729 // LSB
shift_right_sticky_128(cl_ulong * hi,cl_ulong * lo,int shift)2730 static inline void shift_right_sticky_128(cl_ulong *hi, cl_ulong *lo, int shift)
2731 {
2732 cl_ulong sticky = 0;
2733 cl_ulong h = *hi;
2734 cl_ulong l = *lo;
2735
2736 if (shift >= 64)
2737 {
2738 shift -= 64;
2739 sticky = 0 != lo;
2740 l = h;
2741 h = 0;
2742 if (shift >= 64)
2743 {
2744 sticky |= (0 != l);
2745 l = 0;
2746 }
2747 else
2748 {
2749 sticky |= (0 != (l << (64 - shift)));
2750 l >>= shift;
2751 }
2752 }
2753 else
2754 {
2755 sticky |= (0 != (l << (64 - shift)));
2756 l >>= shift;
2757 l |= h << (64 - shift);
2758 h >>= shift;
2759 }
2760
2761 *lo = l | sticky;
2762 *hi = h;
2763 }
2764
2765 // 128-bit add of ((*hi << 64) + *lo) + ((chi << 64) + clo)
2766 // If the 129 bit result doesn't fit, bits lost off the right end will be OR'd
2767 // with the LSB
add128(cl_ulong * hi,cl_ulong * lo,cl_ulong chi,cl_ulong clo,int * exponent)2768 static inline void add128(cl_ulong *hi, cl_ulong *lo, cl_ulong chi,
2769 cl_ulong clo, int *exponent)
2770 {
2771 cl_ulong carry, carry2;
2772 // extended precision add
2773 clo = add_carry(*lo, clo, &carry);
2774 chi = add_carry(*hi, chi, &carry2);
2775 chi = add_carry(chi, carry, &carry);
2776
2777 // If we overflowed the 128 bit result
2778 if (carry || carry2)
2779 {
2780 carry = clo & 1; // set aside low bit
2781 clo >>= 1; // right shift low 1
2782 clo |= carry; // or back in the low bit, so we don't come to believe
2783 // this is an exact half way case for rounding
2784 clo |= chi << 63; // move lowest high bit into highest bit of lo
2785 chi >>= 1; // right shift hi
2786 chi |= 0x8000000000000000ULL; // move the carry bit into hi.
2787 *exponent = *exponent + 1;
2788 }
2789
2790 *hi = chi;
2791 *lo = clo;
2792 }
2793
2794 // 128-bit subtract of ((chi << 64) + clo) - ((*hi << 64) + *lo)
sub128(cl_ulong * chi,cl_ulong * clo,cl_ulong hi,cl_ulong lo,cl_ulong * signC,int * expC)2795 static inline void sub128(cl_ulong *chi, cl_ulong *clo, cl_ulong hi,
2796 cl_ulong lo, cl_ulong *signC, int *expC)
2797 {
2798 cl_ulong rHi = *chi;
2799 cl_ulong rLo = *clo;
2800 cl_ulong carry, carry2;
2801
2802 // extended precision subtract
2803 rLo = sub_carry(rLo, lo, &carry);
2804 rHi = sub_carry(rHi, hi, &carry2);
2805 rHi = sub_carry(rHi, carry, &carry);
2806
2807 // Check for sign flip
2808 if (carry || carry2)
2809 {
2810 *signC ^= 0x8000000000000000ULL;
2811
2812 // negate rLo, rHi: -x = (x ^ -1) + 1
2813 rLo ^= -1ULL;
2814 rHi ^= -1ULL;
2815 rLo++;
2816 rHi += 0 == rLo;
2817 }
2818
2819 // normalize -- move the most significant non-zero bit to the MSB, and
2820 // adjust exponent accordingly
2821 if (rHi == 0)
2822 {
2823 rHi = rLo;
2824 *expC = *expC - 64;
2825 rLo = 0;
2826 }
2827
2828 if (rHi)
2829 {
2830 int shift = 32;
2831 cl_ulong test = 1ULL << 32;
2832 while (0 == (rHi & 0x8000000000000000ULL))
2833 {
2834 if (rHi < test)
2835 {
2836 rHi <<= shift;
2837 rHi |= rLo >> (64 - shift);
2838 rLo <<= shift;
2839 *expC = *expC - shift;
2840 }
2841 shift >>= 1;
2842 test <<= shift;
2843 }
2844 }
2845 else
2846 {
2847 // zero
2848 *expC = INT_MIN;
2849 *signC = 0;
2850 }
2851
2852
2853 *chi = rHi;
2854 *clo = rLo;
2855 }
2856
reference_fmal(long double x,long double y,long double z)2857 long double reference_fmal(long double x, long double y, long double z)
2858 {
2859 static const cl_ulong kMSB = 0x8000000000000000ULL;
2860
2861 // cast values back to double. This is an exact function, so
2862 double a = x;
2863 double b = y;
2864 double c = z;
2865
2866 // Make bits accessible
2867 union {
2868 cl_ulong u;
2869 cl_double d;
2870 } ua;
2871 ua.d = a;
2872 union {
2873 cl_ulong u;
2874 cl_double d;
2875 } ub;
2876 ub.d = b;
2877 union {
2878 cl_ulong u;
2879 cl_double d;
2880 } uc;
2881 uc.d = c;
2882
2883 // deal with Nans, infinities and zeros
2884 if (isnan(a) || isnan(b) || isnan(c) || isinf(a) || isinf(b) || isinf(c)
2885 || 0 == (ua.u & ~kMSB) || // a == 0, defeat host FTZ behavior
2886 0 == (ub.u & ~kMSB) || // b == 0, defeat host FTZ behavior
2887 0 == (uc.u & ~kMSB)) // c == 0, defeat host FTZ behavior
2888 {
2889 if (isinf(c) && !isinf(a) && !isinf(b)) return (c + a) + b;
2890
2891 a = (double)reference_multiplyl(
2892 a, b); // some risk that the compiler will insert a non-compliant
2893 // fma here on some platforms.
2894 return reference_addl(
2895 a,
2896 c); // We use STDC FP_CONTRACT OFF above to attempt to defeat that.
2897 }
2898
2899 // extract exponent and mantissa
2900 // exponent is a standard unbiased signed integer
2901 // mantissa is a cl_uint, with leading non-zero bit positioned at the MSB
2902 cl_ulong mantA, mantB, mantC;
2903 int expA = extract(a, &mantA);
2904 int expB = extract(b, &mantB);
2905 int expC = extract(c, &mantC);
2906 cl_ulong signC = uc.u & kMSB; // We'll need the sign bit of C later to
2907 // decide if we are adding or subtracting
2908
2909 // exact product of A and B
2910 int exponent = expA + expB;
2911 cl_ulong sign = (ua.u ^ ub.u) & kMSB;
2912 cl_ulong hi, lo;
2913 mul128(mantA, mantB, &hi, &lo);
2914
2915 // renormalize
2916 if (0 == (kMSB & hi))
2917 {
2918 hi <<= 1;
2919 hi |= lo >> 63;
2920 lo <<= 1;
2921 }
2922 else
2923 exponent++; // 2**63 * 2**63 gives 2**126. If the MSB was set, then our
2924 // exponent increased.
2925
2926 // infinite precision add
2927 cl_ulong chi = mantC;
2928 cl_ulong clo = 0;
2929
2930 if (exponent >= expC)
2931 {
2932 // Normalize C relative to the product
2933 if (exponent > expC)
2934 shift_right_sticky_128(&chi, &clo, exponent - expC);
2935
2936 // Add
2937 if (sign ^ signC)
2938 sub128(&hi, &lo, chi, clo, &sign, &exponent);
2939 else
2940 add128(&hi, &lo, chi, clo, &exponent);
2941 }
2942 else
2943 {
2944 // Shift the product relative to C so that their exponents match
2945 shift_right_sticky_128(&hi, &lo, expC - exponent);
2946
2947 // add
2948 if (sign ^ signC)
2949 sub128(&chi, &clo, hi, lo, &signC, &expC);
2950 else
2951 add128(&chi, &clo, hi, lo, &expC);
2952
2953 hi = chi;
2954 lo = clo;
2955 exponent = expC;
2956 sign = signC;
2957 }
2958
2959 // round
2960 ua.d = round_to_nearest_even_double(hi, lo, exponent);
2961
2962 // Set the sign
2963 ua.u |= sign;
2964
2965 return ua.d;
2966 }
2967
2968
reference_madl(long double a,long double b,long double c)2969 long double reference_madl(long double a, long double b, long double c)
2970 {
2971 return a * b + c;
2972 }
2973
reference_recipl(long double x)2974 long double reference_recipl(long double x) { return 1.0L / x; }
2975
reference_rootnl(long double x,int i)2976 long double reference_rootnl(long double x, int i)
2977 {
2978 // rootn ( x, 0 ) returns a NaN.
2979 if (0 == i) return cl_make_nan();
2980
2981 // rootn ( x, n ) returns a NaN for x < 0 and n is even.
2982 if (x < 0.0L && 0 == (i & 1)) return cl_make_nan();
2983
2984 if (isinf(x))
2985 {
2986 if (i < 0) return reference_copysignl(0.0L, x);
2987
2988 return x;
2989 }
2990
2991 if (x == 0.0)
2992 {
2993 switch (i & 0x80000001)
2994 {
2995 // rootn ( +-0, n ) is +0 for even n > 0.
2996 case 0: return 0.0L;
2997
2998 // rootn ( +-0, n ) is +-0 for odd n > 0.
2999 case 1: return x;
3000
3001 // rootn ( +-0, n ) is +inf for even n < 0.
3002 case 0x80000000: return INFINITY;
3003
3004 // rootn ( +-0, n ) is +-inf for odd n < 0.
3005 case 0x80000001: return copysign(INFINITY, x);
3006 }
3007 }
3008
3009 if (i == 1) return x;
3010
3011 if (i == -1) return 1.0 / x;
3012
3013 long double sign = x;
3014 x = reference_fabsl(x);
3015 double iHi, iLo;
3016 DivideDD(&iHi, &iLo, 1.0, i);
3017 x = reference_powl(x, iHi) * reference_powl(x, iLo);
3018
3019 return reference_copysignl(x, sign);
3020 }
3021
reference_rsqrtl(long double x)3022 long double reference_rsqrtl(long double x) { return 1.0L / sqrtl(x); }
3023
reference_sinpil(long double x)3024 long double reference_sinpil(long double x)
3025 {
3026 double r = reduce1l(x);
3027
3028 // reduce to [-0.5, 0.5]
3029 if (r < -0.5L)
3030 r = -1.0L - r;
3031 else if (r > 0.5L)
3032 r = 1.0L - r;
3033
3034 // sinPi zeros have the same sign as x
3035 if (r == 0.0L) return reference_copysignl(0.0L, x);
3036
3037 return reference_sinl(r * M_PIL);
3038 }
3039
reference_tanpil(long double x)3040 long double reference_tanpil(long double x)
3041 {
3042 // set aside the sign (allows us to preserve sign of -0)
3043 long double sign = reference_copysignl(1.0L, x);
3044 long double z = reference_fabsl(x);
3045
3046 // if big and even -- caution: only works if x only has single precision
3047 if (z >= HEX_LDBL(+, 1, 0, +, 53))
3048 {
3049 if (z == INFINITY) return x - x; // nan
3050
3051 return reference_copysignl(
3052 0.0L, x); // tanpi ( n ) is copysign( 0.0, n) for even integers n.
3053 }
3054
3055 // reduce to the range [ -0.5, 0.5 ]
3056 long double nearest =
3057 reference_rintl(z); // round to nearest even places n + 0.5 values in
3058 // the right place for us
3059 int64_t i =
3060 (int64_t)nearest; // test above against 0x1.0p53 avoids overflow here
3061 z -= nearest;
3062
3063 // correction for odd integer x for the right sign of zero
3064 if ((i & 1) && z == 0.0L) sign = -sign;
3065
3066 // track changes to the sign
3067 sign *= reference_copysignl(1.0L, z); // really should just be an xor
3068 z = reference_fabsl(z); // remove the sign again
3069
3070 // reduce once more
3071 // If we don't do this, rounding error in z * M_PI will cause us not to
3072 // return infinities properly
3073 if (z > 0.25L)
3074 {
3075 z = 0.5L - z;
3076 return sign
3077 / reference_tanl(z
3078 * M_PIL); // use system tan to get the right result
3079 }
3080
3081 //
3082 return sign
3083 * reference_tanl(z * M_PIL); // use system tan to get the right result
3084 }
3085
reference_pownl(long double x,int i)3086 long double reference_pownl(long double x, int i)
3087 {
3088 return reference_powl(x, (long double)i);
3089 }
3090
reference_powrl(long double x,long double y)3091 long double reference_powrl(long double x, long double y)
3092 {
3093 // powr ( x, y ) returns NaN for x < 0.
3094 if (x < 0.0L) return cl_make_nan();
3095
3096 // powr ( x, NaN ) returns the NaN for x >= 0.
3097 // powr ( NaN, y ) returns the NaN.
3098 if (isnan(x) || isnan(y))
3099 return x + y; // Note: behavior different here than for pow(1,NaN),
3100 // pow(NaN, 0)
3101
3102 if (x == 1.0L)
3103 {
3104 // powr ( +1, +-inf ) returns NaN.
3105 if (reference_fabsl(y) == INFINITY) return cl_make_nan();
3106
3107 // powr ( +1, y ) is 1 for finite y. (NaN handled above)
3108 return 1.0L;
3109 }
3110
3111 if (y == 0.0L)
3112 {
3113 // powr ( +inf, +-0 ) returns NaN.
3114 // powr ( +-0, +-0 ) returns NaN.
3115 if (x == 0.0L || x == INFINITY) return cl_make_nan();
3116
3117 // powr ( x, +-0 ) is 1 for finite x > 0. (x <= 0, NaN, INF already
3118 // handled above)
3119 return 1.0L;
3120 }
3121
3122 if (x == 0.0L)
3123 {
3124 // powr ( +-0, -inf) is +inf.
3125 // powr ( +-0, y ) is +inf for finite y < 0.
3126 if (y < 0.0L) return INFINITY;
3127
3128 // powr ( +-0, y ) is +0 for y > 0. (NaN, y==0 handled above)
3129 return 0.0L;
3130 }
3131
3132 return reference_powl(x, y);
3133 }
3134
reference_addl(long double x,long double y)3135 long double reference_addl(long double x, long double y)
3136 {
3137 volatile double a = (double)x;
3138 volatile double b = (double)y;
3139
3140 #if defined(__SSE2__)
3141 // defeat x87
3142 __m128d va = _mm_set_sd((double)a);
3143 __m128d vb = _mm_set_sd((double)b);
3144 va = _mm_add_sd(va, vb);
3145 _mm_store_sd((double *)&a, va);
3146 #else
3147 a += b;
3148 #endif
3149 return (long double)a;
3150 }
3151
reference_subtractl(long double x,long double y)3152 long double reference_subtractl(long double x, long double y)
3153 {
3154 volatile double a = (double)x;
3155 volatile double b = (double)y;
3156
3157 #if defined(__SSE2__)
3158 // defeat x87
3159 __m128d va = _mm_set_sd((double)a);
3160 __m128d vb = _mm_set_sd((double)b);
3161 va = _mm_sub_sd(va, vb);
3162 _mm_store_sd((double *)&a, va);
3163 #else
3164 a -= b;
3165 #endif
3166 return (long double)a;
3167 }
3168
reference_multiplyl(long double x,long double y)3169 long double reference_multiplyl(long double x, long double y)
3170 {
3171 volatile double a = (double)x;
3172 volatile double b = (double)y;
3173
3174 #if defined(__SSE2__)
3175 // defeat x87
3176 __m128d va = _mm_set_sd((double)a);
3177 __m128d vb = _mm_set_sd((double)b);
3178 va = _mm_mul_sd(va, vb);
3179 _mm_store_sd((double *)&a, va);
3180 #else
3181 a *= b;
3182 #endif
3183 return (long double)a;
3184 }
3185
reference_lgamma_rl(long double x,int * signp)3186 long double reference_lgamma_rl(long double x, int *signp)
3187 {
3188 *signp = 0;
3189 return x;
3190 }
3191
reference_isequall(long double x,long double y)3192 int reference_isequall(long double x, long double y) { return x == y; }
reference_isfinitel(long double x)3193 int reference_isfinitel(long double x) { return 0 != isfinite(x); }
reference_isgreaterl(long double x,long double y)3194 int reference_isgreaterl(long double x, long double y) { return x > y; }
reference_isgreaterequall(long double x,long double y)3195 int reference_isgreaterequall(long double x, long double y) { return x >= y; }
reference_isinfl(long double x)3196 int reference_isinfl(long double x) { return 0 != isinf(x); }
reference_islessl(long double x,long double y)3197 int reference_islessl(long double x, long double y) { return x < y; }
reference_islessequall(long double x,long double y)3198 int reference_islessequall(long double x, long double y) { return x <= y; }
3199 #if defined(__INTEL_COMPILER)
reference_islessgreaterl(long double x,long double y)3200 int reference_islessgreaterl(long double x, long double y)
3201 {
3202 return 0 != islessgreaterl(x, y);
3203 }
3204 #else
reference_islessgreaterl(long double x,long double y)3205 int reference_islessgreaterl(long double x, long double y)
3206 {
3207 return 0 != islessgreater(x, y);
3208 }
3209 #endif
reference_isnanl(long double x)3210 int reference_isnanl(long double x) { return 0 != isnan(x); }
reference_isnormall(long double x)3211 int reference_isnormall(long double x) { return 0 != isnormal((double)x); }
reference_isnotequall(long double x,long double y)3212 int reference_isnotequall(long double x, long double y) { return x != y; }
reference_isorderedl(long double x,long double y)3213 int reference_isorderedl(long double x, long double y)
3214 {
3215 return x == x && y == y;
3216 }
reference_isunorderedl(long double x,long double y)3217 int reference_isunorderedl(long double x, long double y)
3218 {
3219 return isnan(x) || isnan(y);
3220 }
3221 #if defined(__INTEL_COMPILER)
reference_signbitl(long double x)3222 int reference_signbitl(long double x) { return 0 != signbitl(x); }
3223 #else
reference_signbitl(long double x)3224 int reference_signbitl(long double x) { return 0 != signbit(x); }
3225 #endif
3226 long double reference_copysignl(long double x, long double y);
3227 long double reference_roundl(long double x);
3228 long double reference_cbrtl(long double x);
3229
reference_copysignl(long double x,long double y)3230 long double reference_copysignl(long double x, long double y)
3231 {
3232 // We hope that the long double to double conversion proceeds with sign
3233 // fidelity, even for zeros and NaNs
3234 union {
3235 double d;
3236 cl_ulong u;
3237 } u;
3238 u.d = (double)y;
3239
3240 x = reference_fabsl(x);
3241 if (u.u >> 63) x = -x;
3242
3243 return x;
3244 }
3245
reference_roundl(long double x)3246 long double reference_roundl(long double x)
3247 {
3248 // Since we are just using this to verify double precision, we can
3249 // use the double precision copysign here
3250
3251 #if defined(__MINGW32__) && defined(__x86_64__)
3252 long double absx = reference_fabsl(x);
3253 if (absx < 0.5L) return reference_copysignl(0.0L, x);
3254 #endif
3255 return round((double)x);
3256 }
3257
reference_truncl(long double x)3258 long double reference_truncl(long double x)
3259 {
3260 // Since we are just using this to verify double precision, we can
3261 // use the double precision copysign here
3262 return trunc((double)x);
3263 }
3264
3265 static long double reference_scalblnl(long double x, long n);
3266
reference_cbrtl(long double x)3267 long double reference_cbrtl(long double x)
3268 {
3269 double yhi = HEX_DBL(+, 1, 5555555555555, -, 2);
3270 double ylo = HEX_DBL(+, 1, 558, -, 56);
3271
3272 double fabsx = reference_fabs(x);
3273
3274 if (isnan(x) || fabsx == 1.0 || fabsx == 0.0 || isinf(x)) return x;
3275
3276 double log2x_hi, log2x_lo;
3277
3278 // extended precision log .... accurate to at least 64-bits + couple of
3279 // guard bits
3280 __log2_ep(&log2x_hi, &log2x_lo, fabsx);
3281
3282 double ylog2x_hi, ylog2x_lo;
3283
3284 double y_hi = yhi;
3285 double y_lo = ylo;
3286
3287 // compute product of y*log2(x)
3288 MulDD(&ylog2x_hi, &ylog2x_lo, log2x_hi, log2x_lo, y_hi, y_lo);
3289
3290 long double powxy;
3291 if (isinf(ylog2x_hi) || (reference_fabs(ylog2x_hi) > 2200))
3292 {
3293 powxy =
3294 reference_signbit(ylog2x_hi) ? HEX_DBL(+, 0, 0, +, 0) : INFINITY;
3295 }
3296 else
3297 {
3298 // separate integer + fractional part
3299 long int m = lrint(ylog2x_hi);
3300 AddDD(&ylog2x_hi, &ylog2x_lo, ylog2x_hi, ylog2x_lo, -m, 0.0);
3301
3302 // revert to long double arithemtic
3303 long double ylog2x = (long double)ylog2x_hi + (long double)ylog2x_lo;
3304 powxy = reference_exp2l(ylog2x);
3305 powxy = reference_scalblnl(powxy, m);
3306 }
3307
3308 return reference_copysignl(powxy, x);
3309 }
3310
reference_rintl(long double x)3311 long double reference_rintl(long double x)
3312 {
3313 #if defined(__PPC__)
3314 // On PPC, long doubles are maintained as 2 doubles. Therefore, the combined
3315 // mantissa can represent more than LDBL_MANT_DIG binary digits.
3316 x = rintl(x);
3317 #else
3318 static long double magic[2] = { 0.0L, 0.0L };
3319
3320 if (0.0L == magic[0])
3321 {
3322 magic[0] = scalbnl(0.5L, LDBL_MANT_DIG);
3323 magic[1] = scalbnl(-0.5L, LDBL_MANT_DIG);
3324 }
3325
3326 if (reference_fabsl(x) < magic[0] && x != 0.0L)
3327 {
3328 long double m = magic[x < 0];
3329 x += m;
3330 x -= m;
3331 }
3332 #endif // __PPC__
3333 return x;
3334 }
3335
3336 // extended precision sqrt using newton iteration on 1/sqrt(x).
3337 // Final result is computed as x * 1/sqrt(x)
__sqrt_ep(double * rhi,double * rlo,double xhi,double xlo)3338 static void __sqrt_ep(double *rhi, double *rlo, double xhi, double xlo)
3339 {
3340 // approximate reciprocal sqrt
3341 double thi = 1.0 / sqrt(xhi);
3342 double tlo = 0.0;
3343
3344 // One newton iteration in double-double
3345 double yhi, ylo;
3346 MulDD(&yhi, &ylo, thi, tlo, thi, tlo);
3347 MulDD(&yhi, &ylo, yhi, ylo, xhi, xlo);
3348 AddDD(&yhi, &ylo, -yhi, -ylo, 3.0, 0.0);
3349 MulDD(&yhi, &ylo, yhi, ylo, thi, tlo);
3350 MulDD(&yhi, &ylo, yhi, ylo, 0.5, 0.0);
3351
3352 MulDD(rhi, rlo, yhi, ylo, xhi, xlo);
3353 }
3354
reference_acoshl(long double x)3355 long double reference_acoshl(long double x)
3356 {
3357 /*
3358 * ====================================================
3359 * This function derived from fdlibm http://www.netlib.org
3360 * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
3361 *
3362 * Developed at SunSoft, a Sun Microsystems, Inc. business.
3363 * Permission to use, copy, modify, and distribute this
3364 * software is freely granted, provided that this notice
3365 * is preserved.
3366 * ====================================================
3367 *
3368 */
3369 if (isnan(x) || isinf(x)) return x + fabsl(x);
3370
3371 if (x < 1.0L) return cl_make_nan();
3372
3373 if (x == 1.0L) return 0.0L;
3374
3375 if (x > HEX_LDBL(+, 1, 0, +, 60))
3376 return reference_logl(x) + 0.693147180559945309417232121458176568L;
3377
3378 if (x > 2.0L)
3379 return reference_logl(2.0L * x - 1.0L / (x + sqrtl(x * x - 1.0L)));
3380
3381 double hi, lo;
3382 MulD(&hi, &lo, x, x);
3383 AddDD(&hi, &lo, hi, lo, -1.0, 0.0);
3384 __sqrt_ep(&hi, &lo, hi, lo);
3385 AddDD(&hi, &lo, hi, lo, x, 0.0);
3386 double correction = lo / hi;
3387 __log2_ep(&hi, &lo, hi);
3388 double log2Hi = HEX_DBL(+, 1, 62e42fefa39ef, -, 1);
3389 double log2Lo = HEX_DBL(+, 1, abc9e3b39803f, -, 56);
3390 MulDD(&hi, &lo, hi, lo, log2Hi, log2Lo);
3391 AddDD(&hi, &lo, hi, lo, correction, 0.0);
3392
3393 return hi + lo;
3394 }
3395
reference_asinhl(long double x)3396 long double reference_asinhl(long double x)
3397 {
3398 long double cutoff = 0.0L;
3399 const long double ln2 = HEX_LDBL(+, b, 17217f7d1cf79ab, -, 4);
3400
3401 if (cutoff == 0.0L) cutoff = reference_ldexpl(1.0L, -LDBL_MANT_DIG);
3402
3403 if (isnan(x) || isinf(x)) return x + x;
3404
3405 long double absx = reference_fabsl(x);
3406 if (absx < cutoff) return x;
3407
3408 long double sign = reference_copysignl(1.0L, x);
3409
3410 if (absx <= 4.0 / 3.0)
3411 {
3412 return sign
3413 * reference_log1pl(absx + x * x / (1.0 + sqrtl(1.0 + x * x)));
3414 }
3415 else if (absx <= HEX_LDBL(+, 1, 0, +, 27))
3416 {
3417 return sign
3418 * reference_logl(2.0L * absx + 1.0L / (sqrtl(x * x + 1.0) + absx));
3419 }
3420 else
3421 {
3422 return sign * (reference_logl(absx) + ln2);
3423 }
3424 }
3425
reference_atanhl(long double x)3426 long double reference_atanhl(long double x)
3427 {
3428 /*
3429 * ====================================================
3430 * This function is from fdlibm: http://www.netlib.org
3431 * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
3432 *
3433 * Developed at SunSoft, a Sun Microsystems, Inc. business.
3434 * Permission to use, copy, modify, and distribute this
3435 * software is freely granted, provided that this notice
3436 * is preserved.
3437 * ====================================================
3438 */
3439 if (isnan(x)) return x + x;
3440
3441 long double signed_half = reference_copysignl(0.5L, x);
3442 x = reference_fabsl(x);
3443 if (x > 1.0L) return cl_make_nan();
3444
3445 if (x < 0.5L)
3446 return signed_half * reference_log1pl(2.0L * (x + x * x / (1 - x)));
3447
3448 return signed_half * reference_log1pl(2.0L * x / (1 - x));
3449 }
3450
reference_exp2l(long double z)3451 long double reference_exp2l(long double z)
3452 {
3453 double_double x;
3454 int j;
3455
3456 // Handle NaNs
3457 if (isnan(z)) return z;
3458
3459 // init x
3460 x.hi = z;
3461 x.lo = z - x.hi;
3462
3463 // Deal with overflow and underflow for exp2(x) stage next
3464 if (x.hi >= 1025) return INFINITY;
3465
3466 if (x.hi < -1075 - 24) return +0.0;
3467
3468 // find nearest integer to x
3469 int i = (int)rint(x.hi);
3470
3471 // x now holds fractional part. The result would be then 2**i * exp2( x )
3472 x.hi -= i;
3473
3474 // We could attempt to find a minimax polynomial for exp2(x) over the range
3475 // x = [-0.5, 0.5]. However, this would converge very slowly near the
3476 // extrema, where 0.5**n is not a lot different from 0.5**(n+1), thereby
3477 // requiring something like a 20th order polynomial to get 53 + 24 bits of
3478 // precision. Instead we further reduce the range to [-1/32, 1/32] by
3479 // observing that
3480 //
3481 // 2**(a+b) = 2**a * 2**b
3482 //
3483 // We can thus build a table of 2**a values for a = n/16, n = [-8, 8], and
3484 // reduce the range of x to [-1/32, 1/32] by subtracting away the nearest
3485 // value of n/16 from x.
3486 const double_double corrections[17] = {
3487 { HEX_DBL(+, 1, 6a09e667f3bcd, -, 1),
3488 HEX_DBL(-, 1, bdd3413b26456, -, 55) },
3489 { HEX_DBL(+, 1, 7a11473eb0187, -, 1),
3490 HEX_DBL(-, 1, 41577ee04992f, -, 56) },
3491 { HEX_DBL(+, 1, 8ace5422aa0db, -, 1),
3492 HEX_DBL(+, 1, 6e9f156864b27, -, 55) },
3493 { HEX_DBL(+, 1, 9c49182a3f09, -, 1),
3494 HEX_DBL(+, 1, c7c46b071f2be, -, 57) },
3495 { HEX_DBL(+, 1, ae89f995ad3ad, -, 1),
3496 HEX_DBL(+, 1, 7a1cd345dcc81, -, 55) },
3497 { HEX_DBL(+, 1, c199bdd85529c, -, 1),
3498 HEX_DBL(+, 1, 11065895048dd, -, 56) },
3499 { HEX_DBL(+, 1, d5818dcfba487, -, 1),
3500 HEX_DBL(+, 1, 2ed02d75b3707, -, 56) },
3501 { HEX_DBL(+, 1, ea4afa2a490da, -, 1),
3502 HEX_DBL(-, 1, e9c23179c2893, -, 55) },
3503 { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
3504 { HEX_DBL(+, 1, 0b5586cf9890f, +, 0),
3505 HEX_DBL(+, 1, 8a62e4adc610b, -, 54) },
3506 { HEX_DBL(+, 1, 172b83c7d517b, +, 0),
3507 HEX_DBL(-, 1, 19041b9d78a76, -, 55) },
3508 { HEX_DBL(+, 1, 2387a6e756238, +, 0),
3509 HEX_DBL(+, 1, 9b07eb6c70573, -, 54) },
3510 { HEX_DBL(+, 1, 306fe0a31b715, +, 0),
3511 HEX_DBL(+, 1, 6f46ad23182e4, -, 55) },
3512 { HEX_DBL(+, 1, 3dea64c123422, +, 0),
3513 HEX_DBL(+, 1, ada0911f09ebc, -, 55) },
3514 { HEX_DBL(+, 1, 4bfdad5362a27, +, 0),
3515 HEX_DBL(+, 1, d4397afec42e2, -, 56) },
3516 { HEX_DBL(+, 1, 5ab07dd485429, +, 0),
3517 HEX_DBL(+, 1, 6324c054647ad, -, 54) },
3518 { HEX_DBL(+, 1, 6a09e667f3bcd, +, 0),
3519 HEX_DBL(-, 1, bdd3413b26456, -, 54) }
3520 };
3521 int index = (int)rint(x.hi * 16.0);
3522 x.hi -= (double)index * 0.0625;
3523
3524 // canonicalize x
3525 double temp = x.hi;
3526 x.hi += x.lo;
3527 x.lo -= x.hi - temp;
3528
3529 // Minimax polynomial for (exp2(x)-1)/x, over the range [-1/32, 1/32]. Max
3530 // Error: 2 * 0x1.e112p-87
3531 const double_double c[] = { { HEX_DBL(+, 1, 62e42fefa39ef, -, 1),
3532 HEX_DBL(+, 1, abc9e3ac1d244, -, 56) },
3533 { HEX_DBL(+, 1, ebfbdff82c58f, -, 3),
3534 HEX_DBL(-, 1, 5e4987a631846, -, 57) },
3535 { HEX_DBL(+, 1, c6b08d704a0c, -, 5),
3536 HEX_DBL(-, 1, d323200a05713, -, 59) },
3537 { HEX_DBL(+, 1, 3b2ab6fba4e7a, -, 7),
3538 HEX_DBL(+, 1, c5ee8f8b9f0c1, -, 63) },
3539 { HEX_DBL(+, 1, 5d87fe78a672a, -, 10),
3540 HEX_DBL(+, 1, 884e5e5cc7ecc, -, 64) },
3541 { HEX_DBL(+, 1, 430912f7e8373, -, 13),
3542 HEX_DBL(+, 1, 4f1b59514a326, -, 67) },
3543 { HEX_DBL(+, 1, ffcbfc5985e71, -, 17),
3544 HEX_DBL(-, 1, db7d6a0953b78, -, 71) },
3545 { HEX_DBL(+, 1, 62c150eb16465, -, 20),
3546 HEX_DBL(+, 1, e0767c2d7abf5, -, 80) },
3547 { HEX_DBL(+, 1, b52502b5e953, -, 24),
3548 HEX_DBL(+, 1, 6797523f944bc, -, 78) } };
3549 size_t count = sizeof(c) / sizeof(c[0]);
3550
3551 // Do polynomial
3552 double_double r = c[count - 1];
3553 for (j = (int)count - 2; j >= 0; j--) r = add_dd(c[j], mul_dd(r, x));
3554
3555 // unwind approximation
3556 r = mul_dd(r, x); // before: r =(exp2(x)-1)/x; after: r = exp2(x) - 1
3557
3558 // correct for [-0.5, 0.5] -> [-1/32, 1/32] reduction above
3559 // exp2(x) = (r + 1) * correction = r * correction + correction
3560 r = mul_dd(r, corrections[index + 8]);
3561 r = add_dd(r, corrections[index + 8]);
3562
3563 // Format result for output:
3564
3565 // Get mantissa
3566 long double m = ((long double)r.hi + (long double)r.lo);
3567
3568 // Handle a pesky overflow cases when long double = double
3569 if (i > 512)
3570 {
3571 m *= HEX_DBL(+, 1, 0, +, 512);
3572 i -= 512;
3573 }
3574 else if (i < -512)
3575 {
3576 m *= HEX_DBL(+, 1, 0, -, 512);
3577 i += 512;
3578 }
3579
3580 return m * ldexpl(1.0L, i);
3581 }
3582
reference_expm1l(long double x)3583 long double reference_expm1l(long double x)
3584 {
3585 #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
3586 // unimplemented
3587 return x;
3588 #else
3589 if (reference_isnanl(x)) return x;
3590
3591 if (x > 710) return INFINITY;
3592
3593 long double y = expm1l(x);
3594
3595 // Range of expm1l is -1.0L to +inf. Negative inf
3596 // on a few Linux platforms is clearly the wrong sign.
3597 if (reference_isinfl(y)) y = INFINITY;
3598
3599 return y;
3600 #endif
3601 }
3602
reference_fmaxl(long double x,long double y)3603 long double reference_fmaxl(long double x, long double y)
3604 {
3605 if (isnan(y)) return x;
3606
3607 return x >= y ? x : y;
3608 }
3609
reference_fminl(long double x,long double y)3610 long double reference_fminl(long double x, long double y)
3611 {
3612 if (isnan(y)) return x;
3613
3614 return x <= y ? x : y;
3615 }
3616
reference_hypotl(long double x,long double y)3617 long double reference_hypotl(long double x, long double y)
3618 {
3619 static const double tobig = HEX_DBL(+, 1, 0, +, 511);
3620 static const double big = HEX_DBL(+, 1, 0, +, 513);
3621 static const double rbig = HEX_DBL(+, 1, 0, -, 513);
3622 static const double tosmall = HEX_DBL(+, 1, 0, -, 511);
3623 static const double smalll = HEX_DBL(+, 1, 0, -, 607);
3624 static const double rsmall = HEX_DBL(+, 1, 0, +, 607);
3625
3626 long double max, min;
3627
3628 if (isinf(x) || isinf(y)) return INFINITY;
3629
3630 if (isnan(x) || isnan(y)) return x + y;
3631
3632 x = reference_fabsl(x);
3633 y = reference_fabsl(y);
3634
3635 max = reference_fmaxl(x, y);
3636 min = reference_fminl(x, y);
3637
3638 if (max > tobig)
3639 {
3640 max *= rbig;
3641 min *= rbig;
3642 return big * sqrtl(max * max + min * min);
3643 }
3644
3645 if (max < tosmall)
3646 {
3647 max *= rsmall;
3648 min *= rsmall;
3649 return smalll * sqrtl(max * max + min * min);
3650 }
3651 return sqrtl(x * x + y * y);
3652 }
3653
reference_log2l(long double x)3654 long double reference_log2l(long double x)
3655 {
3656 if (isnan(x) || x < 0.0 || x == -INFINITY) return NAN;
3657
3658 if (x == 0.0f) return -INFINITY;
3659
3660 if (x == INFINITY) return INFINITY;
3661
3662 double hi, lo;
3663 __log2_ep(&hi, &lo, x);
3664
3665 return (long double)hi + (long double)lo;
3666 }
3667
reference_log1pl(long double x)3668 long double reference_log1pl(long double x)
3669 {
3670 #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
3671 // unimplemented
3672 return x;
3673 #elif defined(__PPC__)
3674 // log1pl on PPC inadvertantly returns NaN for very large values. Work
3675 // around this limitation by returning logl for large values.
3676 return ((x > (long double)(0x1.0p+1022)) ? logl(x) : log1pl(x));
3677 #else
3678 return log1pl(x);
3679 #endif
3680 }
3681
reference_logbl(long double x)3682 long double reference_logbl(long double x)
3683 {
3684 // Since we are just using this to verify double precision, we can
3685 // use the double precision copysign here
3686 union {
3687 double f;
3688 cl_ulong u;
3689 } u;
3690 u.f = (double)x;
3691
3692 cl_int exponent = (cl_uint)(u.u >> 52) & 0x7ff;
3693 if (exponent == 0x7ff) return x * x;
3694
3695 if (exponent == 0)
3696 { // deal with denormals
3697 u.f = x * HEX_DBL(+, 1, 0, +, 64);
3698 exponent = (cl_int)(u.u >> 52) & 0x7ff;
3699 if (exponent == 0) return -INFINITY;
3700
3701 return exponent - (1023 + 64);
3702 }
3703
3704 return exponent - 1023;
3705 }
3706
reference_maxmagl(long double x,long double y)3707 long double reference_maxmagl(long double x, long double y)
3708 {
3709 long double fabsx = fabsl(x);
3710 long double fabsy = fabsl(y);
3711
3712 if (fabsx < fabsy) return y;
3713
3714 if (fabsy < fabsx) return x;
3715
3716 return reference_fmaxl(x, y);
3717 }
3718
reference_minmagl(long double x,long double y)3719 long double reference_minmagl(long double x, long double y)
3720 {
3721 long double fabsx = fabsl(x);
3722 long double fabsy = fabsl(y);
3723
3724 if (fabsx > fabsy) return y;
3725
3726 if (fabsy > fabsx) return x;
3727
3728 return reference_fminl(x, y);
3729 }
3730
reference_nanl(cl_ulong x)3731 long double reference_nanl(cl_ulong x)
3732 {
3733 union {
3734 cl_ulong u;
3735 cl_double f;
3736 } u;
3737 u.u = x | 0x7ff8000000000000ULL;
3738 return (long double)u.f;
3739 }
3740
3741
reference_reciprocall(long double x)3742 long double reference_reciprocall(long double x) { return 1.0L / x; }
3743
reference_remainderl(long double x,long double y)3744 long double reference_remainderl(long double x, long double y)
3745 {
3746 int i;
3747 return reference_remquol(x, y, &i);
3748 }
3749
reference_lgammal(long double x)3750 long double reference_lgammal(long double x)
3751 {
3752 // lgamma is currently not tested
3753 return reference_lgamma(x);
3754 }
3755
3756 static uint32_t two_over_pi[] = {
3757 0x0, 0x28be60db, 0x24e44152, 0x27f09d5f, 0x11f534dd, 0x3036d8a5,
3758 0x1993c439, 0x107f945, 0x23abdebb, 0x31586dc9, 0x6e3a424, 0x374b8019,
3759 0x92eea09, 0x3464873f, 0x21deb1cb, 0x4a69cfb, 0x288235f5, 0xbaed121,
3760 0xe99c702, 0x1ad17df9, 0x13991d6, 0xe60d4ce, 0x1f49c845, 0x3e2ef7e4,
3761 0x283b1ff8, 0x25fff781, 0x1980fef2, 0x3c462d68, 0xa6d1f6d, 0xd9fb3c9,
3762 0x3cb09b74, 0x3d18fd9a, 0x1e5fea2d, 0x1d49eeb1, 0x3ebe5f17, 0x2cf41ce7,
3763 0x378a5292, 0x3a9afed7, 0x3b11f8d5, 0x3421580c, 0x3046fc7b, 0x1aeafc33,
3764 0x3bc209af, 0x10d876a7, 0x2391615e, 0x3986c219, 0x199855f1, 0x1281a102,
3765 0xdffd880, 0x135cc9cc, 0x10606155
3766 };
3767
3768 static uint32_t pi_over_two[] = { 0x1, 0x2487ed51, 0x42d1846,
3769 0x26263314, 0x1701b839, 0x28948127 };
3770
3771 union d_ui64_t {
3772 uint64_t u;
3773 double d;
3774 };
3775
3776 // radix or base of representation
3777 #define RADIX (30)
3778 #define DIGITS 6
3779
3780 d_ui64_t two_pow_pradix = { (uint64_t)(1023 + RADIX) << 52 };
3781 d_ui64_t two_pow_mradix = { (uint64_t)(1023 - RADIX) << 52 };
3782 d_ui64_t two_pow_two_mradix = { (uint64_t)(1023 - 2 * RADIX) << 52 };
3783
3784 #define tp_pradix two_pow_pradix.d
3785 #define tp_mradix two_pow_mradix.d
3786
3787 // extended fixed point representation of double precision
3788 // floating point number.
3789 // x = sign * [ sum_{i = 0 to 2} ( X[i] * 2^(index - i)*RADIX ) ]
3790 struct eprep_t
3791 {
3792 uint32_t X[3]; // three 32 bit integers are sufficient to represnt double in
3793 // base_30
3794 int index; // exponent bias
3795 int sign; // sign of double
3796 };
3797
double_to_eprep(double x)3798 static eprep_t double_to_eprep(double x)
3799 {
3800 eprep_t result;
3801
3802 result.sign = (signbit(x) == 0) ? 1 : -1;
3803 x = fabs(x);
3804
3805 int index = 0;
3806 while (x > tp_pradix)
3807 {
3808 index++;
3809 x *= tp_mradix;
3810 }
3811 while (x < 1)
3812 {
3813 index--;
3814 x *= tp_pradix;
3815 }
3816
3817 result.index = index;
3818 int i = 0;
3819 result.X[0] = result.X[1] = result.X[2] = 0;
3820 while (x != 0.0)
3821 {
3822 result.X[i] = (uint32_t)x;
3823 x = (x - (double)result.X[i]) * tp_pradix;
3824 i++;
3825 }
3826 return result;
3827 }
3828
eprep_to_double(eprep_t epx)3829 static double eprep_to_double(eprep_t epx)
3830 {
3831 double res = 0.0;
3832
3833 res += ldexp((double)epx.X[0], (epx.index - 0) * RADIX);
3834 res += ldexp((double)epx.X[1], (epx.index - 1) * RADIX);
3835 res += ldexp((double)epx.X[2], (epx.index - 2) * RADIX);
3836
3837 return copysign(res, epx.sign);
3838 }
3839
payne_hanek(double * y,int * exception)3840 static int payne_hanek(double *y, int *exception)
3841 {
3842 double x = *y;
3843
3844 // exception cases .. no reduction required
3845 if (isnan(x) || isinf(x) || (fabs(x) <= M_PI_4))
3846 {
3847 *exception = 1;
3848 return 0;
3849 }
3850
3851 *exception = 0;
3852
3853 // After computation result[0] contains integer part while
3854 // result[1]....result[DIGITS-1] contain fractional part. So we are doing
3855 // computation with (DIGITS-1)*RADIX precision. Default DIGITS=6 and
3856 // RADIX=30 so default precision is 150 bits. Kahan-McDonald algorithm shows
3857 // that a double precision x, closest to pi/2 is 6381956970095103 x 2^797
3858 // which can cause 61 digits of cancellation in computation of f = x*2/pi -
3859 // floor(x*2/pi) ... thus we need at least 114 bits (61 leading zeros + 53
3860 // bits of mentissa of f) of precision to accurately compute f in double
3861 // precision. Since we are using 150 bits (still an overkill), we should be
3862 // safe. Extra bits can act as guard bits for correct rounding.
3863 uint64_t result[DIGITS + 2];
3864
3865 // compute extended precision representation of x
3866 eprep_t epx = double_to_eprep(x);
3867 int index = epx.index;
3868 int i, j;
3869 // extended precision multiplication of 2/pi*x .... we will loose at max two
3870 // RADIX=30 bit digits in the worst case
3871 for (i = 0; i < (DIGITS + 2); i++)
3872 {
3873 result[i] = 0;
3874 result[i] += ((index + i - 0) >= 0)
3875 ? ((uint64_t)two_over_pi[index + i - 0] * (uint64_t)epx.X[0])
3876 : 0;
3877 result[i] += ((index + i - 1) >= 0)
3878 ? ((uint64_t)two_over_pi[index + i - 1] * (uint64_t)epx.X[1])
3879 : 0;
3880 result[i] += ((index + i - 2) >= 0)
3881 ? ((uint64_t)two_over_pi[index + i - 2] * (uint64_t)epx.X[2])
3882 : 0;
3883 }
3884
3885 // Carry propagation.
3886 uint64_t tmp;
3887 for (i = DIGITS + 2 - 1; i > 0; i--)
3888 {
3889 tmp = result[i] >> RADIX;
3890 result[i - 1] += tmp;
3891 result[i] -= (tmp << RADIX);
3892 }
3893
3894 // we dont ned to normalize the integer part since only last two bits of
3895 // this will be used subsequently algorithm which remain unaltered by this
3896 // normalization. tmp = result[0] >> RADIX; result[0] -= (tmp << RADIX);
3897 unsigned int N = (unsigned int)result[0];
3898
3899 // if the result is > pi/4, bring it to (-pi/4, pi/4] range. Note that
3900 // testing if the final x_star = pi/2*(x*2/pi - k) > pi/4 is equivalent to
3901 // testing, at this stage, if r[1] (the first fractional digit) is greater
3902 // than (2^RADIX)/2 and substracting pi/4 from x_star to bring it to
3903 // mentioned range is equivalent to substracting fractional part at this
3904 // stage from one and changing the sign.
3905 int sign = 1;
3906 if (result[1] > (uint64_t)(1 << (RADIX - 1)))
3907 {
3908 for (i = 1; i < (DIGITS + 2); i++)
3909 result[i] = (~((unsigned int)result[i]) & 0x3fffffff);
3910 N += 1;
3911 sign = -1;
3912 }
3913
3914 // Again as per Kahan-McDonald algorithim there may be 61 leading zeros in
3915 // the worst case (when x is multiple of 2/pi very close to an integer) so
3916 // we need to get rid of these zeros and adjust the index of final result.
3917 // So in the worst case, precision of comupted result is 90 bits (150 bits
3918 // original bits - 60 lost in cancellation).
3919 int ind = 1;
3920 for (i = 1; i < (DIGITS + 2); i++)
3921 {
3922 if (result[i] != 0)
3923 break;
3924 else
3925 ind++;
3926 }
3927
3928 uint64_t r[DIGITS - 1];
3929 for (i = 0; i < (DIGITS - 1); i++)
3930 {
3931 r[i] = 0;
3932 for (j = 0; j <= i; j++)
3933 {
3934 r[i] += (result[ind + i - j] * (uint64_t)pi_over_two[j]);
3935 }
3936 }
3937 for (i = (DIGITS - 2); i > 0; i--)
3938 {
3939 tmp = r[i] >> RADIX;
3940 r[i - 1] += tmp;
3941 r[i] -= (tmp << RADIX);
3942 }
3943 tmp = r[0] >> RADIX;
3944 r[0] -= (tmp << RADIX);
3945
3946 eprep_t epr;
3947 epr.sign = epx.sign * sign;
3948 if (tmp != 0)
3949 {
3950 epr.index = -ind + 1;
3951 epr.X[0] = (uint32_t)tmp;
3952 epr.X[1] = (uint32_t)r[0];
3953 epr.X[2] = (uint32_t)r[1];
3954 }
3955 else
3956 {
3957 epr.index = -ind;
3958 epr.X[0] = (uint32_t)r[0];
3959 epr.X[1] = (uint32_t)r[1];
3960 epr.X[2] = (uint32_t)r[2];
3961 }
3962
3963 *y = eprep_to_double(epr);
3964 return epx.sign * N;
3965 }
3966
reference_relaxed_cos(double x)3967 double reference_relaxed_cos(double x)
3968 {
3969 if (isnan(x)) return NAN;
3970 return (float)cos((float)x);
3971 }
3972
reference_cos(double x)3973 double reference_cos(double x)
3974 {
3975 int exception;
3976 int N = payne_hanek(&x, &exception);
3977 if (exception) return cos(x);
3978 unsigned int c = N & 3;
3979 switch (c)
3980 {
3981 case 0: return cos(x);
3982 case 1: return -sin(x);
3983 case 2: return -cos(x);
3984 case 3: return sin(x);
3985 }
3986 return 0.0;
3987 }
3988
reference_relaxed_sin(double x)3989 double reference_relaxed_sin(double x) { return (float)sin((float)x); }
3990
reference_sin(double x)3991 double reference_sin(double x)
3992 {
3993 int exception;
3994 int N = payne_hanek(&x, &exception);
3995 if (exception) return sin(x);
3996 int c = N & 3;
3997 switch (c)
3998 {
3999 case 0: return sin(x);
4000 case 1: return cos(x);
4001 case 2: return -sin(x);
4002 case 3: return -cos(x);
4003 }
4004 return 0.0;
4005 }
4006
reference_relaxed_sincos(double x,double * y)4007 double reference_relaxed_sincos(double x, double *y)
4008 {
4009 *y = reference_relaxed_cos(x);
4010 return reference_relaxed_sin(x);
4011 }
4012
reference_sincos(double x,double * y)4013 double reference_sincos(double x, double *y)
4014 {
4015 int exception;
4016 int N = payne_hanek(&x, &exception);
4017 if (exception)
4018 {
4019 *y = cos(x);
4020 return sin(x);
4021 }
4022 int c = N & 3;
4023 switch (c)
4024 {
4025 case 0: *y = cos(x); return sin(x);
4026 case 1: *y = -sin(x); return cos(x);
4027 case 2: *y = -cos(x); return -sin(x);
4028 case 3: *y = sin(x); return -cos(x);
4029 }
4030 return 0.0;
4031 }
4032
reference_relaxed_tan(double x)4033 double reference_relaxed_tan(double x)
4034 {
4035 return ((float)reference_relaxed_sin((float)x))
4036 / ((float)reference_relaxed_cos((float)x));
4037 }
4038
reference_tan(double x)4039 double reference_tan(double x)
4040 {
4041 int exception;
4042 int N = payne_hanek(&x, &exception);
4043 if (exception) return tan(x);
4044 int c = N & 3;
4045 switch (c)
4046 {
4047 case 0: return tan(x);
4048 case 1: return -1.0 / tan(x);
4049 case 2: return tan(x);
4050 case 3: return -1.0 / tan(x);
4051 }
4052 return 0.0;
4053 }
4054
reference_cosl(long double xx)4055 long double reference_cosl(long double xx)
4056 {
4057 double x = (double)xx;
4058 int exception;
4059 int N = payne_hanek(&x, &exception);
4060 if (exception) return cosl(x);
4061 unsigned int c = N & 3;
4062 switch (c)
4063 {
4064 case 0: return cosl(x);
4065 case 1: return -sinl(x);
4066 case 2: return -cosl(x);
4067 case 3: return sinl(x);
4068 }
4069 return 0.0;
4070 }
4071
reference_sinl(long double xx)4072 long double reference_sinl(long double xx)
4073 {
4074 // we use system tanl after reduction which
4075 // can flush denorm input to zero so
4076 // take care of it here.
4077 if (reference_fabsl(xx) < HEX_DBL(+, 1, 0, -, 1022)) return xx;
4078
4079 double x = (double)xx;
4080 int exception;
4081 int N = payne_hanek(&x, &exception);
4082 if (exception) return sinl(x);
4083 int c = N & 3;
4084 switch (c)
4085 {
4086 case 0: return sinl(x);
4087 case 1: return cosl(x);
4088 case 2: return -sinl(x);
4089 case 3: return -cosl(x);
4090 }
4091 return 0.0;
4092 }
4093
reference_sincosl(long double xx,long double * y)4094 long double reference_sincosl(long double xx, long double *y)
4095 {
4096 // we use system tanl after reduction which
4097 // can flush denorm input to zero so
4098 // take care of it here.
4099 if (reference_fabsl(xx) < HEX_DBL(+, 1, 0, -, 1022))
4100 {
4101 *y = cosl(xx);
4102 return xx;
4103 }
4104
4105 double x = (double)xx;
4106 int exception;
4107 int N = payne_hanek(&x, &exception);
4108 if (exception)
4109 {
4110 *y = cosl(x);
4111 return sinl(x);
4112 }
4113 int c = N & 3;
4114 switch (c)
4115 {
4116 case 0: *y = cosl(x); return sinl(x);
4117 case 1: *y = -sinl(x); return cosl(x);
4118 case 2: *y = -cosl(x); return -sinl(x);
4119 case 3: *y = sinl(x); return -cosl(x);
4120 }
4121 return 0.0;
4122 }
4123
reference_tanl(long double xx)4124 long double reference_tanl(long double xx)
4125 {
4126 // we use system tanl after reduction which
4127 // can flush denorm input to zero so
4128 // take care of it here.
4129 if (reference_fabsl(xx) < HEX_DBL(+, 1, 0, -, 1022)) return xx;
4130
4131 double x = (double)xx;
4132 int exception;
4133 int N = payne_hanek(&x, &exception);
4134 if (exception) return tanl(x);
4135 int c = N & 3;
4136 switch (c)
4137 {
4138 case 0: return tanl(x);
4139 case 1: return -1.0 / tanl(x);
4140 case 2: return tanl(x);
4141 case 3: return -1.0 / tanl(x);
4142 }
4143 return 0.0;
4144 }
4145
4146 static double __loglTable1[64][3] = {
4147 { HEX_DBL(+, 1, 5390948f40fea, +, 0), HEX_DBL(-, 1, a152f142a, -, 2),
4148 HEX_DBL(+, 1, f93e27b43bd2c, -, 40) },
4149 { HEX_DBL(+, 1, 5015015015015, +, 0), HEX_DBL(-, 1, 921800925, -, 2),
4150 HEX_DBL(+, 1, 162432a1b8df7, -, 41) },
4151 { HEX_DBL(+, 1, 4cab88725af6e, +, 0), HEX_DBL(-, 1, 8304d90c18, -, 2),
4152 HEX_DBL(+, 1, 80bb749056fe7, -, 40) },
4153 { HEX_DBL(+, 1, 49539e3b2d066, +, 0), HEX_DBL(-, 1, 7418acebc, -, 2),
4154 HEX_DBL(+, 1, ceac7f0607711, -, 43) },
4155 { HEX_DBL(+, 1, 460cbc7f5cf9a, +, 0), HEX_DBL(-, 1, 6552b49988, -, 2),
4156 HEX_DBL(+, 1, d8913d0e89fa, -, 42) },
4157 { HEX_DBL(+, 1, 42d6625d51f86, +, 0), HEX_DBL(-, 1, 56b22e6b58, -, 2),
4158 HEX_DBL(+, 1, c7eaf515033a1, -, 44) },
4159 { HEX_DBL(+, 1, 3fb013fb013fb, +, 0), HEX_DBL(-, 1, 48365e696, -, 2),
4160 HEX_DBL(+, 1, 434adcde7edc7, -, 41) },
4161 { HEX_DBL(+, 1, 3c995a47babe7, +, 0), HEX_DBL(-, 1, 39de8e156, -, 2),
4162 HEX_DBL(+, 1, 8246f8e527754, -, 40) },
4163 { HEX_DBL(+, 1, 3991c2c187f63, +, 0), HEX_DBL(-, 1, 2baa0c34c, -, 2),
4164 HEX_DBL(+, 1, e1513c28e180d, -, 42) },
4165 { HEX_DBL(+, 1, 3698df3de0747, +, 0), HEX_DBL(-, 1, 1d982c9d58, -, 2),
4166 HEX_DBL(+, 1, 63ea3fed4b8a2, -, 40) },
4167 { HEX_DBL(+, 1, 33ae45b57bcb1, +, 0), HEX_DBL(-, 1, 0fa848045, -, 2),
4168 HEX_DBL(+, 1, 32ccbacf1779b, -, 40) },
4169 { HEX_DBL(+, 1, 30d190130d19, +, 0), HEX_DBL(-, 1, 01d9bbcfa8, -, 2),
4170 HEX_DBL(+, 1, e2bfeb2b884aa, -, 42) },
4171 { HEX_DBL(+, 1, 2e025c04b8097, +, 0), HEX_DBL(-, 1, e857d3d37, -, 3),
4172 HEX_DBL(+, 1, d9309b4d2ea85, -, 40) },
4173 { HEX_DBL(+, 1, 2b404ad012b4, +, 0), HEX_DBL(-, 1, cd3c712d4, -, 3),
4174 HEX_DBL(+, 1, ddf360962d7ab, -, 40) },
4175 { HEX_DBL(+, 1, 288b01288b012, +, 0), HEX_DBL(-, 1, b2602497e, -, 3),
4176 HEX_DBL(+, 1, 597f8a121640f, -, 40) },
4177 { HEX_DBL(+, 1, 25e22708092f1, +, 0), HEX_DBL(-, 1, 97c1cb13d, -, 3),
4178 HEX_DBL(+, 1, 02807d15580dc, -, 40) },
4179 { HEX_DBL(+, 1, 23456789abcdf, +, 0), HEX_DBL(-, 1, 7d60496d, -, 3),
4180 HEX_DBL(+, 1, 12ce913d7a827, -, 41) },
4181 { HEX_DBL(+, 1, 20b470c67c0d8, +, 0), HEX_DBL(-, 1, 633a8bf44, -, 3),
4182 HEX_DBL(+, 1, 0648bca9c96bd, -, 40) },
4183 { HEX_DBL(+, 1, 1e2ef3b3fb874, +, 0), HEX_DBL(-, 1, 494f863b9, -, 3),
4184 HEX_DBL(+, 1, 066fceb89b0eb, -, 42) },
4185 { HEX_DBL(+, 1, 1bb4a4046ed29, +, 0), HEX_DBL(-, 1, 2f9e32d5c, -, 3),
4186 HEX_DBL(+, 1, 17b8b6c4f846b, -, 46) },
4187 { HEX_DBL(+, 1, 19453808ca29c, +, 0), HEX_DBL(-, 1, 162593187, -, 3),
4188 HEX_DBL(+, 1, 2c83506452154, -, 42) },
4189 { HEX_DBL(+, 1, 16e0689427378, +, 0), HEX_DBL(-, 1, f9c95dc1e, -, 4),
4190 HEX_DBL(+, 1, dd5d2183150f3, -, 41) },
4191 { HEX_DBL(+, 1, 1485f0e0acd3b, +, 0), HEX_DBL(-, 1, c7b528b72, -, 4),
4192 HEX_DBL(+, 1, 0e43c4f4e619d, -, 40) },
4193 { HEX_DBL(+, 1, 12358e75d3033, +, 0), HEX_DBL(-, 1, 960caf9ac, -, 4),
4194 HEX_DBL(+, 1, 20fbfd5902a1e, -, 42) },
4195 { HEX_DBL(+, 1, 0fef010fef01, +, 0), HEX_DBL(-, 1, 64ce26c08, -, 4),
4196 HEX_DBL(+, 1, 8ebeefb4ac467, -, 40) },
4197 { HEX_DBL(+, 1, 0db20a88f4695, +, 0), HEX_DBL(-, 1, 33f7cde16, -, 4),
4198 HEX_DBL(+, 1, 30b3312da7a7d, -, 40) },
4199 { HEX_DBL(+, 1, 0b7e6ec259dc7, +, 0), HEX_DBL(-, 1, 0387efbcc, -, 4),
4200 HEX_DBL(+, 1, 796f1632949c3, -, 40) },
4201 { HEX_DBL(+, 1, 0953f39010953, +, 0), HEX_DBL(-, 1, a6f9c378, -, 5),
4202 HEX_DBL(+, 1, 1687e151172cc, -, 40) },
4203 { HEX_DBL(+, 1, 073260a47f7c6, +, 0), HEX_DBL(-, 1, 47aa07358, -, 5),
4204 HEX_DBL(+, 1, 1f87e4a9cc778, -, 42) },
4205 { HEX_DBL(+, 1, 05197f7d73404, +, 0), HEX_DBL(-, 1, d23afc498, -, 6),
4206 HEX_DBL(+, 1, b183a6b628487, -, 40) },
4207 { HEX_DBL(+, 1, 03091b51f5e1a, +, 0), HEX_DBL(-, 1, 16a21e21, -, 6),
4208 HEX_DBL(+, 1, 7d75c58973ce5, -, 40) },
4209 { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
4210 { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
4211 { HEX_DBL(+, 1, f44659e4a4271, -, 1), HEX_DBL(+, 1, 11cd1d51, -, 5),
4212 HEX_DBL(+, 1, 9a0d857e2f4b2, -, 40) },
4213 { HEX_DBL(+, 1, ecc07b301ecc, -, 1), HEX_DBL(+, 1, c4dfab908, -, 5),
4214 HEX_DBL(+, 1, 55b53fce557fd, -, 40) },
4215 { HEX_DBL(+, 1, e573ac901e573, -, 1), HEX_DBL(+, 1, 3aa2fdd26, -, 4),
4216 HEX_DBL(+, 1, f1cb0c9532089, -, 40) },
4217 { HEX_DBL(+, 1, de5d6e3f8868a, -, 1), HEX_DBL(+, 1, 918a16e46, -, 4),
4218 HEX_DBL(+, 1, 9af0dcd65a6e1, -, 43) },
4219 { HEX_DBL(+, 1, d77b654b82c33, -, 1), HEX_DBL(+, 1, e72ec117e, -, 4),
4220 HEX_DBL(+, 1, a5b93c4ebe124, -, 40) },
4221 { HEX_DBL(+, 1, d0cb58f6ec074, -, 1), HEX_DBL(+, 1, 1dcd19755, -, 3),
4222 HEX_DBL(+, 1, 5be50e71ddc6c, -, 42) },
4223 { HEX_DBL(+, 1, ca4b3055ee191, -, 1), HEX_DBL(+, 1, 476a9f983, -, 3),
4224 HEX_DBL(+, 1, ee9a798719e7f, -, 40) },
4225 { HEX_DBL(+, 1, c3f8f01c3f8f, -, 1), HEX_DBL(+, 1, 70742d4ef, -, 3),
4226 HEX_DBL(+, 1, 3ff1352c1219c, -, 46) },
4227 { HEX_DBL(+, 1, bdd2b899406f7, -, 1), HEX_DBL(+, 1, 98edd077e, -, 3),
4228 HEX_DBL(+, 1, c383cd11362f4, -, 41) },
4229 { HEX_DBL(+, 1, b7d6c3dda338b, -, 1), HEX_DBL(+, 1, c0db6cdd9, -, 3),
4230 HEX_DBL(+, 1, 37bd85b1a824e, -, 41) },
4231 { HEX_DBL(+, 1, b2036406c80d9, -, 1), HEX_DBL(+, 1, e840be74e, -, 3),
4232 HEX_DBL(+, 1, a9334d525e1ec, -, 41) },
4233 { HEX_DBL(+, 1, ac5701ac5701a, -, 1), HEX_DBL(+, 1, 0790adbb, -, 2),
4234 HEX_DBL(+, 1, 8060bfb6a491, -, 41) },
4235 { HEX_DBL(+, 1, a6d01a6d01a6d, -, 1), HEX_DBL(+, 1, 1ac05b2918, -, 2),
4236 HEX_DBL(+, 1, c1c161471580a, -, 40) },
4237 { HEX_DBL(+, 1, a16d3f97a4b01, -, 1), HEX_DBL(+, 1, 2db10fc4d8, -, 2),
4238 HEX_DBL(+, 1, ab1aa62214581, -, 42) },
4239 { HEX_DBL(+, 1, 9c2d14ee4a101, -, 1), HEX_DBL(+, 1, 406463b1b, -, 2),
4240 HEX_DBL(+, 1, 12e95dbda6611, -, 44) },
4241 { HEX_DBL(+, 1, 970e4f80cb872, -, 1), HEX_DBL(+, 1, 52dbdfc4c8, -, 2),
4242 HEX_DBL(+, 1, 6b53fee511af, -, 42) },
4243 { HEX_DBL(+, 1, 920fb49d0e228, -, 1), HEX_DBL(+, 1, 6518fe467, -, 2),
4244 HEX_DBL(+, 1, eea7d7d7d1764, -, 40) },
4245 { HEX_DBL(+, 1, 8d3018d3018d3, -, 1), HEX_DBL(+, 1, 771d2ba7e8, -, 2),
4246 HEX_DBL(+, 1, ecefa8d4fab97, -, 40) },
4247 { HEX_DBL(+, 1, 886e5f0abb049, -, 1), HEX_DBL(+, 1, 88e9c72e08, -, 2),
4248 HEX_DBL(+, 1, 913ea3d33fd14, -, 41) },
4249 { HEX_DBL(+, 1, 83c977ab2bedd, -, 1), HEX_DBL(+, 1, 9a802391e, -, 2),
4250 HEX_DBL(+, 1, 197e845877c94, -, 41) },
4251 { HEX_DBL(+, 1, 7f405fd017f4, -, 1), HEX_DBL(+, 1, abe18797f, -, 2),
4252 HEX_DBL(+, 1, f4a52f8e8a81, -, 42) },
4253 { HEX_DBL(+, 1, 7ad2208e0ecc3, -, 1), HEX_DBL(+, 1, bd0f2e9e78, -, 2),
4254 HEX_DBL(+, 1, 031f4336644cc, -, 42) },
4255 { HEX_DBL(+, 1, 767dce434a9b1, -, 1), HEX_DBL(+, 1, ce0a4923a, -, 2),
4256 HEX_DBL(+, 1, 61f33c897020c, -, 40) },
4257 { HEX_DBL(+, 1, 724287f46debc, -, 1), HEX_DBL(+, 1, ded3fd442, -, 2),
4258 HEX_DBL(+, 1, b2632e830632, -, 41) },
4259 { HEX_DBL(+, 1, 6e1f76b4337c6, -, 1), HEX_DBL(+, 1, ef6d673288, -, 2),
4260 HEX_DBL(+, 1, 888ec245a0bf, -, 40) },
4261 { HEX_DBL(+, 1, 6a13cd153729, -, 1), HEX_DBL(+, 1, ffd799a838, -, 2),
4262 HEX_DBL(+, 1, fe6f3b2f5fc8e, -, 40) },
4263 { HEX_DBL(+, 1, 661ec6a5122f9, -, 1), HEX_DBL(+, 1, 0809cf27f4, -, 1),
4264 HEX_DBL(+, 1, 81eaa9ef284dd, -, 40) },
4265 { HEX_DBL(+, 1, 623fa7701623f, -, 1), HEX_DBL(+, 1, 10113b153c, -, 1),
4266 HEX_DBL(+, 1, 1d7b07d6b1143, -, 42) },
4267 { HEX_DBL(+, 1, 5e75bb8d015e7, -, 1), HEX_DBL(+, 1, 18028cf728, -, 1),
4268 HEX_DBL(+, 1, 76b100b1f6c6, -, 41) },
4269 { HEX_DBL(+, 1, 5ac056b015ac, -, 1), HEX_DBL(+, 1, 1fde3d30e8, -, 1),
4270 HEX_DBL(+, 1, 26faeb9870945, -, 45) },
4271 { HEX_DBL(+, 1, 571ed3c506b39, -, 1), HEX_DBL(+, 1, 27a4c0585c, -, 1),
4272 HEX_DBL(+, 1, 7f2c5344d762b, -, 42) }
4273 };
4274
4275 static double __loglTable2[64][3] = {
4276 { HEX_DBL(+, 1, 01fbe7f0a1be6, +, 0), HEX_DBL(-, 1, 6cf6ddd26112a, -, 7),
4277 HEX_DBL(+, 1, 0725e5755e314, -, 60) },
4278 { HEX_DBL(+, 1, 01eba93a97b12, +, 0), HEX_DBL(-, 1, 6155b1d99f603, -, 7),
4279 HEX_DBL(+, 1, 4bcea073117f4, -, 60) },
4280 { HEX_DBL(+, 1, 01db6c9029cd1, +, 0), HEX_DBL(-, 1, 55b54153137ff, -, 7),
4281 HEX_DBL(+, 1, 21e8faccad0ec, -, 61) },
4282 { HEX_DBL(+, 1, 01cb31f0f534c, +, 0), HEX_DBL(-, 1, 4a158c27245bd, -, 7),
4283 HEX_DBL(+, 1, 1a5b7bfbf35d3, -, 60) },
4284 { HEX_DBL(+, 1, 01baf95c9723c, +, 0), HEX_DBL(-, 1, 3e76923e3d678, -, 7),
4285 HEX_DBL(+, 1, eee400eb5fe34, -, 62) },
4286 { HEX_DBL(+, 1, 01aac2d2acee6, +, 0), HEX_DBL(-, 1, 32d85380ce776, -, 7),
4287 HEX_DBL(+, 1, cbf7a513937bd, -, 61) },
4288 { HEX_DBL(+, 1, 019a8e52d401e, +, 0), HEX_DBL(-, 1, 273acfd74be72, -, 7),
4289 HEX_DBL(+, 1, 5c64599efa5e6, -, 60) },
4290 { HEX_DBL(+, 1, 018a5bdca9e42, +, 0), HEX_DBL(-, 1, 1b9e072a2e65, -, 7),
4291 HEX_DBL(+, 1, 364180e0a5d37, -, 60) },
4292 { HEX_DBL(+, 1, 017a2b6fcc33e, +, 0), HEX_DBL(-, 1, 1001f961f3243, -, 7),
4293 HEX_DBL(+, 1, 63d795746f216, -, 60) },
4294 { HEX_DBL(+, 1, 0169fd0bd8a8a, +, 0), HEX_DBL(-, 1, 0466a6671bca4, -, 7),
4295 HEX_DBL(+, 1, 4c99ff1907435, -, 60) },
4296 { HEX_DBL(+, 1, 0159d0b06d129, +, 0), HEX_DBL(-, 1, f1981c445cd05, -, 8),
4297 HEX_DBL(+, 1, 4bfff6366b723, -, 62) },
4298 { HEX_DBL(+, 1, 0149a65d275a6, +, 0), HEX_DBL(-, 1, da6460f76ab8c, -, 8),
4299 HEX_DBL(+, 1, 9c5404f47589c, -, 61) },
4300 { HEX_DBL(+, 1, 01397e11a581b, +, 0), HEX_DBL(-, 1, c3321ab87f4ef, -, 8),
4301 HEX_DBL(+, 1, c0da537429cea, -, 61) },
4302 { HEX_DBL(+, 1, 012957cd85a28, +, 0), HEX_DBL(-, 1, ac014958c112c, -, 8),
4303 HEX_DBL(+, 1, 000c2a1b595e3, -, 64) },
4304 { HEX_DBL(+, 1, 0119339065ef7, +, 0), HEX_DBL(-, 1, 94d1eca95f67a, -, 8),
4305 HEX_DBL(+, 1, d8d20b0564d5, -, 61) },
4306 { HEX_DBL(+, 1, 01091159e4b3d, +, 0), HEX_DBL(-, 1, 7da4047b92b3e, -, 8),
4307 HEX_DBL(+, 1, 6194a5d68cf2, -, 66) },
4308 { HEX_DBL(+, 1, 00f8f129a0535, +, 0), HEX_DBL(-, 1, 667790a09bf77, -, 8),
4309 HEX_DBL(+, 1, ca230e0bea645, -, 61) },
4310 { HEX_DBL(+, 1, 00e8d2ff374a1, +, 0), HEX_DBL(-, 1, 4f4c90e9c4ead, -, 8),
4311 HEX_DBL(+, 1, 1de3e7f350c1, -, 61) },
4312 { HEX_DBL(+, 1, 00d8b6da482ce, +, 0), HEX_DBL(-, 1, 3823052860649, -, 8),
4313 HEX_DBL(+, 1, 5789b4c5891b8, -, 64) },
4314 { HEX_DBL(+, 1, 00c89cba71a8c, +, 0), HEX_DBL(-, 1, 20faed2dc9a9e, -, 8),
4315 HEX_DBL(+, 1, 9e7c40f9839fd, -, 62) },
4316 { HEX_DBL(+, 1, 00b8849f52834, +, 0), HEX_DBL(-, 1, 09d448cb65014, -, 8),
4317 HEX_DBL(+, 1, 387e3e9b6d02, -, 62) },
4318 { HEX_DBL(+, 1, 00a86e88899a4, +, 0), HEX_DBL(-, 1, e55e2fa53ebf1, -, 9),
4319 HEX_DBL(+, 1, cdaa71fddfddf, -, 62) },
4320 { HEX_DBL(+, 1, 00985a75b5e3f, +, 0), HEX_DBL(-, 1, b716b429dce0f, -, 9),
4321 HEX_DBL(+, 1, 2f2af081367bf, -, 63) },
4322 { HEX_DBL(+, 1, 00884866766ee, +, 0), HEX_DBL(-, 1, 88d21ec7a16d7, -, 9),
4323 HEX_DBL(+, 1, fb95c228d6f16, -, 62) },
4324 { HEX_DBL(+, 1, 0078385a6a61d, +, 0), HEX_DBL(-, 1, 5a906f219a9e8, -, 9),
4325 HEX_DBL(+, 1, 18aff10a89f29, -, 64) },
4326 { HEX_DBL(+, 1, 00682a5130fbe, +, 0), HEX_DBL(-, 1, 2c51a4dae87f1, -, 9),
4327 HEX_DBL(+, 1, bcc7e33ddde3, -, 63) },
4328 { HEX_DBL(+, 1, 00581e4a69944, +, 0), HEX_DBL(-, 1, fc2b7f2d782b1, -, 10),
4329 HEX_DBL(+, 1, fe3ef3300a9fa, -, 64) },
4330 { HEX_DBL(+, 1, 00481445b39a8, +, 0), HEX_DBL(-, 1, 9fb97df0b0b83, -, 10),
4331 HEX_DBL(+, 1, 0d9a601f2f324, -, 65) },
4332 { HEX_DBL(+, 1, 00380c42ae963, +, 0), HEX_DBL(-, 1, 434d4546227ae, -, 10),
4333 HEX_DBL(+, 1, 0b9b6a5868f33, -, 63) },
4334 { HEX_DBL(+, 1, 00280640fa271, +, 0), HEX_DBL(-, 1, cdcda8e930c19, -, 11),
4335 HEX_DBL(+, 1, 3d424ab39f789, -, 64) },
4336 { HEX_DBL(+, 1, 0018024036051, +, 0), HEX_DBL(-, 1, 150c558601261, -, 11),
4337 HEX_DBL(+, 1, 285bb90327a0f, -, 64) },
4338 { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
4339 { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
4340 { HEX_DBL(+, 1, ffa011fca0a1e, -, 1), HEX_DBL(+, 1, 14e5640c4197b, -, 10),
4341 HEX_DBL(+, 1, 95728136ae401, -, 63) },
4342 { HEX_DBL(+, 1, ff6031f064e07, -, 1), HEX_DBL(+, 1, cd61806bf532d, -, 10),
4343 HEX_DBL(+, 1, 568a4f35d8538, -, 63) },
4344 { HEX_DBL(+, 1, ff2061d532b9c, -, 1), HEX_DBL(+, 1, 42e34af550eda, -, 9),
4345 HEX_DBL(+, 1, 8f69cee55fec, -, 62) },
4346 { HEX_DBL(+, 1, fee0a1a513253, -, 1), HEX_DBL(+, 1, 9f0a5523902ea, -, 9),
4347 HEX_DBL(+, 1, daec734b11615, -, 63) },
4348 { HEX_DBL(+, 1, fea0f15a12139, -, 1), HEX_DBL(+, 1, fb25e19f11b26, -, 9),
4349 HEX_DBL(+, 1, 8bafca62941da, -, 62) },
4350 { HEX_DBL(+, 1, fe6150ee3e6d4, -, 1), HEX_DBL(+, 1, 2b9af9a28e282, -, 8),
4351 HEX_DBL(+, 1, 0fd3674e1dc5b, -, 61) },
4352 { HEX_DBL(+, 1, fe21c05baa109, -, 1), HEX_DBL(+, 1, 599d4678f24b9, -, 8),
4353 HEX_DBL(+, 1, dafce1f09937b, -, 61) },
4354 { HEX_DBL(+, 1, fde23f9c69cf9, -, 1), HEX_DBL(+, 1, 8799d8c046eb, -, 8),
4355 HEX_DBL(+, 1, ffa0ce0bdd217, -, 65) },
4356 { HEX_DBL(+, 1, fda2ceaa956e8, -, 1), HEX_DBL(+, 1, b590b1e5951ee, -, 8),
4357 HEX_DBL(+, 1, 645a769232446, -, 62) },
4358 { HEX_DBL(+, 1, fd636d8047a1f, -, 1), HEX_DBL(+, 1, e381d3555dbcf, -, 8),
4359 HEX_DBL(+, 1, 882320d368331, -, 61) },
4360 { HEX_DBL(+, 1, fd241c179e0cc, -, 1), HEX_DBL(+, 1, 08b69f3dccde, -, 7),
4361 HEX_DBL(+, 1, 01ad5065aba9e, -, 61) },
4362 { HEX_DBL(+, 1, fce4da6ab93e8, -, 1), HEX_DBL(+, 1, 1fa97a61dd298, -, 7),
4363 HEX_DBL(+, 1, 84cd1f931ae34, -, 60) },
4364 { HEX_DBL(+, 1, fca5a873bcb19, -, 1), HEX_DBL(+, 1, 36997bcc54a3f, -, 7),
4365 HEX_DBL(+, 1, 1485e97eaee03, -, 60) },
4366 { HEX_DBL(+, 1, fc66862ccec93, -, 1), HEX_DBL(+, 1, 4d86a43264a4f, -, 7),
4367 HEX_DBL(+, 1, c75e63370988b, -, 61) },
4368 { HEX_DBL(+, 1, fc27739018cfe, -, 1), HEX_DBL(+, 1, 6470f448fb09d, -, 7),
4369 HEX_DBL(+, 1, d7361eeaed0a1, -, 65) },
4370 { HEX_DBL(+, 1, fbe87097c6f5a, -, 1), HEX_DBL(+, 1, 7b586cc4c2523, -, 7),
4371 HEX_DBL(+, 1, b3df952cc473c, -, 61) },
4372 { HEX_DBL(+, 1, fba97d3e084dd, -, 1), HEX_DBL(+, 1, 923d0e5a21e06, -, 7),
4373 HEX_DBL(+, 1, cf56c7b64ae5d, -, 62) },
4374 { HEX_DBL(+, 1, fb6a997d0ecdc, -, 1), HEX_DBL(+, 1, a91ed9bd3df9a, -, 7),
4375 HEX_DBL(+, 1, b957bdcd89e43, -, 61) },
4376 { HEX_DBL(+, 1, fb2bc54f0f4ab, -, 1), HEX_DBL(+, 1, bffdcfa1f7fbb, -, 7),
4377 HEX_DBL(+, 1, ea8cad9a21771, -, 62) },
4378 { HEX_DBL(+, 1, faed00ae41783, -, 1), HEX_DBL(+, 1, d6d9f0bbee6f6, -, 7),
4379 HEX_DBL(+, 1, 5762a9af89c82, -, 60) },
4380 { HEX_DBL(+, 1, faae4b94dfe64, -, 1), HEX_DBL(+, 1, edb33dbe7d335, -, 7),
4381 HEX_DBL(+, 1, 21e24fc245697, -, 62) },
4382 { HEX_DBL(+, 1, fa6fa5fd27ff8, -, 1), HEX_DBL(+, 1, 0244dbae5ed05, -, 6),
4383 HEX_DBL(+, 1, 12ef51b967102, -, 60) },
4384 { HEX_DBL(+, 1, fa310fe15a078, -, 1), HEX_DBL(+, 1, 0daeaf24c3529, -, 6),
4385 HEX_DBL(+, 1, 10d3cfca60b45, -, 59) },
4386 { HEX_DBL(+, 1, f9f2893bb9192, -, 1), HEX_DBL(+, 1, 1917199bb66bc, -, 6),
4387 HEX_DBL(+, 1, 6cf6034c32e19, -, 60) },
4388 { HEX_DBL(+, 1, f9b412068b247, -, 1), HEX_DBL(+, 1, 247e1b6c615d5, -, 6),
4389 HEX_DBL(+, 1, 42f0fffa229f7, -, 61) },
4390 { HEX_DBL(+, 1, f975aa3c18ed6, -, 1), HEX_DBL(+, 1, 2fe3b4efcc5ad, -, 6),
4391 HEX_DBL(+, 1, 70106136a8919, -, 60) },
4392 { HEX_DBL(+, 1, f93751d6ae09b, -, 1), HEX_DBL(+, 1, 3b47e67edea93, -, 6),
4393 HEX_DBL(+, 1, 38dd5a4f6959a, -, 59) },
4394 { HEX_DBL(+, 1, f8f908d098df6, -, 1), HEX_DBL(+, 1, 46aab0725ea6c, -, 6),
4395 HEX_DBL(+, 1, 821fc1e799e01, -, 60) },
4396 { HEX_DBL(+, 1, f8bacf242aa2c, -, 1), HEX_DBL(+, 1, 520c1322f1e4e, -, 6),
4397 HEX_DBL(+, 1, 129dcda3ad563, -, 60) },
4398 { HEX_DBL(+, 1, f87ca4cbb755, -, 1), HEX_DBL(+, 1, 5d6c0ee91d2ab, -, 6),
4399 HEX_DBL(+, 1, c5b190c04606e, -, 62) },
4400 { HEX_DBL(+, 1, f83e89c195c25, -, 1), HEX_DBL(+, 1, 68caa41d448c3, -, 6),
4401 HEX_DBL(+, 1, 4723441195ac9, -, 59) }
4402 };
4403
4404 static double __loglTable3[8][3] = {
4405 { HEX_DBL(+, 1, 000e00c40ab89, +, 0), HEX_DBL(-, 1, 4332be0032168, -, 12),
4406 HEX_DBL(+, 1, a1003588d217a, -, 65) },
4407 { HEX_DBL(+, 1, 000a006403e82, +, 0), HEX_DBL(-, 1, cdb2987366fcc, -, 13),
4408 HEX_DBL(+, 1, 5c86001294bbc, -, 67) },
4409 { HEX_DBL(+, 1, 0006002400d8, +, 0), HEX_DBL(-, 1, 150297c90fa6f, -, 13),
4410 HEX_DBL(+, 1, 01fb4865fae32, -, 66) },
4411 { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
4412 { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
4413 { HEX_DBL(+, 1, ffe8011ff280a, -, 1), HEX_DBL(+, 1, 14f8daf5e3d3b, -, 12),
4414 HEX_DBL(+, 1, 3c933b4b6b914, -, 68) },
4415 { HEX_DBL(+, 1, ffd8031fc184e, -, 1), HEX_DBL(+, 1, cd978c38042bb, -, 12),
4416 HEX_DBL(+, 1, 10f8e642e66fd, -, 65) },
4417 { HEX_DBL(+, 1, ffc8061f5492b, -, 1), HEX_DBL(+, 1, 43183c878274e, -, 11),
4418 HEX_DBL(+, 1, 5885dd1eb6582, -, 65) }
4419 };
4420
__log2_ep(double * hi,double * lo,double x)4421 static void __log2_ep(double *hi, double *lo, double x)
4422 {
4423 union {
4424 uint64_t i;
4425 double d;
4426 } uu;
4427
4428 int m;
4429 double f = reference_frexp(x, &m);
4430
4431 // bring f in [0.75, 1.5)
4432 if (f < 0.75)
4433 {
4434 f *= 2.0;
4435 m -= 1;
4436 }
4437
4438 // index first table .... brings down to [1-2^-7, 1+2^6)
4439 uu.d = f;
4440 int index =
4441 (int)(((uu.i + ((uint64_t)1 << 51)) & 0x000fc00000000000ULL) >> 46);
4442 double r1 = __loglTable1[index][0];
4443 double logr1hi = __loglTable1[index][1];
4444 double logr1lo = __loglTable1[index][2];
4445 // since log1rhi has 39 bits of precision, we have 14 bit in hand ... since
4446 // |m| <= 1023 which needs 10bits at max, we can directly add m to log1hi
4447 // without spilling
4448 logr1hi += m;
4449
4450 // argument reduction needs to be in double-double since reduced argument
4451 // will form the leading term of polynomial approximation which sets the
4452 // precision we eventually achieve
4453 double zhi, zlo;
4454 MulD(&zhi, &zlo, r1, uu.d);
4455
4456 // second index table .... brings down to [1-2^-12, 1+2^-11)
4457 uu.d = zhi;
4458 index = (int)(((uu.i + ((uint64_t)1 << 46)) & 0x00007e0000000000ULL) >> 41);
4459 double r2 = __loglTable2[index][0];
4460 double logr2hi = __loglTable2[index][1];
4461 double logr2lo = __loglTable2[index][2];
4462
4463 // reduce argument
4464 MulDD(&zhi, &zlo, zhi, zlo, r2, 0.0);
4465
4466 // third index table .... brings down to [1-2^-14, 1+2^-13)
4467 // Actually reduction to 2^-11 would have been sufficient to calculate
4468 // second order term in polynomial in double rather than double-double, I
4469 // reduced it a bit more to make sure other systematic arithmetic errors
4470 // are guarded against .... also this allow lower order product of leading
4471 // polynomial term i.e. Ao_hi*z_lo + Ao_lo*z_hi to be done in double rather
4472 // than double-double ... hence only term that needs to be done in
4473 // double-double is Ao_hi*z_hi
4474 uu.d = zhi;
4475 index = (int)(((uu.i + ((uint64_t)1 << 41)) & 0x0000038000000000ULL) >> 39);
4476 double r3 = __loglTable3[index][0];
4477 double logr3hi = __loglTable3[index][1];
4478 double logr3lo = __loglTable3[index][2];
4479
4480 // log2(x) = m + log2(r1) + log2(r1) + log2(1 + (zh + zlo))
4481 // calculate sum of first three terms ... note that m has already
4482 // been added to log2(r1)_hi
4483 double log2hi, log2lo;
4484 AddDD(&log2hi, &log2lo, logr1hi, logr1lo, logr2hi, logr2lo);
4485 AddDD(&log2hi, &log2lo, logr3hi, logr3lo, log2hi, log2lo);
4486
4487 // final argument reduction .... zhi will be in [1-2^-14, 1+2^-13) after
4488 // this
4489 MulDD(&zhi, &zlo, zhi, zlo, r3, 0.0);
4490 // we dont need to do full double-double substract here. substracting 1.0
4491 // for higher term is exact
4492 zhi = zhi - 1.0;
4493 // normalize
4494 AddD(&zhi, &zlo, zhi, zlo);
4495
4496 // polynomail fitting to compute log2(1 + z) ... forth order polynomial fit
4497 // to log2(1 + z)/z gives minimax absolute error of O(2^-76) with z in
4498 // [-2^-14, 2^-13] log2(1 + z)/z = Ao + A1*z + A2*z^2 + A3*z^3 + A4*z^4
4499 // => log2(1 + z) = Ao*z + A1*z^2 + A2*z^3 + A3*z^4 + A4*z^5
4500 // => log2(1 + z) = (Aohi + Aolo)*(zhi + zlo) + z^2*(A1 + A2*z + A3*z^2 +
4501 // A4*z^3) since we are looking for at least 64 digits of precision and z in
4502 // [-2^-14, 2^-13], final term can be done in double .... also Aolo*zhi +
4503 // Aohi*zlo can be done in double .... Aohi*zhi needs to be done in
4504 // double-double
4505
4506 double Aohi = HEX_DBL(+, 1, 71547652b82fe, +, 0);
4507 double Aolo = HEX_DBL(+, 1, 777c9cbb675c, -, 56);
4508 double y;
4509 y = HEX_DBL(+, 1, 276d2736fade7, -, 2);
4510 y = HEX_DBL(-, 1, 7154765782df1, -, 2) + y * zhi;
4511 y = HEX_DBL(+, 1, ec709dc3a0f67, -, 2) + y * zhi;
4512 y = HEX_DBL(-, 1, 71547652b82fe, -, 1) + y * zhi;
4513 double zhisq = zhi * zhi;
4514 y = y * zhisq;
4515 y = y + zhi * Aolo;
4516 y = y + zlo * Aohi;
4517
4518 MulD(&zhi, &zlo, Aohi, zhi);
4519 AddDD(&zhi, &zlo, zhi, zlo, y, 0.0);
4520 AddDD(&zhi, &zlo, zhi, zlo, log2hi, log2lo);
4521
4522 *hi = zhi;
4523 *lo = zlo;
4524 }
4525
reference_powl(long double x,long double y)4526 long double reference_powl(long double x, long double y)
4527 {
4528 // this will be used for testing doubles i.e. arguments will
4529 // be doubles so cast the input back to double ... returned
4530 // result will be long double though .... > 53 bits of precision
4531 // if platform allows.
4532 // ===========
4533 // New finding.
4534 // ===========
4535 // this function is getting used for computing reference cube root (cbrt)
4536 // as follows __powl( x, 1.0L/3.0L ) so if the y are assumed to
4537 // be double and is converted from long double to double, truncation
4538 // causes errors. So we need to tread y as long double and convert it
4539 // to hi, lo doubles when performing y*log2(x).
4540
4541 static const double neg_epsilon = HEX_DBL(+, 1, 0, +, 53);
4542
4543 // if x = 1, return x for any y, even NaN
4544 if (x == 1.0) return x;
4545
4546 // if y == 0, return 1 for any x, even NaN
4547 if (y == 0.0) return 1.0L;
4548
4549 // get NaNs out of the way
4550 if (x != x || y != y) return x + y;
4551
4552 // do the work required to sort out edge cases
4553 double fabsy = (double)reference_fabsl(y);
4554 double fabsx = (double)reference_fabsl(x);
4555 double iy = reference_rint(
4556 fabsy); // we do round to nearest here so that |fy| <= 0.5
4557 if (iy > fabsy) // convert nearbyint to floor
4558 iy -= 1.0;
4559 int isOddInt = 0;
4560 if (fabsy == iy && !reference_isinf(fabsy) && iy < neg_epsilon)
4561 isOddInt = (int)(iy - 2.0 * rint(0.5 * iy)); // might be 0, -1, or 1
4562
4563 /// test a few more edge cases
4564 // deal with x == 0 cases
4565 if (x == 0.0)
4566 {
4567 if (!isOddInt) x = 0.0;
4568
4569 if (y < 0) x = 1.0 / x;
4570
4571 return x;
4572 }
4573
4574 // x == +-Inf cases
4575 if (isinf(fabsx))
4576 {
4577 if (x < 0)
4578 {
4579 if (isOddInt)
4580 {
4581 if (y < 0)
4582 return -0.0;
4583 else
4584 return -INFINITY;
4585 }
4586 else
4587 {
4588 if (y < 0)
4589 return 0.0;
4590 else
4591 return INFINITY;
4592 }
4593 }
4594
4595 if (y < 0) return 0;
4596 return INFINITY;
4597 }
4598
4599 // y = +-inf cases
4600 if (isinf(fabsy))
4601 {
4602 if (x == -1) return 1;
4603
4604 if (y < 0)
4605 {
4606 if (fabsx < 1) return INFINITY;
4607 return 0;
4608 }
4609 if (fabsx < 1) return 0;
4610 return INFINITY;
4611 }
4612
4613 // x < 0 and y non integer case
4614 if (x < 0 && iy != fabsy)
4615 {
4616 // return nan;
4617 return cl_make_nan();
4618 }
4619
4620 // speedy resolution of sqrt and reciprocal sqrt
4621 if (fabsy == 0.5)
4622 {
4623 long double xl = sqrtl(x);
4624 if (y < 0) xl = 1.0 / xl;
4625 return xl;
4626 }
4627
4628 double log2x_hi, log2x_lo;
4629
4630 // extended precision log .... accurate to at least 64-bits + couple of
4631 // guard bits
4632 __log2_ep(&log2x_hi, &log2x_lo, fabsx);
4633
4634 double ylog2x_hi, ylog2x_lo;
4635
4636 double y_hi = (double)y;
4637 double y_lo = (double)(y - (long double)y_hi);
4638
4639 // compute product of y*log2(x)
4640 // scale to avoid overflow in double-double multiplication
4641 if (fabsy > HEX_DBL(+, 1, 0, +, 970))
4642 {
4643 y_hi = reference_ldexp(y_hi, -53);
4644 y_lo = reference_ldexp(y_lo, -53);
4645 }
4646 MulDD(&ylog2x_hi, &ylog2x_lo, log2x_hi, log2x_lo, y_hi, y_lo);
4647 if (fabsy > HEX_DBL(+, 1, 0, +, 970))
4648 {
4649 ylog2x_hi = reference_ldexp(ylog2x_hi, 53);
4650 ylog2x_lo = reference_ldexp(ylog2x_lo, 53);
4651 }
4652
4653 long double powxy;
4654 if (isinf(ylog2x_hi) || (reference_fabs(ylog2x_hi) > 2200))
4655 {
4656 powxy =
4657 reference_signbit(ylog2x_hi) ? HEX_DBL(+, 0, 0, +, 0) : INFINITY;
4658 }
4659 else
4660 {
4661 // separate integer + fractional part
4662 long int m = lrint(ylog2x_hi);
4663 AddDD(&ylog2x_hi, &ylog2x_lo, ylog2x_hi, ylog2x_lo, -m, 0.0);
4664
4665 // revert to long double arithemtic
4666 long double ylog2x = (long double)ylog2x_hi + (long double)ylog2x_lo;
4667 long double tmp = reference_exp2l(ylog2x);
4668 powxy = reference_scalblnl(tmp, m);
4669 }
4670
4671 // if y is odd integer and x is negative, reverse sign
4672 if (isOddInt & reference_signbit(x)) powxy = -powxy;
4673 return powxy;
4674 }
4675
reference_nextafter(double xx,double yy)4676 double reference_nextafter(double xx, double yy)
4677 {
4678 float x = (float)xx;
4679 float y = (float)yy;
4680
4681 // take care of nans
4682 if (x != x) return x;
4683
4684 if (y != y) return y;
4685
4686 if (x == y) return y;
4687
4688 int32f_t a, b;
4689
4690 a.f = x;
4691 b.f = y;
4692
4693 if (a.i & 0x80000000) a.i = 0x80000000 - a.i;
4694 if (b.i & 0x80000000) b.i = 0x80000000 - b.i;
4695
4696 a.i += (a.i < b.i) ? 1 : -1;
4697 a.i = (a.i < 0) ? (cl_int)0x80000000 - a.i : a.i;
4698
4699 return a.f;
4700 }
4701
4702
reference_nextafterl(long double xx,long double yy)4703 long double reference_nextafterl(long double xx, long double yy)
4704 {
4705 double x = (double)xx;
4706 double y = (double)yy;
4707
4708 // take care of nans
4709 if (x != x) return x;
4710
4711 if (y != y) return y;
4712
4713 int64d_t a, b;
4714
4715 a.d = x;
4716 b.d = y;
4717
4718 int64_t tmp = 0x8000000000000000LL;
4719
4720 if (a.l & tmp) a.l = tmp - a.l;
4721 if (b.l & tmp) b.l = tmp - b.l;
4722
4723 // edge case. if (x == y) or (x = 0.0f and y = -0.0f) or (x = -0.0f and y =
4724 // 0.0f) test needs to be done using integer rep because subnormals may be
4725 // flushed to zero on some platforms
4726 if (a.l == b.l) return y;
4727
4728 a.l += (a.l < b.l) ? 1 : -1;
4729 a.l = (a.l < 0) ? tmp - a.l : a.l;
4730
4731 return a.d;
4732 }
4733
reference_fdim(double xx,double yy)4734 double reference_fdim(double xx, double yy)
4735 {
4736 float x = (float)xx;
4737 float y = (float)yy;
4738
4739 if (x != x) return x;
4740
4741 if (y != y) return y;
4742
4743 float r = (x > y) ? (float)reference_subtract(x, y) : 0.0f;
4744 return r;
4745 }
4746
4747
reference_fdiml(long double xx,long double yy)4748 long double reference_fdiml(long double xx, long double yy)
4749 {
4750 double x = (double)xx;
4751 double y = (double)yy;
4752
4753 if (x != x) return x;
4754
4755 if (y != y) return y;
4756
4757 double r = (x > y) ? (double)reference_subtractl(x, y) : 0.0;
4758 return r;
4759 }
4760
reference_remquo(double xd,double yd,int * n)4761 double reference_remquo(double xd, double yd, int *n)
4762 {
4763 float xx = (float)xd;
4764 float yy = (float)yd;
4765
4766 if (isnan(xx) || isnan(yy) || fabsf(xx) == INFINITY || yy == 0.0)
4767 {
4768 *n = 0;
4769 return cl_make_nan();
4770 }
4771
4772 if (fabsf(yy) == INFINITY || xx == 0.0f)
4773 {
4774 *n = 0;
4775 return xd;
4776 }
4777
4778 if (fabsf(xx) == fabsf(yy))
4779 {
4780 *n = (xx == yy) ? 1 : -1;
4781 return reference_signbit(xx) ? -0.0 : 0.0;
4782 }
4783
4784 int signx = reference_signbit(xx) ? -1 : 1;
4785 int signy = reference_signbit(yy) ? -1 : 1;
4786 int signn = (signx == signy) ? 1 : -1;
4787 float x = fabsf(xx);
4788 float y = fabsf(yy);
4789
4790 int ex, ey;
4791 ex = reference_ilogb(x);
4792 ey = reference_ilogb(y);
4793 float xr = x;
4794 float yr = y;
4795 uint32_t q = 0;
4796
4797 if (ex - ey >= -1)
4798 {
4799
4800 yr = (float)reference_ldexp(y, -ey);
4801 xr = (float)reference_ldexp(x, -ex);
4802
4803 if (ex - ey >= 0)
4804 {
4805 int i;
4806 for (i = ex - ey; i > 0; i--)
4807 {
4808 q <<= 1;
4809 if (xr >= yr)
4810 {
4811 xr -= yr;
4812 q += 1;
4813 }
4814 xr += xr;
4815 }
4816 q <<= 1;
4817 if (xr > yr)
4818 {
4819 xr -= yr;
4820 q += 1;
4821 }
4822 }
4823 else // ex-ey = -1
4824 xr = reference_ldexp(xr, ex - ey);
4825 }
4826
4827 if ((yr < 2.0f * xr) || ((yr == 2.0f * xr) && (q & 0x00000001)))
4828 {
4829 xr -= yr;
4830 q += 1;
4831 }
4832
4833 if (ex - ey >= -1) xr = reference_ldexp(xr, ey);
4834
4835 int qout = q & 0x0000007f;
4836 if (signn < 0) qout = -qout;
4837 if (xx < 0.0) xr = -xr;
4838
4839 *n = qout;
4840
4841 return xr;
4842 }
4843
reference_remquol(long double xd,long double yd,int * n)4844 long double reference_remquol(long double xd, long double yd, int *n)
4845 {
4846 double xx = (double)xd;
4847 double yy = (double)yd;
4848
4849 if (isnan(xx) || isnan(yy) || fabs(xx) == INFINITY || yy == 0.0)
4850 {
4851 *n = 0;
4852 return cl_make_nan();
4853 }
4854
4855 if (reference_fabs(yy) == INFINITY || xx == 0.0)
4856 {
4857 *n = 0;
4858 return xd;
4859 }
4860
4861 if (reference_fabs(xx) == reference_fabs(yy))
4862 {
4863 *n = (xx == yy) ? 1 : -1;
4864 return reference_signbit(xx) ? -0.0 : 0.0;
4865 }
4866
4867 int signx = reference_signbit(xx) ? -1 : 1;
4868 int signy = reference_signbit(yy) ? -1 : 1;
4869 int signn = (signx == signy) ? 1 : -1;
4870 double x = reference_fabs(xx);
4871 double y = reference_fabs(yy);
4872
4873 int ex, ey;
4874 ex = reference_ilogbl(x);
4875 ey = reference_ilogbl(y);
4876 double xr = x;
4877 double yr = y;
4878 uint32_t q = 0;
4879
4880 if (ex - ey >= -1)
4881 {
4882 yr = reference_ldexp(y, -ey);
4883 xr = reference_ldexp(x, -ex);
4884 int i;
4885
4886 if (ex - ey >= 0)
4887 {
4888 for (i = ex - ey; i > 0; i--)
4889 {
4890 q <<= 1;
4891 if (xr >= yr)
4892 {
4893 xr -= yr;
4894 q += 1;
4895 }
4896 xr += xr;
4897 }
4898 q <<= 1;
4899 if (xr > yr)
4900 {
4901 xr -= yr;
4902 q += 1;
4903 }
4904 }
4905 else
4906 xr = reference_ldexp(xr, ex - ey);
4907 }
4908
4909 if ((yr < 2.0 * xr) || ((yr == 2.0 * xr) && (q & 0x00000001)))
4910 {
4911 xr -= yr;
4912 q += 1;
4913 }
4914
4915 if (ex - ey >= -1) xr = reference_ldexp(xr, ey);
4916
4917 int qout = q & 0x0000007f;
4918 if (signn < 0) qout = -qout;
4919 if (xx < 0.0) xr = -xr;
4920
4921 *n = qout;
4922 return xr;
4923 }
4924
reference_scalbn(double x,int n)4925 static double reference_scalbn(double x, int n)
4926 {
4927 if (reference_isinf(x) || reference_isnan(x) || x == 0.0) return x;
4928
4929 int bias = 1023;
4930 union {
4931 double d;
4932 cl_long l;
4933 } u;
4934 u.d = (double)x;
4935 int e = (int)((u.l & 0x7ff0000000000000LL) >> 52);
4936 if (e == 0)
4937 {
4938 u.l |= ((cl_long)1023 << 52);
4939 u.d -= 1.0;
4940 e = (int)((u.l & 0x7ff0000000000000LL) >> 52) - 1022;
4941 }
4942 e += n;
4943 if (e >= 2047 || n >= 2098) return reference_copysign(INFINITY, x);
4944 if (e < -51 || n < -2097) return reference_copysign(0.0, x);
4945 if (e <= 0)
4946 {
4947 bias += (e - 1);
4948 e = 1;
4949 }
4950 u.l &= 0x800fffffffffffffLL;
4951 u.l |= ((cl_long)e << 52);
4952 x = u.d;
4953 u.l = ((cl_long)bias << 52);
4954 return x * u.d;
4955 }
4956
reference_scalblnl(long double x,long n)4957 static long double reference_scalblnl(long double x, long n)
4958 {
4959 #if defined(__i386__) || defined(__x86_64__) // INTEL
4960 union {
4961 long double d;
4962 struct
4963 {
4964 cl_ulong m;
4965 cl_ushort sexp;
4966 } u;
4967 } u;
4968 u.u.m = CL_LONG_MIN;
4969
4970 if (reference_isinf(x)) return x;
4971
4972 if (x == 0.0L || n < -2200) return reference_copysignl(0.0L, x);
4973
4974 if (n > 2200) return reference_copysignl(INFINITY, x);
4975
4976 if (n < 0)
4977 {
4978 u.u.sexp = 0x3fff - 1022;
4979 while (n <= -1022)
4980 {
4981 x *= u.d;
4982 n += 1022;
4983 }
4984 u.u.sexp = 0x3fff + n;
4985 x *= u.d;
4986 return x;
4987 }
4988
4989 if (n > 0)
4990 {
4991 u.u.sexp = 0x3fff + 1023;
4992 while (n >= 1023)
4993 {
4994 x *= u.d;
4995 n -= 1023;
4996 }
4997 u.u.sexp = 0x3fff + n;
4998 x *= u.d;
4999 return x;
5000 }
5001
5002 return x;
5003
5004 #elif defined(__arm__) // ARM .. sizeof(long double) == sizeof(double)
5005
5006 #if __DBL_MAX_EXP__ >= __LDBL_MAX_EXP__
5007 if (reference_isinfl(x) || reference_isnanl(x)) return x;
5008
5009 int bias = 1023;
5010 union {
5011 double d;
5012 cl_long l;
5013 } u;
5014 u.d = (double)x;
5015 int e = (int)((u.l & 0x7ff0000000000000LL) >> 52);
5016 if (e == 0)
5017 {
5018 u.l |= ((cl_long)1023 << 52);
5019 u.d -= 1.0;
5020 e = (int)((u.l & 0x7ff0000000000000LL) >> 52) - 1022;
5021 }
5022 e += n;
5023 if (e >= 2047) return reference_copysignl(INFINITY, x);
5024 if (e < -51) return reference_copysignl(0.0, x);
5025 if (e <= 0)
5026 {
5027 bias += (e - 1);
5028 e = 1;
5029 }
5030 u.l &= 0x800fffffffffffffLL;
5031 u.l |= ((cl_long)e << 52);
5032 x = u.d;
5033 u.l = ((cl_long)bias << 52);
5034 return x * u.d;
5035 #endif
5036
5037 #else // PPC
5038 return scalblnl(x, n);
5039 #endif
5040 }
5041
reference_relaxed_exp(double x)5042 double reference_relaxed_exp(double x) { return reference_exp(x); }
5043
reference_exp(double x)5044 double reference_exp(double x)
5045 {
5046 return reference_exp2(x * HEX_DBL(+, 1, 71547652b82fe, +, 0));
5047 }
5048
reference_expl(long double x)5049 long double reference_expl(long double x)
5050 {
5051 #if defined(__PPC__)
5052 long double scale, bias;
5053
5054 // The PPC double long version of expl fails to produce denorm results
5055 // and instead generates a 0.0. Compensate for this limitation by
5056 // computing expl as:
5057 // expl(x + 40) * expl(-40)
5058 // Likewise, overflows can prematurely produce an infinity, so we
5059 // compute expl as:
5060 // expl(x - 40) * expl(40)
5061 scale = 1.0L;
5062 bias = 0.0L;
5063 if (x < -708.0L)
5064 {
5065 bias = 40.0;
5066 scale = expl(-40.0L);
5067 }
5068 else if (x > 708.0L)
5069 {
5070 bias = -40.0L;
5071 scale = expl(40.0L);
5072 }
5073 return expl(x + bias) * scale;
5074 #else
5075 return expl(x);
5076 #endif
5077 }
5078
reference_sinh(double x)5079 double reference_sinh(double x) { return sinh(x); }
5080
reference_sinhl(long double x)5081 long double reference_sinhl(long double x) { return sinhl(x); }
5082
reference_fmod(double x,double y)5083 double reference_fmod(double x, double y)
5084 {
5085 if (x == 0.0 && fabs(y) > 0.0) return x;
5086
5087 if (fabs(x) == INFINITY || y == 0) return cl_make_nan();
5088
5089 if (fabs(y) == INFINITY) // we know x is finite from above
5090 return x;
5091 #if defined(_MSC_VER) && defined(_M_X64)
5092 return fmod(x, y);
5093 #else
5094 return fmodf((float)x, (float)y);
5095 #endif
5096 }
5097
reference_fmodl(long double x,long double y)5098 long double reference_fmodl(long double x, long double y)
5099 {
5100 if (x == 0.0L && fabsl(y) > 0.0L) return x;
5101
5102 if (fabsl(x) == INFINITY || y == 0.0L) return cl_make_nan();
5103
5104 if (fabsl(y) == INFINITY) // we know x is finite from above
5105 return x;
5106
5107 return fmod((double)x, (double)y);
5108 }
5109
reference_modf(double x,double * n)5110 double reference_modf(double x, double *n)
5111 {
5112 if (isnan(x))
5113 {
5114 *n = cl_make_nan();
5115 return cl_make_nan();
5116 }
5117 float nr;
5118 float yr = modff((float)x, &nr);
5119 *n = nr;
5120 return yr;
5121 }
5122
reference_modfl(long double x,long double * n)5123 long double reference_modfl(long double x, long double *n)
5124 {
5125 if (isnan(x))
5126 {
5127 *n = cl_make_nan();
5128 return cl_make_nan();
5129 }
5130 double nr;
5131 double yr = modf((double)x, &nr);
5132 *n = nr;
5133 return yr;
5134 }
5135
reference_fractl(long double x,long double * ip)5136 long double reference_fractl(long double x, long double *ip)
5137 {
5138 if (isnan(x))
5139 {
5140 *ip = cl_make_nan();
5141 return cl_make_nan();
5142 }
5143
5144 double i;
5145 double f = modf((double)x, &i);
5146 if (f < 0.0)
5147 {
5148 f = 1.0 + f;
5149 i -= 1.0;
5150 if (f == 1.0) f = HEX_DBL(+, 1, fffffffffffff, -, 1);
5151 }
5152 *ip = i;
5153 return f;
5154 }
5155
reference_fabsl(long double x)5156 long double reference_fabsl(long double x) { return fabsl(x); }
5157
reference_relaxed_log(double x)5158 double reference_relaxed_log(double x)
5159 {
5160 return (float)reference_log((float)x);
5161 }
5162
reference_log(double x)5163 double reference_log(double x)
5164 {
5165 if (x == 0.0) return -INFINITY;
5166
5167 if (x < 0.0) return cl_make_nan();
5168
5169 if (isinf(x)) return INFINITY;
5170
5171 double log2Hi = HEX_DBL(+, 1, 62e42fefa39ef, -, 1);
5172 double logxHi, logxLo;
5173 __log2_ep(&logxHi, &logxLo, x);
5174 return logxHi * log2Hi;
5175 }
5176
reference_logl(long double x)5177 long double reference_logl(long double x)
5178 {
5179 if (x == 0.0) return -INFINITY;
5180
5181 if (x < 0.0) return cl_make_nan();
5182
5183 if (isinf(x)) return INFINITY;
5184
5185 double log2Hi = HEX_DBL(+, 1, 62e42fefa39ef, -, 1);
5186 double log2Lo = HEX_DBL(+, 1, abc9e3b39803f, -, 56);
5187 double logxHi, logxLo;
5188 __log2_ep(&logxHi, &logxLo, x);
5189
5190 long double lg2 = (long double)log2Hi + (long double)log2Lo;
5191 long double logx = (long double)logxHi + (long double)logxLo;
5192 return logx * lg2;
5193 }
5194
reference_relaxed_pow(double x,double y)5195 double reference_relaxed_pow(double x, double y)
5196 {
5197 return (float)reference_exp2(((float)y) * (float)reference_log2((float)x));
5198 }
5199
reference_pow(double x,double y)5200 double reference_pow(double x, double y)
5201 {
5202 static const double neg_epsilon = HEX_DBL(+, 1, 0, +, 53);
5203
5204 // if x = 1, return x for any y, even NaN
5205 if (x == 1.0) return x;
5206
5207 // if y == 0, return 1 for any x, even NaN
5208 if (y == 0.0) return 1.0;
5209
5210 // get NaNs out of the way
5211 if (x != x || y != y) return x + y;
5212
5213 // do the work required to sort out edge cases
5214 double fabsy = reference_fabs(y);
5215 double fabsx = reference_fabs(x);
5216 double iy = reference_rint(
5217 fabsy); // we do round to nearest here so that |fy| <= 0.5
5218 if (iy > fabsy) // convert nearbyint to floor
5219 iy -= 1.0;
5220 int isOddInt = 0;
5221 if (fabsy == iy && !reference_isinf(fabsy) && iy < neg_epsilon)
5222 isOddInt = (int)(iy - 2.0 * rint(0.5 * iy)); // might be 0, -1, or 1
5223
5224 /// test a few more edge cases
5225 // deal with x == 0 cases
5226 if (x == 0.0)
5227 {
5228 if (!isOddInt) x = 0.0;
5229
5230 if (y < 0) x = 1.0 / x;
5231
5232 return x;
5233 }
5234
5235 // x == +-Inf cases
5236 if (isinf(fabsx))
5237 {
5238 if (x < 0)
5239 {
5240 if (isOddInt)
5241 {
5242 if (y < 0)
5243 return -0.0;
5244 else
5245 return -INFINITY;
5246 }
5247 else
5248 {
5249 if (y < 0)
5250 return 0.0;
5251 else
5252 return INFINITY;
5253 }
5254 }
5255
5256 if (y < 0) return 0;
5257 return INFINITY;
5258 }
5259
5260 // y = +-inf cases
5261 if (isinf(fabsy))
5262 {
5263 if (x == -1) return 1;
5264
5265 if (y < 0)
5266 {
5267 if (fabsx < 1) return INFINITY;
5268 return 0;
5269 }
5270 if (fabsx < 1) return 0;
5271 return INFINITY;
5272 }
5273
5274 // x < 0 and y non integer case
5275 if (x < 0 && iy != fabsy)
5276 {
5277 // return nan;
5278 return cl_make_nan();
5279 }
5280
5281 // speedy resolution of sqrt and reciprocal sqrt
5282 if (fabsy == 0.5)
5283 {
5284 long double xl = reference_sqrt(x);
5285 if (y < 0) xl = 1.0 / xl;
5286 return xl;
5287 }
5288
5289 double hi, lo;
5290 __log2_ep(&hi, &lo, fabsx);
5291 double prod = y * hi;
5292 double result = reference_exp2(prod);
5293 return isOddInt ? reference_copysignd(result, x) : result;
5294 }
5295
reference_sqrt(double x)5296 double reference_sqrt(double x) { return sqrt(x); }
5297
reference_floor(double x)5298 double reference_floor(double x) { return floorf((float)x); }
5299
reference_ldexp(double value,int exponent)5300 double reference_ldexp(double value, int exponent)
5301 {
5302 #ifdef __MINGW32__
5303 /*
5304 * ====================================================
5305 * This function is from fdlibm: http://www.netlib.org
5306 * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
5307 *
5308 * Developed at SunSoft, a Sun Microsystems, Inc. business.
5309 * Permission to use, copy, modify, and distribute this
5310 * software is freely granted, provided that this notice
5311 * is preserved.
5312 * ====================================================
5313 */
5314 if (!finite(value) || value == 0.0) return value;
5315 return scalbn(value, exponent);
5316 #else
5317 return reference_scalbn(value, exponent);
5318 #endif
5319 }
5320
reference_ldexpl(long double x,int n)5321 long double reference_ldexpl(long double x, int n) { return ldexpl(x, n); }
5322
reference_coshl(long double x)5323 long double reference_coshl(long double x) { return coshl(x); }
5324
reference_ceil(double x)5325 double reference_ceil(double x) { return ceilf((float)x); }
5326
reference_ceill(long double x)5327 long double reference_ceill(long double x)
5328 {
5329 if (x == 0.0 || reference_isinfl(x) || reference_isnanl(x)) return x;
5330
5331 long double absx = reference_fabsl(x);
5332 if (absx >= HEX_LDBL(+, 1, 0, +, 52)) return x;
5333
5334 if (absx < 1.0)
5335 {
5336 if (x < 0.0)
5337 return 0.0;
5338 else
5339 return 1.0;
5340 }
5341
5342 long double r = (long double)((cl_long)x);
5343
5344 if (x > 0.0 && r < x) r += 1.0;
5345
5346 return r;
5347 }
5348
5349
reference_acosl(long double x)5350 long double reference_acosl(long double x)
5351 {
5352 long double x2 = x * x;
5353 int i;
5354
5355 // Prepare a head + tail representation of PI in long double. A good
5356 // compiler should get rid of all of this work.
5357 static const cl_ulong pi_bits[2] = {
5358 0x3243F6A8885A308DULL, 0x313198A2E0370734ULL
5359 }; // first 126 bits of pi
5360 // http://www.super-computing.org/pi-hexa_current.html
5361 long double head, tail;
5362 #if __LDBL_MANT_DIG__ >= 64
5363 // long double has 64-bits of precision or greater
5364 long double temp = (long double)pi_bits[0] * 0x1.0p64L;
5365 head = temp + (long double)pi_bits[1];
5366 temp -= head; // rounding err rounding pi_bits[1] into head
5367 tail = (long double)pi_bits[1] + temp;
5368 head *= HEX_LDBL(+, 1, 0, -, 125);
5369 tail *= HEX_LDBL(+, 1, 0, -, 125);
5370 #else
5371 head = (long double)pi_bits[0];
5372 tail =
5373 (long double)((cl_long)pi_bits[0]
5374 - (cl_long)
5375 head); // residual part of pi_bits[0] after rounding
5376 tail = tail * HEX_LDBL(+, 1, 0, +, 64) + (long double)pi_bits[1];
5377 head *= HEX_LDBL(+, 1, 0, -, 61);
5378 tail *= HEX_LDBL(+, 1, 0, -, 125);
5379 #endif
5380
5381 // oversize values and NaNs go to NaN
5382 if (!(x2 <= 1.0)) return sqrtl(1.0L - x2);
5383
5384 //
5385 // deal with large |x|:
5386 // sqrt( 1 - x**2)
5387 // acos(|x| > sqrt(0.5)) = 2 * atan( z ); z = -------------------- ;
5388 // z in [0, sqrt(0.5)/(1+sqrt(0.5) = .4142135...]
5389 // 1 + x
5390 if (x2 > 0.5)
5391 {
5392 // we handle the x < 0 case as pi - acos(|x|)
5393
5394 long double sign = reference_copysignl(1.0L, x);
5395 long double fabsx = reference_fabsl(x);
5396 head -= head * sign; // x > 0 ? 0 : pi.hi
5397 tail -= tail * sign; // x > 0 ? 0 : pi.low
5398
5399 // z = sqrt( 1-x**2 ) / (1+x) = sqrt( (1-x)(1+x) / (1+x)**2 ) = sqrt(
5400 // (1-x)/(1+x) )
5401 long double z2 = (1.0L - fabsx) / (1.0L + fabsx); // z**2
5402 long double z = sign * sqrtl(z2);
5403
5404 // atan(sqrt(q))
5405 // Minimax fit p(x) = ---------------- - 1
5406 // sqrt(q)
5407 //
5408 // Define q = r*r, and solve for atan(r):
5409 //
5410 // atan(r) = (p(r) + 1) * r = rp(r) + r
5411 static long double atan_coeffs[] = {
5412 HEX_LDBL(-, b, 3f52e0c278293b3, -, 67),
5413 HEX_LDBL(-, a, aaaaaaaaaaa95b8, -, 5),
5414 HEX_LDBL(+, c, ccccccccc992407, -, 6),
5415 HEX_LDBL(-, 9, 24924923024398, -, 6),
5416 HEX_LDBL(+, e, 38e38d6f92c98f3, -, 7),
5417 HEX_LDBL(-, b, a2e89bfb8393ec6, -, 7),
5418 HEX_LDBL(+, 9, d89a9f574d412cb, -, 7),
5419 HEX_LDBL(-, 8, 88580517884c547, -, 7),
5420 HEX_LDBL(+, f, 0ab6756abdad408, -, 8),
5421 HEX_LDBL(-, d, 56a5b07a2f15b49, -, 8),
5422 HEX_LDBL(+, b, 72ab587e46d80b2, -, 8),
5423 HEX_LDBL(-, 8, 62ea24bb5b2e636, -, 8),
5424 HEX_LDBL(+, e, d67c16582123937, -, 10)
5425 }; // minimax fit over [ 0x1.0p-52, 0.18] Max error:
5426 // 0x1.67ea5c184e5d9p-64
5427
5428 // Calculate y = p(r)
5429 const size_t atan_coeff_count =
5430 sizeof(atan_coeffs) / sizeof(atan_coeffs[0]);
5431 long double y = atan_coeffs[atan_coeff_count - 1];
5432 for (i = (int)atan_coeff_count - 2; i >= 0; i--)
5433 y = atan_coeffs[i] + y * z2;
5434
5435 z *= 2.0L; // fold in 2.0 for 2.0 * atan(z)
5436 y *= z; // rp(r)
5437
5438 return head + ((y + tail) + z);
5439 }
5440
5441 // do |x| <= sqrt(0.5) here
5442 // acos( sqrt(z) ) -
5443 // PI/2
5444 // Piecewise minimax polynomial fits for p(z) = 1 +
5445 // ------------------------;
5446 // sqrt(z)
5447 //
5448 // Define z = x*x, and solve for acos(x) over x in x >= 0:
5449 //
5450 // acos( sqrt(z) ) = acos(x) = x*(p(z)-1) + PI/2 = xp(x**2) - x + PI/2
5451 //
5452 const long double coeffs[4][14] = {
5453 { HEX_LDBL(-, a, fa7382e1f347974, -, 10),
5454 HEX_LDBL(-, b, 4d5a992de1ac4da, -, 6),
5455 HEX_LDBL(-, a, c526184bd558c17, -, 7),
5456 HEX_LDBL(-, d, 9ed9b0346ec092a, -, 8),
5457 HEX_LDBL(-, 9, dca410c1f04b1f, -, 8),
5458 HEX_LDBL(-, f, 76e411ba9581ee5, -, 9),
5459 HEX_LDBL(-, c, c71b00479541d8e, -, 9),
5460 HEX_LDBL(-, a, f527a3f9745c9de, -, 9),
5461 HEX_LDBL(-, 9, a93060051f48d14, -, 9),
5462 HEX_LDBL(-, 8, b3d39ad70e06021, -, 9),
5463 HEX_LDBL(-, f, f2ab95ab84f79c, -, 10),
5464 HEX_LDBL(-, e, d1af5f5301ccfe4, -, 10),
5465 HEX_LDBL(-, e, 1b53ba562f0f74a, -, 10),
5466 HEX_LDBL(-, d, 6a3851330e15526, -,
5467 10) }, // x - 0.0625 in [ -0x1.fffffffffp-5, 0x1.0p-4 ]
5468 // Error: 0x1.97839bf07024p-76
5469
5470 { HEX_LDBL(-, 8, c2f1d638e4c1b48, -, 8),
5471 HEX_LDBL(-, c, d47ac903c311c2c, -, 6),
5472 HEX_LDBL(-, d, e020b2dabd5606a, -, 7),
5473 HEX_LDBL(-, a, 086fafac220f16b, -, 7),
5474 HEX_LDBL(-, 8, 55b5efaf6b86c3e, -, 7),
5475 HEX_LDBL(-, f, 05c9774fed2f571, -, 8),
5476 HEX_LDBL(-, e, 484a93f7f0fc772, -, 8),
5477 HEX_LDBL(-, e, 1a32baef01626e4, -, 8),
5478 HEX_LDBL(-, e, 528e525b5c9c73d, -, 8),
5479 HEX_LDBL(-, e, ddd5d27ad49b2c8, -, 8),
5480 HEX_LDBL(-, f, b3259e7ae10c6f, -, 8),
5481 HEX_LDBL(-, 8, 68998170d5b19b7, -, 7),
5482 HEX_LDBL(-, 9, 4468907f007727, -, 7),
5483 HEX_LDBL(-, a, 2ad5e4906a8e7b3, -,
5484 7) }, // x - 0.1875 in [ -0x1.0p-4, 0x1.0p-4 ] Error:
5485 // 0x1.647af70073457p-73
5486
5487 { HEX_LDBL(-, f, a76585ad399e7ac, -, 8),
5488 HEX_LDBL(-, e, d665b7dd504ca7c, -, 6),
5489 HEX_LDBL(-, 9, 4c7c2402bd4bc33, -, 6),
5490 HEX_LDBL(-, f, ba76b69074ff71c, -, 7),
5491 HEX_LDBL(-, f, 58117784bdb6d5f, -, 7),
5492 HEX_LDBL(-, 8, 22ddd8eef53227d, -, 6),
5493 HEX_LDBL(-, 9, 1d1d3b57a63cdb4, -, 6),
5494 HEX_LDBL(-, a, 9c4bdc40cca848, -, 6),
5495 HEX_LDBL(-, c, b673b12794edb24, -, 6),
5496 HEX_LDBL(-, f, 9290a06e31575bf, -, 6),
5497 HEX_LDBL(-, 9, b4929c16aeb3d1f, -, 5),
5498 HEX_LDBL(-, c, 461e725765a7581, -, 5),
5499 HEX_LDBL(-, 8, 0a59654c98d9207, -, 4),
5500 HEX_LDBL(-, a, 6de6cbd96c80562, -,
5501 4) }, // x - 0.3125 in [ -0x1.0p-4, 0x1.0p-4 ] Error:
5502 // 0x1.b0246c304ce1ap-70
5503
5504 { HEX_LDBL(-, b, dca8b0359f96342, -, 7),
5505 HEX_LDBL(-, 8, cd2522fcde9823, -, 5),
5506 HEX_LDBL(-, d, 2af9397b27ff74d, -, 6),
5507 HEX_LDBL(-, d, 723f2c2c2409811, -, 6),
5508 HEX_LDBL(-, f, ea8f8481ecc3cd1, -, 6),
5509 HEX_LDBL(-, a, 43fd8a7a646b0b2, -, 5),
5510 HEX_LDBL(-, e, 01b0bf63a4e8d76, -, 5),
5511 HEX_LDBL(-, 9, f0b7096a2a7b4d, -, 4),
5512 HEX_LDBL(-, e, 872e7c5a627ab4c, -, 4),
5513 HEX_LDBL(-, a, dbd760a1882da48, -, 3),
5514 HEX_LDBL(-, 8, 424e4dea31dd273, -, 2),
5515 HEX_LDBL(-, c, c05d7730963e793, -, 2),
5516 HEX_LDBL(-, a, 523d97197cd124a, -, 1),
5517 HEX_LDBL(-, 8, 307ba943978aaee, +,
5518 0) } // x - 0.4375 in [ -0x1.0p-4, 0x1.0p-4 ] Error:
5519 // 0x1.9ecff73da69c9p-66
5520 };
5521
5522 const long double offsets[4] = { 0.0625, 0.1875, 0.3125, 0.4375 };
5523 const size_t coeff_count = sizeof(coeffs[0]) / sizeof(coeffs[0][0]);
5524
5525 // reduce the incoming values a bit so that they are in the range
5526 // [-0x1.0p-4, 0x1.0p-4]
5527 const long double *c;
5528 i = x2 * 8.0L;
5529 c = coeffs[i];
5530 x2 -= offsets[i]; // exact
5531
5532 // calcualte p(x2)
5533 long double y = c[coeff_count - 1];
5534 for (i = (int)coeff_count - 2; i >= 0; i--) y = c[i] + y * x2;
5535
5536 // xp(x2)
5537 y *= x;
5538
5539 // return xp(x2) - x + PI/2
5540 return head + ((y + tail) - x);
5541 }
5542
reference_relaxed_acos(double x)5543 double reference_relaxed_acos(double x) { return reference_acos(x); }
5544
reference_log10(double x)5545 double reference_log10(double x)
5546 {
5547 if (x == 0.0) return -INFINITY;
5548
5549 if (x < 0.0) return cl_make_nan();
5550
5551 if (isinf(x)) return INFINITY;
5552
5553 double log2Hi = HEX_DBL(+, 1, 34413509f79fe, -, 2);
5554 double logxHi, logxLo;
5555 __log2_ep(&logxHi, &logxLo, x);
5556 return logxHi * log2Hi;
5557 }
5558
reference_relaxed_log10(double x)5559 double reference_relaxed_log10(double x) { return reference_log10(x); }
5560
reference_log10l(long double x)5561 long double reference_log10l(long double x)
5562 {
5563 if (x == 0.0) return -INFINITY;
5564
5565 if (x < 0.0) return cl_make_nan();
5566
5567 if (isinf(x)) return INFINITY;
5568
5569 double log2Hi = HEX_DBL(+, 1, 34413509f79fe, -, 2);
5570 double log2Lo = HEX_DBL(+, 1, e623e2566b02d, -, 55);
5571 double logxHi, logxLo;
5572 __log2_ep(&logxHi, &logxLo, x);
5573
5574 long double lg2 = (long double)log2Hi + (long double)log2Lo;
5575 long double logx = (long double)logxHi + (long double)logxLo;
5576 return logx * lg2;
5577 }
5578
reference_acos(double x)5579 double reference_acos(double x) { return acos(x); }
5580
reference_atan2(double x,double y)5581 double reference_atan2(double x, double y)
5582 {
5583 #if defined(_WIN32)
5584 // fix edge cases for Windows
5585 if (isinf(x) && isinf(y))
5586 {
5587 double retval = (y > 0) ? M_PI_4 : 3.f * M_PI_4;
5588 return (x > 0) ? retval : -retval;
5589 }
5590 #endif // _WIN32
5591 return atan2(x, y);
5592 }
5593
reference_atan2l(long double x,long double y)5594 long double reference_atan2l(long double x, long double y)
5595 {
5596 #if defined(_WIN32)
5597 // fix edge cases for Windows
5598 if (isinf(x) && isinf(y))
5599 {
5600 long double retval = (y > 0) ? M_PI_4 : 3.f * M_PI_4;
5601 return (x > 0) ? retval : -retval;
5602 }
5603 #endif // _WIN32
5604 return atan2l(x, y);
5605 }
5606
reference_frexp(double a,int * exp)5607 double reference_frexp(double a, int *exp)
5608 {
5609 if (isnan(a) || isinf(a) || a == 0.0)
5610 {
5611 *exp = 0;
5612 return a;
5613 }
5614
5615 union {
5616 cl_double d;
5617 cl_ulong l;
5618 } u;
5619
5620 u.d = a;
5621
5622 // separate out sign
5623 cl_ulong s = u.l & 0x8000000000000000ULL;
5624 u.l &= 0x7fffffffffffffffULL;
5625 int bias = -1022;
5626
5627 if ((u.l & 0x7ff0000000000000ULL) == 0)
5628 {
5629 double d = u.l;
5630 u.d = d;
5631 bias -= 1074;
5632 }
5633
5634 int e = (int)((u.l & 0x7ff0000000000000ULL) >> 52);
5635 u.l &= 0x000fffffffffffffULL;
5636 e += bias;
5637 u.l |= ((cl_ulong)1022 << 52);
5638 u.l |= s;
5639
5640 *exp = e;
5641 return u.d;
5642 }
5643
reference_frexpl(long double a,int * exp)5644 long double reference_frexpl(long double a, int *exp)
5645 {
5646 if (isnan(a) || isinf(a) || a == 0.0)
5647 {
5648 *exp = 0;
5649 return a;
5650 }
5651
5652 if (sizeof(long double) == sizeof(double))
5653 {
5654 return reference_frexp(a, exp);
5655 }
5656 else
5657 {
5658 return frexpl(a, exp);
5659 }
5660 }
5661
5662
reference_atan(double x)5663 double reference_atan(double x) { return atan(x); }
5664
reference_atanl(long double x)5665 long double reference_atanl(long double x) { return atanl(x); }
5666
reference_asinl(long double x)5667 long double reference_asinl(long double x) { return asinl(x); }
5668
reference_asin(double x)5669 double reference_asin(double x) { return asin(x); }
5670
reference_relaxed_asin(double x)5671 double reference_relaxed_asin(double x) { return reference_asin(x); }
5672
reference_fabs(double x)5673 double reference_fabs(double x) { return fabs(x); }
5674
reference_cosh(double x)5675 double reference_cosh(double x) { return cosh(x); }
5676
reference_sqrtl(long double x)5677 long double reference_sqrtl(long double x)
5678 {
5679 #if defined(__SSE2__) \
5680 || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
5681 __m128d result128 = _mm_set_sd((double)x);
5682 result128 = _mm_sqrt_sd(result128, result128);
5683 return _mm_cvtsd_f64(result128);
5684 #else
5685 volatile double dx = x;
5686 return sqrt(dx);
5687 #endif
5688 }
5689
reference_tanhl(long double x)5690 long double reference_tanhl(long double x) { return tanhl(x); }
5691
reference_floorl(long double x)5692 long double reference_floorl(long double x)
5693 {
5694 if (x == 0.0 || reference_isinfl(x) || reference_isnanl(x)) return x;
5695
5696 long double absx = reference_fabsl(x);
5697 if (absx >= HEX_LDBL(+, 1, 0, +, 52)) return x;
5698
5699 if (absx < 1.0)
5700 {
5701 if (x < 0.0)
5702 return -1.0;
5703 else
5704 return 0.0;
5705 }
5706
5707 long double r = (long double)((cl_long)x);
5708
5709 if (x < 0.0 && r > x) r -= 1.0;
5710
5711 return r;
5712 }
5713
5714
reference_tanh(double x)5715 double reference_tanh(double x) { return tanh(x); }
5716
reference_assignmentl(long double x)5717 long double reference_assignmentl(long double x) { return x; }
5718
reference_notl(long double x)5719 int reference_notl(long double x)
5720 {
5721 int r = !x;
5722 return r;
5723 }
5724