xref: /aosp_15_r20/external/OpenCL-CTS/test_conformance/math_brute_force/reference_math.cpp (revision 6467f958c7de8070b317fc65bcb0f6472e388d82)
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 
17 #include "reference_math.h"
18 #include "harness/compat.h"
19 
20 #include <climits>
21 
22 #if !defined(_WIN32)
23 #include <cstring>
24 #endif
25 
26 #include "utility.h"
27 
28 #if defined(__SSE__)                                                           \
29     || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
30 #include <xmmintrin.h>
31 #endif
32 #if defined(__SSE2__)                                                          \
33     || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
34 #include <emmintrin.h>
35 #endif
36 
37 #ifndef M_PI_4
38 #define M_PI_4 (M_PI / 4)
39 #endif
40 
41 #pragma STDC FP_CONTRACT OFF
42 static void __log2_ep(double *hi, double *lo, double x);
43 
44 union uint64d_t {
45     uint64_t i;
46     double d;
47 };
48 
49 static const uint64d_t _CL_NAN = { 0x7ff8000000000000ULL };
50 
51 #define cl_make_nan() _CL_NAN.d
52 
reduce1(double x)53 static double reduce1(double x)
54 {
55     if (fabs(x) >= HEX_DBL(+, 1, 0, +, 53))
56     {
57         if (fabs(x) == INFINITY) return cl_make_nan();
58 
59         return 0.0; // we patch up the sign for sinPi and cosPi later, since
60                     // they need different signs
61     }
62 
63     // Find the nearest multiple of 2
64     const double r = copysign(HEX_DBL(+, 1, 0, +, 53), x);
65     double z = x + r;
66     z -= r;
67 
68     // subtract it from x. Value is now in the range -1 <= x <= 1
69     return x - z;
70 }
71 
reference_acospi(double x)72 double reference_acospi(double x) { return reference_acos(x) / M_PI; }
reference_asinpi(double x)73 double reference_asinpi(double x) { return reference_asin(x) / M_PI; }
reference_atanpi(double x)74 double reference_atanpi(double x) { return reference_atan(x) / M_PI; }
reference_atan2pi(double y,double x)75 double reference_atan2pi(double y, double x)
76 {
77     return reference_atan2(y, x) / M_PI;
78 }
reference_cospi(double x)79 double reference_cospi(double x)
80 {
81     if (reference_fabs(x) >= HEX_DBL(+, 1, 0, +, 52))
82     {
83         if (reference_fabs(x) == INFINITY) return cl_make_nan();
84 
85         // Note this probably fails for odd values between 0x1.0p52 and
86         // 0x1.0p53. However, when starting with single precision inputs, there
87         // will be no odd values.
88 
89         return 1.0;
90     }
91 
92     x = reduce1(x + 0.5);
93 
94     // reduce to [-0.5, 0.5]
95     if (x < -0.5)
96         x = -1 - x;
97     else if (x > 0.5)
98         x = 1 - x;
99 
100     // cosPi zeros are all +0
101     if (x == 0.0) return 0.0;
102 
103     return reference_sin(x * M_PI);
104 }
105 
reference_relaxed_cospi(double x)106 double reference_relaxed_cospi(double x) { return reference_cospi(x); }
107 
reference_relaxed_divide(double x,double y)108 double reference_relaxed_divide(double x, double y)
109 {
110     return (float)(((float)x) / ((float)y));
111 }
112 
reference_divide(double x,double y)113 double reference_divide(double x, double y) { return x / y; }
114 
115 // Add a + b. If the result modulo overflowed, write 1 to *carry, otherwise 0
add_carry(cl_ulong a,cl_ulong b,cl_ulong * carry)116 static inline cl_ulong add_carry(cl_ulong a, cl_ulong b, cl_ulong *carry)
117 {
118     cl_ulong result = a + b;
119     *carry = result < a;
120     return result;
121 }
122 
123 // Subtract a - b. If the result modulo overflowed, write 1 to *carry, otherwise
124 // 0
sub_carry(cl_ulong a,cl_ulong b,cl_ulong * carry)125 static inline cl_ulong sub_carry(cl_ulong a, cl_ulong b, cl_ulong *carry)
126 {
127     cl_ulong result = a - b;
128     *carry = result > a;
129     return result;
130 }
131 
fallback_frexpf(float x,int * iptr)132 static float fallback_frexpf(float x, int *iptr)
133 {
134     cl_uint u, v;
135     float fu, fv;
136 
137     memcpy(&u, &x, sizeof(u));
138 
139     cl_uint exponent = u & 0x7f800000U;
140     cl_uint mantissa = u & ~0x7f800000U;
141 
142     // add 1 to the exponent
143     exponent += 0x00800000U;
144 
145     if ((cl_int)exponent < (cl_int)0x01000000)
146     { // subnormal, NaN, Inf
147         mantissa |= 0x3f000000U;
148 
149         v = mantissa & 0xff800000U;
150         u = mantissa;
151         memcpy(&fv, &v, sizeof(v));
152         memcpy(&fu, &u, sizeof(u));
153 
154         fu -= fv;
155 
156         memcpy(&v, &fv, sizeof(v));
157         memcpy(&u, &fu, sizeof(u));
158 
159         exponent = u & 0x7f800000U;
160         mantissa = u & ~0x7f800000U;
161 
162         *iptr = (exponent >> 23) + (-126 + 1 - 126);
163         u = mantissa | 0x3f000000U;
164         memcpy(&fu, &u, sizeof(u));
165         return fu;
166     }
167 
168     *iptr = (exponent >> 23) - 127;
169     u = mantissa | 0x3f000000U;
170     memcpy(&fu, &u, sizeof(u));
171     return fu;
172 }
173 
extractf(float x,cl_uint * mant)174 static inline int extractf(float x, cl_uint *mant)
175 {
176     static float (*frexppf)(float, int *) = NULL;
177     int e;
178 
179     // verify that frexp works properly
180     if (NULL == frexppf)
181     {
182         if (0.5f == frexpf(HEX_FLT(+, 1, 0, -, 130), &e) && e == -129)
183             frexppf = frexpf;
184         else
185             frexppf = fallback_frexpf;
186     }
187 
188     *mant = (cl_uint)(HEX_FLT(+, 1, 0, +, 32) * fabsf(frexppf(x, &e)));
189     return e - 1;
190 }
191 
192 // Shift right by shift bits. Any bits lost on the right side are bitwise OR'd
193 // together and ORd into the LSB of the result
shift_right_sticky_64(cl_ulong * p,int shift)194 static inline void shift_right_sticky_64(cl_ulong *p, int shift)
195 {
196     cl_ulong sticky = 0;
197     cl_ulong r = *p;
198 
199     // C doesn't handle shifts greater than the size of the variable dependably
200     if (shift >= 64)
201     {
202         sticky |= (0 != r);
203         r = 0;
204     }
205     else
206     {
207         sticky |= (0 != (r << (64 - shift)));
208         r >>= shift;
209     }
210 
211     *p = r | sticky;
212 }
213 
214 // Add two 64 bit mantissas. Bits that are below the LSB of the result are OR'd
215 // into the LSB of the result
add64(cl_ulong * p,cl_ulong c,int * exponent)216 static inline void add64(cl_ulong *p, cl_ulong c, int *exponent)
217 {
218     cl_ulong carry;
219     c = add_carry(c, *p, &carry);
220     if (carry)
221     {
222         carry = c & 1; // set aside sticky bit
223         c >>= 1; // right shift to deal with overflow
224         c |= carry
225             | 0x8000000000000000ULL; // or in carry bit, and sticky bit. The
226                                      // latter is to prevent rounding from
227                                      // believing we are exact half way case
228         *exponent = *exponent + 1; // adjust exponent
229     }
230 
231     *p = c;
232 }
233 
234 // IEEE-754 round to nearest, ties to even rounding
round_to_nearest_even_float(cl_ulong p,int exponent)235 static float round_to_nearest_even_float(cl_ulong p, int exponent)
236 {
237     union {
238         cl_uint u;
239         cl_float d;
240     } u;
241 
242     // If mantissa is zero, return 0.0f
243     if (p == 0) return 0.0f;
244 
245     // edges
246     if (exponent > 127)
247     {
248         volatile float r = exponent * CL_FLT_MAX; // signal overflow
249 
250         // attempt to fool the compiler into not optimizing the above line away
251         if (r > CL_FLT_MAX) return INFINITY;
252 
253         return r;
254     }
255     if (exponent == -150 && p > 0x8000000000000000ULL)
256         return HEX_FLT(+, 1, 0, -, 149);
257     if (exponent <= -150) return 0.0f;
258 
259     // Figure out which bits go where
260     int shift = 8 + 32;
261     if (exponent < -126)
262     {
263         shift -= 126 + exponent; // subnormal: shift is not 52
264         exponent = -127; //            set exponent to 0
265     }
266     else
267         p &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove
268                                     // it.
269 
270     // Assemble the double (round toward zero)
271     u.u = (cl_uint)(p >> shift) | ((cl_uint)(exponent + 127) << 23);
272 
273     // put a representation of the residual bits into hi
274     p <<= (64 - shift);
275 
276     // round to nearest, ties to even  based on the unused portion of p
277     if (p < 0x8000000000000000ULL) return u.d;
278     if (p == 0x8000000000000000ULL)
279         u.u += u.u & 1U;
280     else
281         u.u++;
282 
283     return u.d;
284 }
285 
round_to_nearest_even_float_ftz(cl_ulong p,int exponent)286 static float round_to_nearest_even_float_ftz(cl_ulong p, int exponent)
287 {
288     extern int gCheckTininessBeforeRounding;
289 
290     union {
291         cl_uint u;
292         cl_float d;
293     } u;
294     int shift = 8 + 32;
295 
296     // If mantissa is zero, return 0.0f
297     if (p == 0) return 0.0f;
298 
299     // edges
300     if (exponent > 127)
301     {
302         volatile float r = exponent * CL_FLT_MAX; // signal overflow
303 
304         // attempt to fool the compiler into not optimizing the above line away
305         if (r > CL_FLT_MAX) return INFINITY;
306 
307         return r;
308     }
309 
310     // Deal with FTZ for gCheckTininessBeforeRounding
311     if (exponent < (gCheckTininessBeforeRounding - 127)) return 0.0f;
312 
313     if (exponent
314         == -127) // only happens for machines that check tininess after rounding
315         p = (p & 1) | (p >> 1);
316     else
317         p &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove
318                                     // it.
319 
320     cl_ulong q = p;
321 
322 
323     // Assemble the double (round toward zero)
324     u.u = (cl_uint)(q >> shift) | ((cl_uint)(exponent + 127) << 23);
325 
326     // put a representation of the residual bits into hi
327     q <<= (64 - shift);
328 
329     // round to nearest, ties to even  based on the unused portion of p
330     if (q > 0x8000000000000000ULL)
331         u.u++;
332     else if (q == 0x8000000000000000ULL)
333         u.u += u.u & 1U;
334 
335     // Deal with FTZ for ! gCheckTininessBeforeRounding
336     if (0 == (u.u & 0x7f800000U)) return 0.0f;
337 
338     return u.d;
339 }
340 
341 
342 // IEEE-754 round toward zero.
round_toward_zero_float(cl_ulong p,int exponent)343 static float round_toward_zero_float(cl_ulong p, int exponent)
344 {
345     union {
346         cl_uint u;
347         cl_float d;
348     } u;
349 
350     // If mantissa is zero, return 0.0f
351     if (p == 0) return 0.0f;
352 
353     // edges
354     if (exponent > 127)
355     {
356         volatile float r = exponent * CL_FLT_MAX; // signal overflow
357 
358         // attempt to fool the compiler into not optimizing the above line away
359         if (r > CL_FLT_MAX) return CL_FLT_MAX;
360 
361         return r;
362     }
363 
364     if (exponent <= -149) return 0.0f;
365 
366     // Figure out which bits go where
367     int shift = 8 + 32;
368     if (exponent < -126)
369     {
370         shift -= 126 + exponent; // subnormal: shift is not 52
371         exponent = -127; //            set exponent to 0
372     }
373     else
374         p &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove
375                                     // it.
376 
377     // Assemble the double (round toward zero)
378     u.u = (cl_uint)(p >> shift) | ((cl_uint)(exponent + 127) << 23);
379 
380     return u.d;
381 }
382 
round_toward_zero_float_ftz(cl_ulong p,int exponent)383 static float round_toward_zero_float_ftz(cl_ulong p, int exponent)
384 {
385     union {
386         cl_uint u;
387         cl_float d;
388     } u;
389     int shift = 8 + 32;
390 
391     // If mantissa is zero, return 0.0f
392     if (p == 0) return 0.0f;
393 
394     // edges
395     if (exponent > 127)
396     {
397         volatile float r = exponent * CL_FLT_MAX; // signal overflow
398 
399         // attempt to fool the compiler into not optimizing the above line away
400         if (r > CL_FLT_MAX) return CL_FLT_MAX;
401 
402         return r;
403     }
404 
405     // Deal with FTZ for gCheckTininessBeforeRounding
406     if (exponent < -126) return 0.0f;
407 
408     cl_ulong q = p &=
409         0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove it.
410 
411     // Assemble the double (round toward zero)
412     u.u = (cl_uint)(q >> shift) | ((cl_uint)(exponent + 127) << 23);
413 
414     // put a representation of the residual bits into hi
415     q <<= (64 - shift);
416 
417     return u.d;
418 }
419 
420 // Subtract two significands.
sub64(cl_ulong * c,cl_ulong p,cl_uint * signC,int * expC)421 static inline void sub64(cl_ulong *c, cl_ulong p, cl_uint *signC, int *expC)
422 {
423     cl_ulong carry;
424     p = sub_carry(*c, p, &carry);
425 
426     if (carry)
427     {
428         *signC ^= 0x80000000U;
429         p = -p;
430     }
431 
432     // normalize
433     if (p)
434     {
435         int shift = 32;
436         cl_ulong test = 1ULL << 32;
437         while (0 == (p & 0x8000000000000000ULL))
438         {
439             if (p < test)
440             {
441                 p <<= shift;
442                 *expC = *expC - shift;
443             }
444             shift >>= 1;
445             test <<= shift;
446         }
447     }
448     else
449     {
450         // zero result.
451         *expC = -200;
452         *signC =
453             0; // IEEE rules say a - a = +0 for all rounding modes except -inf
454     }
455 
456     *c = p;
457 }
458 
459 
reference_fma(float a,float b,float c,int shouldFlush)460 float reference_fma(float a, float b, float c, int shouldFlush)
461 {
462     static const cl_uint kMSB = 0x80000000U;
463 
464     // Make bits accessible
465     union {
466         cl_uint u;
467         cl_float d;
468     } ua;
469     ua.d = a;
470     union {
471         cl_uint u;
472         cl_float d;
473     } ub;
474     ub.d = b;
475     union {
476         cl_uint u;
477         cl_float d;
478     } uc;
479     uc.d = c;
480 
481     // deal with Nans, infinities and zeros
482     if (isnan(a) || isnan(b) || isnan(c) || isinf(a) || isinf(b) || isinf(c)
483         || 0 == (ua.u & ~kMSB) || // a == 0, defeat host FTZ behavior
484         0 == (ub.u & ~kMSB) || // b == 0, defeat host FTZ behavior
485         0 == (uc.u & ~kMSB)) // c == 0, defeat host FTZ behavior
486     {
487         FPU_mode_type oldMode;
488         RoundingMode oldRoundMode = kRoundToNearestEven;
489         if (isinf(c) && !isinf(a) && !isinf(b)) return (c + a) + b;
490 
491         if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat);
492 
493         memset(&oldMode, 0, sizeof(oldMode));
494         if (shouldFlush) ForceFTZ(&oldMode);
495 
496         a = (float)reference_multiply(
497             a, b); // some risk that the compiler will insert a non-compliant
498                    // fma here on some platforms.
499         a = (float)reference_add(
500             a,
501             c); // We use STDC FP_CONTRACT OFF above to attempt to defeat that.
502 
503         if (shouldFlush) RestoreFPState(&oldMode);
504 
505         if (gIsInRTZMode) set_round(oldRoundMode, kfloat);
506         return a;
507     }
508 
509     // extract exponent and mantissa
510     //   exponent is a standard unbiased signed integer
511     //   mantissa is a cl_uint, with leading non-zero bit positioned at the MSB
512     cl_uint mantA, mantB, mantC;
513     int expA = extractf(a, &mantA);
514     int expB = extractf(b, &mantB);
515     int expC = extractf(c, &mantC);
516     cl_uint signC = uc.u & kMSB; // We'll need the sign bit of C later to decide
517                                  // if we are adding or subtracting
518 
519     // exact product of A and B
520     int exponent = expA + expB;
521     cl_uint sign = (ua.u ^ ub.u) & kMSB;
522     cl_ulong product = (cl_ulong)mantA * (cl_ulong)mantB;
523 
524     // renormalize -- 1.m * 1.n yields a number between 1.0 and 3.99999..
525     //  The MSB might not be set. If so, fix that. Otherwise, reflect the fact
526     //  that we got another power of two from the multiplication
527     if (0 == (0x8000000000000000ULL & product))
528         product <<= 1;
529     else
530         exponent++; // 2**31 * 2**31 gives 2**62. If the MSB was set, then our
531                     // exponent increased.
532 
533     // infinite precision add
534     cl_ulong addend = (cl_ulong)mantC << 32;
535     if (exponent >= expC)
536     {
537         // Shift C relative to the product so that their exponents match
538         if (exponent > expC) shift_right_sticky_64(&addend, exponent - expC);
539 
540         // Add
541         if (sign ^ signC)
542             sub64(&product, addend, &sign, &exponent);
543         else
544             add64(&product, addend, &exponent);
545     }
546     else
547     {
548         // Shift the product relative to C so that their exponents match
549         shift_right_sticky_64(&product, expC - exponent);
550 
551         // add
552         if (sign ^ signC)
553             sub64(&addend, product, &signC, &expC);
554         else
555             add64(&addend, product, &expC);
556 
557         product = addend;
558         exponent = expC;
559         sign = signC;
560     }
561 
562     // round to IEEE result -- we do not do flushing to zero here. That part is
563     // handled manually in ternary.c.
564     if (gIsInRTZMode)
565     {
566         if (shouldFlush)
567             ua.d = round_toward_zero_float_ftz(product, exponent);
568         else
569             ua.d = round_toward_zero_float(product, exponent);
570     }
571     else
572     {
573         if (shouldFlush)
574             ua.d = round_to_nearest_even_float_ftz(product, exponent);
575         else
576             ua.d = round_to_nearest_even_float(product, exponent);
577     }
578 
579     // Set the sign
580     ua.u |= sign;
581 
582     return ua.d;
583 }
584 
reference_relaxed_exp10(double x)585 double reference_relaxed_exp10(double x) { return reference_exp10(x); }
586 
reference_exp10(double x)587 double reference_exp10(double x)
588 {
589     return reference_exp2(x * HEX_DBL(+, 1, a934f0979a371, +, 1));
590 }
591 
592 
reference_ilogb(double x)593 int reference_ilogb(double x)
594 {
595     extern int gDeviceILogb0, gDeviceILogbNaN;
596     union {
597         cl_double f;
598         cl_ulong u;
599     } u;
600 
601     u.f = (float)x;
602     cl_int exponent = (cl_int)(u.u >> 52) & 0x7ff;
603     if (exponent == 0x7ff)
604     {
605         if (u.u & 0x000fffffffffffffULL) return gDeviceILogbNaN;
606 
607         return CL_INT_MAX;
608     }
609 
610     if (exponent == 0)
611     { // deal with denormals
612         u.f = x * HEX_DBL(+, 1, 0, +, 64);
613         exponent = (cl_int)(u.u >> 52) & 0x7ff;
614         if (exponent == 0) return gDeviceILogb0;
615 
616         return exponent - (1023 + 64);
617     }
618 
619     return exponent - 1023;
620 }
621 
reference_nan(cl_uint x)622 double reference_nan(cl_uint x)
623 {
624     union {
625         cl_uint u;
626         cl_float f;
627     } u;
628     u.u = x | 0x7fc00000U;
629     return (double)u.f;
630 }
631 
reference_maxmag(double x,double y)632 double reference_maxmag(double x, double y)
633 {
634     double fabsx = fabs(x);
635     double fabsy = fabs(y);
636 
637     if (fabsx < fabsy) return y;
638 
639     if (fabsy < fabsx) return x;
640 
641     return reference_fmax(x, y);
642 }
643 
reference_minmag(double x,double y)644 double reference_minmag(double x, double y)
645 {
646     double fabsx = fabs(x);
647     double fabsy = fabs(y);
648 
649     if (fabsx > fabsy) return y;
650 
651     if (fabsy > fabsx) return x;
652 
653     return reference_fmin(x, y);
654 }
655 
reference_relaxed_mad(double a,double b,double c)656 double reference_relaxed_mad(double a, double b, double c)
657 {
658     return ((float)a) * ((float)b) + (float)c;
659 }
660 
reference_mad(double a,double b,double c)661 double reference_mad(double a, double b, double c) { return a * b + c; }
662 
reference_recip(double x)663 double reference_recip(double x) { return 1.0 / x; }
reference_rootn(double x,int i)664 double reference_rootn(double x, int i)
665 {
666 
667     // rootn ( x, 0 )  returns a NaN.
668     if (0 == i) return cl_make_nan();
669 
670     // rootn ( x, n )  returns a NaN for x < 0 and n is even.
671     if (x < 0 && 0 == (i & 1)) return cl_make_nan();
672 
673     if (x == 0.0)
674     {
675         switch (i & 0x80000001)
676         {
677             // rootn ( +-0,  n ) is +0 for even n > 0.
678             case 0: return 0.0f;
679 
680             // rootn ( +-0,  n ) is +-0 for odd n > 0.
681             case 1: return x;
682 
683             // rootn ( +-0,  n ) is +inf for even n < 0.
684             case 0x80000000: return INFINITY;
685 
686             // rootn ( +-0,  n ) is +-inf for odd n < 0.
687             case 0x80000001: return copysign(INFINITY, x);
688         }
689     }
690 
691     double sign = x;
692     x = reference_fabs(x);
693     x = reference_exp2(reference_log2(x) / (double)i);
694     return reference_copysignd(x, sign);
695 }
696 
reference_rsqrt(double x)697 double reference_rsqrt(double x) { return 1.0 / reference_sqrt(x); }
698 
reference_sinpi(double x)699 double reference_sinpi(double x)
700 {
701     double r = reduce1(x);
702 
703     // reduce to [-0.5, 0.5]
704     if (r < -0.5)
705         r = -1 - r;
706     else if (r > 0.5)
707         r = 1 - r;
708 
709     // sinPi zeros have the same sign as x
710     if (r == 0.0) return reference_copysignd(0.0, x);
711 
712     return reference_sin(r * M_PI);
713 }
714 
reference_relaxed_sinpi(double x)715 double reference_relaxed_sinpi(double x) { return reference_sinpi(x); }
716 
reference_tanpi(double x)717 double reference_tanpi(double x)
718 {
719     // set aside the sign  (allows us to preserve sign of -0)
720     double sign = reference_copysignd(1.0, x);
721     double z = reference_fabs(x);
722 
723     // if big and even  -- caution: only works if x only has single precision
724     if (z >= HEX_DBL(+, 1, 0, +, 24))
725     {
726         if (z == INFINITY) return x - x; // nan
727 
728         return reference_copysignd(
729             0.0, x); // tanpi ( n ) is copysign( 0.0, n)  for even integers n.
730     }
731 
732     // reduce to the range [ -0.5, 0.5 ]
733     double nearest = reference_rint(z); // round to nearest even places n + 0.5
734                                         // values in the right place for us
735     int i = (int)nearest; // test above against 0x1.0p24 avoids overflow here
736     z -= nearest;
737 
738     // correction for odd integer x for the right sign of zero
739     if ((i & 1) && z == 0.0) sign = -sign;
740 
741     // track changes to the sign
742     sign *= reference_copysignd(1.0, z); // really should just be an xor
743     z = reference_fabs(z); // remove the sign again
744 
745     // reduce once more
746     // If we don't do this, rounding error in z * M_PI will cause us not to
747     // return infinities properly
748     if (z > 0.25)
749     {
750         z = 0.5 - z;
751         return sign
752             / reference_tan(z * M_PI); // use system tan to get the right result
753     }
754 
755     //
756     return sign
757         * reference_tan(z * M_PI); // use system tan to get the right result
758 }
759 
reference_pown(double x,int i)760 double reference_pown(double x, int i) { return reference_pow(x, (double)i); }
reference_powr(double x,double y)761 double reference_powr(double x, double y)
762 {
763     // powr ( x, y ) returns NaN for x < 0.
764     if (x < 0.0) return cl_make_nan();
765 
766     // powr ( x, NaN ) returns the NaN for x >= 0.
767     // powr ( NaN, y ) returns the NaN.
768     if (isnan(x) || isnan(y))
769         return x + y; // Note: behavior different here than for pow(1,NaN),
770                       // pow(NaN, 0)
771 
772     if (x == 1.0)
773     {
774         // powr ( +1, +-inf ) returns NaN.
775         if (reference_fabs(y) == INFINITY) return cl_make_nan();
776 
777         // powr ( +1, y ) is 1 for finite y.    (NaN handled above)
778         return 1.0;
779     }
780 
781     if (y == 0.0)
782     {
783         // powr ( +inf, +-0 ) returns NaN.
784         // powr ( +-0, +-0 ) returns NaN.
785         if (x == 0.0 || x == INFINITY) return cl_make_nan();
786 
787         // powr ( x, +-0 ) is 1 for finite x > 0.  (x <= 0, NaN, INF already
788         // handled above)
789         return 1.0;
790     }
791 
792     if (x == 0.0)
793     {
794         // powr ( +-0, -inf) is +inf.
795         // powr ( +-0, y ) is +inf for finite y < 0.
796         if (y < 0.0) return INFINITY;
797 
798         // powr ( +-0, y ) is +0 for y > 0.    (NaN, y==0 handled above)
799         return 0.0;
800     }
801 
802     // x = +inf
803     if (isinf(x))
804     {
805         if (y < 0) return 0;
806         return INFINITY;
807     }
808 
809     double fabsx = reference_fabs(x);
810     double fabsy = reference_fabs(y);
811 
812     // y = +-inf cases
813     if (isinf(fabsy))
814     {
815         if (y < 0)
816         {
817             if (fabsx < 1) return INFINITY;
818             return 0;
819         }
820         if (fabsx < 1) return 0;
821         return INFINITY;
822     }
823 
824     double hi, lo;
825     __log2_ep(&hi, &lo, x);
826     double prod = y * hi;
827     double result = reference_exp2(prod);
828 
829     return result;
830 }
831 
reference_fract(double x,double * ip)832 double reference_fract(double x, double *ip)
833 {
834     if (isnan(x))
835     {
836         *ip = cl_make_nan();
837         return cl_make_nan();
838     }
839 
840     float i;
841     float f = modff((float)x, &i);
842     if (f < 0.0)
843     {
844         f = 1.0f + f;
845         i -= 1.0f;
846         if (f == 1.0f) f = HEX_FLT(+, 1, fffffe, -, 1);
847     }
848     *ip = i;
849     return f;
850 }
851 
852 
reference_add(double x,double y)853 double reference_add(double x, double y)
854 {
855     volatile float a = (float)x;
856     volatile float b = (float)y;
857 
858 #if defined(__SSE__)                                                           \
859     || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
860     // defeat x87
861     __m128 va = _mm_set_ss((float)a);
862     __m128 vb = _mm_set_ss((float)b);
863     va = _mm_add_ss(va, vb);
864     _mm_store_ss((float *)&a, va);
865 #elif defined(__PPC__)
866     // Most Power host CPUs do not support the non-IEEE mode (NI) which flushes
867     // denorm's to zero. As such, the reference add with FTZ must be emulated in
868     // sw.
869     if (fpu_control & _FPU_MASK_NI)
870     {
871         union {
872             cl_uint u;
873             cl_float d;
874         } ua;
875         ua.d = a;
876         union {
877             cl_uint u;
878             cl_float d;
879         } ub;
880         ub.d = b;
881         cl_uint mantA, mantB;
882         cl_ulong addendA, addendB, sum;
883         int expA = extractf(a, &mantA);
884         int expB = extractf(b, &mantB);
885         cl_uint signA = ua.u & 0x80000000U;
886         cl_uint signB = ub.u & 0x80000000U;
887 
888         // Force matching exponents if an operand is 0
889         if (a == 0.0f)
890         {
891             expA = expB;
892         }
893         else if (b == 0.0f)
894         {
895             expB = expA;
896         }
897 
898         addendA = (cl_ulong)mantA << 32;
899         addendB = (cl_ulong)mantB << 32;
900 
901         if (expA >= expB)
902         {
903             // Shift B relative to the A so that their exponents match
904             if (expA > expB) shift_right_sticky_64(&addendB, expA - expB);
905 
906             // add
907             if (signA ^ signB)
908                 sub64(&addendA, addendB, &signA, &expA);
909             else
910                 add64(&addendA, addendB, &expA);
911         }
912         else
913         {
914             // Shift the A relative to B so that their exponents match
915             shift_right_sticky_64(&addendA, expB - expA);
916 
917             // add
918             if (signA ^ signB)
919                 sub64(&addendB, addendA, &signB, &expB);
920             else
921                 add64(&addendB, addendA, &expB);
922 
923             addendA = addendB;
924             expA = expB;
925             signA = signB;
926         }
927 
928         // round to IEEE result
929         if (gIsInRTZMode)
930         {
931             ua.d = round_toward_zero_float_ftz(addendA, expA);
932         }
933         else
934         {
935             ua.d = round_to_nearest_even_float_ftz(addendA, expA);
936         }
937         // Set the sign
938         ua.u |= signA;
939         a = ua.d;
940     }
941     else
942     {
943         a += b;
944     }
945 #else
946     a += b;
947 #endif
948     return (double)a;
949 }
950 
951 
reference_subtract(double x,double y)952 double reference_subtract(double x, double y)
953 {
954     volatile float a = (float)x;
955     volatile float b = (float)y;
956 #if defined(__SSE__)                                                           \
957     || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
958     // defeat x87
959     __m128 va = _mm_set_ss((float)a);
960     __m128 vb = _mm_set_ss((float)b);
961     va = _mm_sub_ss(va, vb);
962     _mm_store_ss((float *)&a, va);
963 #else
964     a -= b;
965 #endif
966     return a;
967 }
968 
reference_multiply(double x,double y)969 double reference_multiply(double x, double y)
970 {
971     volatile float a = (float)x;
972     volatile float b = (float)y;
973 #if defined(__SSE__)                                                           \
974     || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
975     // defeat x87
976     __m128 va = _mm_set_ss((float)a);
977     __m128 vb = _mm_set_ss((float)b);
978     va = _mm_mul_ss(va, vb);
979     _mm_store_ss((float *)&a, va);
980 #elif defined(__PPC__)
981     // Most Power host CPUs do not support the non-IEEE mode (NI) which flushes
982     // denorm's to zero. As such, the reference multiply with FTZ must be
983     // emulated in sw.
984     if (fpu_control & _FPU_MASK_NI)
985     {
986         // extract exponent and mantissa
987         //   exponent is a standard unbiased signed integer
988         //   mantissa is a cl_uint, with leading non-zero bit positioned at the
989         //   MSB
990         union {
991             cl_uint u;
992             cl_float d;
993         } ua;
994         ua.d = a;
995         union {
996             cl_uint u;
997             cl_float d;
998         } ub;
999         ub.d = b;
1000         cl_uint mantA, mantB;
1001         int expA = extractf(a, &mantA);
1002         int expB = extractf(b, &mantB);
1003 
1004         // exact product of A and B
1005         int exponent = expA + expB;
1006         cl_uint sign = (ua.u ^ ub.u) & 0x80000000U;
1007         cl_ulong product = (cl_ulong)mantA * (cl_ulong)mantB;
1008 
1009         // renormalize -- 1.m * 1.n yields a number between 1.0 and 3.99999..
1010         //  The MSB might not be set. If so, fix that. Otherwise, reflect the
1011         //  fact that we got another power of two from the multiplication
1012         if (0 == (0x8000000000000000ULL & product))
1013             product <<= 1;
1014         else
1015             exponent++; // 2**31 * 2**31 gives 2**62. If the MSB was set, then
1016                         // our exponent increased.
1017 
1018         // round to IEEE result -- we do not do flushing to zero here. That part
1019         // is handled manually in ternary.c.
1020         if (gIsInRTZMode)
1021         {
1022             ua.d = round_toward_zero_float_ftz(product, exponent);
1023         }
1024         else
1025         {
1026             ua.d = round_to_nearest_even_float_ftz(product, exponent);
1027         }
1028         // Set the sign
1029         ua.u |= sign;
1030         a = ua.d;
1031     }
1032     else
1033     {
1034         a *= b;
1035     }
1036 #else
1037     a *= b;
1038 #endif
1039     return a;
1040 }
1041 
reference_lgamma_r(double x,int * signp)1042 double reference_lgamma_r(double x, int *signp)
1043 {
1044     // This is not currently tested
1045     *signp = 0;
1046     return x;
1047 }
1048 
1049 
reference_isequal(double x,double y)1050 int reference_isequal(double x, double y) { return x == y; }
reference_isfinite(double x)1051 int reference_isfinite(double x) { return 0 != isfinite(x); }
reference_isgreater(double x,double y)1052 int reference_isgreater(double x, double y) { return x > y; }
reference_isgreaterequal(double x,double y)1053 int reference_isgreaterequal(double x, double y) { return x >= y; }
reference_isinf(double x)1054 int reference_isinf(double x) { return 0 != isinf(x); }
reference_isless(double x,double y)1055 int reference_isless(double x, double y) { return x < y; }
reference_islessequal(double x,double y)1056 int reference_islessequal(double x, double y) { return x <= y; }
reference_islessgreater(double x,double y)1057 int reference_islessgreater(double x, double y)
1058 {
1059     return 0 != islessgreater(x, y);
1060 }
reference_isnan(double x)1061 int reference_isnan(double x) { return 0 != isnan(x); }
reference_isnormal(double x)1062 int reference_isnormal(double x) { return 0 != isnormal((float)x); }
reference_isnotequal(double x,double y)1063 int reference_isnotequal(double x, double y) { return x != y; }
reference_isordered(double x,double y)1064 int reference_isordered(double x, double y) { return x == x && y == y; }
reference_isunordered(double x,double y)1065 int reference_isunordered(double x, double y) { return isnan(x) || isnan(y); }
reference_signbit(float x)1066 int reference_signbit(float x) { return 0 != signbit(x); }
1067 
1068 #if 1 // defined( _MSC_VER )
1069 
1070 // Missing functions for win32
1071 
1072 
reference_copysign(float x,float y)1073 float reference_copysign(float x, float y)
1074 {
1075     union {
1076         float f;
1077         cl_uint u;
1078     } ux, uy;
1079     ux.f = x;
1080     uy.f = y;
1081     ux.u &= 0x7fffffffU;
1082     ux.u |= uy.u & 0x80000000U;
1083     return ux.f;
1084 }
1085 
1086 
reference_copysignd(double x,double y)1087 double reference_copysignd(double x, double y)
1088 {
1089     union {
1090         double f;
1091         cl_ulong u;
1092     } ux, uy;
1093     ux.f = x;
1094     uy.f = y;
1095     ux.u &= 0x7fffffffffffffffULL;
1096     ux.u |= uy.u & 0x8000000000000000ULL;
1097     return ux.f;
1098 }
1099 
1100 
reference_round(double x)1101 double reference_round(double x)
1102 {
1103     double absx = reference_fabs(x);
1104     if (absx < 0.5) return reference_copysignd(0.0, x);
1105 
1106     if (absx < HEX_DBL(+, 1, 0, +, 53))
1107         x = reference_trunc(x + reference_copysignd(0.5, x));
1108 
1109     return x;
1110 }
1111 
reference_trunc(double x)1112 double reference_trunc(double x)
1113 {
1114     if (fabs(x) < HEX_DBL(+, 1, 0, +, 53))
1115     {
1116         cl_long l = (cl_long)x;
1117 
1118         return reference_copysignd((double)l, x);
1119     }
1120 
1121     return x;
1122 }
1123 
1124 #ifndef FP_ILOGB0
1125 #define FP_ILOGB0 INT_MIN
1126 #endif
1127 
1128 #ifndef FP_ILOGBNAN
1129 #define FP_ILOGBNAN INT_MAX
1130 #endif
1131 
1132 
reference_cbrt(double x)1133 double reference_cbrt(double x)
1134 {
1135     return reference_copysignd(reference_pow(reference_fabs(x), 1.0 / 3.0), x);
1136 }
1137 
reference_rint(double x)1138 double reference_rint(double x)
1139 {
1140     if (reference_fabs(x) < HEX_DBL(+, 1, 0, +, 52))
1141     {
1142         double magic = reference_copysignd(HEX_DBL(+, 1, 0, +, 52), x);
1143         double rounded = (x + magic) - magic;
1144         x = reference_copysignd(rounded, x);
1145     }
1146 
1147     return x;
1148 }
1149 
reference_acosh(double x)1150 double reference_acosh(double x)
1151 { // not full precision. Sufficient precision to cover float
1152     if (isnan(x)) return x + x;
1153 
1154     if (x < 1.0) return cl_make_nan();
1155 
1156     return reference_log(x + reference_sqrt(x + 1) * reference_sqrt(x - 1));
1157 }
1158 
reference_asinh(double x)1159 double reference_asinh(double x)
1160 {
1161     /*
1162      * ====================================================
1163      * This function is from fdlibm: http://www.netlib.org
1164      *   It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
1165      *
1166      * Developed at SunSoft, a Sun Microsystems, Inc. business.
1167      * Permission to use, copy, modify, and distribute this
1168      * software is freely granted, provided that this notice
1169      * is preserved.
1170      * ====================================================
1171      */
1172     if (isnan(x) || isinf(x)) return x + x;
1173 
1174     double absx = reference_fabs(x);
1175     if (absx < HEX_DBL(+, 1, 0, -, 28)) return x;
1176 
1177     double sign = reference_copysignd(1.0, x);
1178 
1179     if (absx > HEX_DBL(+, 1, 0, +, 28))
1180         return sign
1181             * (reference_log(absx)
1182                + 0.693147180559945309417232121458176568); // log(2)
1183 
1184     if (absx > 2.0)
1185         return sign
1186             * reference_log(2.0 * absx
1187                             + 1.0 / (reference_sqrt(x * x + 1.0) + absx));
1188 
1189     return sign
1190         * reference_log1p(absx + x * x / (1.0 + reference_sqrt(1.0 + x * x)));
1191 }
1192 
1193 
reference_atanh(double x)1194 double reference_atanh(double x)
1195 {
1196     /*
1197      * ====================================================
1198      * This function is from fdlibm: http://www.netlib.org
1199      *   It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
1200      *
1201      * Developed at SunSoft, a Sun Microsystems, Inc. business.
1202      * Permission to use, copy, modify, and distribute this
1203      * software is freely granted, provided that this notice
1204      * is preserved.
1205      * ====================================================
1206      */
1207     if (isnan(x)) return x + x;
1208 
1209     double signed_half = reference_copysignd(0.5, x);
1210     x = reference_fabs(x);
1211     if (x > 1.0) return cl_make_nan();
1212 
1213     if (x < 0.5)
1214         return signed_half * reference_log1p(2.0 * (x + x * x / (1 - x)));
1215 
1216     return signed_half * reference_log1p(2.0 * x / (1 - x));
1217 }
1218 
reference_relaxed_atan(double x)1219 double reference_relaxed_atan(double x) { return reference_atan(x); }
1220 
reference_relaxed_exp2(double x)1221 double reference_relaxed_exp2(double x) { return reference_exp2(x); }
1222 
reference_exp2(double x)1223 double reference_exp2(double x)
1224 { // Note: only suitable for verifying single precision. Doesn't have range of a
1225   // full double exp2 implementation.
1226     if (x == 0.0) return 1.0;
1227 
1228     // separate x into fractional and integer parts
1229     double i = reference_rint(x); // round to nearest integer
1230 
1231     if (i < -150) return 0.0;
1232 
1233     if (i > 129) return INFINITY;
1234 
1235     double f = x - i; // -0.5 <= f <= 0.5
1236 
1237     // find exp2(f)
1238     // calculate as p(f) = (exp2(f)-1)/f
1239     //              exp2(f) = f * p(f) + 1
1240     // p(f) is a minimax polynomial with error within 0x1.c1fd80f0d1ab7p-50
1241 
1242     double p = 0.693147180560184539289
1243         + (0.240226506955902863183
1244            + (0.055504108656833424373
1245               + (0.009618129212846484796
1246                  + (0.001333355902958566035
1247                     + (0.000154034191902497930
1248                        + (0.000015252317761038105
1249                           + (0.000001326283129417092
1250                              + 0.000000102593187638680 * f)
1251                               * f)
1252                            * f)
1253                         * f)
1254                      * f)
1255                   * f)
1256                * f)
1257             * f;
1258     f *= p;
1259     f += 1.0;
1260 
1261     // scale by 2 ** i
1262     union {
1263         cl_ulong u;
1264         double d;
1265     } u;
1266     int exponent = (int)i + 1023;
1267     u.u = (cl_ulong)exponent << 52;
1268 
1269     return f * u.d;
1270 }
1271 
1272 
reference_expm1(double x)1273 double reference_expm1(double x)
1274 { // Note: only suitable for verifying single precision. Doesn't have range of a
1275   // full double expm1 implementation. It is only accurate to 47 bits or less.
1276 
1277     // early out for small numbers and NaNs
1278     if (!(reference_fabs(x) > HEX_DBL(+, 1, 0, -, 24))) return x;
1279 
1280     // early out for large negative numbers
1281     if (x < -130.0) return -1.0;
1282 
1283     // early out for large positive numbers
1284     if (x > 100.0) return INFINITY;
1285 
1286     // separate x into fractional and integer parts
1287     double i = reference_rint(x); // round to nearest integer
1288     double f = x - i; // -0.5 <= f <= 0.5
1289 
1290     // reduce f to the range -0.0625 .. f.. 0.0625
1291     int index = (int)(f * 16.0) + 8; // 0...16
1292 
1293     static const double reduction[17] = { -0.5,  -0.4375, -0.375, -0.3125,
1294                                           -0.25, -0.1875, -0.125, -0.0625,
1295                                           0.0,   +0.0625, +0.125, +0.1875,
1296                                           +0.25, +0.3125, +0.375, +0.4375,
1297                                           +0.5 };
1298 
1299 
1300     // exponentials[i] = expm1(reduction[i])
1301     static const double exponentials[17] = {
1302         HEX_DBL(-, 1, 92e9a0720d3ec, -, 2),
1303         HEX_DBL(-, 1, 6adb1cd9205ee, -, 2),
1304         HEX_DBL(-, 1, 40373d42ce2e3, -, 2),
1305         HEX_DBL(-, 1, 12d35a41ba104, -, 2),
1306         HEX_DBL(-, 1, c5041854df7d4, -, 3),
1307         HEX_DBL(-, 1, 5e25fb4fde211, -, 3),
1308         HEX_DBL(-, 1, e14aed893eef4, -, 4),
1309         HEX_DBL(-, 1, f0540438fd5c3, -, 5),
1310         HEX_DBL(+, 0, 0, +, 0),
1311         HEX_DBL(+, 1, 082b577d34ed8, -, 4),
1312         HEX_DBL(+, 1, 10b022db7ae68, -, 3),
1313         HEX_DBL(+, 1, a65c0b85ac1a9, -, 3),
1314         HEX_DBL(+, 1, 22d78f0fa061a, -, 2),
1315         HEX_DBL(+, 1, 77a45d8117fd5, -, 2),
1316         HEX_DBL(+, 1, d1e944f6fbdaa, -, 2),
1317         HEX_DBL(+, 1, 190048ef6002, -, 1),
1318         HEX_DBL(+, 1, 4c2531c3c0d38, -, 1),
1319     };
1320 
1321 
1322     f -= reduction[index];
1323 
1324     // find expm1(f)
1325     // calculate as p(f) = (exp(f)-1)/f
1326     //              expm1(f) = f * p(f)
1327     // p(f) is a minimax polynomial with error within 0x1.1d7693618d001p-48 over
1328     // the range +- 0.0625
1329     double p = 0.999999999999998001599
1330         + (0.499999999999839628284
1331            + (0.166666666672817459505
1332               + (0.041666666612283048687
1333                  + (0.008333330214567431435
1334                     + (0.001389005319303770070 + 0.000198833381525156667 * f)
1335                         * f)
1336                      * f)
1337                   * f)
1338                * f)
1339             * f;
1340     f *= p; // expm1( reduced f )
1341 
1342     // expm1(f) = (exmp1( reduced_f) + 1.0) * ( exponentials[index] + 1 ) - 1
1343     //          =  exmp1( reduced_f) * exponentials[index] + exmp1( reduced_f) +
1344     //          exponentials[index] + 1 -1 =  exmp1( reduced_f) *
1345     //          exponentials[index] + exmp1( reduced_f) + exponentials[index]
1346     f += exponentials[index] + f * exponentials[index];
1347 
1348     // scale by e ** i
1349     int exponent = (int)i;
1350     if (0 == exponent) return f; // precise answer for x near 1
1351 
1352     // table of e**(i-150)
1353     static const double exp_table[128 + 150 + 1] = {
1354         HEX_DBL(+, 1, 82e16284f5ec5, -, 217),
1355         HEX_DBL(+, 1, 06e9996332ba1, -, 215),
1356         HEX_DBL(+, 1, 6555cb289e44b, -, 214),
1357         HEX_DBL(+, 1, e5ab364643354, -, 213),
1358         HEX_DBL(+, 1, 4a0bd18e64df7, -, 211),
1359         HEX_DBL(+, 1, c094499cc578e, -, 210),
1360         HEX_DBL(+, 1, 30d759323998c, -, 208),
1361         HEX_DBL(+, 1, 9e5278ab1d4cf, -, 207),
1362         HEX_DBL(+, 1, 198fa3f30be25, -, 205),
1363         HEX_DBL(+, 1, 7eae636d6144e, -, 204),
1364         HEX_DBL(+, 1, 040f1036f4863, -, 202),
1365         HEX_DBL(+, 1, 6174e477a895f, -, 201),
1366         HEX_DBL(+, 1, e065b82dd95a, -, 200),
1367         HEX_DBL(+, 1, 4676be491d129, -, 198),
1368         HEX_DBL(+, 1, bbb5da5f7c823, -, 197),
1369         HEX_DBL(+, 1, 2d884eef5fdcb, -, 195),
1370         HEX_DBL(+, 1, 99d3397ab8371, -, 194),
1371         HEX_DBL(+, 1, 1681497ed15b3, -, 192),
1372         HEX_DBL(+, 1, 7a870f597fdbd, -, 191),
1373         HEX_DBL(+, 1, 013c74edba307, -, 189),
1374         HEX_DBL(+, 1, 5d9ec4ada7938, -, 188),
1375         HEX_DBL(+, 1, db2edfd20fa7c, -, 187),
1376         HEX_DBL(+, 1, 42eb9f39afb0b, -, 185),
1377         HEX_DBL(+, 1, b6e4f282b43f4, -, 184),
1378         HEX_DBL(+, 1, 2a42764857b19, -, 182),
1379         HEX_DBL(+, 1, 9560792d19314, -, 181),
1380         HEX_DBL(+, 1, 137b6ce8e052c, -, 179),
1381         HEX_DBL(+, 1, 766b45dd84f18, -, 178),
1382         HEX_DBL(+, 1, fce362fe6e7d, -, 177),
1383         HEX_DBL(+, 1, 59d34dd8a5473, -, 175),
1384         HEX_DBL(+, 1, d606847fc727a, -, 174),
1385         HEX_DBL(+, 1, 3f6a58b795de3, -, 172),
1386         HEX_DBL(+, 1, b2216c6efdac1, -, 171),
1387         HEX_DBL(+, 1, 2705b5b153fb8, -, 169),
1388         HEX_DBL(+, 1, 90fa1509bd50d, -, 168),
1389         HEX_DBL(+, 1, 107df698da211, -, 166),
1390         HEX_DBL(+, 1, 725ae6e7b9d35, -, 165),
1391         HEX_DBL(+, 1, f75d6040aeff6, -, 164),
1392         HEX_DBL(+, 1, 56126259e093c, -, 162),
1393         HEX_DBL(+, 1, d0ec7df4f7bd4, -, 161),
1394         HEX_DBL(+, 1, 3bf2cf6722e46, -, 159),
1395         HEX_DBL(+, 1, ad6b22f55db42, -, 158),
1396         HEX_DBL(+, 1, 23d1f3e5834a, -, 156),
1397         HEX_DBL(+, 1, 8c9feab89b876, -, 155),
1398         HEX_DBL(+, 1, 0d88cf37f00dd, -, 153),
1399         HEX_DBL(+, 1, 6e55d2bf838a7, -, 152),
1400         HEX_DBL(+, 1, f1e6b68529e33, -, 151),
1401         HEX_DBL(+, 1, 525be4e4e601d, -, 149),
1402         HEX_DBL(+, 1, cbe0a45f75eb1, -, 148),
1403         HEX_DBL(+, 1, 3884e838aea68, -, 146),
1404         HEX_DBL(+, 1, a8c1f14e2af5d, -, 145),
1405         HEX_DBL(+, 1, 20a717e64a9bd, -, 143),
1406         HEX_DBL(+, 1, 8851d84118908, -, 142),
1407         HEX_DBL(+, 1, 0a9bdfb02d24, -, 140),
1408         HEX_DBL(+, 1, 6a5bea046b42e, -, 139),
1409         HEX_DBL(+, 1, ec7f3b269efa8, -, 138),
1410         HEX_DBL(+, 1, 4eafb87eab0f2, -, 136),
1411         HEX_DBL(+, 1, c6e2d05bbc, -, 135),
1412         HEX_DBL(+, 1, 35208867c2683, -, 133),
1413         HEX_DBL(+, 1, a425b317eeacd, -, 132),
1414         HEX_DBL(+, 1, 1d8508fa8246a, -, 130),
1415         HEX_DBL(+, 1, 840fbc08fdc8a, -, 129),
1416         HEX_DBL(+, 1, 07b7112bc1ffe, -, 127),
1417         HEX_DBL(+, 1, 666d0dad2961d, -, 126),
1418         HEX_DBL(+, 1, e726c3f64d0fe, -, 125),
1419         HEX_DBL(+, 1, 4b0dc07cabf98, -, 123),
1420         HEX_DBL(+, 1, c1f2daf3b6a46, -, 122),
1421         HEX_DBL(+, 1, 31c5957a47de2, -, 120),
1422         HEX_DBL(+, 1, 9f96445648b9f, -, 119),
1423         HEX_DBL(+, 1, 1a6baeadb4fd1, -, 117),
1424         HEX_DBL(+, 1, 7fd974d372e45, -, 116),
1425         HEX_DBL(+, 1, 04da4d1452919, -, 114),
1426         HEX_DBL(+, 1, 62891f06b345, -, 113),
1427         HEX_DBL(+, 1, e1dd273aa8a4a, -, 112),
1428         HEX_DBL(+, 1, 4775e0840bfdd, -, 110),
1429         HEX_DBL(+, 1, bd109d9d94bda, -, 109),
1430         HEX_DBL(+, 1, 2e73f53fba844, -, 107),
1431         HEX_DBL(+, 1, 9b138170d6bfe, -, 106),
1432         HEX_DBL(+, 1, 175af0cf60ec5, -, 104),
1433         HEX_DBL(+, 1, 7baee1bffa80b, -, 103),
1434         HEX_DBL(+, 1, 02057d1245ceb, -, 101),
1435         HEX_DBL(+, 1, 5eafffb34ba31, -, 100),
1436         HEX_DBL(+, 1, dca23bae16424, -, 99),
1437         HEX_DBL(+, 1, 43e7fc88b8056, -, 97),
1438         HEX_DBL(+, 1, b83bf23a9a9eb, -, 96),
1439         HEX_DBL(+, 1, 2b2b8dd05b318, -, 94),
1440         HEX_DBL(+, 1, 969d47321e4cc, -, 93),
1441         HEX_DBL(+, 1, 1452b7723aed2, -, 91),
1442         HEX_DBL(+, 1, 778fe2497184c, -, 90),
1443         HEX_DBL(+, 1, fe7116182e9cc, -, 89),
1444         HEX_DBL(+, 1, 5ae191a99585a, -, 87),
1445         HEX_DBL(+, 1, d775d87da854d, -, 86),
1446         HEX_DBL(+, 1, 4063f8cc8bb98, -, 84),
1447         HEX_DBL(+, 1, b374b315f87c1, -, 83),
1448         HEX_DBL(+, 1, 27ec458c65e3c, -, 81),
1449         HEX_DBL(+, 1, 923372c67a074, -, 80),
1450         HEX_DBL(+, 1, 1152eaeb73c08, -, 78),
1451         HEX_DBL(+, 1, 737c5645114b5, -, 77),
1452         HEX_DBL(+, 1, f8e6c24b5592e, -, 76),
1453         HEX_DBL(+, 1, 571db733a9d61, -, 74),
1454         HEX_DBL(+, 1, d257d547e083f, -, 73),
1455         HEX_DBL(+, 1, 3ce9b9de78f85, -, 71),
1456         HEX_DBL(+, 1, aebabae3a41b5, -, 70),
1457         HEX_DBL(+, 1, 24b6031b49bda, -, 68),
1458         HEX_DBL(+, 1, 8dd5e1bb09d7e, -, 67),
1459         HEX_DBL(+, 1, 0e5b73d1ff53d, -, 65),
1460         HEX_DBL(+, 1, 6f741de1748ec, -, 64),
1461         HEX_DBL(+, 1, f36bd37f42f3e, -, 63),
1462         HEX_DBL(+, 1, 536452ee2f75c, -, 61),
1463         HEX_DBL(+, 1, cd480a1b7482, -, 60),
1464         HEX_DBL(+, 1, 39792499b1a24, -, 58),
1465         HEX_DBL(+, 1, aa0de4bf35b38, -, 57),
1466         HEX_DBL(+, 1, 2188ad6ae3303, -, 55),
1467         HEX_DBL(+, 1, 898471fca6055, -, 54),
1468         HEX_DBL(+, 1, 0b6c3afdde064, -, 52),
1469         HEX_DBL(+, 1, 6b7719a59f0e, -, 51),
1470         HEX_DBL(+, 1, ee001eed62aa, -, 50),
1471         HEX_DBL(+, 1, 4fb547c775da8, -, 48),
1472         HEX_DBL(+, 1, c8464f7616468, -, 47),
1473         HEX_DBL(+, 1, 36121e24d3bba, -, 45),
1474         HEX_DBL(+, 1, a56e0c2ac7f75, -, 44),
1475         HEX_DBL(+, 1, 1e642baeb84a, -, 42),
1476         HEX_DBL(+, 1, 853f01d6d53ba, -, 41),
1477         HEX_DBL(+, 1, 0885298767e9a, -, 39),
1478         HEX_DBL(+, 1, 67852a7007e42, -, 38),
1479         HEX_DBL(+, 1, e8a37a45fc32e, -, 37),
1480         HEX_DBL(+, 1, 4c1078fe9228a, -, 35),
1481         HEX_DBL(+, 1, c3527e433fab1, -, 34),
1482         HEX_DBL(+, 1, 32b48bf117da2, -, 32),
1483         HEX_DBL(+, 1, a0db0d0ddb3ec, -, 31),
1484         HEX_DBL(+, 1, 1b48655f37267, -, 29),
1485         HEX_DBL(+, 1, 81056ff2c5772, -, 28),
1486         HEX_DBL(+, 1, 05a628c699fa1, -, 26),
1487         HEX_DBL(+, 1, 639e3175a689d, -, 25),
1488         HEX_DBL(+, 1, e355bbaee85cb, -, 24),
1489         HEX_DBL(+, 1, 4875ca227ec38, -, 22),
1490         HEX_DBL(+, 1, be6c6fdb01612, -, 21),
1491         HEX_DBL(+, 1, 2f6053b981d98, -, 19),
1492         HEX_DBL(+, 1, 9c54c3b43bc8b, -, 18),
1493         HEX_DBL(+, 1, 18354238f6764, -, 16),
1494         HEX_DBL(+, 1, 7cd79b5647c9b, -, 15),
1495         HEX_DBL(+, 1, 02cf22526545a, -, 13),
1496         HEX_DBL(+, 1, 5fc21041027ad, -, 12),
1497         HEX_DBL(+, 1, de16b9c24a98f, -, 11),
1498         HEX_DBL(+, 1, 44e51f113d4d6, -, 9),
1499         HEX_DBL(+, 1, b993fe00d5376, -, 8),
1500         HEX_DBL(+, 1, 2c155b8213cf4, -, 6),
1501         HEX_DBL(+, 1, 97db0ccceb0af, -, 5),
1502         HEX_DBL(+, 1, 152aaa3bf81cc, -, 3),
1503         HEX_DBL(+, 1, 78b56362cef38, -, 2),
1504         HEX_DBL(+, 1, 0, +, 0),
1505         HEX_DBL(+, 1, 5bf0a8b145769, +, 1),
1506         HEX_DBL(+, 1, d8e64b8d4ddae, +, 2),
1507         HEX_DBL(+, 1, 415e5bf6fb106, +, 4),
1508         HEX_DBL(+, 1, b4c902e273a58, +, 5),
1509         HEX_DBL(+, 1, 28d389970338f, +, 7),
1510         HEX_DBL(+, 1, 936dc5690c08f, +, 8),
1511         HEX_DBL(+, 1, 122885aaeddaa, +, 10),
1512         HEX_DBL(+, 1, 749ea7d470c6e, +, 11),
1513         HEX_DBL(+, 1, fa7157c470f82, +, 12),
1514         HEX_DBL(+, 1, 5829dcf95056, +, 14),
1515         HEX_DBL(+, 1, d3c4488ee4f7f, +, 15),
1516         HEX_DBL(+, 1, 3de1654d37c9a, +, 17),
1517         HEX_DBL(+, 1, b00b5916ac955, +, 18),
1518         HEX_DBL(+, 1, 259ac48bf05d7, +, 20),
1519         HEX_DBL(+, 1, 8f0ccafad2a87, +, 21),
1520         HEX_DBL(+, 1, 0f2ebd0a8002, +, 23),
1521         HEX_DBL(+, 1, 709348c0ea4f9, +, 24),
1522         HEX_DBL(+, 1, f4f22091940bd, +, 25),
1523         HEX_DBL(+, 1, 546d8f9ed26e1, +, 27),
1524         HEX_DBL(+, 1, ceb088b68e804, +, 28),
1525         HEX_DBL(+, 1, 3a6e1fd9eecfd, +, 30),
1526         HEX_DBL(+, 1, ab5adb9c436, +, 31),
1527         HEX_DBL(+, 1, 226af33b1fdc1, +, 33),
1528         HEX_DBL(+, 1, 8ab7fb5475fb7, +, 34),
1529         HEX_DBL(+, 1, 0c3d3920962c9, +, 36),
1530         HEX_DBL(+, 1, 6c932696a6b5d, +, 37),
1531         HEX_DBL(+, 1, ef822f7f6731d, +, 38),
1532         HEX_DBL(+, 1, 50bba3796379a, +, 40),
1533         HEX_DBL(+, 1, c9aae4631c056, +, 41),
1534         HEX_DBL(+, 1, 370470aec28ed, +, 43),
1535         HEX_DBL(+, 1, a6b765d8cdf6d, +, 44),
1536         HEX_DBL(+, 1, 1f43fcc4b662c, +, 46),
1537         HEX_DBL(+, 1, 866f34a725782, +, 47),
1538         HEX_DBL(+, 1, 0953e2f3a1ef7, +, 49),
1539         HEX_DBL(+, 1, 689e221bc8d5b, +, 50),
1540         HEX_DBL(+, 1, ea215a1d20d76, +, 51),
1541         HEX_DBL(+, 1, 4d13fbb1a001a, +, 53),
1542         HEX_DBL(+, 1, c4b334617cc67, +, 54),
1543         HEX_DBL(+, 1, 33a43d282a519, +, 56),
1544         HEX_DBL(+, 1, a220d397972eb, +, 57),
1545         HEX_DBL(+, 1, 1c25c88df6862, +, 59),
1546         HEX_DBL(+, 1, 8232558201159, +, 60),
1547         HEX_DBL(+, 1, 0672a3c9eb871, +, 62),
1548         HEX_DBL(+, 1, 64b41c6d37832, +, 63),
1549         HEX_DBL(+, 1, e4cf766fe49be, +, 64),
1550         HEX_DBL(+, 1, 49767bc0483e3, +, 66),
1551         HEX_DBL(+, 1, bfc951eb8bb76, +, 67),
1552         HEX_DBL(+, 1, 304d6aeca254b, +, 69),
1553         HEX_DBL(+, 1, 9d97010884251, +, 70),
1554         HEX_DBL(+, 1, 19103e4080b45, +, 72),
1555         HEX_DBL(+, 1, 7e013cd114461, +, 73),
1556         HEX_DBL(+, 1, 03996528e074c, +, 75),
1557         HEX_DBL(+, 1, 60d4f6fdac731, +, 76),
1558         HEX_DBL(+, 1, df8c5af17ba3b, +, 77),
1559         HEX_DBL(+, 1, 45e3076d61699, +, 79),
1560         HEX_DBL(+, 1, baed16a6e0da7, +, 80),
1561         HEX_DBL(+, 1, 2cffdfebde1a1, +, 82),
1562         HEX_DBL(+, 1, 9919cabefcb69, +, 83),
1563         HEX_DBL(+, 1, 160345c9953e3, +, 85),
1564         HEX_DBL(+, 1, 79dbc9dc53c66, +, 86),
1565         HEX_DBL(+, 1, 00c810d464097, +, 88),
1566         HEX_DBL(+, 1, 5d009394c5c27, +, 89),
1567         HEX_DBL(+, 1, da57de8f107a8, +, 90),
1568         HEX_DBL(+, 1, 425982cf597cd, +, 92),
1569         HEX_DBL(+, 1, b61e5ca3a5e31, +, 93),
1570         HEX_DBL(+, 1, 29bb825dfcf87, +, 95),
1571         HEX_DBL(+, 1, 94a90db0d6fe2, +, 96),
1572         HEX_DBL(+, 1, 12fec759586fd, +, 98),
1573         HEX_DBL(+, 1, 75c1dc469e3af, +, 99),
1574         HEX_DBL(+, 1, fbfd219c43b04, +, 100),
1575         HEX_DBL(+, 1, 5936d44e1a146, +, 102),
1576         HEX_DBL(+, 1, d531d8a7ee79c, +, 103),
1577         HEX_DBL(+, 1, 3ed9d24a2d51b, +, 105),
1578         HEX_DBL(+, 1, b15cfe5b6e17b, +, 106),
1579         HEX_DBL(+, 1, 268038c2c0e, +, 108),
1580         HEX_DBL(+, 1, 9044a73545d48, +, 109),
1581         HEX_DBL(+, 1, 1002ab6218b38, +, 111),
1582         HEX_DBL(+, 1, 71b3540cbf921, +, 112),
1583         HEX_DBL(+, 1, f6799ea9c414a, +, 113),
1584         HEX_DBL(+, 1, 55779b984f3eb, +, 115),
1585         HEX_DBL(+, 1, d01a210c44aa4, +, 116),
1586         HEX_DBL(+, 1, 3b63da8e9121, +, 118),
1587         HEX_DBL(+, 1, aca8d6b0116b8, +, 119),
1588         HEX_DBL(+, 1, 234de9e0c74e9, +, 121),
1589         HEX_DBL(+, 1, 8bec7503ca477, +, 122),
1590         HEX_DBL(+, 1, 0d0eda9796b9, +, 124),
1591         HEX_DBL(+, 1, 6db0118477245, +, 125),
1592         HEX_DBL(+, 1, f1056dc7bf22d, +, 126),
1593         HEX_DBL(+, 1, 51c2cc3433801, +, 128),
1594         HEX_DBL(+, 1, cb108ffbec164, +, 129),
1595         HEX_DBL(+, 1, 37f780991b584, +, 131),
1596         HEX_DBL(+, 1, a801c0ea8ac4d, +, 132),
1597         HEX_DBL(+, 1, 20247cc4c46c1, +, 134),
1598         HEX_DBL(+, 1, 87a0553328015, +, 135),
1599         HEX_DBL(+, 1, 0a233dee4f9bb, +, 137),
1600         HEX_DBL(+, 1, 69b7f55b808ba, +, 138),
1601         HEX_DBL(+, 1, eba064644060a, +, 139),
1602         HEX_DBL(+, 1, 4e184933d9364, +, 141),
1603         HEX_DBL(+, 1, c614fe2531841, +, 142),
1604         HEX_DBL(+, 1, 3494a9b171bf5, +, 144),
1605         HEX_DBL(+, 1, a36798b9d969b, +, 145),
1606         HEX_DBL(+, 1, 1d03d8c0c04af, +, 147),
1607         HEX_DBL(+, 1, 836026385c974, +, 148),
1608         HEX_DBL(+, 1, 073fbe9ac901d, +, 150),
1609         HEX_DBL(+, 1, 65cae0969f286, +, 151),
1610         HEX_DBL(+, 1, e64a58639cae8, +, 152),
1611         HEX_DBL(+, 1, 4a77f5f9b50f9, +, 154),
1612         HEX_DBL(+, 1, c12744a3a28e3, +, 155),
1613         HEX_DBL(+, 1, 313b3b6978e85, +, 157),
1614         HEX_DBL(+, 1, 9eda3a31e587e, +, 158),
1615         HEX_DBL(+, 1, 19ebe56b56453, +, 160),
1616         HEX_DBL(+, 1, 7f2bc6e599b7e, +, 161),
1617         HEX_DBL(+, 1, 04644610df2ff, +, 163),
1618         HEX_DBL(+, 1, 61e8b490ac4e6, +, 164),
1619         HEX_DBL(+, 1, e103201f299b3, +, 165),
1620         HEX_DBL(+, 1, 46e1b637beaf5, +, 167),
1621         HEX_DBL(+, 1, bc473cfede104, +, 168),
1622         HEX_DBL(+, 1, 2deb1b9c85e2d, +, 170),
1623         HEX_DBL(+, 1, 9a5981ca67d1, +, 171),
1624         HEX_DBL(+, 1, 16dc8a9ef670b, +, 173),
1625         HEX_DBL(+, 1, 7b03166942309, +, 174),
1626         HEX_DBL(+, 1, 0190be03150a7, +, 176),
1627         HEX_DBL(+, 1, 5e1152f9a8119, +, 177),
1628         HEX_DBL(+, 1, dbca9263f8487, +, 178),
1629         HEX_DBL(+, 1, 43556dee93bee, +, 180),
1630         HEX_DBL(+, 1, b774c12967dfa, +, 181),
1631         HEX_DBL(+, 1, 2aa4306e922c2, +, 183),
1632         HEX_DBL(+, 1, 95e54c5dd4217, +, 184)
1633     };
1634 
1635     // scale by e**i --  (expm1(f) + 1)*e**i - 1  = expm1(f) * e**i + e**i - 1 =
1636     // e**i
1637     return exp_table[exponent + 150] + (f * exp_table[exponent + 150] - 1.0);
1638 }
1639 
1640 
reference_fmax(double x,double y)1641 double reference_fmax(double x, double y)
1642 {
1643     if (isnan(y)) return x;
1644 
1645     return x >= y ? x : y;
1646 }
1647 
reference_fmin(double x,double y)1648 double reference_fmin(double x, double y)
1649 {
1650     if (isnan(y)) return x;
1651 
1652     return x <= y ? x : y;
1653 }
1654 
reference_hypot(double x,double y)1655 double reference_hypot(double x, double y)
1656 {
1657     // Since the inputs are actually floats, we don't have to worry about range
1658     // here
1659     if (isinf(x) || isinf(y)) return INFINITY;
1660 
1661     return sqrt(x * x + y * y);
1662 }
1663 
reference_ilogbl(long double x)1664 int reference_ilogbl(long double x)
1665 {
1666     extern int gDeviceILogb0, gDeviceILogbNaN;
1667 
1668     // Since we are just using this to verify double precision, we can
1669     // use the double precision ilogb here
1670     union {
1671         double f;
1672         cl_ulong u;
1673     } u;
1674     u.f = (double)x;
1675 
1676     int exponent = (int)(u.u >> 52) & 0x7ff;
1677     if (exponent == 0x7ff)
1678     {
1679         if (u.u & 0x000fffffffffffffULL) return gDeviceILogbNaN;
1680 
1681         return CL_INT_MAX;
1682     }
1683 
1684     if (exponent == 0)
1685     { // deal with denormals
1686         u.f = x * HEX_DBL(+, 1, 0, +, 64);
1687         exponent = (cl_uint)(u.u >> 52) & 0x7ff;
1688         if (exponent == 0) return gDeviceILogb0;
1689 
1690         exponent -= 1023 + 64;
1691         return exponent;
1692     }
1693 
1694     return exponent - 1023;
1695 }
1696 
reference_relaxed_log2(double x)1697 double reference_relaxed_log2(double x) { return reference_log2(x); }
1698 
reference_log2(double x)1699 double reference_log2(double x)
1700 {
1701     if (isnan(x) || x < 0.0 || x == -INFINITY) return cl_make_nan();
1702 
1703     if (x == 0.0f) return -INFINITY;
1704 
1705     if (x == INFINITY) return INFINITY;
1706 
1707     double hi, lo;
1708     __log2_ep(&hi, &lo, x);
1709     return hi;
1710 }
1711 
reference_log1p(double x)1712 double reference_log1p(double x)
1713 { // This function is suitable only for verifying log1pf(). It produces several
1714   // double precision ulps of error.
1715 
1716     // Handle small and NaN
1717     if (!(reference_fabs(x) > HEX_DBL(+, 1, 0, -, 53))) return x;
1718 
1719     // deal with special values
1720     if (x <= -1.0)
1721     {
1722         if (x < -1.0) return cl_make_nan();
1723         return -INFINITY;
1724     }
1725 
1726     // infinity
1727     if (x == INFINITY) return INFINITY;
1728 
1729     // High precision result for when near 0, to avoid problems with the
1730     // reference result falling in the wrong binade.
1731     if (reference_fabs(x) < HEX_DBL(+, 1, 0, -, 28)) return (1.0 - 0.5 * x) * x;
1732 
1733     // Our polynomial is only good in the region +-2**-4.
1734     // If we aren't in that range then we need to reduce to be in that range
1735     double correctionLo =
1736         -0.0; // correction down stream to compensate for the reduction, if any
1737     double correctionHi =
1738         -0.0; // correction down stream to compensate for the exponent, if any
1739     if (reference_fabs(x) > HEX_DBL(+, 1, 0, -, 4))
1740     {
1741         x += 1.0; // double should cover any loss of precision here
1742 
1743         // separate x into (1+f) * 2**i
1744         union {
1745             double d;
1746             cl_ulong u;
1747         } u;
1748         u.d = x;
1749         int i = (int)((u.u >> 52) & 0x7ff) - 1023;
1750         u.u &= 0x000fffffffffffffULL;
1751         int index = (int)(u.u >> 48);
1752         u.u |= 0x3ff0000000000000ULL;
1753         double f = u.d;
1754 
1755         // further reduce f to be within 1/16 of 1.0
1756         static const double scale_table[16] = {
1757             1.0,
1758             HEX_DBL(+, 1, d2d2d2d6e3f79, -, 1),
1759             HEX_DBL(+, 1, b8e38e42737a1, -, 1),
1760             HEX_DBL(+, 1, a1af28711adf3, -, 1),
1761             HEX_DBL(+, 1, 8cccccd88dd65, -, 1),
1762             HEX_DBL(+, 1, 79e79e810ec8f, -, 1),
1763             HEX_DBL(+, 1, 68ba2e94df404, -, 1),
1764             HEX_DBL(+, 1, 590b216defb29, -, 1),
1765             HEX_DBL(+, 1, 4aaaaab1500ed, -, 1),
1766             HEX_DBL(+, 1, 3d70a3e0d6f73, -, 1),
1767             HEX_DBL(+, 1, 313b13bb39f4f, -, 1),
1768             HEX_DBL(+, 1, 25ed09823f1cc, -, 1),
1769             HEX_DBL(+, 1, 1b6db6e77457b, -, 1),
1770             HEX_DBL(+, 1, 11a7b96a3a34f, -, 1),
1771             HEX_DBL(+, 1, 0888888e46fea, -, 1),
1772             HEX_DBL(+, 1, 00000038e9862, -, 1)
1773         };
1774 
1775         // correction_table[i] = -log( scale_table[i] )
1776         // All entries have >= 64 bits of precision (rather than the expected
1777         // 53)
1778         static const double correction_table[16] = {
1779             -0.0,
1780             HEX_DBL(+, 1, 7a5c722c16058, -, 4),
1781             HEX_DBL(+, 1, 323db16c89ab1, -, 3),
1782             HEX_DBL(+, 1, a0f87d180629, -, 3),
1783             HEX_DBL(+, 1, 050279324e17c, -, 2),
1784             HEX_DBL(+, 1, 36f885bb270b0, -, 2),
1785             HEX_DBL(+, 1, 669b771b5cc69, -, 2),
1786             HEX_DBL(+, 1, 94203a6292a05, -, 2),
1787             HEX_DBL(+, 1, bfb4f9cb333a4, -, 2),
1788             HEX_DBL(+, 1, e982376ddb80e, -, 2),
1789             HEX_DBL(+, 1, 08d5d8769b2b2, -, 1),
1790             HEX_DBL(+, 1, 1c288bc00e0cf, -, 1),
1791             HEX_DBL(+, 1, 2ec7535b31ecb, -, 1),
1792             HEX_DBL(+, 1, 40bed0adc63fb, -, 1),
1793             HEX_DBL(+, 1, 521a5c0330615, -, 1),
1794             HEX_DBL(+, 1, 62e42f7dd092c, -, 1)
1795         };
1796 
1797         f *= scale_table[index];
1798         correctionLo = correction_table[index];
1799 
1800         // log( 2**(i) ) = i * log(2)
1801         correctionHi = (double)i * 0.693147180559945309417232121458176568;
1802 
1803         x = f - 1.0;
1804     }
1805 
1806 
1807     // minmax polynomial for p(x) = (log(x+1) - x)/x valid over the range x =
1808     // [-1/16, 1/16]
1809     //          max error HEX_DBL( +, 1, 048f61f9a5eca, -, 52 )
1810     double p = HEX_DBL(-, 1, cc33de97a9d7b, -, 46)
1811         + (HEX_DBL(-, 1, fffffffff3eb7, -, 2)
1812            + (HEX_DBL(+, 1, 5555555633ef7, -, 2)
1813               + (HEX_DBL(-, 1, 00000062c78, -, 2)
1814                  + (HEX_DBL(+, 1, 9999958a3321, -, 3)
1815                     + (HEX_DBL(-, 1, 55534ce65c347, -, 3)
1816                        + (HEX_DBL(+, 1, 24957208391a5, -, 3)
1817                           + (HEX_DBL(-, 1, 02287b9a5b4a1, -, 3)
1818                              + HEX_DBL(+, 1, c757d922180ed, -, 4) * x)
1819                               * x)
1820                            * x)
1821                         * x)
1822                      * x)
1823                   * x)
1824                * x)
1825             * x;
1826 
1827     // log(x+1) = x * p(x) + x
1828     x += x * p;
1829 
1830     return correctionHi + (correctionLo + x);
1831 }
1832 
reference_logb(double x)1833 double reference_logb(double x)
1834 {
1835     union {
1836         float f;
1837         cl_uint u;
1838     } u;
1839     u.f = (float)x;
1840 
1841     cl_int exponent = (u.u >> 23) & 0xff;
1842     if (exponent == 0xff) return x * x;
1843 
1844     if (exponent == 0)
1845     { // deal with denormals
1846         u.u = (u.u & 0x007fffff) | 0x3f800000;
1847         u.f -= 1.0f;
1848         exponent = (u.u >> 23) & 0xff;
1849         if (exponent == 0) return -INFINITY;
1850 
1851         return exponent - (127 + 126);
1852     }
1853 
1854     return exponent - 127;
1855 }
1856 
reference_relaxed_reciprocal(double x)1857 double reference_relaxed_reciprocal(double x) { return 1.0f / ((float)x); }
1858 
reference_reciprocal(double x)1859 double reference_reciprocal(double x) { return 1.0 / x; }
1860 
reference_remainder(double x,double y)1861 double reference_remainder(double x, double y)
1862 {
1863     int i;
1864     return reference_remquo(x, y, &i);
1865 }
1866 
reference_lgamma(double x)1867 double reference_lgamma(double x)
1868 {
1869     /*
1870      * ====================================================
1871      * This function is from fdlibm. http://www.netlib.org
1872      * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
1873      *
1874      * Developed at SunSoft, a Sun Microsystems, Inc. business.
1875      * Permission to use, copy, modify, and distribute this
1876      * software is freely granted, provided that this notice
1877      * is preserved.
1878      * ====================================================
1879      *
1880      */
1881 
1882     static const double // two52 = 4.50359962737049600000e+15, /* 0x43300000,
1883                         // 0x00000000 */
1884         half = 5.00000000000000000000e-01, /* 0x3FE00000,
1885                                               0x00000000 */
1886         one = 1.00000000000000000000e+00, /* 0x3FF00000, 0x00000000 */
1887         pi = 3.14159265358979311600e+00, /* 0x400921FB, 0x54442D18 */
1888         a0 = 7.72156649015328655494e-02, /* 0x3FB3C467, 0xE37DB0C8 */
1889         a1 = 3.22467033424113591611e-01, /* 0x3FD4A34C, 0xC4A60FAD */
1890         a2 = 6.73523010531292681824e-02, /* 0x3FB13E00, 0x1A5562A7 */
1891         a3 = 2.05808084325167332806e-02, /* 0x3F951322, 0xAC92547B */
1892         a4 = 7.38555086081402883957e-03, /* 0x3F7E404F, 0xB68FEFE8 */
1893         a5 = 2.89051383673415629091e-03, /* 0x3F67ADD8, 0xCCB7926B */
1894         a6 = 1.19270763183362067845e-03, /* 0x3F538A94, 0x116F3F5D */
1895         a7 = 5.10069792153511336608e-04, /* 0x3F40B6C6, 0x89B99C00 */
1896         a8 = 2.20862790713908385557e-04, /* 0x3F2CF2EC, 0xED10E54D */
1897         a9 = 1.08011567247583939954e-04, /* 0x3F1C5088, 0x987DFB07 */
1898         a10 = 2.52144565451257326939e-05, /* 0x3EFA7074, 0x428CFA52 */
1899         a11 = 4.48640949618915160150e-05, /* 0x3F07858E, 0x90A45837 */
1900         tc = 1.46163214496836224576e+00, /* 0x3FF762D8, 0x6356BE3F */
1901         tf = -1.21486290535849611461e-01, /* 0xBFBF19B9, 0xBCC38A42 */
1902         /* tt = -(tail of tf) */
1903         tt = -3.63867699703950536541e-18, /* 0xBC50C7CA, 0xA48A971F */
1904         t0 = 4.83836122723810047042e-01, /* 0x3FDEF72B, 0xC8EE38A2 */
1905         t1 = -1.47587722994593911752e-01, /* 0xBFC2E427, 0x8DC6C509 */
1906         t2 = 6.46249402391333854778e-02, /* 0x3FB08B42, 0x94D5419B */
1907         t3 = -3.27885410759859649565e-02, /* 0xBFA0C9A8, 0xDF35B713 */
1908         t4 = 1.79706750811820387126e-02, /* 0x3F9266E7, 0x970AF9EC */
1909         t5 = -1.03142241298341437450e-02, /* 0xBF851F9F, 0xBA91EC6A */
1910         t6 = 6.10053870246291332635e-03, /* 0x3F78FCE0, 0xE370E344 */
1911         t7 = -3.68452016781138256760e-03, /* 0xBF6E2EFF, 0xB3E914D7 */
1912         t8 = 2.25964780900612472250e-03, /* 0x3F6282D3, 0x2E15C915 */
1913         t9 = -1.40346469989232843813e-03, /* 0xBF56FE8E, 0xBF2D1AF1 */
1914         t10 = 8.81081882437654011382e-04, /* 0x3F4CDF0C, 0xEF61A8E9 */
1915         t11 = -5.38595305356740546715e-04, /* 0xBF41A610, 0x9C73E0EC */
1916         t12 = 3.15632070903625950361e-04, /* 0x3F34AF6D, 0x6C0EBBF7 */
1917         t13 = -3.12754168375120860518e-04, /* 0xBF347F24, 0xECC38C38 */
1918         t14 = 3.35529192635519073543e-04, /* 0x3F35FD3E, 0xE8C2D3F4 */
1919         u0 = -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */
1920         u1 = 6.32827064025093366517e-01, /* 0x3FE4401E, 0x8B005DFF */
1921         u2 = 1.45492250137234768737e+00, /* 0x3FF7475C, 0xD119BD6F */
1922         u3 = 9.77717527963372745603e-01, /* 0x3FEF4976, 0x44EA8450 */
1923         u4 = 2.28963728064692451092e-01, /* 0x3FCD4EAE, 0xF6010924 */
1924         u5 = 1.33810918536787660377e-02, /* 0x3F8B678B, 0xBF2BAB09 */
1925         v1 = 2.45597793713041134822e+00, /* 0x4003A5D7, 0xC2BD619C */
1926         v2 = 2.12848976379893395361e+00, /* 0x40010725, 0xA42B18F5 */
1927         v3 = 7.69285150456672783825e-01, /* 0x3FE89DFB, 0xE45050AF */
1928         v4 = 1.04222645593369134254e-01, /* 0x3FBAAE55, 0xD6537C88 */
1929         v5 = 3.21709242282423911810e-03, /* 0x3F6A5ABB, 0x57D0CF61 */
1930         s0 = -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */
1931         s1 = 2.14982415960608852501e-01, /* 0x3FCB848B, 0x36E20878 */
1932         s2 = 3.25778796408930981787e-01, /* 0x3FD4D98F, 0x4F139F59 */
1933         s3 = 1.46350472652464452805e-01, /* 0x3FC2BB9C, 0xBEE5F2F7 */
1934         s4 = 2.66422703033638609560e-02, /* 0x3F9B481C, 0x7E939961 */
1935         s5 = 1.84028451407337715652e-03, /* 0x3F5E26B6, 0x7368F239 */
1936         s6 = 3.19475326584100867617e-05, /* 0x3F00BFEC, 0xDD17E945 */
1937         r1 = 1.39200533467621045958e+00, /* 0x3FF645A7, 0x62C4AB74 */
1938         r2 = 7.21935547567138069525e-01, /* 0x3FE71A18, 0x93D3DCDC */
1939         r3 = 1.71933865632803078993e-01, /* 0x3FC601ED, 0xCCFBDF27 */
1940         r4 = 1.86459191715652901344e-02, /* 0x3F9317EA, 0x742ED475 */
1941         r5 = 7.77942496381893596434e-04, /* 0x3F497DDA, 0xCA41A95B */
1942         r6 = 7.32668430744625636189e-06, /* 0x3EDEBAF7, 0xA5B38140 */
1943         w0 = 4.18938533204672725052e-01, /* 0x3FDACFE3, 0x90C97D69 */
1944         w1 = 8.33333333333329678849e-02, /* 0x3FB55555, 0x5555553B */
1945         w2 = -2.77777777728775536470e-03, /* 0xBF66C16C, 0x16B02E5C */
1946         w3 = 7.93650558643019558500e-04, /* 0x3F4A019F, 0x98CF38B6 */
1947         w4 = -5.95187557450339963135e-04, /* 0xBF4380CB, 0x8C0FE741 */
1948         w5 = 8.36339918996282139126e-04, /* 0x3F4B67BA, 0x4CDAD5D1 */
1949         w6 = -1.63092934096575273989e-03; /* 0xBF5AB89D, 0x0B9E43E4 */
1950 
1951     static const double zero = 0.00000000000000000000e+00;
1952     double nadj = zero;
1953     double t, y, z, p, p1, p2, p3, q, r, w;
1954     cl_int i, hx, lx, ix;
1955 
1956     union {
1957         double d;
1958         cl_ulong u;
1959     } u;
1960     u.d = x;
1961 
1962     hx = (cl_int)(u.u >> 32);
1963     lx = (cl_int)(u.u & 0xffffffffULL);
1964 
1965     /* purge off +-inf, NaN, +-0, and negative arguments */
1966     //    *signgamp = 1;
1967     ix = hx & 0x7fffffff;
1968     if (ix >= 0x7ff00000) return x * x;
1969     if ((ix | lx) == 0) return INFINITY;
1970     if (ix < 0x3b900000)
1971     { /* |x|<2**-70, return -log(|x|) */
1972         if (hx < 0)
1973         {
1974             //            *signgamp = -1;
1975             return -reference_log(-x);
1976         }
1977         else
1978             return -reference_log(x);
1979     }
1980     if (hx < 0)
1981     {
1982         if (ix >= 0x43300000) /* |x|>=2**52, must be -integer */
1983             return INFINITY;
1984         t = reference_sinpi(x);
1985         if (t == zero) return INFINITY; /* -integer */
1986         nadj = reference_log(pi / reference_fabs(t * x));
1987         //        if(t<zero) *signgamp = -1;
1988         x = -x;
1989     }
1990 
1991     /* purge off 1 and 2 */
1992     if ((((ix - 0x3ff00000) | lx) == 0) || (((ix - 0x40000000) | lx) == 0))
1993         r = 0;
1994     /* for x < 2.0 */
1995     else if (ix < 0x40000000)
1996     {
1997         if (ix <= 0x3feccccc)
1998         { /* lgamma(x) = lgamma(x+1)-log(x) */
1999             r = -reference_log(x);
2000             if (ix >= 0x3FE76944)
2001             {
2002                 y = 1.0 - x;
2003                 i = 0;
2004             }
2005             else if (ix >= 0x3FCDA661)
2006             {
2007                 y = x - (tc - one);
2008                 i = 1;
2009             }
2010             else
2011             {
2012                 y = x;
2013                 i = 2;
2014             }
2015         }
2016         else
2017         {
2018             r = zero;
2019             if (ix >= 0x3FFBB4C3)
2020             {
2021                 y = 2.0 - x;
2022                 i = 0;
2023             } /* [1.7316,2] */
2024             else if (ix >= 0x3FF3B4C4)
2025             {
2026                 y = x - tc;
2027                 i = 1;
2028             } /* [1.23,1.73] */
2029             else
2030             {
2031                 y = x - one;
2032                 i = 2;
2033             }
2034         }
2035         switch (i)
2036         {
2037             case 0:
2038                 z = y * y;
2039                 p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10))));
2040                 p2 = z
2041                     * (a1
2042                        + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11)))));
2043                 p = y * p1 + p2;
2044                 r += (p - 0.5 * y);
2045                 break;
2046             case 1:
2047                 z = y * y;
2048                 w = z * y;
2049                 p1 = t0
2050                     + w
2051                         * (t3
2052                            + w * (t6 + w * (t9 + w * t12))); /* parallel comp */
2053                 p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13)));
2054                 p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14)));
2055                 p = z * p1 - (tt - w * (p2 + y * p3));
2056                 r += (tf + p);
2057                 break;
2058             case 2:
2059                 p1 = y
2060                     * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));
2061                 p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));
2062                 r += (-0.5 * y + p1 / p2);
2063         }
2064     }
2065     else if (ix < 0x40200000)
2066     { /* x < 8.0 */
2067         i = (int)x;
2068         t = zero;
2069         y = x - (double)i;
2070         p = y
2071             * (s0
2072                + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6))))));
2073         q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6)))));
2074         r = half * y + p / q;
2075         z = one; /* lgamma(1+s) = log(s) + lgamma(s) */
2076         switch (i)
2077         {
2078             case 7: z *= (y + 6.0); /* FALLTHRU */
2079             case 6: z *= (y + 5.0); /* FALLTHRU */
2080             case 5: z *= (y + 4.0); /* FALLTHRU */
2081             case 4: z *= (y + 3.0); /* FALLTHRU */
2082             case 3:
2083                 z *= (y + 2.0); /* FALLTHRU */
2084                 r += reference_log(z);
2085                 break;
2086         }
2087         /* 8.0 <= x < 2**58 */
2088     }
2089     else if (ix < 0x43900000)
2090     {
2091         t = reference_log(x);
2092         z = one / x;
2093         y = z * z;
2094         w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6)))));
2095         r = (x - half) * (t - one) + w;
2096     }
2097     else
2098         /* 2**58 <= x <= inf */
2099         r = x * (reference_log(x) - one);
2100     if (hx < 0) r = nadj - r;
2101     return r;
2102 }
2103 
2104 #endif // _MSC_VER
2105 
reference_assignment(double x)2106 double reference_assignment(double x) { return x; }
2107 
reference_not(double x)2108 int reference_not(double x)
2109 {
2110     int r = !x;
2111     return r;
2112 }
2113 
2114 #pragma mark -
2115 #pragma mark Double testing
2116 
2117 #ifndef M_PIL
2118 #define M_PIL                                                                  \
2119     3.14159265358979323846264338327950288419716939937510582097494459230781640628620899L
2120 #endif
2121 
2122 static long double reduce1l(long double x);
2123 
2124 #ifdef __PPC__
2125 // Since long double on PPC is really extended precision double arithmetic
2126 // consisting of two doubles (a high and low). This form of long double has
2127 // the potential of representing a number with more than LDBL_MANT_DIG digits
2128 // such that reduction algorithm used for other architectures will not work.
2129 // Instead and alternate reduction method is used.
2130 
reduce1l(long double x)2131 static long double reduce1l(long double x)
2132 {
2133     union {
2134         long double ld;
2135         double d[2];
2136     } u;
2137 
2138     // Reduce the high and low halfs separately.
2139     u.ld = x;
2140     return ((long double)reduce1(u.d[0]) + reduce1(u.d[1]));
2141 }
2142 
2143 #else // !__PPC__
2144 
reduce1l(long double x)2145 static long double reduce1l(long double x)
2146 {
2147     static long double unit_exp = 0;
2148     if (0.0L == unit_exp) unit_exp = scalbnl(1.0L, LDBL_MANT_DIG);
2149 
2150     if (reference_fabsl(x) >= unit_exp)
2151     {
2152         if (reference_fabsl(x) == INFINITY) return cl_make_nan();
2153 
2154         return 0.0L; // we patch up the sign for sinPi and cosPi later, since
2155                      // they need different signs
2156     }
2157 
2158     // Find the nearest multiple of 2
2159     const long double r = reference_copysignl(unit_exp, x);
2160     long double z = x + r;
2161     z -= r;
2162 
2163     // subtract it from x. Value is now in the range -1 <= x <= 1
2164     return x - z;
2165 }
2166 #endif // __PPC__
2167 
reference_acospil(long double x)2168 long double reference_acospil(long double x)
2169 {
2170     return reference_acosl(x) / M_PIL;
2171 }
reference_asinpil(long double x)2172 long double reference_asinpil(long double x)
2173 {
2174     return reference_asinl(x) / M_PIL;
2175 }
reference_atanpil(long double x)2176 long double reference_atanpil(long double x)
2177 {
2178     return reference_atanl(x) / M_PIL;
2179 }
reference_atan2pil(long double y,long double x)2180 long double reference_atan2pil(long double y, long double x)
2181 {
2182     return reference_atan2l(y, x) / M_PIL;
2183 }
reference_cospil(long double x)2184 long double reference_cospil(long double x)
2185 {
2186     if (reference_fabsl(x) >= HEX_LDBL(+, 1, 0, +, 54))
2187     {
2188         if (reference_fabsl(x) == INFINITY) return cl_make_nan();
2189 
2190         // Note this probably fails for odd values between 0x1.0p52 and
2191         // 0x1.0p53. However, when starting with single precision inputs, there
2192         // will be no odd values.
2193 
2194         return 1.0L;
2195     }
2196 
2197     x = reduce1l(x);
2198 
2199 #if DBL_MANT_DIG >= LDBL_MANT_DIG
2200 
2201     // phase adjust
2202     double xhi = 0.0;
2203     double xlo = 0.0;
2204     xhi = (double)x + 0.5;
2205 
2206     if (reference_fabsl(x) > 0.5L)
2207     {
2208         xlo = xhi - x;
2209         xlo = 0.5 - xlo;
2210     }
2211     else
2212     {
2213         xlo = xhi - 0.5;
2214         xlo = x - xlo;
2215     }
2216 
2217     // reduce to [-0.5, 0.5]
2218     if (xhi < -0.5)
2219     {
2220         xhi = -1.0 - xhi;
2221         xlo = -xlo;
2222     }
2223     else if (xhi > 0.5)
2224     {
2225         xhi = 1.0 - xhi;
2226         xlo = -xlo;
2227     }
2228 
2229     // cosPi zeros are all +0
2230     if (xhi == 0.0 && xlo == 0.0) return 0.0;
2231 
2232     xhi *= M_PI;
2233     xlo *= M_PI;
2234 
2235     xhi += xlo;
2236 
2237     return reference_sinl(xhi);
2238 
2239 #else
2240     // phase adjust
2241     x += 0.5L;
2242 
2243     // reduce to [-0.5, 0.5]
2244     if (x < -0.5L)
2245         x = -1.0L - x;
2246     else if (x > 0.5L)
2247         x = 1.0L - x;
2248 
2249     // cosPi zeros are all +0
2250     if (x == 0.0L) return 0.0L;
2251 
2252     return reference_sinl(x * M_PIL);
2253 #endif
2254 }
2255 
reference_dividel(long double x,long double y)2256 long double reference_dividel(long double x, long double y)
2257 {
2258     double dx = x;
2259     double dy = y;
2260     return dx / dy;
2261 }
2262 
2263 struct double_double
2264 {
2265     double hi, lo;
2266 };
2267 
2268 // Split doubles_double into a series of consecutive 26-bit precise doubles and
2269 // a remainder. Note for later -- for multiplication, it might be better to
2270 // split each double into a power of two and two 26 bit portions
2271 //                      multiplication of a double double by a known power of
2272 //                      two is cheap. The current approach causes some inexact
2273 //                      arithmetic in mul_dd.
split_dd(double_double x,double_double * hi,double_double * lo)2274 static inline void split_dd(double_double x, double_double *hi,
2275                             double_double *lo)
2276 {
2277     union {
2278         double d;
2279         cl_ulong u;
2280     } u;
2281     u.d = x.hi;
2282     u.u &= 0xFFFFFFFFF8000000ULL;
2283     hi->hi = u.d;
2284     x.hi -= u.d;
2285 
2286     u.d = x.hi;
2287     u.u &= 0xFFFFFFFFF8000000ULL;
2288     hi->lo = u.d;
2289     x.hi -= u.d;
2290 
2291     double temp = x.hi;
2292     x.hi += x.lo;
2293     x.lo -= x.hi - temp;
2294     u.d = x.hi;
2295     u.u &= 0xFFFFFFFFF8000000ULL;
2296     lo->hi = u.d;
2297     x.hi -= u.d;
2298 
2299     lo->lo = x.hi + x.lo;
2300 }
2301 
accum_d(double_double a,double b)2302 static inline double_double accum_d(double_double a, double b)
2303 {
2304     double temp;
2305     if (fabs(b) > fabs(a.hi))
2306     {
2307         temp = a.hi;
2308         a.hi += b;
2309         a.lo += temp - (a.hi - b);
2310     }
2311     else
2312     {
2313         temp = a.hi;
2314         a.hi += b;
2315         a.lo += b - (a.hi - temp);
2316     }
2317 
2318     if (isnan(a.lo)) a.lo = 0.0;
2319 
2320     return a;
2321 }
2322 
add_dd(double_double a,double_double b)2323 static inline double_double add_dd(double_double a, double_double b)
2324 {
2325     double_double r = { -0.0, -0.0 };
2326 
2327     if (isinf(a.hi) || isinf(b.hi) || isnan(a.hi) || isnan(b.hi) || 0.0 == a.hi
2328         || 0.0 == b.hi)
2329     {
2330         r.hi = a.hi + b.hi;
2331         r.lo = a.lo + b.lo;
2332         if (isnan(r.lo)) r.lo = 0.0;
2333         return r;
2334     }
2335 
2336     // merge sort terms by magnitude -- here we assume that |a.hi| > |a.lo|,
2337     // |b.hi| > |b.lo|, so we don't have to do the first merge pass
2338     double terms[4] = { a.hi, b.hi, a.lo, b.lo };
2339     double temp;
2340 
2341     // Sort hi terms
2342     if (fabs(terms[0]) < fabs(terms[1]))
2343     {
2344         temp = terms[0];
2345         terms[0] = terms[1];
2346         terms[1] = temp;
2347     }
2348     // sort lo terms
2349     if (fabs(terms[2]) < fabs(terms[3]))
2350     {
2351         temp = terms[2];
2352         terms[2] = terms[3];
2353         terms[3] = temp;
2354     }
2355     // Fix case where small high term is less than large low term
2356     if (fabs(terms[1]) < fabs(terms[2]))
2357     {
2358         temp = terms[1];
2359         terms[1] = terms[2];
2360         terms[2] = temp;
2361     }
2362 
2363     // accumulate the results
2364     r.hi = terms[2] + terms[3];
2365     r.lo = terms[3] - (r.hi - terms[2]);
2366 
2367     temp = r.hi;
2368     r.hi += terms[1];
2369     r.lo += temp - (r.hi - terms[1]);
2370 
2371     temp = r.hi;
2372     r.hi += terms[0];
2373     r.lo += temp - (r.hi - terms[0]);
2374 
2375     // canonicalize the result
2376     temp = r.hi;
2377     r.hi += r.lo;
2378     r.lo = r.lo - (r.hi - temp);
2379     if (isnan(r.lo)) r.lo = 0.0;
2380 
2381     return r;
2382 }
2383 
mul_dd(double_double a,double_double b)2384 static inline double_double mul_dd(double_double a, double_double b)
2385 {
2386     double_double result = { -0.0, -0.0 };
2387 
2388     // Inf, nan and 0
2389     if (isnan(a.hi) || isnan(b.hi) || isinf(a.hi) || isinf(b.hi) || 0.0 == a.hi
2390         || 0.0 == b.hi)
2391     {
2392         result.hi = a.hi * b.hi;
2393         return result;
2394     }
2395 
2396     double_double ah, al, bh, bl;
2397     split_dd(a, &ah, &al);
2398     split_dd(b, &bh, &bl);
2399 
2400     double p0 = ah.hi * bh.hi; // exact    (52 bits in product) 0
2401     double p1 = ah.hi * bh.lo; // exact    (52 bits in product) 26
2402     double p2 = ah.lo * bh.hi; // exact    (52 bits in product) 26
2403     double p3 = ah.lo * bh.lo; // exact    (52 bits in product) 52
2404     double p4 = al.hi * bh.hi; // exact    (52 bits in product) 52
2405     double p5 = al.hi * bh.lo; // exact    (52 bits in product) 78
2406     double p6 = al.lo * bh.hi; // inexact  (54 bits in product) 78
2407     double p7 = al.lo * bh.lo; // inexact  (54 bits in product) 104
2408     double p8 = ah.hi * bl.hi; // exact    (52 bits in product) 52
2409     double p9 = ah.hi * bl.lo; // inexact  (54 bits in product) 78
2410     double pA = ah.lo * bl.hi; // exact    (52 bits in product) 78
2411     double pB = ah.lo * bl.lo; // inexact  (54 bits in product) 104
2412     double pC = al.hi * bl.hi; // exact    (52 bits in product) 104
2413     // the last 3 terms are two low to appear in the result
2414 
2415 
2416     // take advantage of the known relative magnitudes of the partial products
2417     // to avoid some sorting Combine 2**-78 and 2**-104 terms. Here we are a bit
2418     // sloppy about canonicalizing the double_doubles
2419     double_double t0 = { pA, pC };
2420     double_double t1 = { p9, pB };
2421     double_double t2 = { p6, p7 };
2422     double temp0, temp1, temp2;
2423 
2424     t0 = accum_d(t0, p5); // there is an extra 2**-78 term to deal with
2425 
2426     // Add in 2**-52 terms. Here we are a bit sloppy about canonicalizing the
2427     // double_doubles
2428     temp0 = t0.hi;
2429     temp1 = t1.hi;
2430     temp2 = t2.hi;
2431     t0.hi += p3;
2432     t1.hi += p4;
2433     t2.hi += p8;
2434     temp0 -= t0.hi - p3;
2435     temp1 -= t1.hi - p4;
2436     temp2 -= t2.hi - p8;
2437     t0.lo += temp0;
2438     t1.lo += temp1;
2439     t2.lo += temp2;
2440 
2441     // Add in 2**-26 terms. Here we are a bit sloppy about canonicalizing the
2442     // double_doubles
2443     temp1 = t1.hi;
2444     temp2 = t2.hi;
2445     t1.hi += p1;
2446     t2.hi += p2;
2447     temp1 -= t1.hi - p1;
2448     temp2 -= t2.hi - p2;
2449     t1.lo += temp1;
2450     t2.lo += temp2;
2451 
2452     // Combine accumulators to get the low bits of result
2453     t1 = add_dd(t1, add_dd(t2, t0));
2454 
2455     // Add in MSB's, and round to precision
2456     return accum_d(t1, p0); // canonicalizes
2457 }
2458 
2459 
reference_exp10l(long double z)2460 long double reference_exp10l(long double z)
2461 {
2462     const double_double log2_10 = { HEX_DBL(+, 1, a934f0979a371, +, 1),
2463                                     HEX_DBL(+, 1, 7f2495fb7fa6d, -, 53) };
2464     double_double x;
2465     int j;
2466 
2467     // Handle NaNs
2468     if (isnan(z)) return z;
2469 
2470     // init x
2471     x.hi = z;
2472     x.lo = z - x.hi;
2473 
2474 
2475     // 10**x = exp2( x * log2(10) )
2476 
2477     x = mul_dd(x, log2_10); // x * log2(10)
2478 
2479     // Deal with overflow and underflow for exp2(x) stage next
2480     if (x.hi >= 1025) return INFINITY;
2481 
2482     if (x.hi < -1075 - 24) return +0.0;
2483 
2484     // find nearest integer to x
2485     int i = (int)rint(x.hi);
2486 
2487     // x now holds fractional part.  The result would be then 2**i  * exp2( x )
2488     x.hi -= i;
2489 
2490     // We could attempt to find a minimax polynomial for exp2(x) over the range
2491     // x = [-0.5, 0.5]. However, this would converge very slowly near the
2492     // extrema, where 0.5**n is not a lot different from 0.5**(n+1), thereby
2493     // requiring something like a 20th order polynomial to get 53 + 24 bits of
2494     // precision. Instead we further reduce the range to [-1/32, 1/32] by
2495     // observing that
2496     //
2497     //  2**(a+b) = 2**a * 2**b
2498     //
2499     // We can thus build a table of 2**a values for a = n/16, n = [-8, 8], and
2500     // reduce the range of x to [-1/32, 1/32] by subtracting away the nearest
2501     // value of n/16 from x.
2502     const double_double corrections[17] = {
2503         { HEX_DBL(+, 1, 6a09e667f3bcd, -, 1),
2504           HEX_DBL(-, 1, bdd3413b26456, -, 55) },
2505         { HEX_DBL(+, 1, 7a11473eb0187, -, 1),
2506           HEX_DBL(-, 1, 41577ee04992f, -, 56) },
2507         { HEX_DBL(+, 1, 8ace5422aa0db, -, 1),
2508           HEX_DBL(+, 1, 6e9f156864b27, -, 55) },
2509         { HEX_DBL(+, 1, 9c49182a3f09, -, 1),
2510           HEX_DBL(+, 1, c7c46b071f2be, -, 57) },
2511         { HEX_DBL(+, 1, ae89f995ad3ad, -, 1),
2512           HEX_DBL(+, 1, 7a1cd345dcc81, -, 55) },
2513         { HEX_DBL(+, 1, c199bdd85529c, -, 1),
2514           HEX_DBL(+, 1, 11065895048dd, -, 56) },
2515         { HEX_DBL(+, 1, d5818dcfba487, -, 1),
2516           HEX_DBL(+, 1, 2ed02d75b3707, -, 56) },
2517         { HEX_DBL(+, 1, ea4afa2a490da, -, 1),
2518           HEX_DBL(-, 1, e9c23179c2893, -, 55) },
2519         { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
2520         { HEX_DBL(+, 1, 0b5586cf9890f, +, 0),
2521           HEX_DBL(+, 1, 8a62e4adc610b, -, 54) },
2522         { HEX_DBL(+, 1, 172b83c7d517b, +, 0),
2523           HEX_DBL(-, 1, 19041b9d78a76, -, 55) },
2524         { HEX_DBL(+, 1, 2387a6e756238, +, 0),
2525           HEX_DBL(+, 1, 9b07eb6c70573, -, 54) },
2526         { HEX_DBL(+, 1, 306fe0a31b715, +, 0),
2527           HEX_DBL(+, 1, 6f46ad23182e4, -, 55) },
2528         { HEX_DBL(+, 1, 3dea64c123422, +, 0),
2529           HEX_DBL(+, 1, ada0911f09ebc, -, 55) },
2530         { HEX_DBL(+, 1, 4bfdad5362a27, +, 0),
2531           HEX_DBL(+, 1, d4397afec42e2, -, 56) },
2532         { HEX_DBL(+, 1, 5ab07dd485429, +, 0),
2533           HEX_DBL(+, 1, 6324c054647ad, -, 54) },
2534         { HEX_DBL(+, 1, 6a09e667f3bcd, +, 0),
2535           HEX_DBL(-, 1, bdd3413b26456, -, 54) }
2536 
2537     };
2538     int index = (int)rint(x.hi * 16.0);
2539     x.hi -= (double)index * 0.0625;
2540 
2541     // canonicalize x
2542     double temp = x.hi;
2543     x.hi += x.lo;
2544     x.lo -= x.hi - temp;
2545 
2546     // Minimax polynomial for (exp2(x)-1)/x, over the range [-1/32, 1/32].  Max
2547     // Error: 2 * 0x1.e112p-87
2548     const double_double c[] = { { HEX_DBL(+, 1, 62e42fefa39ef, -, 1),
2549                                   HEX_DBL(+, 1, abc9e3ac1d244, -, 56) },
2550                                 { HEX_DBL(+, 1, ebfbdff82c58f, -, 3),
2551                                   HEX_DBL(-, 1, 5e4987a631846, -, 57) },
2552                                 { HEX_DBL(+, 1, c6b08d704a0c, -, 5),
2553                                   HEX_DBL(-, 1, d323200a05713, -, 59) },
2554                                 { HEX_DBL(+, 1, 3b2ab6fba4e7a, -, 7),
2555                                   HEX_DBL(+, 1, c5ee8f8b9f0c1, -, 63) },
2556                                 { HEX_DBL(+, 1, 5d87fe78a672a, -, 10),
2557                                   HEX_DBL(+, 1, 884e5e5cc7ecc, -, 64) },
2558                                 { HEX_DBL(+, 1, 430912f7e8373, -, 13),
2559                                   HEX_DBL(+, 1, 4f1b59514a326, -, 67) },
2560                                 { HEX_DBL(+, 1, ffcbfc5985e71, -, 17),
2561                                   HEX_DBL(-, 1, db7d6a0953b78, -, 71) },
2562                                 { HEX_DBL(+, 1, 62c150eb16465, -, 20),
2563                                   HEX_DBL(+, 1, e0767c2d7abf5, -, 80) },
2564                                 { HEX_DBL(+, 1, b52502b5e953, -, 24),
2565                                   HEX_DBL(+, 1, 6797523f944bc, -, 78) } };
2566     size_t count = sizeof(c) / sizeof(c[0]);
2567 
2568     // Do polynomial
2569     double_double r = c[count - 1];
2570     for (j = (int)count - 2; j >= 0; j--) r = add_dd(c[j], mul_dd(r, x));
2571 
2572     // unwind approximation
2573     r = mul_dd(r, x); // before: r =(exp2(x)-1)/x;   after: r = exp2(x) - 1
2574 
2575     // correct for [-0.5, 0.5] -> [-1/32, 1/32] reduction above
2576     //  exp2(x) = (r + 1) * correction = r * correction + correction
2577     r = mul_dd(r, corrections[index + 8]);
2578     r = add_dd(r, corrections[index + 8]);
2579 
2580     // Format result for output:
2581 
2582     // Get mantissa
2583     long double m = ((long double)r.hi + (long double)r.lo);
2584 
2585     // Handle a pesky overflow cases when long double = double
2586     if (i > 512)
2587     {
2588         m *= HEX_DBL(+, 1, 0, +, 512);
2589         i -= 512;
2590     }
2591     else if (i < -512)
2592     {
2593         m *= HEX_DBL(+, 1, 0, -, 512);
2594         i += 512;
2595     }
2596 
2597     return m * ldexpl(1.0L, i);
2598 }
2599 
2600 
fallback_frexp(double x,int * iptr)2601 static double fallback_frexp(double x, int *iptr)
2602 {
2603     cl_ulong u, v;
2604     double fu, fv;
2605 
2606     memcpy(&u, &x, sizeof(u));
2607 
2608     cl_ulong exponent = u & 0x7ff0000000000000ULL;
2609     cl_ulong mantissa = u & ~0x7ff0000000000000ULL;
2610 
2611     // add 1 to the exponent
2612     exponent += 0x0010000000000000ULL;
2613 
2614     if ((cl_long)exponent < (cl_long)0x0020000000000000LL)
2615     { // subnormal, NaN, Inf
2616         mantissa |= 0x3fe0000000000000ULL;
2617 
2618         v = mantissa & 0xfff0000000000000ULL;
2619         u = mantissa;
2620         memcpy(&fv, &v, sizeof(v));
2621         memcpy(&fu, &u, sizeof(u));
2622 
2623         fu -= fv;
2624 
2625         memcpy(&v, &fv, sizeof(v));
2626         memcpy(&u, &fu, sizeof(u));
2627 
2628         exponent = u & 0x7ff0000000000000ULL;
2629         mantissa = u & ~0x7ff0000000000000ULL;
2630 
2631         *iptr = (exponent >> 52) + (-1022 + 1 - 1022);
2632         u = mantissa | 0x3fe0000000000000ULL;
2633         memcpy(&fu, &u, sizeof(u));
2634         return fu;
2635     }
2636 
2637     *iptr = (exponent >> 52) - 1023;
2638     u = mantissa | 0x3fe0000000000000ULL;
2639     memcpy(&fu, &u, sizeof(u));
2640     return fu;
2641 }
2642 
2643 // Assumes zeros, infinities and NaNs handed elsewhere
extract(double x,cl_ulong * mant)2644 static inline int extract(double x, cl_ulong *mant)
2645 {
2646     static double (*frexpp)(double, int *) = NULL;
2647     int e;
2648 
2649     // verify that frexp works properly
2650     if (NULL == frexpp)
2651     {
2652         if (0.5 == frexp(HEX_DBL(+, 1, 0, -, 1030), &e) && e == -1029)
2653             frexpp = frexp;
2654         else
2655             frexpp = fallback_frexp;
2656     }
2657 
2658     *mant = (cl_ulong)(HEX_DBL(+, 1, 0, +, 64) * fabs(frexpp(x, &e)));
2659     return e - 1;
2660 }
2661 
2662 // Return 128-bit product of a*b  as (hi << 64) + lo
mul128(cl_ulong a,cl_ulong b,cl_ulong * hi,cl_ulong * lo)2663 static inline void mul128(cl_ulong a, cl_ulong b, cl_ulong *hi, cl_ulong *lo)
2664 {
2665     cl_ulong alo = a & 0xffffffffULL;
2666     cl_ulong ahi = a >> 32;
2667     cl_ulong blo = b & 0xffffffffULL;
2668     cl_ulong bhi = b >> 32;
2669     cl_ulong aloblo = alo * blo;
2670     cl_ulong alobhi = alo * bhi;
2671     cl_ulong ahiblo = ahi * blo;
2672     cl_ulong ahibhi = ahi * bhi;
2673 
2674     alobhi += (aloblo >> 32)
2675         + (ahiblo
2676            & 0xffffffffULL); // cannot overflow: (2^32-1)^2 + 2 * (2^32-1)   =
2677                              // (2^64 - 2^33 + 1) + (2^33 - 2) = 2^64 - 1
2678     *hi = ahibhi + (alobhi >> 32)
2679         + (ahiblo >> 32); // cannot overflow: (2^32-1)^2 + 2 * (2^32-1)   =
2680                           // (2^64 - 2^33 + 1) + (2^33 - 2) = 2^64 - 1
2681     *lo = (aloblo & 0xffffffffULL) | (alobhi << 32);
2682 }
2683 
round_to_nearest_even_double(cl_ulong hi,cl_ulong lo,int exponent)2684 static double round_to_nearest_even_double(cl_ulong hi, cl_ulong lo,
2685                                            int exponent)
2686 {
2687     union {
2688         cl_ulong u;
2689         cl_double d;
2690     } u;
2691 
2692     // edges
2693     if (exponent > 1023) return INFINITY;
2694     if (exponent == -1075 && (hi | (lo != 0)) > 0x8000000000000000ULL)
2695         return HEX_DBL(+, 1, 0, -, 1074);
2696     if (exponent <= -1075) return 0.0;
2697 
2698     // Figure out which bits go where
2699     int shift = 11;
2700     if (exponent < -1022)
2701     {
2702         shift -= 1022 + exponent; // subnormal: shift is not 52
2703         exponent = -1023; //              set exponent to 0
2704     }
2705     else
2706         hi &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove
2707                                      // it.
2708 
2709     // Assemble the double (round toward zero)
2710     u.u = (hi >> shift) | ((cl_ulong)(exponent + 1023) << 52);
2711 
2712     // put a representation of the residual bits into hi
2713     hi <<= (64 - shift);
2714     hi |= lo >> shift;
2715     lo <<= (64 - shift);
2716     hi |= lo != 0;
2717 
2718     // round to nearest, ties to even
2719     if (hi < 0x8000000000000000ULL) return u.d;
2720     if (hi == 0x8000000000000000ULL)
2721         u.u += u.u & 1ULL;
2722     else
2723         u.u++;
2724 
2725     return u.d;
2726 }
2727 
2728 // Shift right.  Bits lost on the right will be OR'd together and OR'd with the
2729 // LSB
shift_right_sticky_128(cl_ulong * hi,cl_ulong * lo,int shift)2730 static inline void shift_right_sticky_128(cl_ulong *hi, cl_ulong *lo, int shift)
2731 {
2732     cl_ulong sticky = 0;
2733     cl_ulong h = *hi;
2734     cl_ulong l = *lo;
2735 
2736     if (shift >= 64)
2737     {
2738         shift -= 64;
2739         sticky = 0 != lo;
2740         l = h;
2741         h = 0;
2742         if (shift >= 64)
2743         {
2744             sticky |= (0 != l);
2745             l = 0;
2746         }
2747         else
2748         {
2749             sticky |= (0 != (l << (64 - shift)));
2750             l >>= shift;
2751         }
2752     }
2753     else
2754     {
2755         sticky |= (0 != (l << (64 - shift)));
2756         l >>= shift;
2757         l |= h << (64 - shift);
2758         h >>= shift;
2759     }
2760 
2761     *lo = l | sticky;
2762     *hi = h;
2763 }
2764 
2765 // 128-bit add  of ((*hi << 64) + *lo) + ((chi << 64) + clo)
2766 // If the 129 bit result doesn't fit, bits lost off the right end will be OR'd
2767 // with the LSB
add128(cl_ulong * hi,cl_ulong * lo,cl_ulong chi,cl_ulong clo,int * exponent)2768 static inline void add128(cl_ulong *hi, cl_ulong *lo, cl_ulong chi,
2769                           cl_ulong clo, int *exponent)
2770 {
2771     cl_ulong carry, carry2;
2772     // extended precision add
2773     clo = add_carry(*lo, clo, &carry);
2774     chi = add_carry(*hi, chi, &carry2);
2775     chi = add_carry(chi, carry, &carry);
2776 
2777     // If we overflowed the 128 bit result
2778     if (carry || carry2)
2779     {
2780         carry = clo & 1; // set aside low bit
2781         clo >>= 1; // right shift low 1
2782         clo |= carry; // or back in the low bit, so we don't come to believe
2783                       // this is an exact half way case for rounding
2784         clo |= chi << 63; // move lowest high bit into highest bit of lo
2785         chi >>= 1; // right shift hi
2786         chi |= 0x8000000000000000ULL; // move the carry bit into hi.
2787         *exponent = *exponent + 1;
2788     }
2789 
2790     *hi = chi;
2791     *lo = clo;
2792 }
2793 
2794 // 128-bit subtract  of ((chi << 64) + clo)  - ((*hi << 64) + *lo)
sub128(cl_ulong * chi,cl_ulong * clo,cl_ulong hi,cl_ulong lo,cl_ulong * signC,int * expC)2795 static inline void sub128(cl_ulong *chi, cl_ulong *clo, cl_ulong hi,
2796                           cl_ulong lo, cl_ulong *signC, int *expC)
2797 {
2798     cl_ulong rHi = *chi;
2799     cl_ulong rLo = *clo;
2800     cl_ulong carry, carry2;
2801 
2802     // extended precision subtract
2803     rLo = sub_carry(rLo, lo, &carry);
2804     rHi = sub_carry(rHi, hi, &carry2);
2805     rHi = sub_carry(rHi, carry, &carry);
2806 
2807     // Check for sign flip
2808     if (carry || carry2)
2809     {
2810         *signC ^= 0x8000000000000000ULL;
2811 
2812         // negate rLo, rHi:   -x = (x ^ -1) + 1
2813         rLo ^= -1ULL;
2814         rHi ^= -1ULL;
2815         rLo++;
2816         rHi += 0 == rLo;
2817     }
2818 
2819     // normalize -- move the most significant non-zero bit to the MSB, and
2820     // adjust exponent accordingly
2821     if (rHi == 0)
2822     {
2823         rHi = rLo;
2824         *expC = *expC - 64;
2825         rLo = 0;
2826     }
2827 
2828     if (rHi)
2829     {
2830         int shift = 32;
2831         cl_ulong test = 1ULL << 32;
2832         while (0 == (rHi & 0x8000000000000000ULL))
2833         {
2834             if (rHi < test)
2835             {
2836                 rHi <<= shift;
2837                 rHi |= rLo >> (64 - shift);
2838                 rLo <<= shift;
2839                 *expC = *expC - shift;
2840             }
2841             shift >>= 1;
2842             test <<= shift;
2843         }
2844     }
2845     else
2846     {
2847         // zero
2848         *expC = INT_MIN;
2849         *signC = 0;
2850     }
2851 
2852 
2853     *chi = rHi;
2854     *clo = rLo;
2855 }
2856 
reference_fmal(long double x,long double y,long double z)2857 long double reference_fmal(long double x, long double y, long double z)
2858 {
2859     static const cl_ulong kMSB = 0x8000000000000000ULL;
2860 
2861     // cast values back to double. This is an exact function, so
2862     double a = x;
2863     double b = y;
2864     double c = z;
2865 
2866     // Make bits accessible
2867     union {
2868         cl_ulong u;
2869         cl_double d;
2870     } ua;
2871     ua.d = a;
2872     union {
2873         cl_ulong u;
2874         cl_double d;
2875     } ub;
2876     ub.d = b;
2877     union {
2878         cl_ulong u;
2879         cl_double d;
2880     } uc;
2881     uc.d = c;
2882 
2883     // deal with Nans, infinities and zeros
2884     if (isnan(a) || isnan(b) || isnan(c) || isinf(a) || isinf(b) || isinf(c)
2885         || 0 == (ua.u & ~kMSB) || // a == 0, defeat host FTZ behavior
2886         0 == (ub.u & ~kMSB) || // b == 0, defeat host FTZ behavior
2887         0 == (uc.u & ~kMSB)) // c == 0, defeat host FTZ behavior
2888     {
2889         if (isinf(c) && !isinf(a) && !isinf(b)) return (c + a) + b;
2890 
2891         a = (double)reference_multiplyl(
2892             a, b); // some risk that the compiler will insert a non-compliant
2893                    // fma here on some platforms.
2894         return reference_addl(
2895             a,
2896             c); // We use STDC FP_CONTRACT OFF above to attempt to defeat that.
2897     }
2898 
2899     // extract exponent and mantissa
2900     //   exponent is a standard unbiased signed integer
2901     //   mantissa is a cl_uint, with leading non-zero bit positioned at the MSB
2902     cl_ulong mantA, mantB, mantC;
2903     int expA = extract(a, &mantA);
2904     int expB = extract(b, &mantB);
2905     int expC = extract(c, &mantC);
2906     cl_ulong signC = uc.u & kMSB; // We'll need the sign bit of C later to
2907                                   // decide if we are adding or subtracting
2908 
2909     // exact product of A and B
2910     int exponent = expA + expB;
2911     cl_ulong sign = (ua.u ^ ub.u) & kMSB;
2912     cl_ulong hi, lo;
2913     mul128(mantA, mantB, &hi, &lo);
2914 
2915     // renormalize
2916     if (0 == (kMSB & hi))
2917     {
2918         hi <<= 1;
2919         hi |= lo >> 63;
2920         lo <<= 1;
2921     }
2922     else
2923         exponent++; // 2**63 * 2**63 gives 2**126. If the MSB was set, then our
2924                     // exponent increased.
2925 
2926     // infinite precision add
2927     cl_ulong chi = mantC;
2928     cl_ulong clo = 0;
2929 
2930     if (exponent >= expC)
2931     {
2932         // Normalize C relative to the product
2933         if (exponent > expC)
2934             shift_right_sticky_128(&chi, &clo, exponent - expC);
2935 
2936         // Add
2937         if (sign ^ signC)
2938             sub128(&hi, &lo, chi, clo, &sign, &exponent);
2939         else
2940             add128(&hi, &lo, chi, clo, &exponent);
2941     }
2942     else
2943     {
2944         // Shift the product relative to C so that their exponents match
2945         shift_right_sticky_128(&hi, &lo, expC - exponent);
2946 
2947         // add
2948         if (sign ^ signC)
2949             sub128(&chi, &clo, hi, lo, &signC, &expC);
2950         else
2951             add128(&chi, &clo, hi, lo, &expC);
2952 
2953         hi = chi;
2954         lo = clo;
2955         exponent = expC;
2956         sign = signC;
2957     }
2958 
2959     // round
2960     ua.d = round_to_nearest_even_double(hi, lo, exponent);
2961 
2962     // Set the sign
2963     ua.u |= sign;
2964 
2965     return ua.d;
2966 }
2967 
2968 
reference_madl(long double a,long double b,long double c)2969 long double reference_madl(long double a, long double b, long double c)
2970 {
2971     return a * b + c;
2972 }
2973 
reference_recipl(long double x)2974 long double reference_recipl(long double x) { return 1.0L / x; }
2975 
reference_rootnl(long double x,int i)2976 long double reference_rootnl(long double x, int i)
2977 {
2978     // rootn ( x, 0 )  returns a NaN.
2979     if (0 == i) return cl_make_nan();
2980 
2981     // rootn ( x, n )  returns a NaN for x < 0 and n is even.
2982     if (x < 0.0L && 0 == (i & 1)) return cl_make_nan();
2983 
2984     if (isinf(x))
2985     {
2986         if (i < 0) return reference_copysignl(0.0L, x);
2987 
2988         return x;
2989     }
2990 
2991     if (x == 0.0)
2992     {
2993         switch (i & 0x80000001)
2994         {
2995             // rootn ( +-0,  n ) is +0 for even n > 0.
2996             case 0: return 0.0L;
2997 
2998             // rootn ( +-0,  n ) is +-0 for odd n > 0.
2999             case 1: return x;
3000 
3001             // rootn ( +-0,  n ) is +inf for even n < 0.
3002             case 0x80000000: return INFINITY;
3003 
3004             // rootn ( +-0,  n ) is +-inf for odd n < 0.
3005             case 0x80000001: return copysign(INFINITY, x);
3006         }
3007     }
3008 
3009     if (i == 1) return x;
3010 
3011     if (i == -1) return 1.0 / x;
3012 
3013     long double sign = x;
3014     x = reference_fabsl(x);
3015     double iHi, iLo;
3016     DivideDD(&iHi, &iLo, 1.0, i);
3017     x = reference_powl(x, iHi) * reference_powl(x, iLo);
3018 
3019     return reference_copysignl(x, sign);
3020 }
3021 
reference_rsqrtl(long double x)3022 long double reference_rsqrtl(long double x) { return 1.0L / sqrtl(x); }
3023 
reference_sinpil(long double x)3024 long double reference_sinpil(long double x)
3025 {
3026     double r = reduce1l(x);
3027 
3028     // reduce to [-0.5, 0.5]
3029     if (r < -0.5L)
3030         r = -1.0L - r;
3031     else if (r > 0.5L)
3032         r = 1.0L - r;
3033 
3034     // sinPi zeros have the same sign as x
3035     if (r == 0.0L) return reference_copysignl(0.0L, x);
3036 
3037     return reference_sinl(r * M_PIL);
3038 }
3039 
reference_tanpil(long double x)3040 long double reference_tanpil(long double x)
3041 {
3042     // set aside the sign  (allows us to preserve sign of -0)
3043     long double sign = reference_copysignl(1.0L, x);
3044     long double z = reference_fabsl(x);
3045 
3046     // if big and even  -- caution: only works if x only has single precision
3047     if (z >= HEX_LDBL(+, 1, 0, +, 53))
3048     {
3049         if (z == INFINITY) return x - x; // nan
3050 
3051         return reference_copysignl(
3052             0.0L, x); // tanpi ( n ) is copysign( 0.0, n)  for even integers n.
3053     }
3054 
3055     // reduce to the range [ -0.5, 0.5 ]
3056     long double nearest =
3057         reference_rintl(z); // round to nearest even places n + 0.5 values in
3058                             // the right place for us
3059     int64_t i =
3060         (int64_t)nearest; // test above against 0x1.0p53 avoids overflow here
3061     z -= nearest;
3062 
3063     // correction for odd integer x for the right sign of zero
3064     if ((i & 1) && z == 0.0L) sign = -sign;
3065 
3066     // track changes to the sign
3067     sign *= reference_copysignl(1.0L, z); // really should just be an xor
3068     z = reference_fabsl(z); // remove the sign again
3069 
3070     // reduce once more
3071     // If we don't do this, rounding error in z * M_PI will cause us not to
3072     // return infinities properly
3073     if (z > 0.25L)
3074     {
3075         z = 0.5L - z;
3076         return sign
3077             / reference_tanl(z
3078                              * M_PIL); // use system tan to get the right result
3079     }
3080 
3081     //
3082     return sign
3083         * reference_tanl(z * M_PIL); // use system tan to get the right result
3084 }
3085 
reference_pownl(long double x,int i)3086 long double reference_pownl(long double x, int i)
3087 {
3088     return reference_powl(x, (long double)i);
3089 }
3090 
reference_powrl(long double x,long double y)3091 long double reference_powrl(long double x, long double y)
3092 {
3093     // powr ( x, y ) returns NaN for x < 0.
3094     if (x < 0.0L) return cl_make_nan();
3095 
3096     // powr ( x, NaN ) returns the NaN for x >= 0.
3097     // powr ( NaN, y ) returns the NaN.
3098     if (isnan(x) || isnan(y))
3099         return x + y; // Note: behavior different here than for pow(1,NaN),
3100                       // pow(NaN, 0)
3101 
3102     if (x == 1.0L)
3103     {
3104         // powr ( +1, +-inf ) returns NaN.
3105         if (reference_fabsl(y) == INFINITY) return cl_make_nan();
3106 
3107         // powr ( +1, y ) is 1 for finite y.    (NaN handled above)
3108         return 1.0L;
3109     }
3110 
3111     if (y == 0.0L)
3112     {
3113         // powr ( +inf, +-0 ) returns NaN.
3114         // powr ( +-0, +-0 ) returns NaN.
3115         if (x == 0.0L || x == INFINITY) return cl_make_nan();
3116 
3117         // powr ( x, +-0 ) is 1 for finite x > 0.  (x <= 0, NaN, INF already
3118         // handled above)
3119         return 1.0L;
3120     }
3121 
3122     if (x == 0.0L)
3123     {
3124         // powr ( +-0, -inf) is +inf.
3125         // powr ( +-0, y ) is +inf for finite y < 0.
3126         if (y < 0.0L) return INFINITY;
3127 
3128         // powr ( +-0, y ) is +0 for y > 0.    (NaN, y==0 handled above)
3129         return 0.0L;
3130     }
3131 
3132     return reference_powl(x, y);
3133 }
3134 
reference_addl(long double x,long double y)3135 long double reference_addl(long double x, long double y)
3136 {
3137     volatile double a = (double)x;
3138     volatile double b = (double)y;
3139 
3140 #if defined(__SSE2__)
3141     // defeat x87
3142     __m128d va = _mm_set_sd((double)a);
3143     __m128d vb = _mm_set_sd((double)b);
3144     va = _mm_add_sd(va, vb);
3145     _mm_store_sd((double *)&a, va);
3146 #else
3147     a += b;
3148 #endif
3149     return (long double)a;
3150 }
3151 
reference_subtractl(long double x,long double y)3152 long double reference_subtractl(long double x, long double y)
3153 {
3154     volatile double a = (double)x;
3155     volatile double b = (double)y;
3156 
3157 #if defined(__SSE2__)
3158     // defeat x87
3159     __m128d va = _mm_set_sd((double)a);
3160     __m128d vb = _mm_set_sd((double)b);
3161     va = _mm_sub_sd(va, vb);
3162     _mm_store_sd((double *)&a, va);
3163 #else
3164     a -= b;
3165 #endif
3166     return (long double)a;
3167 }
3168 
reference_multiplyl(long double x,long double y)3169 long double reference_multiplyl(long double x, long double y)
3170 {
3171     volatile double a = (double)x;
3172     volatile double b = (double)y;
3173 
3174 #if defined(__SSE2__)
3175     // defeat x87
3176     __m128d va = _mm_set_sd((double)a);
3177     __m128d vb = _mm_set_sd((double)b);
3178     va = _mm_mul_sd(va, vb);
3179     _mm_store_sd((double *)&a, va);
3180 #else
3181     a *= b;
3182 #endif
3183     return (long double)a;
3184 }
3185 
reference_lgamma_rl(long double x,int * signp)3186 long double reference_lgamma_rl(long double x, int *signp)
3187 {
3188     *signp = 0;
3189     return x;
3190 }
3191 
reference_isequall(long double x,long double y)3192 int reference_isequall(long double x, long double y) { return x == y; }
reference_isfinitel(long double x)3193 int reference_isfinitel(long double x) { return 0 != isfinite(x); }
reference_isgreaterl(long double x,long double y)3194 int reference_isgreaterl(long double x, long double y) { return x > y; }
reference_isgreaterequall(long double x,long double y)3195 int reference_isgreaterequall(long double x, long double y) { return x >= y; }
reference_isinfl(long double x)3196 int reference_isinfl(long double x) { return 0 != isinf(x); }
reference_islessl(long double x,long double y)3197 int reference_islessl(long double x, long double y) { return x < y; }
reference_islessequall(long double x,long double y)3198 int reference_islessequall(long double x, long double y) { return x <= y; }
3199 #if defined(__INTEL_COMPILER)
reference_islessgreaterl(long double x,long double y)3200 int reference_islessgreaterl(long double x, long double y)
3201 {
3202     return 0 != islessgreaterl(x, y);
3203 }
3204 #else
reference_islessgreaterl(long double x,long double y)3205 int reference_islessgreaterl(long double x, long double y)
3206 {
3207     return 0 != islessgreater(x, y);
3208 }
3209 #endif
reference_isnanl(long double x)3210 int reference_isnanl(long double x) { return 0 != isnan(x); }
reference_isnormall(long double x)3211 int reference_isnormall(long double x) { return 0 != isnormal((double)x); }
reference_isnotequall(long double x,long double y)3212 int reference_isnotequall(long double x, long double y) { return x != y; }
reference_isorderedl(long double x,long double y)3213 int reference_isorderedl(long double x, long double y)
3214 {
3215     return x == x && y == y;
3216 }
reference_isunorderedl(long double x,long double y)3217 int reference_isunorderedl(long double x, long double y)
3218 {
3219     return isnan(x) || isnan(y);
3220 }
3221 #if defined(__INTEL_COMPILER)
reference_signbitl(long double x)3222 int reference_signbitl(long double x) { return 0 != signbitl(x); }
3223 #else
reference_signbitl(long double x)3224 int reference_signbitl(long double x) { return 0 != signbit(x); }
3225 #endif
3226 long double reference_copysignl(long double x, long double y);
3227 long double reference_roundl(long double x);
3228 long double reference_cbrtl(long double x);
3229 
reference_copysignl(long double x,long double y)3230 long double reference_copysignl(long double x, long double y)
3231 {
3232     // We hope that the long double to double conversion proceeds with sign
3233     // fidelity, even for zeros and NaNs
3234     union {
3235         double d;
3236         cl_ulong u;
3237     } u;
3238     u.d = (double)y;
3239 
3240     x = reference_fabsl(x);
3241     if (u.u >> 63) x = -x;
3242 
3243     return x;
3244 }
3245 
reference_roundl(long double x)3246 long double reference_roundl(long double x)
3247 {
3248     // Since we are just using this to verify double precision, we can
3249     // use the double precision copysign here
3250 
3251 #if defined(__MINGW32__) && defined(__x86_64__)
3252     long double absx = reference_fabsl(x);
3253     if (absx < 0.5L) return reference_copysignl(0.0L, x);
3254 #endif
3255     return round((double)x);
3256 }
3257 
reference_truncl(long double x)3258 long double reference_truncl(long double x)
3259 {
3260     // Since we are just using this to verify double precision, we can
3261     // use the double precision copysign here
3262     return trunc((double)x);
3263 }
3264 
3265 static long double reference_scalblnl(long double x, long n);
3266 
reference_cbrtl(long double x)3267 long double reference_cbrtl(long double x)
3268 {
3269     double yhi = HEX_DBL(+, 1, 5555555555555, -, 2);
3270     double ylo = HEX_DBL(+, 1, 558, -, 56);
3271 
3272     double fabsx = reference_fabs(x);
3273 
3274     if (isnan(x) || fabsx == 1.0 || fabsx == 0.0 || isinf(x)) return x;
3275 
3276     double log2x_hi, log2x_lo;
3277 
3278     // extended precision log .... accurate to at least 64-bits + couple of
3279     // guard bits
3280     __log2_ep(&log2x_hi, &log2x_lo, fabsx);
3281 
3282     double ylog2x_hi, ylog2x_lo;
3283 
3284     double y_hi = yhi;
3285     double y_lo = ylo;
3286 
3287     // compute product of y*log2(x)
3288     MulDD(&ylog2x_hi, &ylog2x_lo, log2x_hi, log2x_lo, y_hi, y_lo);
3289 
3290     long double powxy;
3291     if (isinf(ylog2x_hi) || (reference_fabs(ylog2x_hi) > 2200))
3292     {
3293         powxy =
3294             reference_signbit(ylog2x_hi) ? HEX_DBL(+, 0, 0, +, 0) : INFINITY;
3295     }
3296     else
3297     {
3298         // separate integer + fractional part
3299         long int m = lrint(ylog2x_hi);
3300         AddDD(&ylog2x_hi, &ylog2x_lo, ylog2x_hi, ylog2x_lo, -m, 0.0);
3301 
3302         // revert to long double arithemtic
3303         long double ylog2x = (long double)ylog2x_hi + (long double)ylog2x_lo;
3304         powxy = reference_exp2l(ylog2x);
3305         powxy = reference_scalblnl(powxy, m);
3306     }
3307 
3308     return reference_copysignl(powxy, x);
3309 }
3310 
reference_rintl(long double x)3311 long double reference_rintl(long double x)
3312 {
3313 #if defined(__PPC__)
3314     // On PPC, long doubles are maintained as 2 doubles. Therefore, the combined
3315     // mantissa can represent more than LDBL_MANT_DIG binary digits.
3316     x = rintl(x);
3317 #else
3318     static long double magic[2] = { 0.0L, 0.0L };
3319 
3320     if (0.0L == magic[0])
3321     {
3322         magic[0] = scalbnl(0.5L, LDBL_MANT_DIG);
3323         magic[1] = scalbnl(-0.5L, LDBL_MANT_DIG);
3324     }
3325 
3326     if (reference_fabsl(x) < magic[0] && x != 0.0L)
3327     {
3328         long double m = magic[x < 0];
3329         x += m;
3330         x -= m;
3331     }
3332 #endif // __PPC__
3333     return x;
3334 }
3335 
3336 // extended precision sqrt using newton iteration on 1/sqrt(x).
3337 // Final result is computed as x * 1/sqrt(x)
__sqrt_ep(double * rhi,double * rlo,double xhi,double xlo)3338 static void __sqrt_ep(double *rhi, double *rlo, double xhi, double xlo)
3339 {
3340     // approximate reciprocal sqrt
3341     double thi = 1.0 / sqrt(xhi);
3342     double tlo = 0.0;
3343 
3344     // One newton iteration in double-double
3345     double yhi, ylo;
3346     MulDD(&yhi, &ylo, thi, tlo, thi, tlo);
3347     MulDD(&yhi, &ylo, yhi, ylo, xhi, xlo);
3348     AddDD(&yhi, &ylo, -yhi, -ylo, 3.0, 0.0);
3349     MulDD(&yhi, &ylo, yhi, ylo, thi, tlo);
3350     MulDD(&yhi, &ylo, yhi, ylo, 0.5, 0.0);
3351 
3352     MulDD(rhi, rlo, yhi, ylo, xhi, xlo);
3353 }
3354 
reference_acoshl(long double x)3355 long double reference_acoshl(long double x)
3356 {
3357     /*
3358      * ====================================================
3359      * This function derived from fdlibm http://www.netlib.org
3360      * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
3361      *
3362      * Developed at SunSoft, a Sun Microsystems, Inc. business.
3363      * Permission to use, copy, modify, and distribute this
3364      * software is freely granted, provided that this notice
3365      * is preserved.
3366      * ====================================================
3367      *
3368      */
3369     if (isnan(x) || isinf(x)) return x + fabsl(x);
3370 
3371     if (x < 1.0L) return cl_make_nan();
3372 
3373     if (x == 1.0L) return 0.0L;
3374 
3375     if (x > HEX_LDBL(+, 1, 0, +, 60))
3376         return reference_logl(x) + 0.693147180559945309417232121458176568L;
3377 
3378     if (x > 2.0L)
3379         return reference_logl(2.0L * x - 1.0L / (x + sqrtl(x * x - 1.0L)));
3380 
3381     double hi, lo;
3382     MulD(&hi, &lo, x, x);
3383     AddDD(&hi, &lo, hi, lo, -1.0, 0.0);
3384     __sqrt_ep(&hi, &lo, hi, lo);
3385     AddDD(&hi, &lo, hi, lo, x, 0.0);
3386     double correction = lo / hi;
3387     __log2_ep(&hi, &lo, hi);
3388     double log2Hi = HEX_DBL(+, 1, 62e42fefa39ef, -, 1);
3389     double log2Lo = HEX_DBL(+, 1, abc9e3b39803f, -, 56);
3390     MulDD(&hi, &lo, hi, lo, log2Hi, log2Lo);
3391     AddDD(&hi, &lo, hi, lo, correction, 0.0);
3392 
3393     return hi + lo;
3394 }
3395 
reference_asinhl(long double x)3396 long double reference_asinhl(long double x)
3397 {
3398     long double cutoff = 0.0L;
3399     const long double ln2 = HEX_LDBL(+, b, 17217f7d1cf79ab, -, 4);
3400 
3401     if (cutoff == 0.0L) cutoff = reference_ldexpl(1.0L, -LDBL_MANT_DIG);
3402 
3403     if (isnan(x) || isinf(x)) return x + x;
3404 
3405     long double absx = reference_fabsl(x);
3406     if (absx < cutoff) return x;
3407 
3408     long double sign = reference_copysignl(1.0L, x);
3409 
3410     if (absx <= 4.0 / 3.0)
3411     {
3412         return sign
3413             * reference_log1pl(absx + x * x / (1.0 + sqrtl(1.0 + x * x)));
3414     }
3415     else if (absx <= HEX_LDBL(+, 1, 0, +, 27))
3416     {
3417         return sign
3418             * reference_logl(2.0L * absx + 1.0L / (sqrtl(x * x + 1.0) + absx));
3419     }
3420     else
3421     {
3422         return sign * (reference_logl(absx) + ln2);
3423     }
3424 }
3425 
reference_atanhl(long double x)3426 long double reference_atanhl(long double x)
3427 {
3428     /*
3429      * ====================================================
3430      * This function is from fdlibm: http://www.netlib.org
3431      *   It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
3432      *
3433      * Developed at SunSoft, a Sun Microsystems, Inc. business.
3434      * Permission to use, copy, modify, and distribute this
3435      * software is freely granted, provided that this notice
3436      * is preserved.
3437      * ====================================================
3438      */
3439     if (isnan(x)) return x + x;
3440 
3441     long double signed_half = reference_copysignl(0.5L, x);
3442     x = reference_fabsl(x);
3443     if (x > 1.0L) return cl_make_nan();
3444 
3445     if (x < 0.5L)
3446         return signed_half * reference_log1pl(2.0L * (x + x * x / (1 - x)));
3447 
3448     return signed_half * reference_log1pl(2.0L * x / (1 - x));
3449 }
3450 
reference_exp2l(long double z)3451 long double reference_exp2l(long double z)
3452 {
3453     double_double x;
3454     int j;
3455 
3456     // Handle NaNs
3457     if (isnan(z)) return z;
3458 
3459     // init x
3460     x.hi = z;
3461     x.lo = z - x.hi;
3462 
3463     // Deal with overflow and underflow for exp2(x) stage next
3464     if (x.hi >= 1025) return INFINITY;
3465 
3466     if (x.hi < -1075 - 24) return +0.0;
3467 
3468     // find nearest integer to x
3469     int i = (int)rint(x.hi);
3470 
3471     // x now holds fractional part.  The result would be then 2**i  * exp2( x )
3472     x.hi -= i;
3473 
3474     // We could attempt to find a minimax polynomial for exp2(x) over the range
3475     // x = [-0.5, 0.5]. However, this would converge very slowly near the
3476     // extrema, where 0.5**n is not a lot different from 0.5**(n+1), thereby
3477     // requiring something like a 20th order polynomial to get 53 + 24 bits of
3478     // precision. Instead we further reduce the range to [-1/32, 1/32] by
3479     // observing that
3480     //
3481     //  2**(a+b) = 2**a * 2**b
3482     //
3483     // We can thus build a table of 2**a values for a = n/16, n = [-8, 8], and
3484     // reduce the range of x to [-1/32, 1/32] by subtracting away the nearest
3485     // value of n/16 from x.
3486     const double_double corrections[17] = {
3487         { HEX_DBL(+, 1, 6a09e667f3bcd, -, 1),
3488           HEX_DBL(-, 1, bdd3413b26456, -, 55) },
3489         { HEX_DBL(+, 1, 7a11473eb0187, -, 1),
3490           HEX_DBL(-, 1, 41577ee04992f, -, 56) },
3491         { HEX_DBL(+, 1, 8ace5422aa0db, -, 1),
3492           HEX_DBL(+, 1, 6e9f156864b27, -, 55) },
3493         { HEX_DBL(+, 1, 9c49182a3f09, -, 1),
3494           HEX_DBL(+, 1, c7c46b071f2be, -, 57) },
3495         { HEX_DBL(+, 1, ae89f995ad3ad, -, 1),
3496           HEX_DBL(+, 1, 7a1cd345dcc81, -, 55) },
3497         { HEX_DBL(+, 1, c199bdd85529c, -, 1),
3498           HEX_DBL(+, 1, 11065895048dd, -, 56) },
3499         { HEX_DBL(+, 1, d5818dcfba487, -, 1),
3500           HEX_DBL(+, 1, 2ed02d75b3707, -, 56) },
3501         { HEX_DBL(+, 1, ea4afa2a490da, -, 1),
3502           HEX_DBL(-, 1, e9c23179c2893, -, 55) },
3503         { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
3504         { HEX_DBL(+, 1, 0b5586cf9890f, +, 0),
3505           HEX_DBL(+, 1, 8a62e4adc610b, -, 54) },
3506         { HEX_DBL(+, 1, 172b83c7d517b, +, 0),
3507           HEX_DBL(-, 1, 19041b9d78a76, -, 55) },
3508         { HEX_DBL(+, 1, 2387a6e756238, +, 0),
3509           HEX_DBL(+, 1, 9b07eb6c70573, -, 54) },
3510         { HEX_DBL(+, 1, 306fe0a31b715, +, 0),
3511           HEX_DBL(+, 1, 6f46ad23182e4, -, 55) },
3512         { HEX_DBL(+, 1, 3dea64c123422, +, 0),
3513           HEX_DBL(+, 1, ada0911f09ebc, -, 55) },
3514         { HEX_DBL(+, 1, 4bfdad5362a27, +, 0),
3515           HEX_DBL(+, 1, d4397afec42e2, -, 56) },
3516         { HEX_DBL(+, 1, 5ab07dd485429, +, 0),
3517           HEX_DBL(+, 1, 6324c054647ad, -, 54) },
3518         { HEX_DBL(+, 1, 6a09e667f3bcd, +, 0),
3519           HEX_DBL(-, 1, bdd3413b26456, -, 54) }
3520     };
3521     int index = (int)rint(x.hi * 16.0);
3522     x.hi -= (double)index * 0.0625;
3523 
3524     // canonicalize x
3525     double temp = x.hi;
3526     x.hi += x.lo;
3527     x.lo -= x.hi - temp;
3528 
3529     // Minimax polynomial for (exp2(x)-1)/x, over the range [-1/32, 1/32].  Max
3530     // Error: 2 * 0x1.e112p-87
3531     const double_double c[] = { { HEX_DBL(+, 1, 62e42fefa39ef, -, 1),
3532                                   HEX_DBL(+, 1, abc9e3ac1d244, -, 56) },
3533                                 { HEX_DBL(+, 1, ebfbdff82c58f, -, 3),
3534                                   HEX_DBL(-, 1, 5e4987a631846, -, 57) },
3535                                 { HEX_DBL(+, 1, c6b08d704a0c, -, 5),
3536                                   HEX_DBL(-, 1, d323200a05713, -, 59) },
3537                                 { HEX_DBL(+, 1, 3b2ab6fba4e7a, -, 7),
3538                                   HEX_DBL(+, 1, c5ee8f8b9f0c1, -, 63) },
3539                                 { HEX_DBL(+, 1, 5d87fe78a672a, -, 10),
3540                                   HEX_DBL(+, 1, 884e5e5cc7ecc, -, 64) },
3541                                 { HEX_DBL(+, 1, 430912f7e8373, -, 13),
3542                                   HEX_DBL(+, 1, 4f1b59514a326, -, 67) },
3543                                 { HEX_DBL(+, 1, ffcbfc5985e71, -, 17),
3544                                   HEX_DBL(-, 1, db7d6a0953b78, -, 71) },
3545                                 { HEX_DBL(+, 1, 62c150eb16465, -, 20),
3546                                   HEX_DBL(+, 1, e0767c2d7abf5, -, 80) },
3547                                 { HEX_DBL(+, 1, b52502b5e953, -, 24),
3548                                   HEX_DBL(+, 1, 6797523f944bc, -, 78) } };
3549     size_t count = sizeof(c) / sizeof(c[0]);
3550 
3551     // Do polynomial
3552     double_double r = c[count - 1];
3553     for (j = (int)count - 2; j >= 0; j--) r = add_dd(c[j], mul_dd(r, x));
3554 
3555     // unwind approximation
3556     r = mul_dd(r, x); // before: r =(exp2(x)-1)/x;   after: r = exp2(x) - 1
3557 
3558     // correct for [-0.5, 0.5] -> [-1/32, 1/32] reduction above
3559     //  exp2(x) = (r + 1) * correction = r * correction + correction
3560     r = mul_dd(r, corrections[index + 8]);
3561     r = add_dd(r, corrections[index + 8]);
3562 
3563     // Format result for output:
3564 
3565     // Get mantissa
3566     long double m = ((long double)r.hi + (long double)r.lo);
3567 
3568     // Handle a pesky overflow cases when long double = double
3569     if (i > 512)
3570     {
3571         m *= HEX_DBL(+, 1, 0, +, 512);
3572         i -= 512;
3573     }
3574     else if (i < -512)
3575     {
3576         m *= HEX_DBL(+, 1, 0, -, 512);
3577         i += 512;
3578     }
3579 
3580     return m * ldexpl(1.0L, i);
3581 }
3582 
reference_expm1l(long double x)3583 long double reference_expm1l(long double x)
3584 {
3585 #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
3586     // unimplemented
3587     return x;
3588 #else
3589     if (reference_isnanl(x)) return x;
3590 
3591     if (x > 710) return INFINITY;
3592 
3593     long double y = expm1l(x);
3594 
3595     // Range of expm1l is -1.0L to +inf. Negative inf
3596     // on a few Linux platforms is clearly the wrong sign.
3597     if (reference_isinfl(y)) y = INFINITY;
3598 
3599     return y;
3600 #endif
3601 }
3602 
reference_fmaxl(long double x,long double y)3603 long double reference_fmaxl(long double x, long double y)
3604 {
3605     if (isnan(y)) return x;
3606 
3607     return x >= y ? x : y;
3608 }
3609 
reference_fminl(long double x,long double y)3610 long double reference_fminl(long double x, long double y)
3611 {
3612     if (isnan(y)) return x;
3613 
3614     return x <= y ? x : y;
3615 }
3616 
reference_hypotl(long double x,long double y)3617 long double reference_hypotl(long double x, long double y)
3618 {
3619     static const double tobig = HEX_DBL(+, 1, 0, +, 511);
3620     static const double big = HEX_DBL(+, 1, 0, +, 513);
3621     static const double rbig = HEX_DBL(+, 1, 0, -, 513);
3622     static const double tosmall = HEX_DBL(+, 1, 0, -, 511);
3623     static const double smalll = HEX_DBL(+, 1, 0, -, 607);
3624     static const double rsmall = HEX_DBL(+, 1, 0, +, 607);
3625 
3626     long double max, min;
3627 
3628     if (isinf(x) || isinf(y)) return INFINITY;
3629 
3630     if (isnan(x) || isnan(y)) return x + y;
3631 
3632     x = reference_fabsl(x);
3633     y = reference_fabsl(y);
3634 
3635     max = reference_fmaxl(x, y);
3636     min = reference_fminl(x, y);
3637 
3638     if (max > tobig)
3639     {
3640         max *= rbig;
3641         min *= rbig;
3642         return big * sqrtl(max * max + min * min);
3643     }
3644 
3645     if (max < tosmall)
3646     {
3647         max *= rsmall;
3648         min *= rsmall;
3649         return smalll * sqrtl(max * max + min * min);
3650     }
3651     return sqrtl(x * x + y * y);
3652 }
3653 
reference_log2l(long double x)3654 long double reference_log2l(long double x)
3655 {
3656     if (isnan(x) || x < 0.0 || x == -INFINITY) return NAN;
3657 
3658     if (x == 0.0f) return -INFINITY;
3659 
3660     if (x == INFINITY) return INFINITY;
3661 
3662     double hi, lo;
3663     __log2_ep(&hi, &lo, x);
3664 
3665     return (long double)hi + (long double)lo;
3666 }
3667 
reference_log1pl(long double x)3668 long double reference_log1pl(long double x)
3669 {
3670 #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
3671     // unimplemented
3672     return x;
3673 #elif defined(__PPC__)
3674     // log1pl on PPC inadvertantly returns NaN for very large values. Work
3675     // around this limitation by returning logl for large values.
3676     return ((x > (long double)(0x1.0p+1022)) ? logl(x) : log1pl(x));
3677 #else
3678     return log1pl(x);
3679 #endif
3680 }
3681 
reference_logbl(long double x)3682 long double reference_logbl(long double x)
3683 {
3684     // Since we are just using this to verify double precision, we can
3685     // use the double precision copysign here
3686     union {
3687         double f;
3688         cl_ulong u;
3689     } u;
3690     u.f = (double)x;
3691 
3692     cl_int exponent = (cl_uint)(u.u >> 52) & 0x7ff;
3693     if (exponent == 0x7ff) return x * x;
3694 
3695     if (exponent == 0)
3696     { // deal with denormals
3697         u.f = x * HEX_DBL(+, 1, 0, +, 64);
3698         exponent = (cl_int)(u.u >> 52) & 0x7ff;
3699         if (exponent == 0) return -INFINITY;
3700 
3701         return exponent - (1023 + 64);
3702     }
3703 
3704     return exponent - 1023;
3705 }
3706 
reference_maxmagl(long double x,long double y)3707 long double reference_maxmagl(long double x, long double y)
3708 {
3709     long double fabsx = fabsl(x);
3710     long double fabsy = fabsl(y);
3711 
3712     if (fabsx < fabsy) return y;
3713 
3714     if (fabsy < fabsx) return x;
3715 
3716     return reference_fmaxl(x, y);
3717 }
3718 
reference_minmagl(long double x,long double y)3719 long double reference_minmagl(long double x, long double y)
3720 {
3721     long double fabsx = fabsl(x);
3722     long double fabsy = fabsl(y);
3723 
3724     if (fabsx > fabsy) return y;
3725 
3726     if (fabsy > fabsx) return x;
3727 
3728     return reference_fminl(x, y);
3729 }
3730 
reference_nanl(cl_ulong x)3731 long double reference_nanl(cl_ulong x)
3732 {
3733     union {
3734         cl_ulong u;
3735         cl_double f;
3736     } u;
3737     u.u = x | 0x7ff8000000000000ULL;
3738     return (long double)u.f;
3739 }
3740 
3741 
reference_reciprocall(long double x)3742 long double reference_reciprocall(long double x) { return 1.0L / x; }
3743 
reference_remainderl(long double x,long double y)3744 long double reference_remainderl(long double x, long double y)
3745 {
3746     int i;
3747     return reference_remquol(x, y, &i);
3748 }
3749 
reference_lgammal(long double x)3750 long double reference_lgammal(long double x)
3751 {
3752     // lgamma is currently not tested
3753     return reference_lgamma(x);
3754 }
3755 
3756 static uint32_t two_over_pi[] = {
3757     0x0,        0x28be60db, 0x24e44152, 0x27f09d5f, 0x11f534dd, 0x3036d8a5,
3758     0x1993c439, 0x107f945,  0x23abdebb, 0x31586dc9, 0x6e3a424,  0x374b8019,
3759     0x92eea09,  0x3464873f, 0x21deb1cb, 0x4a69cfb,  0x288235f5, 0xbaed121,
3760     0xe99c702,  0x1ad17df9, 0x13991d6,  0xe60d4ce,  0x1f49c845, 0x3e2ef7e4,
3761     0x283b1ff8, 0x25fff781, 0x1980fef2, 0x3c462d68, 0xa6d1f6d,  0xd9fb3c9,
3762     0x3cb09b74, 0x3d18fd9a, 0x1e5fea2d, 0x1d49eeb1, 0x3ebe5f17, 0x2cf41ce7,
3763     0x378a5292, 0x3a9afed7, 0x3b11f8d5, 0x3421580c, 0x3046fc7b, 0x1aeafc33,
3764     0x3bc209af, 0x10d876a7, 0x2391615e, 0x3986c219, 0x199855f1, 0x1281a102,
3765     0xdffd880,  0x135cc9cc, 0x10606155
3766 };
3767 
3768 static uint32_t pi_over_two[] = { 0x1,        0x2487ed51, 0x42d1846,
3769                                   0x26263314, 0x1701b839, 0x28948127 };
3770 
3771 union d_ui64_t {
3772     uint64_t u;
3773     double d;
3774 };
3775 
3776 // radix or base of representation
3777 #define RADIX (30)
3778 #define DIGITS 6
3779 
3780 d_ui64_t two_pow_pradix = { (uint64_t)(1023 + RADIX) << 52 };
3781 d_ui64_t two_pow_mradix = { (uint64_t)(1023 - RADIX) << 52 };
3782 d_ui64_t two_pow_two_mradix = { (uint64_t)(1023 - 2 * RADIX) << 52 };
3783 
3784 #define tp_pradix two_pow_pradix.d
3785 #define tp_mradix two_pow_mradix.d
3786 
3787 // extended fixed point representation of double precision
3788 // floating point number.
3789 // x = sign * [ sum_{i = 0 to 2} ( X[i] * 2^(index - i)*RADIX ) ]
3790 struct eprep_t
3791 {
3792     uint32_t X[3]; // three 32 bit integers are sufficient to represnt double in
3793                    // base_30
3794     int index; // exponent bias
3795     int sign; // sign of double
3796 };
3797 
double_to_eprep(double x)3798 static eprep_t double_to_eprep(double x)
3799 {
3800     eprep_t result;
3801 
3802     result.sign = (signbit(x) == 0) ? 1 : -1;
3803     x = fabs(x);
3804 
3805     int index = 0;
3806     while (x > tp_pradix)
3807     {
3808         index++;
3809         x *= tp_mradix;
3810     }
3811     while (x < 1)
3812     {
3813         index--;
3814         x *= tp_pradix;
3815     }
3816 
3817     result.index = index;
3818     int i = 0;
3819     result.X[0] = result.X[1] = result.X[2] = 0;
3820     while (x != 0.0)
3821     {
3822         result.X[i] = (uint32_t)x;
3823         x = (x - (double)result.X[i]) * tp_pradix;
3824         i++;
3825     }
3826     return result;
3827 }
3828 
eprep_to_double(eprep_t epx)3829 static double eprep_to_double(eprep_t epx)
3830 {
3831     double res = 0.0;
3832 
3833     res += ldexp((double)epx.X[0], (epx.index - 0) * RADIX);
3834     res += ldexp((double)epx.X[1], (epx.index - 1) * RADIX);
3835     res += ldexp((double)epx.X[2], (epx.index - 2) * RADIX);
3836 
3837     return copysign(res, epx.sign);
3838 }
3839 
payne_hanek(double * y,int * exception)3840 static int payne_hanek(double *y, int *exception)
3841 {
3842     double x = *y;
3843 
3844     // exception cases .. no reduction required
3845     if (isnan(x) || isinf(x) || (fabs(x) <= M_PI_4))
3846     {
3847         *exception = 1;
3848         return 0;
3849     }
3850 
3851     *exception = 0;
3852 
3853     // After computation result[0] contains integer part while
3854     // result[1]....result[DIGITS-1] contain fractional part. So we are doing
3855     // computation with (DIGITS-1)*RADIX precision. Default DIGITS=6 and
3856     // RADIX=30 so default precision is 150 bits. Kahan-McDonald algorithm shows
3857     // that a double precision x, closest to pi/2 is 6381956970095103 x 2^797
3858     // which can cause 61 digits of cancellation in computation of f = x*2/pi -
3859     // floor(x*2/pi) ... thus we need at least 114 bits (61 leading zeros + 53
3860     // bits of mentissa of f) of precision to accurately compute f in double
3861     // precision. Since we are using 150 bits (still an overkill), we should be
3862     // safe. Extra bits can act as guard bits for correct rounding.
3863     uint64_t result[DIGITS + 2];
3864 
3865     // compute extended precision representation of x
3866     eprep_t epx = double_to_eprep(x);
3867     int index = epx.index;
3868     int i, j;
3869     // extended precision multiplication of 2/pi*x .... we will loose at max two
3870     // RADIX=30 bit digits in the worst case
3871     for (i = 0; i < (DIGITS + 2); i++)
3872     {
3873         result[i] = 0;
3874         result[i] += ((index + i - 0) >= 0)
3875             ? ((uint64_t)two_over_pi[index + i - 0] * (uint64_t)epx.X[0])
3876             : 0;
3877         result[i] += ((index + i - 1) >= 0)
3878             ? ((uint64_t)two_over_pi[index + i - 1] * (uint64_t)epx.X[1])
3879             : 0;
3880         result[i] += ((index + i - 2) >= 0)
3881             ? ((uint64_t)two_over_pi[index + i - 2] * (uint64_t)epx.X[2])
3882             : 0;
3883     }
3884 
3885     // Carry propagation.
3886     uint64_t tmp;
3887     for (i = DIGITS + 2 - 1; i > 0; i--)
3888     {
3889         tmp = result[i] >> RADIX;
3890         result[i - 1] += tmp;
3891         result[i] -= (tmp << RADIX);
3892     }
3893 
3894     // we dont ned to normalize the integer part since only last two bits of
3895     // this will be used subsequently algorithm which remain unaltered by this
3896     // normalization. tmp = result[0] >> RADIX; result[0] -= (tmp << RADIX);
3897     unsigned int N = (unsigned int)result[0];
3898 
3899     // if the result is > pi/4, bring it to (-pi/4, pi/4] range. Note that
3900     // testing if the final x_star = pi/2*(x*2/pi - k) > pi/4 is equivalent to
3901     // testing, at this stage, if r[1] (the first fractional digit) is greater
3902     // than (2^RADIX)/2 and substracting pi/4 from x_star to bring it to
3903     // mentioned range is equivalent to substracting fractional part at this
3904     // stage from one and changing the sign.
3905     int sign = 1;
3906     if (result[1] > (uint64_t)(1 << (RADIX - 1)))
3907     {
3908         for (i = 1; i < (DIGITS + 2); i++)
3909             result[i] = (~((unsigned int)result[i]) & 0x3fffffff);
3910         N += 1;
3911         sign = -1;
3912     }
3913 
3914     // Again as per Kahan-McDonald algorithim there may be 61 leading zeros in
3915     // the worst case (when x is multiple of 2/pi very close to an integer) so
3916     // we need to get rid of these zeros and adjust the index of final result.
3917     // So in the worst case, precision of comupted result is 90 bits (150 bits
3918     // original bits - 60 lost in cancellation).
3919     int ind = 1;
3920     for (i = 1; i < (DIGITS + 2); i++)
3921     {
3922         if (result[i] != 0)
3923             break;
3924         else
3925             ind++;
3926     }
3927 
3928     uint64_t r[DIGITS - 1];
3929     for (i = 0; i < (DIGITS - 1); i++)
3930     {
3931         r[i] = 0;
3932         for (j = 0; j <= i; j++)
3933         {
3934             r[i] += (result[ind + i - j] * (uint64_t)pi_over_two[j]);
3935         }
3936     }
3937     for (i = (DIGITS - 2); i > 0; i--)
3938     {
3939         tmp = r[i] >> RADIX;
3940         r[i - 1] += tmp;
3941         r[i] -= (tmp << RADIX);
3942     }
3943     tmp = r[0] >> RADIX;
3944     r[0] -= (tmp << RADIX);
3945 
3946     eprep_t epr;
3947     epr.sign = epx.sign * sign;
3948     if (tmp != 0)
3949     {
3950         epr.index = -ind + 1;
3951         epr.X[0] = (uint32_t)tmp;
3952         epr.X[1] = (uint32_t)r[0];
3953         epr.X[2] = (uint32_t)r[1];
3954     }
3955     else
3956     {
3957         epr.index = -ind;
3958         epr.X[0] = (uint32_t)r[0];
3959         epr.X[1] = (uint32_t)r[1];
3960         epr.X[2] = (uint32_t)r[2];
3961     }
3962 
3963     *y = eprep_to_double(epr);
3964     return epx.sign * N;
3965 }
3966 
reference_relaxed_cos(double x)3967 double reference_relaxed_cos(double x)
3968 {
3969     if (isnan(x)) return NAN;
3970     return (float)cos((float)x);
3971 }
3972 
reference_cos(double x)3973 double reference_cos(double x)
3974 {
3975     int exception;
3976     int N = payne_hanek(&x, &exception);
3977     if (exception) return cos(x);
3978     unsigned int c = N & 3;
3979     switch (c)
3980     {
3981         case 0: return cos(x);
3982         case 1: return -sin(x);
3983         case 2: return -cos(x);
3984         case 3: return sin(x);
3985     }
3986     return 0.0;
3987 }
3988 
reference_relaxed_sin(double x)3989 double reference_relaxed_sin(double x) { return (float)sin((float)x); }
3990 
reference_sin(double x)3991 double reference_sin(double x)
3992 {
3993     int exception;
3994     int N = payne_hanek(&x, &exception);
3995     if (exception) return sin(x);
3996     int c = N & 3;
3997     switch (c)
3998     {
3999         case 0: return sin(x);
4000         case 1: return cos(x);
4001         case 2: return -sin(x);
4002         case 3: return -cos(x);
4003     }
4004     return 0.0;
4005 }
4006 
reference_relaxed_sincos(double x,double * y)4007 double reference_relaxed_sincos(double x, double *y)
4008 {
4009     *y = reference_relaxed_cos(x);
4010     return reference_relaxed_sin(x);
4011 }
4012 
reference_sincos(double x,double * y)4013 double reference_sincos(double x, double *y)
4014 {
4015     int exception;
4016     int N = payne_hanek(&x, &exception);
4017     if (exception)
4018     {
4019         *y = cos(x);
4020         return sin(x);
4021     }
4022     int c = N & 3;
4023     switch (c)
4024     {
4025         case 0: *y = cos(x); return sin(x);
4026         case 1: *y = -sin(x); return cos(x);
4027         case 2: *y = -cos(x); return -sin(x);
4028         case 3: *y = sin(x); return -cos(x);
4029     }
4030     return 0.0;
4031 }
4032 
reference_relaxed_tan(double x)4033 double reference_relaxed_tan(double x)
4034 {
4035     return ((float)reference_relaxed_sin((float)x))
4036         / ((float)reference_relaxed_cos((float)x));
4037 }
4038 
reference_tan(double x)4039 double reference_tan(double x)
4040 {
4041     int exception;
4042     int N = payne_hanek(&x, &exception);
4043     if (exception) return tan(x);
4044     int c = N & 3;
4045     switch (c)
4046     {
4047         case 0: return tan(x);
4048         case 1: return -1.0 / tan(x);
4049         case 2: return tan(x);
4050         case 3: return -1.0 / tan(x);
4051     }
4052     return 0.0;
4053 }
4054 
reference_cosl(long double xx)4055 long double reference_cosl(long double xx)
4056 {
4057     double x = (double)xx;
4058     int exception;
4059     int N = payne_hanek(&x, &exception);
4060     if (exception) return cosl(x);
4061     unsigned int c = N & 3;
4062     switch (c)
4063     {
4064         case 0: return cosl(x);
4065         case 1: return -sinl(x);
4066         case 2: return -cosl(x);
4067         case 3: return sinl(x);
4068     }
4069     return 0.0;
4070 }
4071 
reference_sinl(long double xx)4072 long double reference_sinl(long double xx)
4073 {
4074     // we use system tanl after reduction which
4075     // can flush denorm input to zero so
4076     // take care of it here.
4077     if (reference_fabsl(xx) < HEX_DBL(+, 1, 0, -, 1022)) return xx;
4078 
4079     double x = (double)xx;
4080     int exception;
4081     int N = payne_hanek(&x, &exception);
4082     if (exception) return sinl(x);
4083     int c = N & 3;
4084     switch (c)
4085     {
4086         case 0: return sinl(x);
4087         case 1: return cosl(x);
4088         case 2: return -sinl(x);
4089         case 3: return -cosl(x);
4090     }
4091     return 0.0;
4092 }
4093 
reference_sincosl(long double xx,long double * y)4094 long double reference_sincosl(long double xx, long double *y)
4095 {
4096     // we use system tanl after reduction which
4097     // can flush denorm input to zero so
4098     // take care of it here.
4099     if (reference_fabsl(xx) < HEX_DBL(+, 1, 0, -, 1022))
4100     {
4101         *y = cosl(xx);
4102         return xx;
4103     }
4104 
4105     double x = (double)xx;
4106     int exception;
4107     int N = payne_hanek(&x, &exception);
4108     if (exception)
4109     {
4110         *y = cosl(x);
4111         return sinl(x);
4112     }
4113     int c = N & 3;
4114     switch (c)
4115     {
4116         case 0: *y = cosl(x); return sinl(x);
4117         case 1: *y = -sinl(x); return cosl(x);
4118         case 2: *y = -cosl(x); return -sinl(x);
4119         case 3: *y = sinl(x); return -cosl(x);
4120     }
4121     return 0.0;
4122 }
4123 
reference_tanl(long double xx)4124 long double reference_tanl(long double xx)
4125 {
4126     // we use system tanl after reduction which
4127     // can flush denorm input to zero so
4128     // take care of it here.
4129     if (reference_fabsl(xx) < HEX_DBL(+, 1, 0, -, 1022)) return xx;
4130 
4131     double x = (double)xx;
4132     int exception;
4133     int N = payne_hanek(&x, &exception);
4134     if (exception) return tanl(x);
4135     int c = N & 3;
4136     switch (c)
4137     {
4138         case 0: return tanl(x);
4139         case 1: return -1.0 / tanl(x);
4140         case 2: return tanl(x);
4141         case 3: return -1.0 / tanl(x);
4142     }
4143     return 0.0;
4144 }
4145 
4146 static double __loglTable1[64][3] = {
4147     { HEX_DBL(+, 1, 5390948f40fea, +, 0), HEX_DBL(-, 1, a152f142a, -, 2),
4148       HEX_DBL(+, 1, f93e27b43bd2c, -, 40) },
4149     { HEX_DBL(+, 1, 5015015015015, +, 0), HEX_DBL(-, 1, 921800925, -, 2),
4150       HEX_DBL(+, 1, 162432a1b8df7, -, 41) },
4151     { HEX_DBL(+, 1, 4cab88725af6e, +, 0), HEX_DBL(-, 1, 8304d90c18, -, 2),
4152       HEX_DBL(+, 1, 80bb749056fe7, -, 40) },
4153     { HEX_DBL(+, 1, 49539e3b2d066, +, 0), HEX_DBL(-, 1, 7418acebc, -, 2),
4154       HEX_DBL(+, 1, ceac7f0607711, -, 43) },
4155     { HEX_DBL(+, 1, 460cbc7f5cf9a, +, 0), HEX_DBL(-, 1, 6552b49988, -, 2),
4156       HEX_DBL(+, 1, d8913d0e89fa, -, 42) },
4157     { HEX_DBL(+, 1, 42d6625d51f86, +, 0), HEX_DBL(-, 1, 56b22e6b58, -, 2),
4158       HEX_DBL(+, 1, c7eaf515033a1, -, 44) },
4159     { HEX_DBL(+, 1, 3fb013fb013fb, +, 0), HEX_DBL(-, 1, 48365e696, -, 2),
4160       HEX_DBL(+, 1, 434adcde7edc7, -, 41) },
4161     { HEX_DBL(+, 1, 3c995a47babe7, +, 0), HEX_DBL(-, 1, 39de8e156, -, 2),
4162       HEX_DBL(+, 1, 8246f8e527754, -, 40) },
4163     { HEX_DBL(+, 1, 3991c2c187f63, +, 0), HEX_DBL(-, 1, 2baa0c34c, -, 2),
4164       HEX_DBL(+, 1, e1513c28e180d, -, 42) },
4165     { HEX_DBL(+, 1, 3698df3de0747, +, 0), HEX_DBL(-, 1, 1d982c9d58, -, 2),
4166       HEX_DBL(+, 1, 63ea3fed4b8a2, -, 40) },
4167     { HEX_DBL(+, 1, 33ae45b57bcb1, +, 0), HEX_DBL(-, 1, 0fa848045, -, 2),
4168       HEX_DBL(+, 1, 32ccbacf1779b, -, 40) },
4169     { HEX_DBL(+, 1, 30d190130d19, +, 0), HEX_DBL(-, 1, 01d9bbcfa8, -, 2),
4170       HEX_DBL(+, 1, e2bfeb2b884aa, -, 42) },
4171     { HEX_DBL(+, 1, 2e025c04b8097, +, 0), HEX_DBL(-, 1, e857d3d37, -, 3),
4172       HEX_DBL(+, 1, d9309b4d2ea85, -, 40) },
4173     { HEX_DBL(+, 1, 2b404ad012b4, +, 0), HEX_DBL(-, 1, cd3c712d4, -, 3),
4174       HEX_DBL(+, 1, ddf360962d7ab, -, 40) },
4175     { HEX_DBL(+, 1, 288b01288b012, +, 0), HEX_DBL(-, 1, b2602497e, -, 3),
4176       HEX_DBL(+, 1, 597f8a121640f, -, 40) },
4177     { HEX_DBL(+, 1, 25e22708092f1, +, 0), HEX_DBL(-, 1, 97c1cb13d, -, 3),
4178       HEX_DBL(+, 1, 02807d15580dc, -, 40) },
4179     { HEX_DBL(+, 1, 23456789abcdf, +, 0), HEX_DBL(-, 1, 7d60496d, -, 3),
4180       HEX_DBL(+, 1, 12ce913d7a827, -, 41) },
4181     { HEX_DBL(+, 1, 20b470c67c0d8, +, 0), HEX_DBL(-, 1, 633a8bf44, -, 3),
4182       HEX_DBL(+, 1, 0648bca9c96bd, -, 40) },
4183     { HEX_DBL(+, 1, 1e2ef3b3fb874, +, 0), HEX_DBL(-, 1, 494f863b9, -, 3),
4184       HEX_DBL(+, 1, 066fceb89b0eb, -, 42) },
4185     { HEX_DBL(+, 1, 1bb4a4046ed29, +, 0), HEX_DBL(-, 1, 2f9e32d5c, -, 3),
4186       HEX_DBL(+, 1, 17b8b6c4f846b, -, 46) },
4187     { HEX_DBL(+, 1, 19453808ca29c, +, 0), HEX_DBL(-, 1, 162593187, -, 3),
4188       HEX_DBL(+, 1, 2c83506452154, -, 42) },
4189     { HEX_DBL(+, 1, 16e0689427378, +, 0), HEX_DBL(-, 1, f9c95dc1e, -, 4),
4190       HEX_DBL(+, 1, dd5d2183150f3, -, 41) },
4191     { HEX_DBL(+, 1, 1485f0e0acd3b, +, 0), HEX_DBL(-, 1, c7b528b72, -, 4),
4192       HEX_DBL(+, 1, 0e43c4f4e619d, -, 40) },
4193     { HEX_DBL(+, 1, 12358e75d3033, +, 0), HEX_DBL(-, 1, 960caf9ac, -, 4),
4194       HEX_DBL(+, 1, 20fbfd5902a1e, -, 42) },
4195     { HEX_DBL(+, 1, 0fef010fef01, +, 0), HEX_DBL(-, 1, 64ce26c08, -, 4),
4196       HEX_DBL(+, 1, 8ebeefb4ac467, -, 40) },
4197     { HEX_DBL(+, 1, 0db20a88f4695, +, 0), HEX_DBL(-, 1, 33f7cde16, -, 4),
4198       HEX_DBL(+, 1, 30b3312da7a7d, -, 40) },
4199     { HEX_DBL(+, 1, 0b7e6ec259dc7, +, 0), HEX_DBL(-, 1, 0387efbcc, -, 4),
4200       HEX_DBL(+, 1, 796f1632949c3, -, 40) },
4201     { HEX_DBL(+, 1, 0953f39010953, +, 0), HEX_DBL(-, 1, a6f9c378, -, 5),
4202       HEX_DBL(+, 1, 1687e151172cc, -, 40) },
4203     { HEX_DBL(+, 1, 073260a47f7c6, +, 0), HEX_DBL(-, 1, 47aa07358, -, 5),
4204       HEX_DBL(+, 1, 1f87e4a9cc778, -, 42) },
4205     { HEX_DBL(+, 1, 05197f7d73404, +, 0), HEX_DBL(-, 1, d23afc498, -, 6),
4206       HEX_DBL(+, 1, b183a6b628487, -, 40) },
4207     { HEX_DBL(+, 1, 03091b51f5e1a, +, 0), HEX_DBL(-, 1, 16a21e21, -, 6),
4208       HEX_DBL(+, 1, 7d75c58973ce5, -, 40) },
4209     { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
4210     { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
4211     { HEX_DBL(+, 1, f44659e4a4271, -, 1), HEX_DBL(+, 1, 11cd1d51, -, 5),
4212       HEX_DBL(+, 1, 9a0d857e2f4b2, -, 40) },
4213     { HEX_DBL(+, 1, ecc07b301ecc, -, 1), HEX_DBL(+, 1, c4dfab908, -, 5),
4214       HEX_DBL(+, 1, 55b53fce557fd, -, 40) },
4215     { HEX_DBL(+, 1, e573ac901e573, -, 1), HEX_DBL(+, 1, 3aa2fdd26, -, 4),
4216       HEX_DBL(+, 1, f1cb0c9532089, -, 40) },
4217     { HEX_DBL(+, 1, de5d6e3f8868a, -, 1), HEX_DBL(+, 1, 918a16e46, -, 4),
4218       HEX_DBL(+, 1, 9af0dcd65a6e1, -, 43) },
4219     { HEX_DBL(+, 1, d77b654b82c33, -, 1), HEX_DBL(+, 1, e72ec117e, -, 4),
4220       HEX_DBL(+, 1, a5b93c4ebe124, -, 40) },
4221     { HEX_DBL(+, 1, d0cb58f6ec074, -, 1), HEX_DBL(+, 1, 1dcd19755, -, 3),
4222       HEX_DBL(+, 1, 5be50e71ddc6c, -, 42) },
4223     { HEX_DBL(+, 1, ca4b3055ee191, -, 1), HEX_DBL(+, 1, 476a9f983, -, 3),
4224       HEX_DBL(+, 1, ee9a798719e7f, -, 40) },
4225     { HEX_DBL(+, 1, c3f8f01c3f8f, -, 1), HEX_DBL(+, 1, 70742d4ef, -, 3),
4226       HEX_DBL(+, 1, 3ff1352c1219c, -, 46) },
4227     { HEX_DBL(+, 1, bdd2b899406f7, -, 1), HEX_DBL(+, 1, 98edd077e, -, 3),
4228       HEX_DBL(+, 1, c383cd11362f4, -, 41) },
4229     { HEX_DBL(+, 1, b7d6c3dda338b, -, 1), HEX_DBL(+, 1, c0db6cdd9, -, 3),
4230       HEX_DBL(+, 1, 37bd85b1a824e, -, 41) },
4231     { HEX_DBL(+, 1, b2036406c80d9, -, 1), HEX_DBL(+, 1, e840be74e, -, 3),
4232       HEX_DBL(+, 1, a9334d525e1ec, -, 41) },
4233     { HEX_DBL(+, 1, ac5701ac5701a, -, 1), HEX_DBL(+, 1, 0790adbb, -, 2),
4234       HEX_DBL(+, 1, 8060bfb6a491, -, 41) },
4235     { HEX_DBL(+, 1, a6d01a6d01a6d, -, 1), HEX_DBL(+, 1, 1ac05b2918, -, 2),
4236       HEX_DBL(+, 1, c1c161471580a, -, 40) },
4237     { HEX_DBL(+, 1, a16d3f97a4b01, -, 1), HEX_DBL(+, 1, 2db10fc4d8, -, 2),
4238       HEX_DBL(+, 1, ab1aa62214581, -, 42) },
4239     { HEX_DBL(+, 1, 9c2d14ee4a101, -, 1), HEX_DBL(+, 1, 406463b1b, -, 2),
4240       HEX_DBL(+, 1, 12e95dbda6611, -, 44) },
4241     { HEX_DBL(+, 1, 970e4f80cb872, -, 1), HEX_DBL(+, 1, 52dbdfc4c8, -, 2),
4242       HEX_DBL(+, 1, 6b53fee511af, -, 42) },
4243     { HEX_DBL(+, 1, 920fb49d0e228, -, 1), HEX_DBL(+, 1, 6518fe467, -, 2),
4244       HEX_DBL(+, 1, eea7d7d7d1764, -, 40) },
4245     { HEX_DBL(+, 1, 8d3018d3018d3, -, 1), HEX_DBL(+, 1, 771d2ba7e8, -, 2),
4246       HEX_DBL(+, 1, ecefa8d4fab97, -, 40) },
4247     { HEX_DBL(+, 1, 886e5f0abb049, -, 1), HEX_DBL(+, 1, 88e9c72e08, -, 2),
4248       HEX_DBL(+, 1, 913ea3d33fd14, -, 41) },
4249     { HEX_DBL(+, 1, 83c977ab2bedd, -, 1), HEX_DBL(+, 1, 9a802391e, -, 2),
4250       HEX_DBL(+, 1, 197e845877c94, -, 41) },
4251     { HEX_DBL(+, 1, 7f405fd017f4, -, 1), HEX_DBL(+, 1, abe18797f, -, 2),
4252       HEX_DBL(+, 1, f4a52f8e8a81, -, 42) },
4253     { HEX_DBL(+, 1, 7ad2208e0ecc3, -, 1), HEX_DBL(+, 1, bd0f2e9e78, -, 2),
4254       HEX_DBL(+, 1, 031f4336644cc, -, 42) },
4255     { HEX_DBL(+, 1, 767dce434a9b1, -, 1), HEX_DBL(+, 1, ce0a4923a, -, 2),
4256       HEX_DBL(+, 1, 61f33c897020c, -, 40) },
4257     { HEX_DBL(+, 1, 724287f46debc, -, 1), HEX_DBL(+, 1, ded3fd442, -, 2),
4258       HEX_DBL(+, 1, b2632e830632, -, 41) },
4259     { HEX_DBL(+, 1, 6e1f76b4337c6, -, 1), HEX_DBL(+, 1, ef6d673288, -, 2),
4260       HEX_DBL(+, 1, 888ec245a0bf, -, 40) },
4261     { HEX_DBL(+, 1, 6a13cd153729, -, 1), HEX_DBL(+, 1, ffd799a838, -, 2),
4262       HEX_DBL(+, 1, fe6f3b2f5fc8e, -, 40) },
4263     { HEX_DBL(+, 1, 661ec6a5122f9, -, 1), HEX_DBL(+, 1, 0809cf27f4, -, 1),
4264       HEX_DBL(+, 1, 81eaa9ef284dd, -, 40) },
4265     { HEX_DBL(+, 1, 623fa7701623f, -, 1), HEX_DBL(+, 1, 10113b153c, -, 1),
4266       HEX_DBL(+, 1, 1d7b07d6b1143, -, 42) },
4267     { HEX_DBL(+, 1, 5e75bb8d015e7, -, 1), HEX_DBL(+, 1, 18028cf728, -, 1),
4268       HEX_DBL(+, 1, 76b100b1f6c6, -, 41) },
4269     { HEX_DBL(+, 1, 5ac056b015ac, -, 1), HEX_DBL(+, 1, 1fde3d30e8, -, 1),
4270       HEX_DBL(+, 1, 26faeb9870945, -, 45) },
4271     { HEX_DBL(+, 1, 571ed3c506b39, -, 1), HEX_DBL(+, 1, 27a4c0585c, -, 1),
4272       HEX_DBL(+, 1, 7f2c5344d762b, -, 42) }
4273 };
4274 
4275 static double __loglTable2[64][3] = {
4276     { HEX_DBL(+, 1, 01fbe7f0a1be6, +, 0), HEX_DBL(-, 1, 6cf6ddd26112a, -, 7),
4277       HEX_DBL(+, 1, 0725e5755e314, -, 60) },
4278     { HEX_DBL(+, 1, 01eba93a97b12, +, 0), HEX_DBL(-, 1, 6155b1d99f603, -, 7),
4279       HEX_DBL(+, 1, 4bcea073117f4, -, 60) },
4280     { HEX_DBL(+, 1, 01db6c9029cd1, +, 0), HEX_DBL(-, 1, 55b54153137ff, -, 7),
4281       HEX_DBL(+, 1, 21e8faccad0ec, -, 61) },
4282     { HEX_DBL(+, 1, 01cb31f0f534c, +, 0), HEX_DBL(-, 1, 4a158c27245bd, -, 7),
4283       HEX_DBL(+, 1, 1a5b7bfbf35d3, -, 60) },
4284     { HEX_DBL(+, 1, 01baf95c9723c, +, 0), HEX_DBL(-, 1, 3e76923e3d678, -, 7),
4285       HEX_DBL(+, 1, eee400eb5fe34, -, 62) },
4286     { HEX_DBL(+, 1, 01aac2d2acee6, +, 0), HEX_DBL(-, 1, 32d85380ce776, -, 7),
4287       HEX_DBL(+, 1, cbf7a513937bd, -, 61) },
4288     { HEX_DBL(+, 1, 019a8e52d401e, +, 0), HEX_DBL(-, 1, 273acfd74be72, -, 7),
4289       HEX_DBL(+, 1, 5c64599efa5e6, -, 60) },
4290     { HEX_DBL(+, 1, 018a5bdca9e42, +, 0), HEX_DBL(-, 1, 1b9e072a2e65, -, 7),
4291       HEX_DBL(+, 1, 364180e0a5d37, -, 60) },
4292     { HEX_DBL(+, 1, 017a2b6fcc33e, +, 0), HEX_DBL(-, 1, 1001f961f3243, -, 7),
4293       HEX_DBL(+, 1, 63d795746f216, -, 60) },
4294     { HEX_DBL(+, 1, 0169fd0bd8a8a, +, 0), HEX_DBL(-, 1, 0466a6671bca4, -, 7),
4295       HEX_DBL(+, 1, 4c99ff1907435, -, 60) },
4296     { HEX_DBL(+, 1, 0159d0b06d129, +, 0), HEX_DBL(-, 1, f1981c445cd05, -, 8),
4297       HEX_DBL(+, 1, 4bfff6366b723, -, 62) },
4298     { HEX_DBL(+, 1, 0149a65d275a6, +, 0), HEX_DBL(-, 1, da6460f76ab8c, -, 8),
4299       HEX_DBL(+, 1, 9c5404f47589c, -, 61) },
4300     { HEX_DBL(+, 1, 01397e11a581b, +, 0), HEX_DBL(-, 1, c3321ab87f4ef, -, 8),
4301       HEX_DBL(+, 1, c0da537429cea, -, 61) },
4302     { HEX_DBL(+, 1, 012957cd85a28, +, 0), HEX_DBL(-, 1, ac014958c112c, -, 8),
4303       HEX_DBL(+, 1, 000c2a1b595e3, -, 64) },
4304     { HEX_DBL(+, 1, 0119339065ef7, +, 0), HEX_DBL(-, 1, 94d1eca95f67a, -, 8),
4305       HEX_DBL(+, 1, d8d20b0564d5, -, 61) },
4306     { HEX_DBL(+, 1, 01091159e4b3d, +, 0), HEX_DBL(-, 1, 7da4047b92b3e, -, 8),
4307       HEX_DBL(+, 1, 6194a5d68cf2, -, 66) },
4308     { HEX_DBL(+, 1, 00f8f129a0535, +, 0), HEX_DBL(-, 1, 667790a09bf77, -, 8),
4309       HEX_DBL(+, 1, ca230e0bea645, -, 61) },
4310     { HEX_DBL(+, 1, 00e8d2ff374a1, +, 0), HEX_DBL(-, 1, 4f4c90e9c4ead, -, 8),
4311       HEX_DBL(+, 1, 1de3e7f350c1, -, 61) },
4312     { HEX_DBL(+, 1, 00d8b6da482ce, +, 0), HEX_DBL(-, 1, 3823052860649, -, 8),
4313       HEX_DBL(+, 1, 5789b4c5891b8, -, 64) },
4314     { HEX_DBL(+, 1, 00c89cba71a8c, +, 0), HEX_DBL(-, 1, 20faed2dc9a9e, -, 8),
4315       HEX_DBL(+, 1, 9e7c40f9839fd, -, 62) },
4316     { HEX_DBL(+, 1, 00b8849f52834, +, 0), HEX_DBL(-, 1, 09d448cb65014, -, 8),
4317       HEX_DBL(+, 1, 387e3e9b6d02, -, 62) },
4318     { HEX_DBL(+, 1, 00a86e88899a4, +, 0), HEX_DBL(-, 1, e55e2fa53ebf1, -, 9),
4319       HEX_DBL(+, 1, cdaa71fddfddf, -, 62) },
4320     { HEX_DBL(+, 1, 00985a75b5e3f, +, 0), HEX_DBL(-, 1, b716b429dce0f, -, 9),
4321       HEX_DBL(+, 1, 2f2af081367bf, -, 63) },
4322     { HEX_DBL(+, 1, 00884866766ee, +, 0), HEX_DBL(-, 1, 88d21ec7a16d7, -, 9),
4323       HEX_DBL(+, 1, fb95c228d6f16, -, 62) },
4324     { HEX_DBL(+, 1, 0078385a6a61d, +, 0), HEX_DBL(-, 1, 5a906f219a9e8, -, 9),
4325       HEX_DBL(+, 1, 18aff10a89f29, -, 64) },
4326     { HEX_DBL(+, 1, 00682a5130fbe, +, 0), HEX_DBL(-, 1, 2c51a4dae87f1, -, 9),
4327       HEX_DBL(+, 1, bcc7e33ddde3, -, 63) },
4328     { HEX_DBL(+, 1, 00581e4a69944, +, 0), HEX_DBL(-, 1, fc2b7f2d782b1, -, 10),
4329       HEX_DBL(+, 1, fe3ef3300a9fa, -, 64) },
4330     { HEX_DBL(+, 1, 00481445b39a8, +, 0), HEX_DBL(-, 1, 9fb97df0b0b83, -, 10),
4331       HEX_DBL(+, 1, 0d9a601f2f324, -, 65) },
4332     { HEX_DBL(+, 1, 00380c42ae963, +, 0), HEX_DBL(-, 1, 434d4546227ae, -, 10),
4333       HEX_DBL(+, 1, 0b9b6a5868f33, -, 63) },
4334     { HEX_DBL(+, 1, 00280640fa271, +, 0), HEX_DBL(-, 1, cdcda8e930c19, -, 11),
4335       HEX_DBL(+, 1, 3d424ab39f789, -, 64) },
4336     { HEX_DBL(+, 1, 0018024036051, +, 0), HEX_DBL(-, 1, 150c558601261, -, 11),
4337       HEX_DBL(+, 1, 285bb90327a0f, -, 64) },
4338     { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
4339     { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
4340     { HEX_DBL(+, 1, ffa011fca0a1e, -, 1), HEX_DBL(+, 1, 14e5640c4197b, -, 10),
4341       HEX_DBL(+, 1, 95728136ae401, -, 63) },
4342     { HEX_DBL(+, 1, ff6031f064e07, -, 1), HEX_DBL(+, 1, cd61806bf532d, -, 10),
4343       HEX_DBL(+, 1, 568a4f35d8538, -, 63) },
4344     { HEX_DBL(+, 1, ff2061d532b9c, -, 1), HEX_DBL(+, 1, 42e34af550eda, -, 9),
4345       HEX_DBL(+, 1, 8f69cee55fec, -, 62) },
4346     { HEX_DBL(+, 1, fee0a1a513253, -, 1), HEX_DBL(+, 1, 9f0a5523902ea, -, 9),
4347       HEX_DBL(+, 1, daec734b11615, -, 63) },
4348     { HEX_DBL(+, 1, fea0f15a12139, -, 1), HEX_DBL(+, 1, fb25e19f11b26, -, 9),
4349       HEX_DBL(+, 1, 8bafca62941da, -, 62) },
4350     { HEX_DBL(+, 1, fe6150ee3e6d4, -, 1), HEX_DBL(+, 1, 2b9af9a28e282, -, 8),
4351       HEX_DBL(+, 1, 0fd3674e1dc5b, -, 61) },
4352     { HEX_DBL(+, 1, fe21c05baa109, -, 1), HEX_DBL(+, 1, 599d4678f24b9, -, 8),
4353       HEX_DBL(+, 1, dafce1f09937b, -, 61) },
4354     { HEX_DBL(+, 1, fde23f9c69cf9, -, 1), HEX_DBL(+, 1, 8799d8c046eb, -, 8),
4355       HEX_DBL(+, 1, ffa0ce0bdd217, -, 65) },
4356     { HEX_DBL(+, 1, fda2ceaa956e8, -, 1), HEX_DBL(+, 1, b590b1e5951ee, -, 8),
4357       HEX_DBL(+, 1, 645a769232446, -, 62) },
4358     { HEX_DBL(+, 1, fd636d8047a1f, -, 1), HEX_DBL(+, 1, e381d3555dbcf, -, 8),
4359       HEX_DBL(+, 1, 882320d368331, -, 61) },
4360     { HEX_DBL(+, 1, fd241c179e0cc, -, 1), HEX_DBL(+, 1, 08b69f3dccde, -, 7),
4361       HEX_DBL(+, 1, 01ad5065aba9e, -, 61) },
4362     { HEX_DBL(+, 1, fce4da6ab93e8, -, 1), HEX_DBL(+, 1, 1fa97a61dd298, -, 7),
4363       HEX_DBL(+, 1, 84cd1f931ae34, -, 60) },
4364     { HEX_DBL(+, 1, fca5a873bcb19, -, 1), HEX_DBL(+, 1, 36997bcc54a3f, -, 7),
4365       HEX_DBL(+, 1, 1485e97eaee03, -, 60) },
4366     { HEX_DBL(+, 1, fc66862ccec93, -, 1), HEX_DBL(+, 1, 4d86a43264a4f, -, 7),
4367       HEX_DBL(+, 1, c75e63370988b, -, 61) },
4368     { HEX_DBL(+, 1, fc27739018cfe, -, 1), HEX_DBL(+, 1, 6470f448fb09d, -, 7),
4369       HEX_DBL(+, 1, d7361eeaed0a1, -, 65) },
4370     { HEX_DBL(+, 1, fbe87097c6f5a, -, 1), HEX_DBL(+, 1, 7b586cc4c2523, -, 7),
4371       HEX_DBL(+, 1, b3df952cc473c, -, 61) },
4372     { HEX_DBL(+, 1, fba97d3e084dd, -, 1), HEX_DBL(+, 1, 923d0e5a21e06, -, 7),
4373       HEX_DBL(+, 1, cf56c7b64ae5d, -, 62) },
4374     { HEX_DBL(+, 1, fb6a997d0ecdc, -, 1), HEX_DBL(+, 1, a91ed9bd3df9a, -, 7),
4375       HEX_DBL(+, 1, b957bdcd89e43, -, 61) },
4376     { HEX_DBL(+, 1, fb2bc54f0f4ab, -, 1), HEX_DBL(+, 1, bffdcfa1f7fbb, -, 7),
4377       HEX_DBL(+, 1, ea8cad9a21771, -, 62) },
4378     { HEX_DBL(+, 1, faed00ae41783, -, 1), HEX_DBL(+, 1, d6d9f0bbee6f6, -, 7),
4379       HEX_DBL(+, 1, 5762a9af89c82, -, 60) },
4380     { HEX_DBL(+, 1, faae4b94dfe64, -, 1), HEX_DBL(+, 1, edb33dbe7d335, -, 7),
4381       HEX_DBL(+, 1, 21e24fc245697, -, 62) },
4382     { HEX_DBL(+, 1, fa6fa5fd27ff8, -, 1), HEX_DBL(+, 1, 0244dbae5ed05, -, 6),
4383       HEX_DBL(+, 1, 12ef51b967102, -, 60) },
4384     { HEX_DBL(+, 1, fa310fe15a078, -, 1), HEX_DBL(+, 1, 0daeaf24c3529, -, 6),
4385       HEX_DBL(+, 1, 10d3cfca60b45, -, 59) },
4386     { HEX_DBL(+, 1, f9f2893bb9192, -, 1), HEX_DBL(+, 1, 1917199bb66bc, -, 6),
4387       HEX_DBL(+, 1, 6cf6034c32e19, -, 60) },
4388     { HEX_DBL(+, 1, f9b412068b247, -, 1), HEX_DBL(+, 1, 247e1b6c615d5, -, 6),
4389       HEX_DBL(+, 1, 42f0fffa229f7, -, 61) },
4390     { HEX_DBL(+, 1, f975aa3c18ed6, -, 1), HEX_DBL(+, 1, 2fe3b4efcc5ad, -, 6),
4391       HEX_DBL(+, 1, 70106136a8919, -, 60) },
4392     { HEX_DBL(+, 1, f93751d6ae09b, -, 1), HEX_DBL(+, 1, 3b47e67edea93, -, 6),
4393       HEX_DBL(+, 1, 38dd5a4f6959a, -, 59) },
4394     { HEX_DBL(+, 1, f8f908d098df6, -, 1), HEX_DBL(+, 1, 46aab0725ea6c, -, 6),
4395       HEX_DBL(+, 1, 821fc1e799e01, -, 60) },
4396     { HEX_DBL(+, 1, f8bacf242aa2c, -, 1), HEX_DBL(+, 1, 520c1322f1e4e, -, 6),
4397       HEX_DBL(+, 1, 129dcda3ad563, -, 60) },
4398     { HEX_DBL(+, 1, f87ca4cbb755, -, 1), HEX_DBL(+, 1, 5d6c0ee91d2ab, -, 6),
4399       HEX_DBL(+, 1, c5b190c04606e, -, 62) },
4400     { HEX_DBL(+, 1, f83e89c195c25, -, 1), HEX_DBL(+, 1, 68caa41d448c3, -, 6),
4401       HEX_DBL(+, 1, 4723441195ac9, -, 59) }
4402 };
4403 
4404 static double __loglTable3[8][3] = {
4405     { HEX_DBL(+, 1, 000e00c40ab89, +, 0), HEX_DBL(-, 1, 4332be0032168, -, 12),
4406       HEX_DBL(+, 1, a1003588d217a, -, 65) },
4407     { HEX_DBL(+, 1, 000a006403e82, +, 0), HEX_DBL(-, 1, cdb2987366fcc, -, 13),
4408       HEX_DBL(+, 1, 5c86001294bbc, -, 67) },
4409     { HEX_DBL(+, 1, 0006002400d8, +, 0), HEX_DBL(-, 1, 150297c90fa6f, -, 13),
4410       HEX_DBL(+, 1, 01fb4865fae32, -, 66) },
4411     { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
4412     { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
4413     { HEX_DBL(+, 1, ffe8011ff280a, -, 1), HEX_DBL(+, 1, 14f8daf5e3d3b, -, 12),
4414       HEX_DBL(+, 1, 3c933b4b6b914, -, 68) },
4415     { HEX_DBL(+, 1, ffd8031fc184e, -, 1), HEX_DBL(+, 1, cd978c38042bb, -, 12),
4416       HEX_DBL(+, 1, 10f8e642e66fd, -, 65) },
4417     { HEX_DBL(+, 1, ffc8061f5492b, -, 1), HEX_DBL(+, 1, 43183c878274e, -, 11),
4418       HEX_DBL(+, 1, 5885dd1eb6582, -, 65) }
4419 };
4420 
__log2_ep(double * hi,double * lo,double x)4421 static void __log2_ep(double *hi, double *lo, double x)
4422 {
4423     union {
4424         uint64_t i;
4425         double d;
4426     } uu;
4427 
4428     int m;
4429     double f = reference_frexp(x, &m);
4430 
4431     // bring f in [0.75, 1.5)
4432     if (f < 0.75)
4433     {
4434         f *= 2.0;
4435         m -= 1;
4436     }
4437 
4438     // index first table .... brings down to [1-2^-7, 1+2^6)
4439     uu.d = f;
4440     int index =
4441         (int)(((uu.i + ((uint64_t)1 << 51)) & 0x000fc00000000000ULL) >> 46);
4442     double r1 = __loglTable1[index][0];
4443     double logr1hi = __loglTable1[index][1];
4444     double logr1lo = __loglTable1[index][2];
4445     // since log1rhi has 39 bits of precision, we have 14 bit in hand ... since
4446     // |m| <= 1023 which needs 10bits at max, we can directly add m to log1hi
4447     // without spilling
4448     logr1hi += m;
4449 
4450     // argument reduction needs to be in double-double since reduced argument
4451     // will form the leading term of polynomial approximation which sets the
4452     // precision we eventually achieve
4453     double zhi, zlo;
4454     MulD(&zhi, &zlo, r1, uu.d);
4455 
4456     // second index table .... brings down to [1-2^-12, 1+2^-11)
4457     uu.d = zhi;
4458     index = (int)(((uu.i + ((uint64_t)1 << 46)) & 0x00007e0000000000ULL) >> 41);
4459     double r2 = __loglTable2[index][0];
4460     double logr2hi = __loglTable2[index][1];
4461     double logr2lo = __loglTable2[index][2];
4462 
4463     // reduce argument
4464     MulDD(&zhi, &zlo, zhi, zlo, r2, 0.0);
4465 
4466     // third index table .... brings down to [1-2^-14, 1+2^-13)
4467     // Actually reduction to 2^-11 would have been sufficient to calculate
4468     // second order term in polynomial in double rather than double-double, I
4469     // reduced it a bit more to make sure other systematic arithmetic errors
4470     // are guarded against .... also this allow lower order product of leading
4471     // polynomial term i.e. Ao_hi*z_lo + Ao_lo*z_hi to be done in double rather
4472     // than double-double ... hence only term that needs to be done in
4473     // double-double is Ao_hi*z_hi
4474     uu.d = zhi;
4475     index = (int)(((uu.i + ((uint64_t)1 << 41)) & 0x0000038000000000ULL) >> 39);
4476     double r3 = __loglTable3[index][0];
4477     double logr3hi = __loglTable3[index][1];
4478     double logr3lo = __loglTable3[index][2];
4479 
4480     // log2(x) = m + log2(r1) + log2(r1) + log2(1 + (zh + zlo))
4481     // calculate sum of first three terms ... note that m has already
4482     // been added to log2(r1)_hi
4483     double log2hi, log2lo;
4484     AddDD(&log2hi, &log2lo, logr1hi, logr1lo, logr2hi, logr2lo);
4485     AddDD(&log2hi, &log2lo, logr3hi, logr3lo, log2hi, log2lo);
4486 
4487     // final argument reduction .... zhi will be in [1-2^-14, 1+2^-13) after
4488     // this
4489     MulDD(&zhi, &zlo, zhi, zlo, r3, 0.0);
4490     // we dont need to do full double-double substract here. substracting 1.0
4491     // for higher term is exact
4492     zhi = zhi - 1.0;
4493     // normalize
4494     AddD(&zhi, &zlo, zhi, zlo);
4495 
4496     // polynomail fitting to compute log2(1 + z) ... forth order polynomial fit
4497     // to log2(1 + z)/z gives minimax absolute error of O(2^-76) with z in
4498     // [-2^-14, 2^-13] log2(1 + z)/z = Ao + A1*z + A2*z^2 + A3*z^3 + A4*z^4
4499     // => log2(1 + z) = Ao*z + A1*z^2 + A2*z^3 + A3*z^4 + A4*z^5
4500     // => log2(1 + z) = (Aohi + Aolo)*(zhi + zlo) + z^2*(A1 + A2*z + A3*z^2 +
4501     // A4*z^3) since we are looking for at least 64 digits of precision and z in
4502     // [-2^-14, 2^-13], final term can be done in double .... also Aolo*zhi +
4503     // Aohi*zlo can be done in double .... Aohi*zhi needs to be done in
4504     // double-double
4505 
4506     double Aohi = HEX_DBL(+, 1, 71547652b82fe, +, 0);
4507     double Aolo = HEX_DBL(+, 1, 777c9cbb675c, -, 56);
4508     double y;
4509     y = HEX_DBL(+, 1, 276d2736fade7, -, 2);
4510     y = HEX_DBL(-, 1, 7154765782df1, -, 2) + y * zhi;
4511     y = HEX_DBL(+, 1, ec709dc3a0f67, -, 2) + y * zhi;
4512     y = HEX_DBL(-, 1, 71547652b82fe, -, 1) + y * zhi;
4513     double zhisq = zhi * zhi;
4514     y = y * zhisq;
4515     y = y + zhi * Aolo;
4516     y = y + zlo * Aohi;
4517 
4518     MulD(&zhi, &zlo, Aohi, zhi);
4519     AddDD(&zhi, &zlo, zhi, zlo, y, 0.0);
4520     AddDD(&zhi, &zlo, zhi, zlo, log2hi, log2lo);
4521 
4522     *hi = zhi;
4523     *lo = zlo;
4524 }
4525 
reference_powl(long double x,long double y)4526 long double reference_powl(long double x, long double y)
4527 {
4528     // this will be used for testing doubles i.e. arguments will
4529     // be doubles so cast the input back to double ... returned
4530     // result will be long double though .... > 53 bits of precision
4531     // if platform allows.
4532     // ===========
4533     // New finding.
4534     // ===========
4535     // this function is getting used for computing reference cube root (cbrt)
4536     // as follows __powl( x, 1.0L/3.0L ) so if the y are assumed to
4537     // be double and is converted from long double to double, truncation
4538     // causes errors. So we need to tread y as long double and convert it
4539     // to hi, lo doubles when performing y*log2(x).
4540 
4541     static const double neg_epsilon = HEX_DBL(+, 1, 0, +, 53);
4542 
4543     // if x = 1, return x for any y, even NaN
4544     if (x == 1.0) return x;
4545 
4546     // if y == 0, return 1 for any x, even NaN
4547     if (y == 0.0) return 1.0L;
4548 
4549     // get NaNs out of the way
4550     if (x != x || y != y) return x + y;
4551 
4552     // do the work required to sort out edge cases
4553     double fabsy = (double)reference_fabsl(y);
4554     double fabsx = (double)reference_fabsl(x);
4555     double iy = reference_rint(
4556         fabsy); // we do round to nearest here so that |fy| <= 0.5
4557     if (iy > fabsy) // convert nearbyint to floor
4558         iy -= 1.0;
4559     int isOddInt = 0;
4560     if (fabsy == iy && !reference_isinf(fabsy) && iy < neg_epsilon)
4561         isOddInt = (int)(iy - 2.0 * rint(0.5 * iy)); // might be 0, -1, or 1
4562 
4563     /// test a few more edge cases
4564     // deal with x == 0 cases
4565     if (x == 0.0)
4566     {
4567         if (!isOddInt) x = 0.0;
4568 
4569         if (y < 0) x = 1.0 / x;
4570 
4571         return x;
4572     }
4573 
4574     // x == +-Inf cases
4575     if (isinf(fabsx))
4576     {
4577         if (x < 0)
4578         {
4579             if (isOddInt)
4580             {
4581                 if (y < 0)
4582                     return -0.0;
4583                 else
4584                     return -INFINITY;
4585             }
4586             else
4587             {
4588                 if (y < 0)
4589                     return 0.0;
4590                 else
4591                     return INFINITY;
4592             }
4593         }
4594 
4595         if (y < 0) return 0;
4596         return INFINITY;
4597     }
4598 
4599     // y = +-inf cases
4600     if (isinf(fabsy))
4601     {
4602         if (x == -1) return 1;
4603 
4604         if (y < 0)
4605         {
4606             if (fabsx < 1) return INFINITY;
4607             return 0;
4608         }
4609         if (fabsx < 1) return 0;
4610         return INFINITY;
4611     }
4612 
4613     // x < 0 and y non integer case
4614     if (x < 0 && iy != fabsy)
4615     {
4616         // return nan;
4617         return cl_make_nan();
4618     }
4619 
4620     // speedy resolution of sqrt and reciprocal sqrt
4621     if (fabsy == 0.5)
4622     {
4623         long double xl = sqrtl(x);
4624         if (y < 0) xl = 1.0 / xl;
4625         return xl;
4626     }
4627 
4628     double log2x_hi, log2x_lo;
4629 
4630     // extended precision log .... accurate to at least 64-bits + couple of
4631     // guard bits
4632     __log2_ep(&log2x_hi, &log2x_lo, fabsx);
4633 
4634     double ylog2x_hi, ylog2x_lo;
4635 
4636     double y_hi = (double)y;
4637     double y_lo = (double)(y - (long double)y_hi);
4638 
4639     // compute product of y*log2(x)
4640     // scale to avoid overflow in double-double multiplication
4641     if (fabsy > HEX_DBL(+, 1, 0, +, 970))
4642     {
4643         y_hi = reference_ldexp(y_hi, -53);
4644         y_lo = reference_ldexp(y_lo, -53);
4645     }
4646     MulDD(&ylog2x_hi, &ylog2x_lo, log2x_hi, log2x_lo, y_hi, y_lo);
4647     if (fabsy > HEX_DBL(+, 1, 0, +, 970))
4648     {
4649         ylog2x_hi = reference_ldexp(ylog2x_hi, 53);
4650         ylog2x_lo = reference_ldexp(ylog2x_lo, 53);
4651     }
4652 
4653     long double powxy;
4654     if (isinf(ylog2x_hi) || (reference_fabs(ylog2x_hi) > 2200))
4655     {
4656         powxy =
4657             reference_signbit(ylog2x_hi) ? HEX_DBL(+, 0, 0, +, 0) : INFINITY;
4658     }
4659     else
4660     {
4661         // separate integer + fractional part
4662         long int m = lrint(ylog2x_hi);
4663         AddDD(&ylog2x_hi, &ylog2x_lo, ylog2x_hi, ylog2x_lo, -m, 0.0);
4664 
4665         // revert to long double arithemtic
4666         long double ylog2x = (long double)ylog2x_hi + (long double)ylog2x_lo;
4667         long double tmp = reference_exp2l(ylog2x);
4668         powxy = reference_scalblnl(tmp, m);
4669     }
4670 
4671     // if y is odd integer and x is negative, reverse sign
4672     if (isOddInt & reference_signbit(x)) powxy = -powxy;
4673     return powxy;
4674 }
4675 
reference_nextafter(double xx,double yy)4676 double reference_nextafter(double xx, double yy)
4677 {
4678     float x = (float)xx;
4679     float y = (float)yy;
4680 
4681     // take care of nans
4682     if (x != x) return x;
4683 
4684     if (y != y) return y;
4685 
4686     if (x == y) return y;
4687 
4688     int32f_t a, b;
4689 
4690     a.f = x;
4691     b.f = y;
4692 
4693     if (a.i & 0x80000000) a.i = 0x80000000 - a.i;
4694     if (b.i & 0x80000000) b.i = 0x80000000 - b.i;
4695 
4696     a.i += (a.i < b.i) ? 1 : -1;
4697     a.i = (a.i < 0) ? (cl_int)0x80000000 - a.i : a.i;
4698 
4699     return a.f;
4700 }
4701 
4702 
reference_nextafterl(long double xx,long double yy)4703 long double reference_nextafterl(long double xx, long double yy)
4704 {
4705     double x = (double)xx;
4706     double y = (double)yy;
4707 
4708     // take care of nans
4709     if (x != x) return x;
4710 
4711     if (y != y) return y;
4712 
4713     int64d_t a, b;
4714 
4715     a.d = x;
4716     b.d = y;
4717 
4718     int64_t tmp = 0x8000000000000000LL;
4719 
4720     if (a.l & tmp) a.l = tmp - a.l;
4721     if (b.l & tmp) b.l = tmp - b.l;
4722 
4723     // edge case. if (x == y) or (x = 0.0f and y = -0.0f) or (x = -0.0f and y =
4724     // 0.0f) test needs to be done using integer rep because subnormals may be
4725     // flushed to zero on some platforms
4726     if (a.l == b.l) return y;
4727 
4728     a.l += (a.l < b.l) ? 1 : -1;
4729     a.l = (a.l < 0) ? tmp - a.l : a.l;
4730 
4731     return a.d;
4732 }
4733 
reference_fdim(double xx,double yy)4734 double reference_fdim(double xx, double yy)
4735 {
4736     float x = (float)xx;
4737     float y = (float)yy;
4738 
4739     if (x != x) return x;
4740 
4741     if (y != y) return y;
4742 
4743     float r = (x > y) ? (float)reference_subtract(x, y) : 0.0f;
4744     return r;
4745 }
4746 
4747 
reference_fdiml(long double xx,long double yy)4748 long double reference_fdiml(long double xx, long double yy)
4749 {
4750     double x = (double)xx;
4751     double y = (double)yy;
4752 
4753     if (x != x) return x;
4754 
4755     if (y != y) return y;
4756 
4757     double r = (x > y) ? (double)reference_subtractl(x, y) : 0.0;
4758     return r;
4759 }
4760 
reference_remquo(double xd,double yd,int * n)4761 double reference_remquo(double xd, double yd, int *n)
4762 {
4763     float xx = (float)xd;
4764     float yy = (float)yd;
4765 
4766     if (isnan(xx) || isnan(yy) || fabsf(xx) == INFINITY || yy == 0.0)
4767     {
4768         *n = 0;
4769         return cl_make_nan();
4770     }
4771 
4772     if (fabsf(yy) == INFINITY || xx == 0.0f)
4773     {
4774         *n = 0;
4775         return xd;
4776     }
4777 
4778     if (fabsf(xx) == fabsf(yy))
4779     {
4780         *n = (xx == yy) ? 1 : -1;
4781         return reference_signbit(xx) ? -0.0 : 0.0;
4782     }
4783 
4784     int signx = reference_signbit(xx) ? -1 : 1;
4785     int signy = reference_signbit(yy) ? -1 : 1;
4786     int signn = (signx == signy) ? 1 : -1;
4787     float x = fabsf(xx);
4788     float y = fabsf(yy);
4789 
4790     int ex, ey;
4791     ex = reference_ilogb(x);
4792     ey = reference_ilogb(y);
4793     float xr = x;
4794     float yr = y;
4795     uint32_t q = 0;
4796 
4797     if (ex - ey >= -1)
4798     {
4799 
4800         yr = (float)reference_ldexp(y, -ey);
4801         xr = (float)reference_ldexp(x, -ex);
4802 
4803         if (ex - ey >= 0)
4804         {
4805             int i;
4806             for (i = ex - ey; i > 0; i--)
4807             {
4808                 q <<= 1;
4809                 if (xr >= yr)
4810                 {
4811                     xr -= yr;
4812                     q += 1;
4813                 }
4814                 xr += xr;
4815             }
4816             q <<= 1;
4817             if (xr > yr)
4818             {
4819                 xr -= yr;
4820                 q += 1;
4821             }
4822         }
4823         else // ex-ey = -1
4824             xr = reference_ldexp(xr, ex - ey);
4825     }
4826 
4827     if ((yr < 2.0f * xr) || ((yr == 2.0f * xr) && (q & 0x00000001)))
4828     {
4829         xr -= yr;
4830         q += 1;
4831     }
4832 
4833     if (ex - ey >= -1) xr = reference_ldexp(xr, ey);
4834 
4835     int qout = q & 0x0000007f;
4836     if (signn < 0) qout = -qout;
4837     if (xx < 0.0) xr = -xr;
4838 
4839     *n = qout;
4840 
4841     return xr;
4842 }
4843 
reference_remquol(long double xd,long double yd,int * n)4844 long double reference_remquol(long double xd, long double yd, int *n)
4845 {
4846     double xx = (double)xd;
4847     double yy = (double)yd;
4848 
4849     if (isnan(xx) || isnan(yy) || fabs(xx) == INFINITY || yy == 0.0)
4850     {
4851         *n = 0;
4852         return cl_make_nan();
4853     }
4854 
4855     if (reference_fabs(yy) == INFINITY || xx == 0.0)
4856     {
4857         *n = 0;
4858         return xd;
4859     }
4860 
4861     if (reference_fabs(xx) == reference_fabs(yy))
4862     {
4863         *n = (xx == yy) ? 1 : -1;
4864         return reference_signbit(xx) ? -0.0 : 0.0;
4865     }
4866 
4867     int signx = reference_signbit(xx) ? -1 : 1;
4868     int signy = reference_signbit(yy) ? -1 : 1;
4869     int signn = (signx == signy) ? 1 : -1;
4870     double x = reference_fabs(xx);
4871     double y = reference_fabs(yy);
4872 
4873     int ex, ey;
4874     ex = reference_ilogbl(x);
4875     ey = reference_ilogbl(y);
4876     double xr = x;
4877     double yr = y;
4878     uint32_t q = 0;
4879 
4880     if (ex - ey >= -1)
4881     {
4882         yr = reference_ldexp(y, -ey);
4883         xr = reference_ldexp(x, -ex);
4884         int i;
4885 
4886         if (ex - ey >= 0)
4887         {
4888             for (i = ex - ey; i > 0; i--)
4889             {
4890                 q <<= 1;
4891                 if (xr >= yr)
4892                 {
4893                     xr -= yr;
4894                     q += 1;
4895                 }
4896                 xr += xr;
4897             }
4898             q <<= 1;
4899             if (xr > yr)
4900             {
4901                 xr -= yr;
4902                 q += 1;
4903             }
4904         }
4905         else
4906             xr = reference_ldexp(xr, ex - ey);
4907     }
4908 
4909     if ((yr < 2.0 * xr) || ((yr == 2.0 * xr) && (q & 0x00000001)))
4910     {
4911         xr -= yr;
4912         q += 1;
4913     }
4914 
4915     if (ex - ey >= -1) xr = reference_ldexp(xr, ey);
4916 
4917     int qout = q & 0x0000007f;
4918     if (signn < 0) qout = -qout;
4919     if (xx < 0.0) xr = -xr;
4920 
4921     *n = qout;
4922     return xr;
4923 }
4924 
reference_scalbn(double x,int n)4925 static double reference_scalbn(double x, int n)
4926 {
4927     if (reference_isinf(x) || reference_isnan(x) || x == 0.0) return x;
4928 
4929     int bias = 1023;
4930     union {
4931         double d;
4932         cl_long l;
4933     } u;
4934     u.d = (double)x;
4935     int e = (int)((u.l & 0x7ff0000000000000LL) >> 52);
4936     if (e == 0)
4937     {
4938         u.l |= ((cl_long)1023 << 52);
4939         u.d -= 1.0;
4940         e = (int)((u.l & 0x7ff0000000000000LL) >> 52) - 1022;
4941     }
4942     e += n;
4943     if (e >= 2047 || n >= 2098) return reference_copysign(INFINITY, x);
4944     if (e < -51 || n < -2097) return reference_copysign(0.0, x);
4945     if (e <= 0)
4946     {
4947         bias += (e - 1);
4948         e = 1;
4949     }
4950     u.l &= 0x800fffffffffffffLL;
4951     u.l |= ((cl_long)e << 52);
4952     x = u.d;
4953     u.l = ((cl_long)bias << 52);
4954     return x * u.d;
4955 }
4956 
reference_scalblnl(long double x,long n)4957 static long double reference_scalblnl(long double x, long n)
4958 {
4959 #if defined(__i386__) || defined(__x86_64__) // INTEL
4960     union {
4961         long double d;
4962         struct
4963         {
4964             cl_ulong m;
4965             cl_ushort sexp;
4966         } u;
4967     } u;
4968     u.u.m = CL_LONG_MIN;
4969 
4970     if (reference_isinf(x)) return x;
4971 
4972     if (x == 0.0L || n < -2200) return reference_copysignl(0.0L, x);
4973 
4974     if (n > 2200) return reference_copysignl(INFINITY, x);
4975 
4976     if (n < 0)
4977     {
4978         u.u.sexp = 0x3fff - 1022;
4979         while (n <= -1022)
4980         {
4981             x *= u.d;
4982             n += 1022;
4983         }
4984         u.u.sexp = 0x3fff + n;
4985         x *= u.d;
4986         return x;
4987     }
4988 
4989     if (n > 0)
4990     {
4991         u.u.sexp = 0x3fff + 1023;
4992         while (n >= 1023)
4993         {
4994             x *= u.d;
4995             n -= 1023;
4996         }
4997         u.u.sexp = 0x3fff + n;
4998         x *= u.d;
4999         return x;
5000     }
5001 
5002     return x;
5003 
5004 #elif defined(__arm__) // ARM .. sizeof(long double) == sizeof(double)
5005 
5006 #if __DBL_MAX_EXP__ >= __LDBL_MAX_EXP__
5007     if (reference_isinfl(x) || reference_isnanl(x)) return x;
5008 
5009     int bias = 1023;
5010     union {
5011         double d;
5012         cl_long l;
5013     } u;
5014     u.d = (double)x;
5015     int e = (int)((u.l & 0x7ff0000000000000LL) >> 52);
5016     if (e == 0)
5017     {
5018         u.l |= ((cl_long)1023 << 52);
5019         u.d -= 1.0;
5020         e = (int)((u.l & 0x7ff0000000000000LL) >> 52) - 1022;
5021     }
5022     e += n;
5023     if (e >= 2047) return reference_copysignl(INFINITY, x);
5024     if (e < -51) return reference_copysignl(0.0, x);
5025     if (e <= 0)
5026     {
5027         bias += (e - 1);
5028         e = 1;
5029     }
5030     u.l &= 0x800fffffffffffffLL;
5031     u.l |= ((cl_long)e << 52);
5032     x = u.d;
5033     u.l = ((cl_long)bias << 52);
5034     return x * u.d;
5035 #endif
5036 
5037 #else // PPC
5038     return scalblnl(x, n);
5039 #endif
5040 }
5041 
reference_relaxed_exp(double x)5042 double reference_relaxed_exp(double x) { return reference_exp(x); }
5043 
reference_exp(double x)5044 double reference_exp(double x)
5045 {
5046     return reference_exp2(x * HEX_DBL(+, 1, 71547652b82fe, +, 0));
5047 }
5048 
reference_expl(long double x)5049 long double reference_expl(long double x)
5050 {
5051 #if defined(__PPC__)
5052     long double scale, bias;
5053 
5054     // The PPC double long version of expl fails to produce denorm results
5055     // and instead generates a 0.0. Compensate for this limitation by
5056     // computing expl as:
5057     //     expl(x + 40) * expl(-40)
5058     // Likewise, overflows can prematurely produce an infinity, so we
5059     // compute expl as:
5060     //     expl(x - 40) * expl(40)
5061     scale = 1.0L;
5062     bias = 0.0L;
5063     if (x < -708.0L)
5064     {
5065         bias = 40.0;
5066         scale = expl(-40.0L);
5067     }
5068     else if (x > 708.0L)
5069     {
5070         bias = -40.0L;
5071         scale = expl(40.0L);
5072     }
5073     return expl(x + bias) * scale;
5074 #else
5075     return expl(x);
5076 #endif
5077 }
5078 
reference_sinh(double x)5079 double reference_sinh(double x) { return sinh(x); }
5080 
reference_sinhl(long double x)5081 long double reference_sinhl(long double x) { return sinhl(x); }
5082 
reference_fmod(double x,double y)5083 double reference_fmod(double x, double y)
5084 {
5085     if (x == 0.0 && fabs(y) > 0.0) return x;
5086 
5087     if (fabs(x) == INFINITY || y == 0) return cl_make_nan();
5088 
5089     if (fabs(y) == INFINITY) // we know x is finite from above
5090         return x;
5091 #if defined(_MSC_VER) && defined(_M_X64)
5092     return fmod(x, y);
5093 #else
5094     return fmodf((float)x, (float)y);
5095 #endif
5096 }
5097 
reference_fmodl(long double x,long double y)5098 long double reference_fmodl(long double x, long double y)
5099 {
5100     if (x == 0.0L && fabsl(y) > 0.0L) return x;
5101 
5102     if (fabsl(x) == INFINITY || y == 0.0L) return cl_make_nan();
5103 
5104     if (fabsl(y) == INFINITY) // we know x is finite from above
5105         return x;
5106 
5107     return fmod((double)x, (double)y);
5108 }
5109 
reference_modf(double x,double * n)5110 double reference_modf(double x, double *n)
5111 {
5112     if (isnan(x))
5113     {
5114         *n = cl_make_nan();
5115         return cl_make_nan();
5116     }
5117     float nr;
5118     float yr = modff((float)x, &nr);
5119     *n = nr;
5120     return yr;
5121 }
5122 
reference_modfl(long double x,long double * n)5123 long double reference_modfl(long double x, long double *n)
5124 {
5125     if (isnan(x))
5126     {
5127         *n = cl_make_nan();
5128         return cl_make_nan();
5129     }
5130     double nr;
5131     double yr = modf((double)x, &nr);
5132     *n = nr;
5133     return yr;
5134 }
5135 
reference_fractl(long double x,long double * ip)5136 long double reference_fractl(long double x, long double *ip)
5137 {
5138     if (isnan(x))
5139     {
5140         *ip = cl_make_nan();
5141         return cl_make_nan();
5142     }
5143 
5144     double i;
5145     double f = modf((double)x, &i);
5146     if (f < 0.0)
5147     {
5148         f = 1.0 + f;
5149         i -= 1.0;
5150         if (f == 1.0) f = HEX_DBL(+, 1, fffffffffffff, -, 1);
5151     }
5152     *ip = i;
5153     return f;
5154 }
5155 
reference_fabsl(long double x)5156 long double reference_fabsl(long double x) { return fabsl(x); }
5157 
reference_relaxed_log(double x)5158 double reference_relaxed_log(double x)
5159 {
5160     return (float)reference_log((float)x);
5161 }
5162 
reference_log(double x)5163 double reference_log(double x)
5164 {
5165     if (x == 0.0) return -INFINITY;
5166 
5167     if (x < 0.0) return cl_make_nan();
5168 
5169     if (isinf(x)) return INFINITY;
5170 
5171     double log2Hi = HEX_DBL(+, 1, 62e42fefa39ef, -, 1);
5172     double logxHi, logxLo;
5173     __log2_ep(&logxHi, &logxLo, x);
5174     return logxHi * log2Hi;
5175 }
5176 
reference_logl(long double x)5177 long double reference_logl(long double x)
5178 {
5179     if (x == 0.0) return -INFINITY;
5180 
5181     if (x < 0.0) return cl_make_nan();
5182 
5183     if (isinf(x)) return INFINITY;
5184 
5185     double log2Hi = HEX_DBL(+, 1, 62e42fefa39ef, -, 1);
5186     double log2Lo = HEX_DBL(+, 1, abc9e3b39803f, -, 56);
5187     double logxHi, logxLo;
5188     __log2_ep(&logxHi, &logxLo, x);
5189 
5190     long double lg2 = (long double)log2Hi + (long double)log2Lo;
5191     long double logx = (long double)logxHi + (long double)logxLo;
5192     return logx * lg2;
5193 }
5194 
reference_relaxed_pow(double x,double y)5195 double reference_relaxed_pow(double x, double y)
5196 {
5197     return (float)reference_exp2(((float)y) * (float)reference_log2((float)x));
5198 }
5199 
reference_pow(double x,double y)5200 double reference_pow(double x, double y)
5201 {
5202     static const double neg_epsilon = HEX_DBL(+, 1, 0, +, 53);
5203 
5204     // if x = 1, return x for any y, even NaN
5205     if (x == 1.0) return x;
5206 
5207     // if y == 0, return 1 for any x, even NaN
5208     if (y == 0.0) return 1.0;
5209 
5210     // get NaNs out of the way
5211     if (x != x || y != y) return x + y;
5212 
5213     // do the work required to sort out edge cases
5214     double fabsy = reference_fabs(y);
5215     double fabsx = reference_fabs(x);
5216     double iy = reference_rint(
5217         fabsy); // we do round to nearest here so that |fy| <= 0.5
5218     if (iy > fabsy) // convert nearbyint to floor
5219         iy -= 1.0;
5220     int isOddInt = 0;
5221     if (fabsy == iy && !reference_isinf(fabsy) && iy < neg_epsilon)
5222         isOddInt = (int)(iy - 2.0 * rint(0.5 * iy)); // might be 0, -1, or 1
5223 
5224     /// test a few more edge cases
5225     // deal with x == 0 cases
5226     if (x == 0.0)
5227     {
5228         if (!isOddInt) x = 0.0;
5229 
5230         if (y < 0) x = 1.0 / x;
5231 
5232         return x;
5233     }
5234 
5235     // x == +-Inf cases
5236     if (isinf(fabsx))
5237     {
5238         if (x < 0)
5239         {
5240             if (isOddInt)
5241             {
5242                 if (y < 0)
5243                     return -0.0;
5244                 else
5245                     return -INFINITY;
5246             }
5247             else
5248             {
5249                 if (y < 0)
5250                     return 0.0;
5251                 else
5252                     return INFINITY;
5253             }
5254         }
5255 
5256         if (y < 0) return 0;
5257         return INFINITY;
5258     }
5259 
5260     // y = +-inf cases
5261     if (isinf(fabsy))
5262     {
5263         if (x == -1) return 1;
5264 
5265         if (y < 0)
5266         {
5267             if (fabsx < 1) return INFINITY;
5268             return 0;
5269         }
5270         if (fabsx < 1) return 0;
5271         return INFINITY;
5272     }
5273 
5274     // x < 0 and y non integer case
5275     if (x < 0 && iy != fabsy)
5276     {
5277         // return nan;
5278         return cl_make_nan();
5279     }
5280 
5281     // speedy resolution of sqrt and reciprocal sqrt
5282     if (fabsy == 0.5)
5283     {
5284         long double xl = reference_sqrt(x);
5285         if (y < 0) xl = 1.0 / xl;
5286         return xl;
5287     }
5288 
5289     double hi, lo;
5290     __log2_ep(&hi, &lo, fabsx);
5291     double prod = y * hi;
5292     double result = reference_exp2(prod);
5293     return isOddInt ? reference_copysignd(result, x) : result;
5294 }
5295 
reference_sqrt(double x)5296 double reference_sqrt(double x) { return sqrt(x); }
5297 
reference_floor(double x)5298 double reference_floor(double x) { return floorf((float)x); }
5299 
reference_ldexp(double value,int exponent)5300 double reference_ldexp(double value, int exponent)
5301 {
5302 #ifdef __MINGW32__
5303     /*
5304      * ====================================================
5305      * This function is from fdlibm: http://www.netlib.org
5306      *   It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
5307      *
5308      * Developed at SunSoft, a Sun Microsystems, Inc. business.
5309      * Permission to use, copy, modify, and distribute this
5310      * software is freely granted, provided that this notice
5311      * is preserved.
5312      * ====================================================
5313      */
5314     if (!finite(value) || value == 0.0) return value;
5315     return scalbn(value, exponent);
5316 #else
5317     return reference_scalbn(value, exponent);
5318 #endif
5319 }
5320 
reference_ldexpl(long double x,int n)5321 long double reference_ldexpl(long double x, int n) { return ldexpl(x, n); }
5322 
reference_coshl(long double x)5323 long double reference_coshl(long double x) { return coshl(x); }
5324 
reference_ceil(double x)5325 double reference_ceil(double x) { return ceilf((float)x); }
5326 
reference_ceill(long double x)5327 long double reference_ceill(long double x)
5328 {
5329     if (x == 0.0 || reference_isinfl(x) || reference_isnanl(x)) return x;
5330 
5331     long double absx = reference_fabsl(x);
5332     if (absx >= HEX_LDBL(+, 1, 0, +, 52)) return x;
5333 
5334     if (absx < 1.0)
5335     {
5336         if (x < 0.0)
5337             return 0.0;
5338         else
5339             return 1.0;
5340     }
5341 
5342     long double r = (long double)((cl_long)x);
5343 
5344     if (x > 0.0 && r < x) r += 1.0;
5345 
5346     return r;
5347 }
5348 
5349 
reference_acosl(long double x)5350 long double reference_acosl(long double x)
5351 {
5352     long double x2 = x * x;
5353     int i;
5354 
5355     // Prepare a head + tail representation of PI in long double.  A good
5356     // compiler should get rid of all of this work.
5357     static const cl_ulong pi_bits[2] = {
5358         0x3243F6A8885A308DULL, 0x313198A2E0370734ULL
5359     }; // first 126 bits of pi
5360        // http://www.super-computing.org/pi-hexa_current.html
5361     long double head, tail;
5362 #if __LDBL_MANT_DIG__ >= 64
5363     // long double has 64-bits of precision or greater
5364     long double temp = (long double)pi_bits[0] * 0x1.0p64L;
5365     head = temp + (long double)pi_bits[1];
5366     temp -= head; // rounding err rounding pi_bits[1] into head
5367     tail = (long double)pi_bits[1] + temp;
5368     head *= HEX_LDBL(+, 1, 0, -, 125);
5369     tail *= HEX_LDBL(+, 1, 0, -, 125);
5370 #else
5371     head = (long double)pi_bits[0];
5372     tail =
5373         (long double)((cl_long)pi_bits[0]
5374                       - (cl_long)
5375                           head); // residual part of pi_bits[0] after rounding
5376     tail = tail * HEX_LDBL(+, 1, 0, +, 64) + (long double)pi_bits[1];
5377     head *= HEX_LDBL(+, 1, 0, -, 61);
5378     tail *= HEX_LDBL(+, 1, 0, -, 125);
5379 #endif
5380 
5381     // oversize values and NaNs go to NaN
5382     if (!(x2 <= 1.0)) return sqrtl(1.0L - x2);
5383 
5384     //
5385     // deal with large |x|:
5386     //                                                      sqrt( 1 - x**2)
5387     // acos(|x| > sqrt(0.5)) = 2 * atan( z );       z = -------------------- ;
5388     // z in [0, sqrt(0.5)/(1+sqrt(0.5) = .4142135...]
5389     //                                                          1 + x
5390     if (x2 > 0.5)
5391     {
5392         // we handle the x < 0 case as pi - acos(|x|)
5393 
5394         long double sign = reference_copysignl(1.0L, x);
5395         long double fabsx = reference_fabsl(x);
5396         head -= head * sign; // x > 0 ? 0 : pi.hi
5397         tail -= tail * sign; // x > 0 ? 0 : pi.low
5398 
5399         // z = sqrt( 1-x**2 ) / (1+x) = sqrt( (1-x)(1+x) / (1+x)**2 ) = sqrt(
5400         // (1-x)/(1+x) )
5401         long double z2 = (1.0L - fabsx) / (1.0L + fabsx); // z**2
5402         long double z = sign * sqrtl(z2);
5403 
5404         //                     atan(sqrt(q))
5405         // Minimax fit p(x) = ---------------- - 1
5406         //                        sqrt(q)
5407         //
5408         // Define q = r*r, and solve for atan(r):
5409         //
5410         //  atan(r) = (p(r) + 1) * r = rp(r) + r
5411         static long double atan_coeffs[] = {
5412             HEX_LDBL(-, b, 3f52e0c278293b3, -, 67),
5413             HEX_LDBL(-, a, aaaaaaaaaaa95b8, -, 5),
5414             HEX_LDBL(+, c, ccccccccc992407, -, 6),
5415             HEX_LDBL(-, 9, 24924923024398, -, 6),
5416             HEX_LDBL(+, e, 38e38d6f92c98f3, -, 7),
5417             HEX_LDBL(-, b, a2e89bfb8393ec6, -, 7),
5418             HEX_LDBL(+, 9, d89a9f574d412cb, -, 7),
5419             HEX_LDBL(-, 8, 88580517884c547, -, 7),
5420             HEX_LDBL(+, f, 0ab6756abdad408, -, 8),
5421             HEX_LDBL(-, d, 56a5b07a2f15b49, -, 8),
5422             HEX_LDBL(+, b, 72ab587e46d80b2, -, 8),
5423             HEX_LDBL(-, 8, 62ea24bb5b2e636, -, 8),
5424             HEX_LDBL(+, e, d67c16582123937, -, 10)
5425         }; // minimax fit over [ 0x1.0p-52, 0.18]   Max error:
5426            // 0x1.67ea5c184e5d9p-64
5427 
5428         // Calculate y = p(r)
5429         const size_t atan_coeff_count =
5430             sizeof(atan_coeffs) / sizeof(atan_coeffs[0]);
5431         long double y = atan_coeffs[atan_coeff_count - 1];
5432         for (i = (int)atan_coeff_count - 2; i >= 0; i--)
5433             y = atan_coeffs[i] + y * z2;
5434 
5435         z *= 2.0L; // fold in 2.0 for 2.0 * atan(z)
5436         y *= z; // rp(r)
5437 
5438         return head + ((y + tail) + z);
5439     }
5440 
5441     // do |x| <= sqrt(0.5) here
5442     //                                                     acos( sqrt(z) ) -
5443     //                                                     PI/2
5444     //  Piecewise minimax polynomial fits for p(z) = 1 +
5445     //  ------------------------;
5446     //                                                            sqrt(z)
5447     //
5448     //  Define z = x*x, and solve for acos(x) over x in  x >= 0:
5449     //
5450     //      acos( sqrt(z) ) = acos(x) = x*(p(z)-1) + PI/2 = xp(x**2) - x + PI/2
5451     //
5452     const long double coeffs[4][14] = {
5453         { HEX_LDBL(-, a, fa7382e1f347974, -, 10),
5454           HEX_LDBL(-, b, 4d5a992de1ac4da, -, 6),
5455           HEX_LDBL(-, a, c526184bd558c17, -, 7),
5456           HEX_LDBL(-, d, 9ed9b0346ec092a, -, 8),
5457           HEX_LDBL(-, 9, dca410c1f04b1f, -, 8),
5458           HEX_LDBL(-, f, 76e411ba9581ee5, -, 9),
5459           HEX_LDBL(-, c, c71b00479541d8e, -, 9),
5460           HEX_LDBL(-, a, f527a3f9745c9de, -, 9),
5461           HEX_LDBL(-, 9, a93060051f48d14, -, 9),
5462           HEX_LDBL(-, 8, b3d39ad70e06021, -, 9),
5463           HEX_LDBL(-, f, f2ab95ab84f79c, -, 10),
5464           HEX_LDBL(-, e, d1af5f5301ccfe4, -, 10),
5465           HEX_LDBL(-, e, 1b53ba562f0f74a, -, 10),
5466           HEX_LDBL(-, d, 6a3851330e15526, -,
5467                    10) }, // x - 0.0625 in [ -0x1.fffffffffp-5, 0x1.0p-4 ]
5468                           // Error: 0x1.97839bf07024p-76
5469 
5470         { HEX_LDBL(-, 8, c2f1d638e4c1b48, -, 8),
5471           HEX_LDBL(-, c, d47ac903c311c2c, -, 6),
5472           HEX_LDBL(-, d, e020b2dabd5606a, -, 7),
5473           HEX_LDBL(-, a, 086fafac220f16b, -, 7),
5474           HEX_LDBL(-, 8, 55b5efaf6b86c3e, -, 7),
5475           HEX_LDBL(-, f, 05c9774fed2f571, -, 8),
5476           HEX_LDBL(-, e, 484a93f7f0fc772, -, 8),
5477           HEX_LDBL(-, e, 1a32baef01626e4, -, 8),
5478           HEX_LDBL(-, e, 528e525b5c9c73d, -, 8),
5479           HEX_LDBL(-, e, ddd5d27ad49b2c8, -, 8),
5480           HEX_LDBL(-, f, b3259e7ae10c6f, -, 8),
5481           HEX_LDBL(-, 8, 68998170d5b19b7, -, 7),
5482           HEX_LDBL(-, 9, 4468907f007727, -, 7),
5483           HEX_LDBL(-, a, 2ad5e4906a8e7b3, -,
5484                    7) }, // x - 0.1875 in [ -0x1.0p-4, 0x1.0p-4 ]    Error:
5485                          // 0x1.647af70073457p-73
5486 
5487         { HEX_LDBL(-, f, a76585ad399e7ac, -, 8),
5488           HEX_LDBL(-, e, d665b7dd504ca7c, -, 6),
5489           HEX_LDBL(-, 9, 4c7c2402bd4bc33, -, 6),
5490           HEX_LDBL(-, f, ba76b69074ff71c, -, 7),
5491           HEX_LDBL(-, f, 58117784bdb6d5f, -, 7),
5492           HEX_LDBL(-, 8, 22ddd8eef53227d, -, 6),
5493           HEX_LDBL(-, 9, 1d1d3b57a63cdb4, -, 6),
5494           HEX_LDBL(-, a, 9c4bdc40cca848, -, 6),
5495           HEX_LDBL(-, c, b673b12794edb24, -, 6),
5496           HEX_LDBL(-, f, 9290a06e31575bf, -, 6),
5497           HEX_LDBL(-, 9, b4929c16aeb3d1f, -, 5),
5498           HEX_LDBL(-, c, 461e725765a7581, -, 5),
5499           HEX_LDBL(-, 8, 0a59654c98d9207, -, 4),
5500           HEX_LDBL(-, a, 6de6cbd96c80562, -,
5501                    4) }, // x - 0.3125 in [ -0x1.0p-4, 0x1.0p-4 ]   Error:
5502                          // 0x1.b0246c304ce1ap-70
5503 
5504         { HEX_LDBL(-, b, dca8b0359f96342, -, 7),
5505           HEX_LDBL(-, 8, cd2522fcde9823, -, 5),
5506           HEX_LDBL(-, d, 2af9397b27ff74d, -, 6),
5507           HEX_LDBL(-, d, 723f2c2c2409811, -, 6),
5508           HEX_LDBL(-, f, ea8f8481ecc3cd1, -, 6),
5509           HEX_LDBL(-, a, 43fd8a7a646b0b2, -, 5),
5510           HEX_LDBL(-, e, 01b0bf63a4e8d76, -, 5),
5511           HEX_LDBL(-, 9, f0b7096a2a7b4d, -, 4),
5512           HEX_LDBL(-, e, 872e7c5a627ab4c, -, 4),
5513           HEX_LDBL(-, a, dbd760a1882da48, -, 3),
5514           HEX_LDBL(-, 8, 424e4dea31dd273, -, 2),
5515           HEX_LDBL(-, c, c05d7730963e793, -, 2),
5516           HEX_LDBL(-, a, 523d97197cd124a, -, 1),
5517           HEX_LDBL(-, 8, 307ba943978aaee, +,
5518                    0) } // x - 0.4375 in [ -0x1.0p-4, 0x1.0p-4 ]  Error:
5519                         // 0x1.9ecff73da69c9p-66
5520     };
5521 
5522     const long double offsets[4] = { 0.0625, 0.1875, 0.3125, 0.4375 };
5523     const size_t coeff_count = sizeof(coeffs[0]) / sizeof(coeffs[0][0]);
5524 
5525     // reduce the incoming values a bit so that they are in the range
5526     // [-0x1.0p-4, 0x1.0p-4]
5527     const long double *c;
5528     i = x2 * 8.0L;
5529     c = coeffs[i];
5530     x2 -= offsets[i]; // exact
5531 
5532     // calcualte p(x2)
5533     long double y = c[coeff_count - 1];
5534     for (i = (int)coeff_count - 2; i >= 0; i--) y = c[i] + y * x2;
5535 
5536     // xp(x2)
5537     y *= x;
5538 
5539     // return xp(x2) - x + PI/2
5540     return head + ((y + tail) - x);
5541 }
5542 
reference_relaxed_acos(double x)5543 double reference_relaxed_acos(double x) { return reference_acos(x); }
5544 
reference_log10(double x)5545 double reference_log10(double x)
5546 {
5547     if (x == 0.0) return -INFINITY;
5548 
5549     if (x < 0.0) return cl_make_nan();
5550 
5551     if (isinf(x)) return INFINITY;
5552 
5553     double log2Hi = HEX_DBL(+, 1, 34413509f79fe, -, 2);
5554     double logxHi, logxLo;
5555     __log2_ep(&logxHi, &logxLo, x);
5556     return logxHi * log2Hi;
5557 }
5558 
reference_relaxed_log10(double x)5559 double reference_relaxed_log10(double x) { return reference_log10(x); }
5560 
reference_log10l(long double x)5561 long double reference_log10l(long double x)
5562 {
5563     if (x == 0.0) return -INFINITY;
5564 
5565     if (x < 0.0) return cl_make_nan();
5566 
5567     if (isinf(x)) return INFINITY;
5568 
5569     double log2Hi = HEX_DBL(+, 1, 34413509f79fe, -, 2);
5570     double log2Lo = HEX_DBL(+, 1, e623e2566b02d, -, 55);
5571     double logxHi, logxLo;
5572     __log2_ep(&logxHi, &logxLo, x);
5573 
5574     long double lg2 = (long double)log2Hi + (long double)log2Lo;
5575     long double logx = (long double)logxHi + (long double)logxLo;
5576     return logx * lg2;
5577 }
5578 
reference_acos(double x)5579 double reference_acos(double x) { return acos(x); }
5580 
reference_atan2(double x,double y)5581 double reference_atan2(double x, double y)
5582 {
5583 #if defined(_WIN32)
5584     // fix edge cases for Windows
5585     if (isinf(x) && isinf(y))
5586     {
5587         double retval = (y > 0) ? M_PI_4 : 3.f * M_PI_4;
5588         return (x > 0) ? retval : -retval;
5589     }
5590 #endif // _WIN32
5591     return atan2(x, y);
5592 }
5593 
reference_atan2l(long double x,long double y)5594 long double reference_atan2l(long double x, long double y)
5595 {
5596 #if defined(_WIN32)
5597     // fix edge cases for Windows
5598     if (isinf(x) && isinf(y))
5599     {
5600         long double retval = (y > 0) ? M_PI_4 : 3.f * M_PI_4;
5601         return (x > 0) ? retval : -retval;
5602     }
5603 #endif // _WIN32
5604     return atan2l(x, y);
5605 }
5606 
reference_frexp(double a,int * exp)5607 double reference_frexp(double a, int *exp)
5608 {
5609     if (isnan(a) || isinf(a) || a == 0.0)
5610     {
5611         *exp = 0;
5612         return a;
5613     }
5614 
5615     union {
5616         cl_double d;
5617         cl_ulong l;
5618     } u;
5619 
5620     u.d = a;
5621 
5622     // separate out sign
5623     cl_ulong s = u.l & 0x8000000000000000ULL;
5624     u.l &= 0x7fffffffffffffffULL;
5625     int bias = -1022;
5626 
5627     if ((u.l & 0x7ff0000000000000ULL) == 0)
5628     {
5629         double d = u.l;
5630         u.d = d;
5631         bias -= 1074;
5632     }
5633 
5634     int e = (int)((u.l & 0x7ff0000000000000ULL) >> 52);
5635     u.l &= 0x000fffffffffffffULL;
5636     e += bias;
5637     u.l |= ((cl_ulong)1022 << 52);
5638     u.l |= s;
5639 
5640     *exp = e;
5641     return u.d;
5642 }
5643 
reference_frexpl(long double a,int * exp)5644 long double reference_frexpl(long double a, int *exp)
5645 {
5646     if (isnan(a) || isinf(a) || a == 0.0)
5647     {
5648         *exp = 0;
5649         return a;
5650     }
5651 
5652     if (sizeof(long double) == sizeof(double))
5653     {
5654         return reference_frexp(a, exp);
5655     }
5656     else
5657     {
5658         return frexpl(a, exp);
5659     }
5660 }
5661 
5662 
reference_atan(double x)5663 double reference_atan(double x) { return atan(x); }
5664 
reference_atanl(long double x)5665 long double reference_atanl(long double x) { return atanl(x); }
5666 
reference_asinl(long double x)5667 long double reference_asinl(long double x) { return asinl(x); }
5668 
reference_asin(double x)5669 double reference_asin(double x) { return asin(x); }
5670 
reference_relaxed_asin(double x)5671 double reference_relaxed_asin(double x) { return reference_asin(x); }
5672 
reference_fabs(double x)5673 double reference_fabs(double x) { return fabs(x); }
5674 
reference_cosh(double x)5675 double reference_cosh(double x) { return cosh(x); }
5676 
reference_sqrtl(long double x)5677 long double reference_sqrtl(long double x)
5678 {
5679 #if defined(__SSE2__)                                                          \
5680     || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
5681     __m128d result128 = _mm_set_sd((double)x);
5682     result128 = _mm_sqrt_sd(result128, result128);
5683     return _mm_cvtsd_f64(result128);
5684 #else
5685     volatile double dx = x;
5686     return sqrt(dx);
5687 #endif
5688 }
5689 
reference_tanhl(long double x)5690 long double reference_tanhl(long double x) { return tanhl(x); }
5691 
reference_floorl(long double x)5692 long double reference_floorl(long double x)
5693 {
5694     if (x == 0.0 || reference_isinfl(x) || reference_isnanl(x)) return x;
5695 
5696     long double absx = reference_fabsl(x);
5697     if (absx >= HEX_LDBL(+, 1, 0, +, 52)) return x;
5698 
5699     if (absx < 1.0)
5700     {
5701         if (x < 0.0)
5702             return -1.0;
5703         else
5704             return 0.0;
5705     }
5706 
5707     long double r = (long double)((cl_long)x);
5708 
5709     if (x < 0.0 && r > x) r -= 1.0;
5710 
5711     return r;
5712 }
5713 
5714 
reference_tanh(double x)5715 double reference_tanh(double x) { return tanh(x); }
5716 
reference_assignmentl(long double x)5717 long double reference_assignmentl(long double x) { return x; }
5718 
reference_notl(long double x)5719 int reference_notl(long double x)
5720 {
5721     int r = !x;
5722     return r;
5723 }
5724