xref: /aosp_15_r20/external/pytorch/aten/src/ATen/native/Math.h (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #pragma once
2 
3 #include <ATen/AccumulateType.h>
4 #include <ATen/NumericUtils.h>
5 #include <ATen/jiterator_macros.h>
6 #include <c10/util/BFloat16.h>
7 #include <c10/util/Half.h>
8 #include <c10/util/MathConstants.h>
9 #include <cfloat>
10 #include <cmath>
11 #include <cstdint>
12 #include <cstdlib>
13 #include <limits>
14 #include <type_traits>
15 
16 C10_CLANG_DIAGNOSTIC_PUSH()
17 #if C10_CLANG_HAS_WARNING("-Wimplicit-float-conversion")
18 C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-float-conversion")
19 #endif
20 
21 /* The next function is taken from  https://github.com/antelopeusersgroup/antelope_contrib/blob/master/lib/location/libgenloc/erfinv.c.
22 Below is the copyright.
23 Output was modified to be inf or -inf when input is 1 or -1. */
24 
25 
26 /*
27     Copyright (c) 2014 Indiana University
28     All rights reserved.
29 
30     Written by Prof. Gary L. Pavlis, Dept. of Geol. Sci.,
31             Indiana University, Bloomington, IN
32 
33     This software is licensed under the New BSD license:
34 
35     Redistribution and use in source and binary forms,
36     with or without modification, are permitted provided
37     that the following conditions are met:
38 
39     Redistributions of source code must retain the above
40     copyright notice, this list of conditions and the
41     following disclaimer.
42 
43     Redistributions in binary form must reproduce the
44     above copyright notice, this list of conditions and
45     the following disclaimer in the documentation and/or
46     other materials provided with the distribution.
47 
48     Neither the name of Indiana University nor
49     the names of its contributors may be used to endorse
50     or promote products derived from this software without
51     specific prior written permission.
52 
53     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
54     CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
55     WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
56     WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
57     PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
58     THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY
59     DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
60     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
61     PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
62     USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
63     HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
64     IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
65     NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
66     USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
67     POSSIBILITY OF SUCH DAMAGE.
68 */
69 
70 namespace {
71 /*
72  * This function is derived from the implementation of the i0e function in the
73  * Cephes Math Library. See note [3-Clause BSD License for the Cephes Math
74  * Library].
75  *
76  * Computes an approximation of the exponentially scaled zeroth order modified
77  * Bessel function of the first kind. The approximation is actually two
78  * (sub)approximations, both using a Chebyshev polynomial expansion. One
79  * approximates the function over [0, 8], and the other over (8, infinity). This
80  * function takes the absolute value of all inputs to convert them into the
81  * domain of the approximation.
82  */
83 jiterator_also_stringify_as(jiterator_code(
84   template <typename T>
85   JITERATOR_HOST_DEVICE T chbevl(T x, const T array[], const int len) {
86     T b0, b1, b2;
87 
88     b0 = array[0];
89     b1 = 0;
90 
91     for (int i = 1; i < len; ++i) {
92       b2 = b1;
93       b1 = b0;
94       b0 = x * b1 - b2 + array[i];
95     }
96 
97     return T{0.5} * (b0 - b2);
98   }
99 
100   template <typename T>
101   JITERATOR_HOST_DEVICE T calc_i0e(T _x) {
102     T x = std::fabs(_x);
103 
104     if (x <= T{8.0}) {
105       static const T coefficients[] = {
106           -4.41534164647933937950E-18, 3.33079451882223809783E-17,
107           -2.43127984654795469359E-16, 1.71539128555513303061E-15,
108           -1.16853328779934516808E-14, 7.67618549860493561688E-14,
109           -4.85644678311192946090E-13, 2.95505266312963983461E-12,
110           -1.72682629144155570723E-11, 9.67580903537323691224E-11,
111           -5.18979560163526290666E-10, 2.65982372468238665035E-9,
112           -1.30002500998624804212E-8,  6.04699502254191894932E-8,
113           -2.67079385394061173391E-7,  1.11738753912010371815E-6,
114           -4.41673835845875056359E-6,  1.64484480707288970893E-5,
115           -5.75419501008210370398E-5,  1.88502885095841655729E-4,
116           -5.76375574538582365885E-4,  1.63947561694133579842E-3,
117           -4.32430999505057594430E-3,  1.05464603945949983183E-2,
118           -2.37374148058994688156E-2,  4.93052842396707084878E-2,
119           -9.49010970480476444210E-2,  1.71620901522208775349E-1,
120           -3.04682672343198398683E-1,  6.76795274409476084995E-1};
121 
122       T y = (x / T{2.0}) - T{2.0};
123       return chbevl(y, coefficients, int{30});
124     }
125 
126     // x > 8
127     static const T coefficients[] = {
128         -7.23318048787475395456E-18, -4.83050448594418207126E-18,
129         4.46562142029675999901E-17,  3.46122286769746109310E-17,
130         -2.82762398051658348494E-16, -3.42548561967721913462E-16,
131         1.77256013305652638360E-15,  3.81168066935262242075E-15,
132         -9.55484669882830764870E-15, -4.15056934728722208663E-14,
133         1.54008621752140982691E-14,  3.85277838274214270114E-13,
134         7.18012445138366623367E-13,  -1.79417853150680611778E-12,
135         -1.32158118404477131188E-11, -3.14991652796324136454E-11,
136         1.18891471078464383424E-11,  4.94060238822496958910E-10,
137         3.39623202570838634515E-9,   2.26666899049817806459E-8,
138         2.04891858946906374183E-7,   2.89137052083475648297E-6,
139         6.88975834691682398426E-5,   3.36911647825569408990E-3,
140         8.04490411014108831608E-1};
141 
142     return chbevl(T{32.0} / x - T{2.0}, coefficients, int{25}) / std::sqrt(x);
143   }),
144   i0e_string); // i0e_string
145 }
146 
147 #define CENTRAL_RANGE 0.7
148 
149 template <typename T>
150 inline typename std::enable_if<std::is_floating_point<T>::value, T>::type
calc_erfinv(T y)151 calc_erfinv(T y) {
152 /* Function to calculate inverse error function.  Rational approximation
153 is used to generate an initial approximation, which is then improved to
154 full accuracy by two steps of Newton's method.  Code is a direct
155 translation of the erfinv m file in matlab version 2.0.
156 Author:  Gary L. Pavlis, Indiana University
157 Date:  February 1996
158 */
159   T x, z, num, dem; /*working variables */
160   /* coefficients in rational expansion */
161   T a[4] = {  T(0.886226899), T(-1.645349621),  T(0.914624893), T(-0.140543331) };
162   T b[4] = { T(-2.118377725),  T(1.442710462), T(-0.329097515),  T(0.012229801) };
163   T c[4] = { T(-1.970840454), T(-1.624906493),  T(3.429567803),  T(1.641345311) };
164   T d[2] = {  T(3.543889200),  T(1.637067800) };
165   T y_abs = std::abs(y);
166   if(y_abs > 1.0) return std::numeric_limits<T>::quiet_NaN();
167 #ifdef _WIN32
168   // error C2039: '_copysign': is not a member of 'std'
169   if(y_abs == 1.0) return copysign(std::numeric_limits<T>::infinity(), y);
170 #else
171   if(y_abs == 1.0) return std::copysign(std::numeric_limits<T>::infinity(), y);
172 #endif
173   if(y_abs <= static_cast<T>(CENTRAL_RANGE)) {
174     z = y * y;
175     num = (((a[3]*z + a[2])*z + a[1])*z + a[0]);
176     dem = ((((b[3]*z + b[2])*z + b[1])*z +b[0]) * z + static_cast<T>(1.0));
177     x = y * num / dem;
178   }
179   else{
180     z = std::sqrt(-std::log((static_cast<T>(1.0)-y_abs)/static_cast<T>(2.0)));
181     num = ((c[3]*z + c[2])*z + c[1]) * z + c[0];
182     dem = (d[1]*z + d[0])*z + static_cast<T>(1.0);
183 #ifdef _WIN32
184     // error C2039: '_copysign': is not a member of 'std'
185     x = copysign(num, y) / dem;
186 #else
187     x = std::copysign(num, y) / dem;
188 #endif
189   }
190   /* Two steps of Newton-Raphson correction */
191   x = x - (std::erf(x) - y) / ((static_cast<T>(2.0)/static_cast<T>(std::sqrt(c10::pi<double>)))*std::exp(-x*x));
192   x = x - (std::erf(x) - y) / ((static_cast<T>(2.0)/static_cast<T>(std::sqrt(c10::pi<double>)))*std::exp(-x*x));
193 
194   return(x);
195 }
196 
197 #undef CENTRAL_RANGE
198 
199 /*
200  * Note [3-Clause BSD License for the Cephes Math Library]
201  * Code derived from implementations in the Cephes Math Library should mention its derivation and reference
202  * this note (ex. 'This function is derived from the implementation of X in the Cephes Math Library. See note
203  * [3-Clause BSD License for the Cephes Math Library]. The license is:
204  * Copyright (c) 2018, Steven Moshier
205  * All rights reserved.
206  *
207  * Redistribution and use in source and binary forms, with or without
208  * modification, are permitted provided that the following conditions are met:
209  * * Redistributions of source code must retain the above copyright
210  * notice, this list of conditions and the following disclaimer.
211  * * Redistributions in binary form must reproduce the above copyright
212  * notice, this list of conditions and the following disclaimer in the
213  * documentation and/or other materials provided with the distribution.
214  * * Neither the name of the nor the
215  * names of its contributors may be used to endorse or promote products
216  * derived from this software without specific prior written permission.
217  *
218  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
219  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
220  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
221  * DISCLAIMED. IN NO EVENT SHALL Steven Moshier BE LIABLE FOR ANY
222  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
223  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
224  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
225  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
226  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
227  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
228  */
229 
230 /*
231  * This function is derived from the implementation of the zeta function in the Cephes Math Library.
232  * See note [3-Clause BSD License for the Cephes Math Library].
233  */
234 template <typename scalar_t, bool is_cuda=false>
zeta(scalar_t x,scalar_t q)235 C10_HOST_DEVICE inline scalar_t zeta(scalar_t x, scalar_t q) __ubsan_ignore_float_divide_by_zero__ {
236   using acc_t = at::acc_type<scalar_t, is_cuda>;
237   const acc_t MACHEP = acc_t{1.11022302462515654042E-16};
238   constexpr acc_t zero = acc_t{0.0};
239   constexpr acc_t half = acc_t{0.5};
240   constexpr acc_t one = acc_t{1.0};
241   static const acc_t A[] = {
242       12.0,
243       -720.0,
244       30240.0,
245       -1209600.0,
246       47900160.0,
247       -1.8924375803183791606e9, /*1.307674368e12/691*/
248       7.47242496e10,
249       -2.950130727918164224e12, /*1.067062284288e16/3617*/
250       1.1646782814350067249e14, /*5.109094217170944e18/43867*/
251       -4.5979787224074726105e15, /*8.028576626982912e20/174611*/
252       1.8152105401943546773e17, /*1.5511210043330985984e23/854513*/
253       -7.1661652561756670113e18 /*1.6938241367317436694528e27/236364091*/
254   };
255 
256   int i = 0;
257   acc_t a, b, k, s, t, w;
258   if (x == one) {
259     return std::numeric_limits<scalar_t>::infinity();
260   }
261 
262   if (x < one) {
263     return std::numeric_limits<scalar_t>::quiet_NaN();
264   }
265 
266   if (q <= zero) {
267     if (q == std::floor(q)) {
268       return std::numeric_limits<scalar_t>::infinity();
269     }
270     if (x != std::floor(x)) {
271       return std::numeric_limits<scalar_t>::quiet_NaN();
272     }
273   }
274 
275   s = std::pow(q, -x);
276   a = q;
277   i = 0;
278   b = zero;
279   while ((i < 9) || (a <= acc_t{9.0})) {
280     i += 1;
281     a += one;
282     b = ::pow(a, -x);
283     s += b;
284     if ((-MACHEP * s < b) && (b < MACHEP * s)) {
285       return static_cast<scalar_t>(s);
286     }
287   };
288 
289   w = a;
290   s += b * w / (x - one);
291   s -= half * b;
292   a = one;
293   k = zero;
294   for (int i = 0; i < 12; i++) {
295     a *= x + k;
296     b /= w;
297     t = a * b / A[i];
298     s = s + t;
299     t = ::fabs(t / s);
300     if (t < MACHEP) {
301       return static_cast<scalar_t>(s);
302     }
303     k += one;
304     a *= x + k;
305     b /= w;
306     k += one;
307   }
308   return static_cast<scalar_t>(s);
309 }
310 
311 /*
312  * This function is derived from the implementation of the digamma function in the Cephes Math Library.
313  * See note [3-Clause BSD License for the Cephes Math Library].
314  *
315  * Evaluates polynomial of degree N:
316  *
317  *                     2          N
318  * y  =  C  + C x + C x  +...+ C x
319  *        0    1     2          N
320  *
321  * Coefficients are stored in reverse order:
322  *
323  * coef[0] = C  , ..., coef[N] = C  .
324  *            N                   0
325  */
326 template <typename T>
polevl(const T x,const T A[],size_t len)327 C10_HOST_DEVICE inline T polevl(const T x, const T A[], size_t len) {
328   T result = 0;
329   for (size_t i = 0; i <= len; i++) {
330     result = result * x + A[i];
331   }
332   return result;
333 }
334 
trigamma(double x)335 inline double trigamma(double x) __ubsan_ignore_float_divide_by_zero__ {
336   double sign = +1;
337   double result = 0;
338   if (x < 0.5) {
339     sign = -1;
340     const double sin_pi_x = sin(c10::pi<double> * x);
341     result -= (c10::pi<double> * c10::pi<double>) / (sin_pi_x * sin_pi_x);
342     x = 1 - x;
343   }
344   for (int i = 0; i < 6; ++i) {
345     result += 1 / (x * x);
346     x += 1;
347   }
348   const double ixx = 1 / (x*x);
349   result += (1 + 1 / (2*x) + ixx * (1./6 - ixx * (1./30 - ixx * (1./42)))) / x;
350   return sign * result;
351 }
352 
trigamma(float x)353 inline float trigamma(float x) __ubsan_ignore_float_divide_by_zero__ {
354   float sign = +1;
355   float result = 0;
356   if (x < 0.5f) {
357     sign = -1;
358     const float sin_pi_x = sinf(c10::pi<float> * x);
359     result -= (c10::pi<float> * c10::pi<float>) / (sin_pi_x * sin_pi_x);
360     x = 1 - x;
361   }
362   for (int i = 0; i < 6; ++i) {
363     result += 1 / (x * x);
364     x += 1;
365   }
366   const float ixx = 1 / (x*x);
367   result += (1 + 1 / (2*x) + ixx * (1.f/6 - ixx * (1.f/30 - ixx * (1.f/42)))) / x;
368   return sign * result;
369 }
370 
371 /*
372  * This function is derived from the implementation of the digamma function in the Cephes Math Library.
373  * See note [3-Clause BSD License for the Cephes Math Library].
374  */
calc_digamma(double x)375 inline double calc_digamma(double x) {
376   // [C++ Standard Reference: Gamma Function] https://en.cppreference.com/w/cpp/numeric/math/tgamma
377   static double PSI_10 = 2.25175258906672110764;
378   if (x == 0) {
379     // As per C++ standard for gamma related functions and SciPy,
380     // If the argument is ±0, ±∞ is returned
381     return std::copysign(INFINITY, -x);
382   }
383 
384   bool x_is_integer = x == trunc(x);
385   if (x < 0) {
386     if (x_is_integer) {
387       // As per C++ standard for gamma related functions and SciPy,
388       // If the argument is a negative integer, NaN is returned
389       return std::numeric_limits<double>::quiet_NaN();
390     }
391     // Extracts the fractional part of x as r, since tan(pi * r) is more numerically
392     // accurate than tan(pi * x). While these operations are mathematically equivalent
393     // since both x and r are in radians and tan() has a periodicity of pi, in practice
394     // the computation of pi * x is a source of error (when |x| > 1).
395     double q, r;
396     r = std::modf(x, &q);
397     return calc_digamma(1 - x) - c10::pi<double> / tan(c10::pi<double> * r);
398   }
399 
400   // Push x to be >= 10
401   double result = 0;
402   while (x < 10) {
403     result -= 1 / x;
404     x += 1;
405   }
406   if (x == 10) {
407     return result + PSI_10;
408   }
409 
410   // Compute asymptotic digamma
411   static const double A[] = {
412       8.33333333333333333333E-2,
413       -2.10927960927960927961E-2,
414       7.57575757575757575758E-3,
415       -4.16666666666666666667E-3,
416       3.96825396825396825397E-3,
417       -8.33333333333333333333E-3,
418       8.33333333333333333333E-2,
419   };
420 
421   double y = 0;
422   if (x < 1.0e17) {
423     double z = 1.0 / (x * x);
424     y = z * polevl(z, A, 6);
425   }
426   return result + log(x) - (0.5 / x) - y;
427 }
428 
429 /*
430  * This function is derived from the implementation of the digamma function in the Cephes Math Library.
431  * See note [3-Clause BSD License for the Cephes Math Library].
432  */
calc_digamma(float x)433 inline float calc_digamma(float x) {
434   // See [C++ Standard Reference: Gamma Function]
435   static float PSI_10 = 2.25175258906672110764f;
436   if (x == 0) {
437     // As per C++ standard for gamma related functions and SciPy,
438     // If the argument is ±0, ±∞ is returned
439     return std::copysign(INFINITY, -x);
440   }
441 
442   bool x_is_integer = x == truncf(x);
443   if (x < 0) {
444     if (x_is_integer) {
445     // As per C++ standard for gamma related functions and SciPy,
446     // If the argument is a negative integer, NaN is returned
447       return std::numeric_limits<float>::quiet_NaN();
448     }
449     // Extracts the fractional part of x as r, since tan(pi * r) is more numerically
450     // accurate than tan(pi * x). While these operations are mathematically equivalent
451     // since both x and r are in radians and tan() has a periodicity of pi, in practice
452     // the computation of pi * x is a source of error (when |x| > 1).
453     double q, r;
454     r = std::modf(x, &q);
455     float pi_over_tan_pi_x = (float)(c10::pi<double> / tan(c10::pi<double> * r));
456     return calc_digamma(1 - x) - pi_over_tan_pi_x;
457   }
458 
459   // Push x to be >= 10
460   float result = 0;
461   while (x < 10) {
462     result -= 1 / x;
463     x += 1;
464   }
465   if (x == 10) {
466     return result + PSI_10;
467   }
468 
469   // Compute asymptotic digamma
470   static const float A[] = {
471       8.33333333333333333333E-2f,
472       -2.10927960927960927961E-2f,
473       7.57575757575757575758E-3f,
474       -4.16666666666666666667E-3f,
475       3.96825396825396825397E-3f,
476       -8.33333333333333333333E-3f,
477       8.33333333333333333333E-2f,
478   };
479 
480   float y = 0;
481   if (x < 1.0e17f) {
482     float z = 1 / (x * x);
483     y = z * polevl(z, A, 6);
484   }
485   return result + logf(x) - (0.5f / x) - y;
486 }
487 
calc_digamma(c10::BFloat16 a)488 inline c10::BFloat16 calc_digamma(c10::BFloat16 a) {
489   return calc_digamma(static_cast<float>(a));
490 }
491 
calc_digamma(c10::Half a)492 inline c10::Half calc_digamma(c10::Half a) {
493   return calc_digamma(static_cast<float>(a));
494 }
495 
496 template <typename scalar_t, bool is_cuda=false>
calc_polygamma(scalar_t x,int n)497 inline C10_HOST_DEVICE scalar_t calc_polygamma(scalar_t x, int n) {
498   // already blocked if n <= 1
499   const auto one = scalar_t{1};
500   return ((n % 2) ? one : -one) *
501       std::exp(std::lgamma(static_cast<scalar_t>(n) + one)) *
502       zeta<scalar_t, is_cuda>(static_cast<scalar_t>(n + 1), x);
503 }
504 
505 // regularized lower incomplete gamma
506 // the regularized lower, upper incomplete gamma, as well as their
507 // helper functions follow SciPy's implementation
508 
509 /* References
510  * [igam1] "The Digital Library of Mathematical Functions", dlmf.nist.gov
511  * [igam2] Maddock et al., "Incomplete Gamma Functions",
512  *     https://www.boost.org/doc/libs/1_61_0/libs/math/doc/html/math_toolkit/sf_gamma/igamma.html
513  */
514 
515 /*
516  * This implementation of the regularized incomplete gamma functions and
517  * their helper functions are derived from the implementation of SciPy's
518  * gammainc, Cephes's igam and igamc, and Boost's Lanczos approximations.
519  * See NOTICE for the licenses.
520  */
521 template <typename scalar_t>
ratevl(scalar_t x,const scalar_t num[],int64_t M,const scalar_t denom[],int64_t N)522 scalar_t ratevl(scalar_t x, const scalar_t num[], int64_t M,
523     const scalar_t denom[], int64_t N) {
524   // evaluating rational function, i.e., the ratio of two polynomials
525   // the coefficients for numerator are given by `num` while coeffs for
526   // denumerator are given by `denom`
527 
528   int64_t i, dir;
529   scalar_t y, num_ans, denom_ans;
530   scalar_t absx = std::fabs(x);
531   const scalar_t *p;
532 
533   if (absx > 1) {
534     /* Evaluate as a polynomial in 1/x. */
535     dir = -1;
536     p = num + M;
537     y = 1 / x;
538   }
539   else {
540     dir = 1;
541     p = num;
542     y = x;
543   }
544 
545   /* Evaluate the numerator */
546   num_ans = *p;
547   p += dir;
548   for (i = 1; i <= M; i++) {
549     num_ans = num_ans * y + *p;
550     p += dir;
551   }
552   /* Evaluate the denominator */
553   if (absx > 1) {
554     p = denom + N;
555   }
556   else {
557     p = denom;
558   }
559 
560   denom_ans = *p;
561   p += dir;
562   for (i = 1; i <= N; i++) {
563     denom_ans = denom_ans * y + *p;
564     p += dir;
565   }
566   if (absx > 1) {
567     i = N - M;
568     return std::pow(x, i) * num_ans / denom_ans;
569   }
570   else {
571     return num_ans / denom_ans;
572   }
573 }
574 
575 // SciPy's lanczos implementation is taken from Boost
576 /* (C) Copyright John Maddock 2006.
577  * Use, modification and distribution are subject to the
578  * Boost Software License, Version 1.0. See
579  * https://www.boost.org/LICENSE_1_0.txt or see NOTICE.
580  */
581 template <typename scalar_t>
lanczos_sum_expg_scaled(scalar_t x)582 static scalar_t lanczos_sum_expg_scaled(scalar_t x) {
583   // lanczos approximation
584   static const scalar_t lanczos_sum_expg_scaled_num[13] = {
585     0.006061842346248906525783753964555936883222,
586     0.5098416655656676188125178644804694509993,
587     19.51992788247617482847860966235652136208,
588     449.9445569063168119446858607650988409623,
589     6955.999602515376140356310115515198987526,
590     75999.29304014542649875303443598909137092,
591     601859.6171681098786670226533699352302507,
592     3481712.15498064590882071018964774556468,
593     14605578.08768506808414169982791359218571,
594     43338889.32467613834773723740590533316085,
595     86363131.28813859145546927288977868422342,
596     103794043.1163445451906271053616070238554,
597     56906521.91347156388090791033559122686859
598   };
599   static const scalar_t lanczos_sum_expg_scaled_denom[13] = {
600     1.,
601     66.,
602     1925.,
603     32670.,
604     357423.,
605     2637558.,
606     13339535.,
607     45995730.,
608     105258076.,
609     150917976.,
610     120543840.,
611     39916800.,
612     0.
613   };
614   return ratevl(x, lanczos_sum_expg_scaled_num,
615       sizeof(lanczos_sum_expg_scaled_num) / sizeof(lanczos_sum_expg_scaled_num[0]) - 1,
616       lanczos_sum_expg_scaled_denom,
617       sizeof(lanczos_sum_expg_scaled_denom) / sizeof(lanczos_sum_expg_scaled_denom[0]) - 1);
618 }
619 
620 template <typename scalar_t>
_igam_helper_fac(scalar_t a,scalar_t x)621 static scalar_t _igam_helper_fac(scalar_t a, scalar_t x) {
622   // compute x^a * exp(-a) / gamma(a)
623   // corrected from (15) and (16) in [igam2] by replacing exp(x - a) with
624   // exp(a - x).
625 
626   scalar_t ax, fac, res, num, numfac;
627   static scalar_t MAXLOG = std::is_same<scalar_t,double>::value ?
628     7.09782712893383996843E2 : 88.72283905206835;
629   static scalar_t EXP1 = 2.718281828459045;
630   static scalar_t lanczos_g = 6.024680040776729583740234375;
631 
632   if (std::fabs(a - x) > 0.4 * std::fabs(a)) {
633     ax = a * std::log(x) - x - std::lgamma(a);
634     if (ax < -MAXLOG) {
635       return 0.0;
636     }
637     return std::exp(ax);
638   }
639 
640   fac = a + lanczos_g - 0.5;
641   res = std::sqrt(fac / EXP1) / lanczos_sum_expg_scaled(a);
642 
643   if ((a < 200) && (x < 200)) {
644     res *= std::exp(a - x) * std::pow(x / fac, a);
645   }
646   else {
647     num = x - a - lanczos_g + 0.5;
648     numfac = num / fac;
649     res *= std::exp(a * (std::log1p(numfac) - numfac) + x * (0.5 - lanczos_g) / fac);
650   }
651   return res;
652 }
653 
654 template <typename scalar_t>
_igam_helper_series(scalar_t a,scalar_t x)655 static scalar_t _igam_helper_series(scalar_t a, scalar_t x) {
656   // Compute igam using DLMF 8.11.4. [igam1]
657   static scalar_t MACHEP = std::is_same<scalar_t, double>::value ?
658     1.11022302462515654042E-16 : 5.9604644775390625E-8;
659   static int MAXITER = 2000;
660 
661   int i;
662   scalar_t ans, ax, c, r;
663 
664   ax = _igam_helper_fac(a, x);
665   if (ax == 0.0) {
666     return 0.0;
667   }
668 
669   /* power series */
670   r = a;
671   c = 1.0;
672   ans = 1.0;
673 
674   for (i = 0; i < MAXITER; i++) {
675     r += 1.0;
676     c *= x / r;
677     ans += c;
678     if (c <= MACHEP * ans) {
679       break;
680     }
681   }
682   return (ans * ax / a);
683 }
684 
685 template <typename scalar_t>
_igamc_helper_series(scalar_t a,scalar_t x)686 static scalar_t _igamc_helper_series(scalar_t a, scalar_t x) {
687   // Compute igamc using DLMF 8.7.3 [igam1]. This is related to the series in
688   // _igam_helper_series but extra care is taken to avoid cancellation.
689 
690   int n;
691   scalar_t fac = 1;
692   scalar_t sum = 0;
693   scalar_t term, logx;
694   static scalar_t MAXITER = 2000;
695   static scalar_t MACHEP = std::is_same<scalar_t, double>::value ?
696     1.11022302462515654042E-16 : 5.9604644775390625E-8;
697 
698   for (n = 1; n < MAXITER; n++) {
699     fac *= -x / n;
700     term = fac / (a + n);
701     sum += term;
702     if (std::fabs(term) <= MACHEP * std::fabs(sum)) {
703         break;
704     }
705   }
706 
707   logx = std::log(x);
708   term = -std::expm1(a * logx - std::lgamma(1+a));
709   return term - std::exp(a * logx - std::lgamma(a)) * sum;
710 }
711 
712 template <typename scalar_t>
_igam_helper_asymptotic_series(scalar_t a,scalar_t x,bool igam)713 static scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t x, bool igam) {
714   // Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1]
715   static const scalar_t d[25][25] =
716     {{-3.3333333333333333e-1, 8.3333333333333333e-2, -1.4814814814814815e-2,
717       1.1574074074074074e-3, 3.527336860670194e-4, -1.7875514403292181e-4,
718       3.9192631785224378e-5, -2.1854485106799922e-6, -1.85406221071516e-6,
719       8.296711340953086e-7, -1.7665952736826079e-7, 6.7078535434014986e-9,
720       1.0261809784240308e-8, -4.3820360184533532e-9, 9.1476995822367902e-10,
721       -2.551419399494625e-11, -5.8307721325504251e-11, 2.4361948020667416e-11,
722       -5.0276692801141756e-12, 1.1004392031956135e-13, 3.3717632624009854e-13,
723       -1.3923887224181621e-13, 2.8534893807047443e-14, -5.1391118342425726e-16,
724       -1.9752288294349443e-15},
725     {-1.8518518518518519e-3, -3.4722222222222222e-3, 2.6455026455026455e-3,
726       -9.9022633744855967e-4, 2.0576131687242798e-4, -4.0187757201646091e-7,
727       -1.8098550334489978e-5, 7.6491609160811101e-6, -1.6120900894563446e-6,
728       4.6471278028074343e-9, 1.378633446915721e-7, -5.752545603517705e-8,
729       1.1951628599778147e-8, -1.7543241719747648e-11, -1.0091543710600413e-9,
730       4.1627929918425826e-10, -8.5639070264929806e-11, 6.0672151016047586e-14,
731       7.1624989648114854e-12, -2.9331866437714371e-12, 5.9966963656836887e-13,
732       -2.1671786527323314e-16, -4.9783399723692616e-14, 2.0291628823713425e-14,
733       -4.13125571381061e-15},
734     {4.1335978835978836e-3, -2.6813271604938272e-3, 7.7160493827160494e-4,
735       2.0093878600823045e-6, -1.0736653226365161e-4, 5.2923448829120125e-5,
736       -1.2760635188618728e-5, 3.4235787340961381e-8, 1.3721957309062933e-6,
737       -6.298992138380055e-7, 1.4280614206064242e-7, -2.0477098421990866e-10,
738       -1.4092529910867521e-8, 6.228974084922022e-9, -1.3670488396617113e-9,
739       9.4283561590146782e-13, 1.2872252400089318e-10, -5.5645956134363321e-11,
740       1.1975935546366981e-11, -4.1689782251838635e-15, -1.0940640427884594e-12,
741       4.6622399463901357e-13, -9.905105763906906e-14, 1.8931876768373515e-17,
742       8.8592218725911273e-15},
743     {6.4943415637860082e-4, 2.2947209362139918e-4, -4.6918949439525571e-4,
744       2.6772063206283885e-4, -7.5618016718839764e-5, -2.3965051138672967e-7,
745       1.1082654115347302e-5, -5.6749528269915966e-6, 1.4230900732435884e-6,
746       -2.7861080291528142e-11, -1.6958404091930277e-7, 8.0994649053880824e-8,
747       -1.9111168485973654e-8, 2.3928620439808118e-12, 2.0620131815488798e-9,
748       -9.4604966618551322e-10, 2.1541049775774908e-10, -1.388823336813903e-14,
749       -2.1894761681963939e-11, 9.7909989511716851e-12, -2.1782191880180962e-12,
750       6.2088195734079014e-17, 2.126978363279737e-13, -9.3446887915174333e-14,
751       2.0453671226782849e-14},
752     {-8.618882909167117e-4, 7.8403922172006663e-4, -2.9907248030319018e-4,
753       -1.4638452578843418e-6, 6.6414982154651222e-5, -3.9683650471794347e-5,
754       1.1375726970678419e-5, 2.5074972262375328e-10, -1.6954149536558306e-6,
755       8.9075075322053097e-7, -2.2929348340008049e-7, 2.956794137544049e-11,
756       2.8865829742708784e-8, -1.4189739437803219e-8, 3.4463580499464897e-9,
757       -2.3024517174528067e-13, -3.9409233028046405e-10, 1.8602338968504502e-10,
758       -4.356323005056618e-11, 1.2786001016296231e-15, 4.6792750266579195e-12,
759       -2.1492464706134829e-12, 4.9088156148096522e-13, -6.3385914848915603e-18,
760       -5.0453320690800944e-14},
761     {-3.3679855336635815e-4, -6.9728137583658578e-5, 2.7727532449593921e-4,
762       -1.9932570516188848e-4, 6.7977804779372078e-5, 1.419062920643967e-7,
763       -1.3594048189768693e-5, 8.0184702563342015e-6, -2.2914811765080952e-6,
764       -3.252473551298454e-10, 3.4652846491085265e-7, -1.8447187191171343e-7,
765       4.8240967037894181e-8, -1.7989466721743515e-14, -6.3061945000135234e-9,
766       3.1624176287745679e-9, -7.8409242536974293e-10, 5.1926791652540407e-15,
767       9.3589442423067836e-11, -4.5134262161632782e-11, 1.0799129993116827e-11,
768       -3.661886712685252e-17, -1.210902069055155e-12, 5.6807435849905643e-13,
769       -1.3249659916340829e-13},
770     {5.3130793646399222e-4, -5.9216643735369388e-4, 2.7087820967180448e-4,
771       7.9023532326603279e-7, -8.1539693675619688e-5, 5.6116827531062497e-5,
772       -1.8329116582843376e-5, -3.0796134506033048e-9, 3.4651553688036091e-6,
773       -2.0291327396058604e-6, 5.7887928631490037e-7, 2.338630673826657e-13,
774       -8.8286007463304835e-8, 4.7435958880408128e-8, -1.2545415020710382e-8,
775       8.6496488580102925e-14, 1.6846058979264063e-9, -8.5754928235775947e-10,
776       2.1598224929232125e-10, -7.6132305204761539e-16, -2.6639822008536144e-11,
777       1.3065700536611057e-11, -3.1799163902367977e-12, 4.7109761213674315e-18,
778       3.6902800842763467e-13},
779     {3.4436760689237767e-4, 5.1717909082605922e-5, -3.3493161081142236e-4,
780       2.812695154763237e-4, -1.0976582244684731e-4, -1.2741009095484485e-7,
781       2.7744451511563644e-5, -1.8263488805711333e-5, 5.7876949497350524e-6,
782       4.9387589339362704e-10, -1.0595367014026043e-6, 6.1667143761104075e-7,
783       -1.7562973359060462e-7, -1.2974473287015439e-12, 2.695423606288966e-8,
784       -1.4578352908731271e-8, 3.887645959386175e-9, -3.8810022510194121e-17,
785       -5.3279941738772867e-10, 2.7437977643314845e-10, -6.9957960920705679e-11,
786       2.5899863874868481e-17, 8.8566890996696381e-12, -4.403168815871311e-12,
787       1.0865561947091654e-12},
788     {-6.5262391859530942e-4, 8.3949872067208728e-4, -4.3829709854172101e-4,
789       -6.969091458420552e-7, 1.6644846642067548e-4, -1.2783517679769219e-4,
790       4.6299532636913043e-5, 4.5579098679227077e-9, -1.0595271125805195e-5,
791       6.7833429048651666e-6, -2.1075476666258804e-6, -1.7213731432817145e-11,
792       3.7735877416110979e-7, -2.1867506700122867e-7, 6.2202288040189269e-8,
793       6.5977038267330006e-16, -9.5903864974256858e-9, 5.2132144922808078e-9,
794       -1.3991589583935709e-9, 5.382058999060575e-16, 1.9484714275467745e-10,
795       -1.0127287556389682e-10, 2.6077347197254926e-11, -5.0904186999932993e-18,
796       -3.3721464474854592e-12},
797     {-5.9676129019274625e-4, -7.2048954160200106e-5, 6.7823088376673284e-4,
798       -6.4014752602627585e-4, 2.7750107634328704e-4, 1.8197008380465151e-7,
799       -8.4795071170685032e-5, 6.105192082501531e-5, -2.1073920183404862e-5,
800       -8.8585890141255994e-10, 4.5284535953805377e-6, -2.8427815022504408e-6,
801       8.7082341778646412e-7, 3.6886101871706965e-12, -1.5344695190702061e-7,
802       8.862466778790695e-8, -2.5184812301826817e-8, -1.0225912098215092e-14,
803       3.8969470758154777e-9, -2.1267304792235635e-9, 5.7370135528051385e-10,
804       -1.887749850169741e-19, -8.0931538694657866e-11, 4.2382723283449199e-11,
805       -1.1002224534207726e-11},
806     {1.3324454494800656e-3, -1.9144384985654775e-3, 1.1089369134596637e-3,
807       9.932404122642299e-7, -5.0874501293093199e-4, 4.2735056665392884e-4,
808       -1.6858853767910799e-4, -8.1301893922784998e-9, 4.5284402370562147e-5,
809       -3.127053674781734e-5, 1.044986828530338e-5, 4.8435226265680926e-11,
810       -2.1482565873456258e-6, 1.329369701097492e-6, -4.0295693092101029e-7,
811       -1.7567877666323291e-13, 7.0145043163668257e-8, -4.040787734999483e-8,
812       1.1474026743371963e-8, 3.9642746853563325e-18, -1.7804938269892714e-9,
813       9.7480262548731646e-10, -2.6405338676507616e-10, 5.794875163403742e-18,
814       3.7647749553543836e-11},
815     {1.579727660730835e-3, 1.6251626278391582e-4, -2.0633421035543276e-3,
816       2.1389686185689098e-3, -1.0108559391263003e-3, -3.9912705529919201e-7,
817       3.6235025084764691e-4, -2.8143901463712154e-4, 1.0449513336495887e-4,
818       2.1211418491830297e-9, -2.5779417251947842e-5, 1.7281818956040463e-5,
819       -5.6413773872904282e-6, -1.1024320105776174e-11, 1.1223224418895175e-6,
820       -6.8693396379526735e-7, 2.0653236975414887e-7, 4.6714772409838506e-14,
821       -3.5609886164949055e-8, 2.0470855345905963e-8, -5.8091738633283358e-9,
822       -1.332821287582869e-16, 9.0354604391335133e-10, -4.9598782517330834e-10,
823       1.3481607129399749e-10},
824     {-4.0725121195140166e-3, 6.4033628338080698e-3, -4.0410161081676618e-3,
825       -2.183732802866233e-6, 2.1740441801254639e-3, -1.9700440518418892e-3,
826       8.3595469747962458e-4, 1.9445447567109655e-8, -2.5779387120421696e-4,
827       1.9009987368139304e-4, -6.7696499937438965e-5, -1.4440629666426572e-10,
828       1.5712512518742269e-5, -1.0304008744776893e-5, 3.304517767401387e-6,
829       7.9829760242325709e-13, -6.4097794149313004e-7, 3.8894624761300056e-7,
830       -1.1618347644948869e-7, -2.816808630596451e-15, 1.9878012911297093e-8,
831       -1.1407719956357511e-8, 3.2355857064185555e-9, 4.1759468293455945e-20,
832       -5.0423112718105824e-10},
833     {-5.9475779383993003e-3, -5.4016476789260452e-4, 8.7910413550767898e-3,
834       -9.8576315587856125e-3, 5.0134695031021538e-3, 1.2807521786221875e-6,
835       -2.0626019342754683e-3, 1.7109128573523058e-3, -6.7695312714133799e-4,
836       -6.9011545676562133e-9, 1.8855128143995902e-4, -1.3395215663491969e-4,
837       4.6263183033528039e-5, 4.0034230613321351e-11, -1.0255652921494033e-5,
838       6.612086372797651e-6, -2.0913022027253008e-6, -2.0951775649603837e-13,
839       3.9756029041993247e-7, -2.3956211978815887e-7, 7.1182883382145864e-8,
840       8.925574873053455e-16, -1.2101547235064676e-8, 6.9350618248334386e-9,
841       -1.9661464453856102e-9},
842     {1.7402027787522711e-2, -2.9527880945699121e-2, 2.0045875571402799e-2,
843       7.0289515966903407e-6, -1.2375421071343148e-2, 1.1976293444235254e-2,
844       -5.4156038466518525e-3, -6.3290893396418616e-8, 1.8855118129005065e-3,
845       -1.473473274825001e-3, 5.5515810097708387e-4, 5.2406834412550662e-10,
846       -1.4357913535784836e-4, 9.9181293224943297e-5, -3.3460834749478311e-5,
847       -3.5755837291098993e-12, 7.1560851960630076e-6, -4.5516802628155526e-6,
848       1.4236576649271475e-6, 1.8803149082089664e-14, -2.6623403898929211e-7,
849       1.5950642189595716e-7, -4.7187514673841102e-8, -6.5107872958755177e-17,
850       7.9795091026746235e-9},
851     {3.0249124160905891e-2, 2.4817436002649977e-3, -4.9939134373457022e-2,
852       5.9915643009307869e-2, -3.2483207601623391e-2, -5.7212968652103441e-6,
853       1.5085251778569354e-2, -1.3261324005088445e-2, 5.5515262632426148e-3,
854       3.0263182257030016e-8, -1.7229548406756723e-3, 1.2893570099929637e-3,
855       -4.6845138348319876e-4, -1.830259937893045e-10, 1.1449739014822654e-4,
856       -7.7378565221244477e-5, 2.5625836246985201e-5, 1.0766165333192814e-12,
857       -5.3246809282422621e-6, 3.349634863064464e-6, -1.0381253128684018e-6,
858       -5.608909920621128e-15, 1.9150821930676591e-7, -1.1418365800203486e-7,
859       3.3654425209171788e-8},
860     {-9.9051020880159045e-2, 1.7954011706123486e-1, -1.2989606383463778e-1,
861       -3.1478872752284357e-5, 9.0510635276848131e-2, -9.2828824411184397e-2,
862       4.4412112839877808e-2, 2.7779236316835888e-7, -1.7229543805449697e-2,
863       1.4182925050891573e-2, -5.6214161633747336e-3, -2.39598509186381e-9,
864       1.6029634366079908e-3, -1.1606784674435773e-3, 4.1001337768153873e-4,
865       1.8365800754090661e-11, -9.5844256563655903e-5, 6.3643062337764708e-5,
866       -2.076250624489065e-5, -1.1806020912804483e-13, 4.2131808239120649e-6,
867       -2.6262241337012467e-6, 8.0770620494930662e-7, 6.0125912123632725e-16,
868       -1.4729737374018841e-7},
869     {-1.9994542198219728e-1, -1.5056113040026424e-2, 3.6470239469348489e-1,
870       -4.6435192311733545e-1, 2.6640934719197893e-1, 3.4038266027147191e-5,
871       -1.3784338709329624e-1, 1.276467178337056e-1, -5.6213828755200985e-2,
872       -1.753150885483011e-7, 1.9235592956768113e-2, -1.5088821281095315e-2,
873       5.7401854451350123e-3, 1.0622382710310225e-9, -1.5335082692563998e-3,
874       1.0819320643228214e-3, -3.7372510193945659e-4, -6.6170909729031985e-12,
875       8.4263617380909628e-5, -5.5150706827483479e-5, 1.7769536448348069e-5,
876       3.8827923210205533e-14, -3.53513697488768e-6, 2.1865832130045269e-6,
877       -6.6812849447625594e-7},
878     {7.2438608504029431e-1, -1.3918010932653375, 1.0654143352413968,
879       1.876173868950258e-4, -8.2705501176152696e-1, 8.9352433347828414e-1,
880       -4.4971003995291339e-1, -1.6107401567546652e-6, 1.9235590165271091e-1,
881       -1.6597702160042609e-1, 6.8882222681814333e-2, 1.3910091724608687e-8,
882       -2.146911561508663e-2, 1.6228980898865892e-2, -5.9796016172584256e-3,
883       -1.1287469112826745e-10, 1.5167451119784857e-3, -1.0478634293553899e-3,
884       3.5539072889126421e-4, 8.1704322111801517e-13, -7.7773013442452395e-5,
885       5.0291413897007722e-5, -1.6035083867000518e-5, 1.2469354315487605e-14,
886       3.1369106244517615e-6},
887     {1.6668949727276811, 1.165462765994632e-1, -3.3288393225018906,
888       4.4692325482864037, -2.6977693045875807, -2.600667859891061e-4,
889       1.5389017615694539, -1.4937962361134612, 6.8881964633233148e-1,
890       1.3077482004552385e-6, -2.5762963325596288e-1, 2.1097676102125449e-1,
891       -8.3714408359219882e-2, -7.7920428881354753e-9, 2.4267923064833599e-2,
892       -1.7813678334552311e-2, 6.3970330388900056e-3, 4.9430807090480523e-11,
893       -1.5554602758465635e-3, 1.0561196919903214e-3, -3.5277184460472902e-4,
894       9.3002334645022459e-14, 7.5285855026557172e-5, -4.8186515569156351e-5,
895       1.5227271505597605e-5},
896     {-6.6188298861372935, 1.3397985455142589e+1, -1.0789350606845146e+1,
897       -1.4352254537875018e-3, 9.2333694596189809, -1.0456552819547769e+1,
898       5.5105526029033471, 1.2024439690716742e-5, -2.5762961164755816,
899       2.3207442745387179, -1.0045728797216284, -1.0207833290021914e-7,
900       3.3975092171169466e-1, -2.6720517450757468e-1, 1.0235252851562706e-1,
901       8.4329730484871625e-10, -2.7998284958442595e-2, 2.0066274144976813e-2,
902       -7.0554368915086242e-3, 1.9402238183698188e-12, 1.6562888105449611e-3,
903       -1.1082898580743683e-3, 3.654545161310169e-4, -5.1290032026971794e-11,
904       -7.6340103696869031e-5},
905     {-1.7112706061976095e+1, -1.1208044642899116, 3.7131966511885444e+1,
906       -5.2298271025348962e+1, 3.3058589696624618e+1, 2.4791298976200222e-3,
907       -2.061089403411526e+1, 2.088672775145582e+1, -1.0045703956517752e+1,
908       -1.2238783449063012e-5, 4.0770134274221141, -3.473667358470195,
909       1.4329352617312006, 7.1359914411879712e-8, -4.4797257159115612e-1,
910       3.4112666080644461e-1, -1.2699786326594923e-1, -2.8953677269081528e-10,
911       3.3125776278259863e-2, -2.3274087021036101e-2, 8.0399993503648882e-3,
912       -1.177805216235265e-9, -1.8321624891071668e-3, 1.2108282933588665e-3,
913       -3.9479941246822517e-4},
914     {7.389033153567425e+1, -1.5680141270402273e+2, 1.322177542759164e+2,
915       1.3692876877324546e-2, -1.2366496885920151e+2, 1.4620689391062729e+2,
916       -8.0365587724865346e+1, -1.1259851148881298e-4, 4.0770132196179938e+1,
917       -3.8210340013273034e+1, 1.719522294277362e+1, 9.3519707955168356e-7,
918       -6.2716159907747034, 5.1168999071852637, -2.0319658112299095,
919       -4.9507215582761543e-9, 5.9626397294332597e-1, -4.4220765337238094e-1,
920       1.6079998700166273e-1, -2.4733786203223402e-8, -4.0307574759979762e-2,
921       2.7849050747097869e-2, -9.4751858992054221e-3, 6.419922235909132e-6,
922       2.1250180774699461e-3},
923     {2.1216837098382522e+2, 1.3107863022633868e+1, -4.9698285932871748e+2,
924       7.3121595266969204e+2, -4.8213821720890847e+2, -2.8817248692894889e-2,
925       3.2616720302947102e+2, -3.4389340280087117e+2, 1.7195193870816232e+2,
926       1.4038077378096158e-4, -7.52594195897599e+1, 6.651969984520934e+1,
927       -2.8447519748152462e+1, -7.613702615875391e-7, 9.5402237105304373,
928       -7.5175301113311376, 2.8943997568871961, -4.6612194999538201e-7,
929       -8.0615149598794088e-1, 5.8483006570631029e-1, -2.0845408972964956e-1,
930       1.4765818959305817e-4, 5.1000433863753019e-2, -3.3066252141883665e-2,
931       1.5109265210467774e-2},
932     {-9.8959643098322368e+2, 2.1925555360905233e+3, -1.9283586782723356e+3,
933       -1.5925738122215253e-1, 1.9569985945919857e+3, -2.4072514765081556e+3,
934       1.3756149959336496e+3, 1.2920735237496668e-3, -7.525941715948055e+2,
935       7.3171668742208716e+2, -3.4137023466220065e+2, -9.9857390260608043e-6,
936       1.3356313181291573e+2, -1.1276295161252794e+2, 4.6310396098204458e+1,
937       -7.9237387133614756e-6, -1.4510726927018646e+1, 1.1111771248100563e+1,
938       -4.1690817945270892, 3.1008219800117808e-3, 1.1220095449981468,
939       -7.6052379926149916e-1, 3.6262236505085254e-1, 2.216867741940747e-1,
940       4.8683443692930507e-1}};
941 
942   int k, n, sgn;
943   int maxpow = 0;
944   static scalar_t MACHEP = std::is_same<scalar_t, double>::value ?
945     1.11022302462515654042E-16 : 5.9604644775390625E-8;
946   scalar_t lambda = x / a;
947   scalar_t sigma = (x - a) / a;
948   scalar_t eta, res, ck, ckterm, term, absterm;
949   scalar_t absoldterm = INFINITY;
950   scalar_t etapow[25] = {1};
951   scalar_t sum = 0;
952   scalar_t afac = 1;
953 
954   if (igam) {
955     sgn = -1;
956   }
957   else {
958     sgn = 1;
959   }
960 
961   if (lambda > 1) {
962     eta = std::sqrt(-2 * (std::log1p(sigma) - sigma));
963   }
964   else if (lambda < 1) {
965     eta = -std::sqrt(-2 * (std::log1p(sigma) - sigma));
966   }
967   else {
968     eta = 0;
969   }
970   res = 0.5 * std::erfc(sgn * eta * std::sqrt(a / 2));
971 
972   for (k = 0; k < 25; k++) {
973     ck = d[k][0];
974     for (n = 1; n < 25; n++) {
975       if (n > maxpow) {
976         etapow[n] = eta * etapow[n-1];
977         maxpow += 1;
978       }
979       ckterm = d[k][n]*etapow[n];
980       ck += ckterm;
981       if (std::fabs(ckterm) < MACHEP * std::fabs(ck)) {
982         break;
983       }
984     }
985     term = ck * afac;
986     absterm = std::fabs(term);
987     if (absterm > absoldterm) {
988       break;
989     }
990     sum += term;
991     if (absterm < MACHEP * std::fabs(sum)) {
992       break;
993     }
994     absoldterm = absterm;
995     afac /= a;
996   }
997   res += sgn * std::exp(-0.5 * a * eta * eta) * sum / std::sqrt(2 * c10::pi<float> * a);
998 
999   return res;
1000 }
1001 
1002 template <typename scalar_t>
_igamc_helper_continued_fraction(scalar_t a,scalar_t x)1003 static scalar_t _igamc_helper_continued_fraction(scalar_t a, scalar_t x) {
1004   // Compute igamc using DLMF 8.9.2. [igam1]
1005   int i;
1006   scalar_t ans, ax, c, yc, r, t, y, z;
1007   scalar_t pk, pkm1, pkm2, qk, qkm1, qkm2;
1008   int MAXITER = 2000;
1009   static scalar_t MACHEP = std::is_same<scalar_t, double>::value ?
1010     1.11022302462515654042E-16 : 5.9604644775390625E-8;
1011   static scalar_t BIG = std::is_same<scalar_t,double>::value ?
1012     4.503599627370496e15 : 16777216.;
1013   static scalar_t BIGINV = std::is_same<scalar_t,double>::value ?
1014     2.22044604925031308085e-16 : 5.9604644775390625E-8;
1015 
1016   ax = _igam_helper_fac(a, x);
1017   if (ax == 0.0) {
1018     return 0.0;
1019   }
1020 
1021   /* continued fraction */
1022   y = 1.0 - a;
1023   z = x + y + 1.0;
1024   c = 0.0;
1025   pkm2 = 1.0;
1026   qkm2 = x;
1027   pkm1 = x + 1.0;
1028   qkm1 = z * x;
1029   ans = pkm1 / qkm1;
1030 
1031   for (i = 0; i < MAXITER; i++) {
1032     c += 1.0;
1033     y += 1.0;
1034     z += 2.0;
1035     yc = y * c;
1036     pk = pkm1 * z - pkm2 * yc;
1037     qk = qkm1 * z - qkm2 * yc;
1038     if (qk != 0) {
1039       r = pk / qk;
1040       t = std::fabs((ans - r) / r);
1041       ans = r;
1042     }
1043     else {
1044       t = 1.0;
1045     }
1046     pkm2 = pkm1;
1047     pkm1 = pk;
1048     qkm2 = qkm1;
1049     qkm1 = qk;
1050     if (std::fabs(pk) > BIG) {
1051       pkm2 *= BIGINV;
1052       pkm1 *= BIGINV;
1053       qkm2 *= BIGINV;
1054       qkm1 *= BIGINV;
1055     }
1056     if (t <= MACHEP) {
1057       break;
1058     }
1059   }
1060   return ans * ax;
1061 }
1062 
1063 template <typename scalar_t>
calc_igammac(scalar_t a,scalar_t x)1064 inline scalar_t calc_igammac(scalar_t a, scalar_t x) {
1065   /* the calculation of the regularized upper incomplete gamma function
1066    * is done differently based on the values of a and x:
1067    * - if x and/or a is at the boundary of defined region, then assign the
1068    *   result at the boundary
1069    * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
1070    *   Large Parameter (see DLMF 8.12.4 [igam1])
1071    * - if x > 1.1 and x < a, using the substraction from the regularized lower
1072    *   incomplete gamma
1073    * - otherwise, calculate the series from [igam2] eq (5)
1074    */
1075   scalar_t absxma_a;
1076 
1077   static scalar_t SMALL = 20.0;
1078   static scalar_t LARGE = 200.0;
1079   static scalar_t SMALLRATIO = 0.3;
1080   static scalar_t LARGERATIO = 4.5;
1081 
1082   // note that in SciPy, a and x are non-negative, with exclusive 0s (i.e.,
1083   // at most 1 of them can be 0), where igammac(0, x) = 0.0 iff x > 0.
1084   if ((x < 0) || (a < 0)) {
1085     // out of defined-region of the function
1086     return std::numeric_limits<scalar_t>::quiet_NaN();
1087   }
1088   else if (a == 0) {
1089     if (x > 0) {
1090       return 0.0;
1091     }
1092     else {
1093       return std::numeric_limits<scalar_t>::quiet_NaN();
1094     }
1095   }
1096   else if (x == 0) {
1097     return 1.0;
1098   }
1099   else if (std::isinf(a)) {
1100     if (std::isinf(x)) {
1101       return std::numeric_limits<scalar_t>::quiet_NaN();
1102     }
1103     return 1.0;
1104   }
1105   else if (std::isinf(x)) {
1106     return 0.0;
1107   }
1108 
1109   absxma_a = std::fabs(x - a) / a;
1110   if ((a > SMALL) && (a < LARGE) && (absxma_a < SMALLRATIO)) {
1111      return _igam_helper_asymptotic_series(a, x, 0);
1112   }
1113   else if ((a > LARGE) && (absxma_a < LARGERATIO / std::sqrt(a))) {
1114      return _igam_helper_asymptotic_series(a, x, 0);
1115   }
1116 
1117   if (x > 1.1) {
1118     if (x < a) {
1119       return 1.0 - _igam_helper_series(a, x);
1120     }
1121     else {
1122       return _igamc_helper_continued_fraction(a, x);
1123     }
1124   }
1125   else if (x <= 0.5) {
1126     if (-0.4 / std::log(x) < a) {
1127       return 1.0 - _igam_helper_series(a, x);
1128     }
1129     else {
1130       return _igamc_helper_series(a, x);
1131     }
1132   }
1133   else {
1134     if (x * 1.1 < a) {
1135       return 1.0 - _igam_helper_series(a, x);
1136     }
1137     else {
1138       return _igamc_helper_series(a, x);
1139     }
1140   }
1141 }
1142 
1143 template <typename scalar_t>
calc_igamma(scalar_t a,scalar_t x)1144 scalar_t calc_igamma(scalar_t a, scalar_t x) {
1145   /* the calculation of the regularized lower incomplete gamma function
1146    * is done differently based on the values of a and x:
1147    * - if x and/or a is at the boundary of defined region, then assign the
1148    *   result at the boundary
1149    * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
1150    *   Large Parameter (see DLMF 8.12.3 [igam1])
1151    * - if x > 1 and x > a, using the substraction from the regularized upper
1152    *   incomplete gamma
1153    * - otherwise, calculate the series from [igam2] eq (4)
1154    */
1155   scalar_t absxma_a;
1156   static scalar_t SMALL = 20.0;
1157   static scalar_t LARGE = 200.0;
1158   static scalar_t SMALLRATIO = 0.3;
1159   static scalar_t LARGERATIO = 4.5;
1160 
1161   // boundary values following SciPy
1162   // note that in SciPy, a and x are non-negative, with exclusive 0s (i.e.,
1163   // at most 1 of them can be 0), where igamma(0, x) = 1.0 iff x > 0.
1164   if ((x < 0) || (a < 0)) {
1165     // out of defined-region of the function
1166     return std::numeric_limits<scalar_t>::quiet_NaN();
1167   }
1168   else if (a == 0) {
1169     if (x > 0) {
1170       return 1.0;
1171     }
1172     else {
1173       return std::numeric_limits<scalar_t>::quiet_NaN();
1174     }
1175   }
1176   else if (x == 0) {
1177     return 0.0; // zero integration limit
1178   }
1179   else if (std::isinf(a)) {
1180     if (std::isinf(x)) {
1181       return std::numeric_limits<scalar_t>::quiet_NaN();
1182     }
1183     return 0.0;
1184   }
1185   else if (std::isinf(x)) {
1186     return 1.0;
1187   }
1188 
1189   /* Asymptotic regime where a ~ x. See [igam2] */
1190   absxma_a = std::fabs(x - a) / a;
1191   if ((a > SMALL) && (a < LARGE) && (absxma_a < SMALLRATIO)) {
1192     return _igam_helper_asymptotic_series(a, x, 1);
1193   }
1194   else if ((a > LARGE) && (absxma_a < LARGERATIO / std::sqrt(a))) {
1195     return _igam_helper_asymptotic_series(a, x, 1);
1196   }
1197 
1198   if ((x > 1.0) && (x > a)) {
1199     return 1.0 - calc_igammac(a, x);
1200   }
1201 
1202   return _igam_helper_series(a, x);
1203 }
1204 
1205 template <>
1206 C10_UNUSED inline c10::BFloat16 calc_igamma<c10::BFloat16>(c10::BFloat16 a, c10::BFloat16 x) {
1207   return calc_igamma<float>(float(a), float(x));
1208 }
1209 
1210 template <>
1211 C10_UNUSED inline c10::Half calc_igamma<c10::Half>(c10::Half a, c10::Half x) {
1212   return calc_igamma<float>(float(a), float(x));
1213 }
1214 
1215 template <>
1216 C10_UNUSED inline c10::BFloat16 calc_igammac<c10::BFloat16>(c10::BFloat16 a, c10::BFloat16 x) {
1217   return calc_igammac<float>(float(a), float(x));
1218 }
1219 
1220 template <>
1221 C10_UNUSED inline c10::Half calc_igammac<c10::Half>(c10::Half a, c10::Half x) {
1222   return calc_igammac<float>(float(a), float(x));
1223 }
1224 
calc_erfinv(c10::BFloat16 a)1225 inline c10::BFloat16 calc_erfinv(c10::BFloat16 a) { return calc_erfinv(float(a)); }
1226 
1227 template <typename T>
abs_impl(T v)1228 inline T abs_impl(T v) {
1229   return std::abs(v);
1230 }
1231 
1232 template <>
abs_impl(uint8_t v)1233 C10_UNUSED inline uint8_t abs_impl(uint8_t v) {
1234   return v;
1235 }
1236 
1237 template <typename T>
1238 inline typename std::enable_if<std::is_integral<T>::value, T>::type
calc_gcd(T a,T b)1239 calc_gcd(T a, T b) {
1240   a = abs_impl(a);
1241   b = abs_impl(b);
1242   while (a != 0) {
1243     T c = a;
1244     a = b % a;
1245     b = c;
1246   }
1247   return b;
1248 }
1249 
1250 template <typename T>
exp2_impl(T x)1251 C10_HOST_DEVICE T exp2_impl(T x) {
1252   return std::exp2(x);
1253 }
1254 
1255 template <typename T>
exp2_impl(c10::complex<T> x)1256 C10_HOST_DEVICE c10::complex<T> exp2_impl(c10::complex<T> x) {
1257   // There is no std::exp2 overload for complex, so instead
1258   // use the identity 2^x = e^(ln(2) * x)
1259   constexpr auto ln2 = c10::ln_2<T>;
1260   return std::exp(ln2 * x);
1261 }
1262 
1263 /*
1264  * This function is derived from the implementation of the chbevl function in the Cephes Math Library.
1265  * See note [3-Clause BSD License for the Cephes Math Library].
1266  *
1267  * Evaluates the series
1268  *
1269  *       len-1
1270  *         - '
1271  *  y  =   >   array[i] T (x/2)
1272  *         -             i
1273  *        i=0
1274  *
1275  * of Chebyshev polynomials Ti at argument x/2.
1276  *
1277  * Coefficients are stored in reverse order, i.e. the zero order term is last in the array.  Note len is the number of
1278  * coefficients, not the order.
1279  *
1280  * If coefficients are for the interval a to b, x must have been transformed to x -> 2(2x - b - a)/(b-a) before
1281  * entering the routine.  This maps x from (a, b) to (-1, 1), over which the Chebyshev polynomials are defined.
1282  *
1283  * If the coefficients are for the inverted interval, in which (a, b) is mapped to (1/b, 1/a), the transformation
1284  * required is x -> 2(2ab/x - b - a)/(b-a).  If b is infinity, this becomes x -> 4a/x - 1.
1285  */
1286 template <typename T>
1287 inline typename std::enable_if<std::is_floating_point<T>::value, T>::type
chbevl(const T x,const T array[],size_t len)1288 chbevl(const T x, const T array[], size_t len) {
1289   T b0, b1, b2;
1290 
1291   b0 = array[0];
1292   b1 = static_cast<T>(0.0);
1293 
1294   for (size_t i = 1; i < len; ++i) {
1295     b2 = b1;
1296     b1 = b0;
1297     b0 = x * b1 - b2 + array[i];
1298   }
1299 
1300   return (static_cast<T>(0.5) * (b0 - b2));
1301 }
1302 
1303 /*
1304  * This function is derived from the implementation of the i0 function in the Cephes Math Library.
1305  * See note [3-Clause BSD License for the Cephes Math Library].
1306  *
1307  * Computes an approximation of the zeroth order modified Bessel function of the first kind.
1308  * The approximation is actually two (sub)approximations, both using a Chebyshev polynomial expansion.
1309  * One approximates the function over [0, 8], and the other over (8, infinity). This function takes the absolute value
1310  * of all inputs to convert them into the domain of the approximation.
1311  */
1312 template <typename T>
chebyshev_coefficients_i0e_A()1313 inline std::tuple<const T*, size_t> chebyshev_coefficients_i0e_A() {
1314   /* Chebyshev coefficients for exp(-x) I0(x)
1315    * in the interval [0,8].
1316    *
1317    * lim(x->0){ exp(-x) I0(x) } = 1.
1318    */
1319   static const T coeff[] = {
1320       -4.41534164647933937950E-18, 3.33079451882223809783E-17,
1321       -2.43127984654795469359E-16, 1.71539128555513303061E-15,
1322       -1.16853328779934516808E-14, 7.67618549860493561688E-14,
1323       -4.85644678311192946090E-13, 2.95505266312963983461E-12,
1324       -1.72682629144155570723E-11, 9.67580903537323691224E-11,
1325       -5.18979560163526290666E-10, 2.65982372468238665035E-9,
1326       -1.30002500998624804212E-8,  6.04699502254191894932E-8,
1327       -2.67079385394061173391E-7,  1.11738753912010371815E-6,
1328       -4.41673835845875056359E-6,  1.64484480707288970893E-5,
1329       -5.75419501008210370398E-5,  1.88502885095841655729E-4,
1330       -5.76375574538582365885E-4,  1.63947561694133579842E-3,
1331       -4.32430999505057594430E-3,  1.05464603945949983183E-2,
1332       -2.37374148058994688156E-2,  4.93052842396707084878E-2,
1333       -9.49010970480476444210E-2,  1.71620901522208775349E-1,
1334       -3.04682672343198398683E-1,  6.76795274409476084995E-1};
1335   return std::make_tuple(coeff, 30);
1336 };
1337 
1338 template <typename T>
chebyshev_coefficients_i0e_B()1339 inline std::tuple<const T*, size_t> chebyshev_coefficients_i0e_B() {
1340   /* Chebyshev coefficients for exp(-x) sqrt(x) I0(x)
1341    * in the inverted interval [8,infinity].
1342    *
1343    * lim(x->inf){ exp(-x) sqrt(x) I0(x) } = 1/sqrt(2pi).
1344    */
1345   static const T coeff[] = {
1346       -7.23318048787475395456E-18, -4.83050448594418207126E-18,
1347       4.46562142029675999901E-17,  3.46122286769746109310E-17,
1348       -2.82762398051658348494E-16, -3.42548561967721913462E-16,
1349       1.77256013305652638360E-15,  3.81168066935262242075E-15,
1350       -9.55484669882830764870E-15, -4.15056934728722208663E-14,
1351       1.54008621752140982691E-14,  3.85277838274214270114E-13,
1352       7.18012445138366623367E-13,  -1.79417853150680611778E-12,
1353       -1.32158118404477131188E-11, -3.14991652796324136454E-11,
1354       1.18891471078464383424E-11,  4.94060238822496958910E-10,
1355       3.39623202570838634515E-9,   2.26666899049817806459E-8,
1356       2.04891858946906374183E-7,   2.89137052083475648297E-6,
1357       6.88975834691682398426E-5,   3.36911647825569408990E-3,
1358       8.04490411014108831608E-1};
1359 
1360   return std::make_tuple(coeff, 25);
1361 };
1362 
1363 template <typename T>
1364 inline typename std::enable_if<std::is_same<double, T>::value, std::tuple<const T*, size_t>>::type
chebyshev_coefficients_i1e_A()1365 chebyshev_coefficients_i1e_A() {
1366   /* Chebyshev coefficients for exp(-x) I1(x)
1367    * in the interval [0,8].
1368    *
1369    * lim(x->0){ exp(-x) I1(x) / x } = 1/2.
1370    */
1371   static const T coeff[] = {
1372       2.77791411276104639959E-18, -2.11142121435816608115E-17,
1373       1.55363195773620046921E-16, -1.10559694773538630805E-15,
1374       7.60068429473540693410E-15, -5.04218550472791168711E-14,
1375       3.22379336594557470981E-13, -1.98397439776494371520E-12,
1376       1.17361862988909016308E-11, -6.66348972350202774223E-11,
1377       3.62559028155211703701E-10, -1.88724975172282928790E-9,
1378       9.38153738649577178388E-9,  -4.44505912879632808065E-8,
1379       2.00329475355213526229E-7,  -8.56872026469545474066E-7,
1380       3.47025130813767847674E-6,  -1.32731636560394358279E-5,
1381       4.78156510755005422638E-5,  -1.61760815825896745588E-4,
1382       5.12285956168575772895E-4,  -1.51357245063125314899E-3,
1383       4.15642294431288815669E-3,  -1.05640848946261981558E-2,
1384       2.47264490306265168283E-2,  -5.29459812080949914269E-2,
1385       1.02643658689847095384E-1,  -1.76416518357834055153E-1,
1386       2.52587186443633654823E-1};
1387   return std::make_tuple(coeff, 29);
1388 };
1389 
1390 template <typename T>
1391 inline typename std::enable_if<std::is_same<float, T>::value, std::tuple<const T*, size_t>>::type
chebyshev_coefficients_i1e_A()1392 chebyshev_coefficients_i1e_A() {
1393   /* Chebyshev coefficients for exp(-x) I1(x)
1394    * in the interval [0,8].
1395    *
1396    * lim(x->0){ exp(-x) I1(x) / x } = 1/2.
1397    */
1398   static const T coeff[] = {
1399       9.38153738649577178388E-9f,
1400       -4.44505912879632808065E-8f,
1401       2.00329475355213526229E-7f,
1402       -8.56872026469545474066E-7f,
1403       3.47025130813767847674E-6f,
1404       -1.32731636560394358279E-5f,
1405       4.78156510755005422638E-5f,
1406       -1.61760815825896745588E-4f,
1407       5.12285956168575772895E-4f,
1408       -1.51357245063125314899E-3f,
1409       4.15642294431288815669E-3f,
1410       -1.05640848946261981558E-2f,
1411       2.47264490306265168283E-2f,
1412       -5.29459812080949914269E-2f,
1413       1.02643658689847095384E-1f,
1414       -1.76416518357834055153E-1f,
1415       2.52587186443633654823E-1f};
1416   return std::make_tuple(coeff, 17);
1417 };
1418 
1419 template <typename T>
1420 inline typename std::enable_if<std::is_same<double, T>::value, std::tuple<const T*, size_t>>::type
chebyshev_coefficients_i1e_B()1421 chebyshev_coefficients_i1e_B() {
1422   /* Chebyshev coefficients for exp(-x) sqrt(x) I1(x)
1423    * in the inverted interval [8,infinity].
1424    *
1425    * lim(x->inf){ exp(-x) sqrt(x) I1(x) } = 1/sqrt(2pi).
1426    */
1427   static const T coeff[] = {
1428       7.51729631084210481353E-18,  4.41434832307170791151E-18,
1429       -4.65030536848935832153E-17, -3.20952592199342395980E-17,
1430       2.96262899764595013876E-16,  3.30820231092092828324E-16,
1431       -1.88035477551078244854E-15, -3.81440307243700780478E-15,
1432       1.04202769841288027642E-14,  4.27244001671195135429E-14,
1433       -2.10154184277266431302E-14, -4.08355111109219731823E-13,
1434       -7.19855177624590851209E-13, 2.03562854414708950722E-12,
1435       1.41258074366137813316E-11,  3.25260358301548823856E-11,
1436       -1.89749581235054123450E-11, -5.58974346219658380687E-10,
1437       -3.83538038596423702205E-9,  -2.63146884688951950684E-8,
1438       -2.51223623787020892529E-7,  -3.88256480887769039346E-6,
1439       -1.10588938762623716291E-4,  -9.76109749136146840777E-3,
1440       7.78576235018280120474E-1};
1441 
1442   return std::make_tuple(coeff, 25);
1443 };
1444 
1445 template <typename T>
1446 inline typename std::enable_if<std::is_same<float, T>::value, std::tuple<const T*, size_t>>::type
chebyshev_coefficients_i1e_B()1447 chebyshev_coefficients_i1e_B() {
1448   /* Chebyshev coefficients for exp(-x) sqrt(x) I1(x)
1449    * in the inverted interval [8,infinity].
1450    *
1451    * lim(x->inf){ exp(-x) sqrt(x) I1(x) } = 1/sqrt(2pi).
1452    */
1453   static const T coeff[] = {
1454       -3.83538038596423702205E-9f,
1455       -2.63146884688951950684E-8f,
1456       -2.51223623787020892529E-7f,
1457       -3.88256480887769039346E-6f,
1458       -1.10588938762623716291E-4f,
1459       -9.76109749136146840777E-3f,
1460       7.78576235018280120474E-1f};
1461 
1462   return std::make_tuple(coeff, 7);
1463 };
1464 
1465 template <typename T>
1466 inline typename std::enable_if<std::is_floating_point<T>::value, T>::type
calc_i0(T _x)1467 calc_i0(T _x) {
1468   T x = std::abs(_x);
1469 
1470   if (x <= T{8.0}) {
1471     auto coeff_pair = chebyshev_coefficients_i0e_A<T>();
1472     auto A = std::get<0>(coeff_pair);
1473     auto len = std::get<1>(coeff_pair);
1474     T y = (x / T{2.0}) - T{2.0};
1475     return static_cast<T>(std::exp(x) * chbevl(y, A, len));
1476   }
1477   auto coeff_pair = chebyshev_coefficients_i0e_B<T>();
1478   auto B = std::get<0>(coeff_pair);
1479   auto len = std::get<1>(coeff_pair);
1480   return std::exp(x) * chbevl(T{32.0} / x - T{2.0}, B, len) / std::sqrt(x);
1481 }
1482 
1483 // Upcast bfloat16 input to float for numerical accuracy purposes
calc_i0(c10::BFloat16 a)1484 inline c10::BFloat16 calc_i0(c10::BFloat16 a) { return calc_i0(static_cast<float>(a)); }
1485 
1486 /*
1487  * This function is derived from the implementation of the i1 function in the Cephes Math Library.
1488  * See note [3-Clause BSD License for the Cephes Math Library].
1489  *
1490  * Computes an approximation of the first order modified Bessel function of the first kind.
1491  * The approximation is actually two (sub)approximations, both using a Chebyshev polynomial expansion.
1492  * One approximates the function over [0, 8], and the other over (8, infinity). This function takes the absolute value
1493  * of all inputs to convert them into the domain of the approximation.
1494  */
1495 template <typename T>
1496 inline typename std::enable_if<std::is_floating_point<T>::value, T>::type
calc_i1(T _x)1497 calc_i1(T _x) {
1498   T x = std::abs(_x);
1499 
1500   if (x <= T{8.0}) {
1501     auto coeff_pair = chebyshev_coefficients_i1e_A<T>();
1502     auto A = std::get<0>(coeff_pair);
1503     auto len = std::get<1>(coeff_pair);
1504     T y = (x / T{2.0}) - T{2.0};
1505     const T out = std::exp(x) * x * chbevl(y, A, len);
1506     return (_x < T{0.0}) ? -out : out;
1507   }
1508   auto coeff_pair = chebyshev_coefficients_i1e_B<T>();
1509   auto B = std::get<0>(coeff_pair);
1510   auto len = std::get<1>(coeff_pair);
1511   const T out = (std::exp(x) * chbevl(T{32.0} / x - T{2.0}, B, len)) / std::sqrt(x);
1512   return (_x < T{0.0}) ? -out : out;
1513 }
1514 
1515 /*
1516  * This function is derived from the implementation of the i1e function in the Cephes Math Library.
1517  * See note [3-Clause BSD License for the Cephes Math Library].
1518  *
1519  * Computes an approximation of the exponentially scaled first order modified Bessel function of the first kind.
1520  * The approximation is actually two (sub)approximations, both using a Chebyshev polynomial expansion.
1521  * One approximates the function over [0, 8], and the other over (8, infinity). This function takes the absolute value
1522  * of all inputs to convert them into the domain of the approximation.
1523  */
1524 template <typename T>
1525 inline typename std::enable_if<std::is_floating_point<T>::value, T>::type
calc_i1e(T _x)1526 calc_i1e(T _x) {
1527   T x = std::abs(_x);
1528 
1529   if (x <= T{8.0}) {
1530     auto coeff_pair = chebyshev_coefficients_i1e_A<T>();
1531     auto A = std::get<0>(coeff_pair);
1532     auto len = std::get<1>(coeff_pair);
1533     T y = (x / T{2.0}) - T{2.0};
1534     const T out = chbevl(y, A, len) * x;
1535     return (_x < T{0.0}) ? -out : out;
1536   }
1537   auto coeff_pair = chebyshev_coefficients_i1e_B<T>();
1538   auto B = std::get<0>(coeff_pair);
1539   auto len = std::get<1>(coeff_pair);
1540   const auto out = chbevl(T{32.0} / x - T{2.0}, B, len) / std::sqrt(x);
1541   return (_x < T{0.0}) ? -out : out;
1542 }
1543 
1544 /*
1545  * This function is derived from the implementation of the i1e function in the Cephes Math Library.
1546  * See note [3-Clause BSD License for the Cephes Math Library].
1547  *
1548  * Computes the argument, x, for which the area under the Gaussian probability density function
1549  * (integrated from minus infinity to x) is equal to y.
1550  */
1551 template <typename T>
calc_ndtri(T y0)1552 inline C10_HOST_DEVICE T calc_ndtri(T y0) {
1553 
1554   /* sqrt(2pi) */
1555   constexpr T s2pi = 2.50662827463100050242E0;
1556   constexpr T one = 1;
1557   constexpr T zero = 0;
1558 
1559   /* approximation for 0 <= |y - 0.5| <= 3/8 */
1560   static const T P0[5] = {
1561       -5.99633501014107895267E1,
1562       9.80010754185999661536E1,
1563       -5.66762857469070293439E1,
1564       1.39312609387279679503E1,
1565       -1.23916583867381258016E0,
1566   };
1567 
1568   static const T Q0[9] = {
1569       1.00000000000000000000E0,
1570       1.95448858338141759834E0,
1571       4.67627912898881538453E0,
1572       8.63602421390890590575E1,
1573       -2.25462687854119370527E2,
1574       2.00260212380060660359E2,
1575       -8.20372256168333339912E1,
1576       1.59056225126211695515E1,
1577       -1.18331621121330003142E0,
1578   };
1579 
1580   /* Approximation for interval z = sqrt(-2 log y ) between 2 and 8
1581   * i.e., y between exp(-2) = .135 and exp(-32) = 1.27e-14.
1582   */
1583   static const T P1[9] = {
1584       4.05544892305962419923E0,
1585       3.15251094599893866154E1,
1586       5.71628192246421288162E1,
1587       4.40805073893200834700E1,
1588       1.46849561928858024014E1,
1589       2.18663306850790267539E0,
1590       -1.40256079171354495875E-1,
1591       -3.50424626827848203418E-2,
1592       -8.57456785154685413611E-4,
1593   };
1594 
1595   static const T Q1[9] = {
1596       1.00000000000000000000E0,
1597       1.57799883256466749731E1,
1598       4.53907635128879210584E1,
1599       4.13172038254672030440E1,
1600       1.50425385692907503408E1,
1601       2.50464946208309415979E0,
1602       -1.42182922854787788574E-1,
1603       -3.80806407691578277194E-2,
1604       -9.33259480895457427372E-4,
1605   };
1606 
1607   /* Approximation for interval z = sqrt(-2 log y ) between 8 and 64
1608   * i.e., y between exp(-32) = 1.27e-14 and exp(-2048) = 3.67e-890.
1609   */
1610 
1611   static const T P2[9] = {
1612       3.23774891776946035970E0,
1613       6.91522889068984211695E0,
1614       3.93881025292474443415E0,
1615       1.33303460815807542389E0,
1616       2.01485389549179081538E-1,
1617       1.23716634817820021358E-2,
1618       3.01581553508235416007E-4,
1619       2.65806974686737550832E-6,
1620       6.23974539184983293730E-9,
1621   };
1622 
1623   static const T Q2[9] = {
1624       1.00000000000000000000E0,
1625       6.02427039364742014255E0,
1626       3.67983563856160859403E0,
1627       1.37702099489081330271E0,
1628       2.16236993594496635890E-1,
1629       1.34204006088543189037E-2,
1630       3.28014464682127739104E-4,
1631       2.89247864745380683936E-6,
1632       6.79019408009981274425E-9,
1633   };
1634 
1635   if (y0 == zero) {
1636     return -std::numeric_limits<T>::infinity();
1637   }
1638   if (y0 == one) {
1639     return std::numeric_limits<T>::infinity();
1640   }
1641   if (y0 < zero || y0 > one) {
1642     return std::numeric_limits<T>::quiet_NaN();
1643   }
1644   bool code = true;
1645   T y = y0;
1646   if (y > one - T{0.13533528323661269189}) { /* 0.135... = exp(-2) */
1647     y = one - y;
1648     code = false;
1649   }
1650 
1651   if (y > T{0.13533528323661269189}) {
1652     y = y - T{0.5};
1653     const T y2 = y * y;
1654     T x = y + y * (y2 * polevl(y2, P0, 4) / polevl(y2, Q0, 8));
1655     return (x * s2pi);
1656   }
1657 
1658   T x = ::sqrt(T{-2.0} * ::log(y));
1659   const T x0 = x - ::log(x) / x;
1660 
1661   const T z = one / x;
1662   T x1;
1663   if (x < T{8.0}) /* y > exp(-32) = 1.2664165549e-14 */
1664   {
1665     x1 = z * polevl(z, P1, 8) / polevl(z, Q1, 8);
1666   } else {
1667     x1 = z * polevl(z, P2, 8) / polevl(z, Q2, 8);
1668   }
1669   x = x0 - x1;
1670   if (code) {
1671     x = -x;
1672   }
1673   return x;
1674 }
1675 
1676 /* The next function is taken from http://ab-initio.mit.edu/Faddeev */
1677 
1678 /* Copyright (c) 2012 Massachusetts Institute of Technology
1679  *
1680  * Permission is hereby granted, free of charge, to any person obtaining
1681  * a copy of this software and associated documentation files (the
1682  * "Software"), to deal in the Software without restriction, including
1683  * without limitation the rights to use, copy, modify, merge, publish,
1684  * distribute, sublicense, and/or sell copies of the Software, and to
1685  * permit persons to whom the Software is furnished to do so, subject to
1686  * the following conditions:
1687  *
1688  * The above copyright notice and this permission notice shall be
1689  * included in all copies or substantial portions of the Software.
1690  *
1691  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
1692  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
1693  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
1694  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
1695  * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
1696  * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
1697  * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
1698  */
1699 
1700 /* erfcx(x) = exp(x^2) erfc(x) function, for real x, written by
1701    Steven G. Johnson, October 2012.
1702 
1703    This function combines a few different ideas.
1704 
1705    First, for x > 50, it uses a continued-fraction expansion (same as
1706    for the Faddeeva function, but with algebraic simplifications for z=i*x).
1707 
1708    Second, for 0 <= x <= 50, it uses Chebyshev polynomial approximations,
1709    but with two twists:
1710 
1711       a) It maps x to y = 4 / (4+x) in [0,1].  This simple transformation,
1712          inspired by a similar transformation in the octave-forge/specfun
1713          erfcx by Soren Hauberg, results in much faster Chebyshev convergence
1714          than other simple transformations I have examined.
1715 
1716       b) Instead of using a single Chebyshev polynomial for the entire
1717          [0,1] y interval, we break the interval up into 100 equal
1718          subintervals, with a switch/lookup table, and use much lower
1719          degree Chebyshev polynomials in each subinterval. This greatly
1720          improves performance in my tests.
1721 
1722    For x < 0, we use the relationship erfcx(-x) = 2 exp(x^2) - erfc(x),
1723    with the usual checks for overflow etcetera.
1724 
1725    Performance-wise, it seems to be substantially faster than either
1726    the SLATEC DERFC function [or an erfcx function derived therefrom]
1727    or Cody's CALERF function (from netlib.org/specfun), while
1728    retaining near machine precision in accuracy.  */
1729 
1730 /* Given y100=100*y, where y = 4/(4+x) for x >= 0, compute erfc(x).
1731 
1732    Uses a look-up table of 100 different Chebyshev polynomials
1733    for y intervals [0,0.01], [0.01,0.02], ...., [0.99,1], generated
1734    with the help of Maple and a little shell script.   This allows
1735    the Chebyshev polynomials to be of significantly lower degree (about 1/4)
1736    compared to fitting the whole [0,1] interval with a single polynomial. */
1737 
1738 
1739 template <typename T>
1740 C10_HOST_DEVICE  inline typename std::enable_if<std::is_floating_point<T>::value, T>::type
erfcx_y100(T y100)1741 erfcx_y100(T y100)
1742 {
1743   switch (static_cast<int>(y100)) {
1744 case 0: {
1745 T t = 2*y100 - 1;
1746 return 0.70878032454106438663e-3 + (0.71234091047026302958e-3 + (0.35779077297597742384e-5 + (0.17403143962587937815e-7 + (0.81710660047307788845e-10 + (0.36885022360434957634e-12 + 0.15917038551111111111e-14 * t) * t) * t) * t) * t) * t;
1747 }
1748 case 1: {
1749 T t = 2*y100 - 3;
1750 return 0.21479143208285144230e-2 + (0.72686402367379996033e-3 + (0.36843175430938995552e-5 + (0.18071841272149201685e-7 + (0.85496449296040325555e-10 + (0.38852037518534291510e-12 + 0.16868473576888888889e-14 * t) * t) * t) * t) * t) * t;
1751 }
1752 case 2: {
1753 T t = 2*y100 - 5;
1754 return 0.36165255935630175090e-2 + (0.74182092323555510862e-3 + (0.37948319957528242260e-5 + (0.18771627021793087350e-7 + (0.89484715122415089123e-10 + (0.40935858517772440862e-12 + 0.17872061464888888889e-14 * t) * t) * t) * t) * t) * t;
1755 }
1756 case 3: {
1757 T t = 2*y100 - 7;
1758 return 0.51154983860031979264e-2 + (0.75722840734791660540e-3 + (0.39096425726735703941e-5 + (0.19504168704300468210e-7 + (0.93687503063178993915e-10 + (0.43143925959079664747e-12 + 0.18939926435555555556e-14 * t) * t) * t) * t) * t) * t;
1759 }
1760 case 4: {
1761 T t = 2*y100 - 9;
1762 return 0.66457513172673049824e-2 + (0.77310406054447454920e-3 + (0.40289510589399439385e-5 + (0.20271233238288381092e-7 + (0.98117631321709100264e-10 + (0.45484207406017752971e-12 + 0.20076352213333333333e-14 * t) * t) * t) * t) * t) * t;
1763 }
1764 case 5: {
1765 T t = 2*y100 - 11;
1766 return 0.82082389970241207883e-2 + (0.78946629611881710721e-3 + (0.41529701552622656574e-5 + (0.21074693344544655714e-7 + (0.10278874108587317989e-9 + (0.47965201390613339638e-12 + 0.21285907413333333333e-14 * t) * t) * t) * t) * t) * t;
1767 }
1768 case 6: {
1769 T t = 2*y100 - 13;
1770 return 0.98039537275352193165e-2 + (0.80633440108342840956e-3 + (0.42819241329736982942e-5 + (0.21916534346907168612e-7 + (0.10771535136565470914e-9 + (0.50595972623692822410e-12 + 0.22573462684444444444e-14 * t) * t) * t) * t) * t) * t;
1771 }
1772 case 7: {
1773 T t = 2*y100 - 15;
1774 return 0.11433927298290302370e-1 + (0.82372858383196561209e-3 + (0.44160495311765438816e-5 + (0.22798861426211986056e-7 + (0.11291291745879239736e-9 + (0.53386189365816880454e-12 + 0.23944209546666666667e-14 * t) * t) * t) * t) * t) * t;
1775 }
1776 case 8: {
1777 T t = 2*y100 - 17;
1778 return 0.13099232878814653979e-1 + (0.84167002467906968214e-3 + (0.45555958988457506002e-5 + (0.23723907357214175198e-7 + (0.11839789326602695603e-9 + (0.56346163067550237877e-12 + 0.25403679644444444444e-14 * t) * t) * t) * t) * t) * t;
1779 }
1780 case 9: {
1781 T t = 2*y100 - 19;
1782 return 0.14800987015587535621e-1 + (0.86018092946345943214e-3 + (0.47008265848816866105e-5 + (0.24694040760197315333e-7 + (0.12418779768752299093e-9 + (0.59486890370320261949e-12 + 0.26957764568888888889e-14 * t) * t) * t) * t) * t) * t;
1783 }
1784 case 10: {
1785 T t = 2*y100 - 21;
1786 return 0.16540351739394069380e-1 + (0.87928458641241463952e-3 + (0.48520195793001753903e-5 + (0.25711774900881709176e-7 + (0.13030128534230822419e-9 + (0.62820097586874779402e-12 + 0.28612737351111111111e-14 * t) * t) * t) * t) * t) * t;
1787 }
1788 case 11: {
1789 T t = 2*y100 - 23;
1790 return 0.18318536789842392647e-1 + (0.89900542647891721692e-3 + (0.50094684089553365810e-5 + (0.26779777074218070482e-7 + (0.13675822186304615566e-9 + (0.66358287745352705725e-12 + 0.30375273884444444444e-14 * t) * t) * t) * t) * t) * t;
1791 }
1792 case 12: {
1793 T t = 2*y100 - 25;
1794 return 0.20136801964214276775e-1 + (0.91936908737673676012e-3 + (0.51734830914104276820e-5 + (0.27900878609710432673e-7 + (0.14357976402809042257e-9 + (0.70114790311043728387e-12 + 0.32252476000000000000e-14 * t) * t) * t) * t) * t) * t;
1795 }
1796 case 13: {
1797 T t = 2*y100 - 27;
1798 return 0.21996459598282740954e-1 + (0.94040248155366777784e-3 + (0.53443911508041164739e-5 + (0.29078085538049374673e-7 + (0.15078844500329731137e-9 + (0.74103813647499204269e-12 + 0.34251892320000000000e-14 * t) * t) * t) * t) * t) * t;
1799 }
1800 case 14: {
1801 T t = 2*y100 - 29;
1802 return 0.23898877187226319502e-1 + (0.96213386835900177540e-3 + (0.55225386998049012752e-5 + (0.30314589961047687059e-7 + (0.15840826497296335264e-9 + (0.78340500472414454395e-12 + 0.36381553564444444445e-14 * t) * t) * t) * t) * t) * t;
1803 }
1804 case 15: {
1805 T t = 2*y100 - 31;
1806 return 0.25845480155298518485e-1 + (0.98459293067820123389e-3 + (0.57082915920051843672e-5 + (0.31613782169164830118e-7 + (0.16646478745529630813e-9 + (0.82840985928785407942e-12 + 0.38649975768888888890e-14 * t) * t) * t) * t) * t) * t;
1807 }
1808 case 16: {
1809 T t = 2*y100 - 33;
1810 return 0.27837754783474696598e-1 + (0.10078108563256892757e-2 + (0.59020366493792212221e-5 + (0.32979263553246520417e-7 + (0.17498524159268458073e-9 + (0.87622459124842525110e-12 + 0.41066206488888888890e-14 * t) * t) * t) * t) * t) * t;
1811 }
1812 case 17: {
1813 T t = 2*y100 - 35;
1814 return 0.29877251304899307550e-1 + (0.10318204245057349310e-2 + (0.61041829697162055093e-5 + (0.34414860359542720579e-7 + (0.18399863072934089607e-9 + (0.92703227366365046533e-12 + 0.43639844053333333334e-14 * t) * t) * t) * t) * t) * t;
1815 }
1816 case 18: {
1817 T t = 2*y100 - 37;
1818 return 0.31965587178596443475e-1 + (0.10566560976716574401e-2 + (0.63151633192414586770e-5 + (0.35924638339521924242e-7 + (0.19353584758781174038e-9 + (0.98102783859889264382e-12 + 0.46381060817777777779e-14 * t) * t) * t) * t) * t) * t;
1819 }
1820 case 19: {
1821 T t = 2*y100 - 39;
1822 return 0.34104450552588334840e-1 + (0.10823541191350532574e-2 + (0.65354356159553934436e-5 + (0.37512918348533521149e-7 + (0.20362979635817883229e-9 + (0.10384187833037282363e-11 + 0.49300625262222222221e-14 * t) * t) * t) * t) * t) * t;
1823 }
1824 case 20: {
1825 T t = 2*y100 - 41;
1826 return 0.36295603928292425716e-1 + (0.11089526167995268200e-2 + (0.67654845095518363577e-5 + (0.39184292949913591646e-7 + (0.21431552202133775150e-9 + (0.10994259106646731797e-11 + 0.52409949102222222221e-14 * t) * t) * t) * t) * t) * t;
1827 }
1828 case 21: {
1829 T t = 2*y100 - 43;
1830 return 0.38540888038840509795e-1 + (0.11364917134175420009e-2 + (0.70058230641246312003e-5 + (0.40943644083718586939e-7 + (0.22563034723692881631e-9 + (0.11642841011361992885e-11 + 0.55721092871111111110e-14 * t) * t) * t) * t) * t) * t;
1831 }
1832 case 22: {
1833 T t = 2*y100 - 45;
1834 return 0.40842225954785960651e-1 + (0.11650136437945673891e-2 + (0.72569945502343006619e-5 + (0.42796161861855042273e-7 + (0.23761401711005024162e-9 + (0.12332431172381557035e-11 + 0.59246802364444444445e-14 * t) * t) * t) * t) * t) * t;
1835 }
1836 case 23: {
1837 T t = 2*y100 - 47;
1838 return 0.43201627431540222422e-1 + (0.11945628793917272199e-2 + (0.75195743532849206263e-5 + (0.44747364553960993492e-7 + (0.25030885216472953674e-9 + (0.13065684400300476484e-11 + 0.63000532853333333334e-14 * t) * t) * t) * t) * t) * t;
1839 }
1840 case 24: {
1841 T t = 2*y100 - 49;
1842 return 0.45621193513810471438e-1 + (0.12251862608067529503e-2 + (0.77941720055551920319e-5 + (0.46803119830954460212e-7 + (0.26375990983978426273e-9 + (0.13845421370977119765e-11 + 0.66996477404444444445e-14 * t) * t) * t) * t) * t) * t;
1843 }
1844 case 25: {
1845 T t = 2*y100 - 51;
1846 return 0.48103121413299865517e-1 + (0.12569331386432195113e-2 + (0.80814333496367673980e-5 + (0.48969667335682018324e-7 + (0.27801515481905748484e-9 + (0.14674637611609884208e-11 + 0.71249589351111111110e-14 * t) * t) * t) * t) * t) * t;
1847 }
1848 case 26: {
1849 T t = 2*y100 - 53;
1850 return 0.50649709676983338501e-1 + (0.12898555233099055810e-2 + (0.83820428414568799654e-5 + (0.51253642652551838659e-7 + (0.29312563849675507232e-9 + (0.15556512782814827846e-11 + 0.75775607822222222221e-14 * t) * t) * t) * t) * t) * t;
1851 }
1852 case 27: {
1853 T t = 2*y100 - 55;
1854 return 0.53263363664388864181e-1 + (0.13240082443256975769e-2 + (0.86967260015007658418e-5 + (0.53662102750396795566e-7 + (0.30914568786634796807e-9 + (0.16494420240828493176e-11 + 0.80591079644444444445e-14 * t) * t) * t) * t) * t) * t;
1855 }
1856 case 28: {
1857 T t = 2*y100 - 57;
1858 return 0.55946601353500013794e-1 + (0.13594491197408190706e-2 + (0.90262520233016380987e-5 + (0.56202552975056695376e-7 + (0.32613310410503135996e-9 + (0.17491936862246367398e-11 + 0.85713381688888888890e-14 * t) * t) * t) * t) * t) * t;
1859 }
1860 case 29: {
1861 T t = 2*y100 - 59;
1862 return 0.58702059496154081813e-1 + (0.13962391363223647892e-2 + (0.93714365487312784270e-5 + (0.58882975670265286526e-7 + (0.34414937110591753387e-9 + (0.18552853109751857859e-11 + 0.91160736711111111110e-14 * t) * t) * t) * t) * t) * t;
1863 }
1864 case 30: {
1865 T t = 2*y100 - 61;
1866 return 0.61532500145144778048e-1 + (0.14344426411912015247e-2 + (0.97331446201016809696e-5 + (0.61711860507347175097e-7 + (0.36325987418295300221e-9 + (0.19681183310134518232e-11 + 0.96952238400000000000e-14 * t) * t) * t) * t) * t) * t;
1867 }
1868 case 31: {
1869 T t = 2*y100 - 63;
1870 return 0.64440817576653297993e-1 + (0.14741275456383131151e-2 + (0.10112293819576437838e-4 + (0.64698236605933246196e-7 + (0.38353412915303665586e-9 + (0.20881176114385120186e-11 + 0.10310784480000000000e-13 * t) * t) * t) * t) * t) * t;
1871 }
1872 case 32: {
1873 T t = 2*y100 - 65;
1874 return 0.67430045633130393282e-1 + (0.15153655418916540370e-2 + (0.10509857606888328667e-4 + (0.67851706529363332855e-7 + (0.40504602194811140006e-9 + (0.22157325110542534469e-11 + 0.10964842115555555556e-13 * t) * t) * t) * t) * t) * t;
1875 }
1876 case 33: {
1877 T t = 2*y100 - 67;
1878 return 0.70503365513338850709e-1 + (0.15582323336495709827e-2 + (0.10926868866865231089e-4 + (0.71182482239613507542e-7 + (0.42787405890153386710e-9 + (0.23514379522274416437e-11 + 0.11659571751111111111e-13 * t) * t) * t) * t) * t) * t;
1879 }
1880 case 34: {
1881 T t = 2*y100 - 69;
1882 return 0.73664114037944596353e-1 + (0.16028078812438820413e-2 + (0.11364423678778207991e-4 + (0.74701423097423182009e-7 + (0.45210162777476488324e-9 + (0.24957355004088569134e-11 + 0.12397238257777777778e-13 * t) * t) * t) * t) * t) * t;
1883 }
1884 case 35: {
1885 T t = 2*y100 - 71;
1886 return 0.76915792420819562379e-1 + (0.16491766623447889354e-2 + (0.11823685320041302169e-4 + (0.78420075993781544386e-7 + (0.47781726956916478925e-9 + (0.26491544403815724749e-11 + 0.13180196462222222222e-13 * t) * t) * t) * t) * t) * t;
1887 }
1888 case 36: {
1889 T t = 2*y100 - 73;
1890 return 0.80262075578094612819e-1 + (0.16974279491709504117e-2 + (0.12305888517309891674e-4 + (0.82350717698979042290e-7 + (0.50511496109857113929e-9 + (0.28122528497626897696e-11 + 0.14010889635555555556e-13 * t) * t) * t) * t) * t) * t;
1891 }
1892 case 37: {
1893 T t = 2*y100 - 75;
1894 return 0.83706822008980357446e-1 + (0.17476561032212656962e-2 + (0.12812343958540763368e-4 + (0.86506399515036435592e-7 + (0.53409440823869467453e-9 + (0.29856186620887555043e-11 + 0.14891851591111111111e-13 * t) * t) * t) * t) * t) * t;
1895 }
1896 case 38: {
1897 T t = 2*y100 - 77;
1898 return 0.87254084284461718231e-1 + (0.17999608886001962327e-2 + (0.13344443080089492218e-4 + (0.90900994316429008631e-7 + (0.56486134972616465316e-9 + (0.31698707080033956934e-11 + 0.15825697795555555556e-13 * t) * t) * t) * t) * t) * t;
1899 }
1900 case 39: {
1901 T t = 2*y100 - 79;
1902 return 0.90908120182172748487e-1 + (0.18544478050657699758e-2 + (0.13903663143426120077e-4 + (0.95549246062549906177e-7 + (0.59752787125242054315e-9 + (0.33656597366099099413e-11 + 0.16815130613333333333e-13 * t) * t) * t) * t) * t) * t;
1903 }
1904 case 40: {
1905 T t = 2*y100 - 81;
1906 return 0.94673404508075481121e-1 + (0.19112284419887303347e-2 + (0.14491572616545004930e-4 + (0.10046682186333613697e-6 + (0.63221272959791000515e-9 + (0.35736693975589130818e-11 + 0.17862931591111111111e-13 * t) * t) * t) * t) * t) * t;
1907 }
1908 case 41: {
1909 T t = 2*y100 - 83;
1910 return 0.98554641648004456555e-1 + (0.19704208544725622126e-2 + (0.15109836875625443935e-4 + (0.10567036667675984067e-6 + (0.66904168640019354565e-9 + (0.37946171850824333014e-11 + 0.18971959040000000000e-13 * t) * t) * t) * t) * t) * t;
1911 }
1912 case 42: {
1913 T t = 2*y100 - 85;
1914 return 0.10255677889470089531e0 + (0.20321499629472857418e-2 + (0.15760224242962179564e-4 + (0.11117756071353507391e-6 + (0.70814785110097658502e-9 + (0.40292553276632563925e-11 + 0.20145143075555555556e-13 * t) * t) * t) * t) * t) * t;
1915 }
1916 case 43: {
1917 T t = 2*y100 - 87;
1918 return 0.10668502059865093318e0 + (0.20965479776148731610e-2 + (0.16444612377624983565e-4 + (0.11700717962026152749e-6 + (0.74967203250938418991e-9 + (0.42783716186085922176e-11 + 0.21385479360000000000e-13 * t) * t) * t) * t) * t) * t;
1919 }
1920 case 44: {
1921 T t = 2*y100 - 89;
1922 return 0.11094484319386444474e0 + (0.21637548491908170841e-2 + (0.17164995035719657111e-4 + (0.12317915750735938089e-6 + (0.79376309831499633734e-9 + (0.45427901763106353914e-11 + 0.22696025653333333333e-13 * t) * t) * t) * t) * t) * t;
1923 }
1924 case 45: {
1925 T t = 2*y100 - 91;
1926 return 0.11534201115268804714e0 + (0.22339187474546420375e-2 + (0.17923489217504226813e-4 + (0.12971465288245997681e-6 + (0.84057834180389073587e-9 + (0.48233721206418027227e-11 + 0.24079890062222222222e-13 * t) * t) * t) * t) * t) * t;
1927 }
1928 case 46: {
1929 T t = 2*y100 - 93;
1930 return 0.11988259392684094740e0 + (0.23071965691918689601e-2 + (0.18722342718958935446e-4 + (0.13663611754337957520e-6 + (0.89028385488493287005e-9 + (0.51210161569225846701e-11 + 0.25540227111111111111e-13 * t) * t) * t) * t) * t) * t;
1931 }
1932 case 47: {
1933 T t = 2*y100 - 95;
1934 return 0.12457298393509812907e0 + (0.23837544771809575380e-2 + (0.19563942105711612475e-4 + (0.14396736847739470782e-6 + (0.94305490646459247016e-9 + (0.54366590583134218096e-11 + 0.27080225920000000000e-13 * t) * t) * t) * t) * t) * t;
1935 }
1936 case 48: {
1937 T t = 2*y100 - 97;
1938 return 0.12941991566142438816e0 + (0.24637684719508859484e-2 + (0.20450821127475879816e-4 + (0.15173366280523906622e-6 + (0.99907632506389027739e-9 + (0.57712760311351625221e-11 + 0.28703099555555555556e-13 * t) * t) * t) * t) * t) * t;
1939 }
1940 case 49: {
1941 T t = 2*y100 - 99;
1942 return 0.13443048593088696613e0 + (0.25474249981080823877e-2 + (0.21385669591362915223e-4 + (0.15996177579900443030e-6 + (0.10585428844575134013e-8 + (0.61258809536787882989e-11 + 0.30412080142222222222e-13 * t) * t) * t) * t) * t) * t;
1943 }
1944 case 50: {
1945 T t = 2*y100 - 101;
1946 return 0.13961217543434561353e0 + (0.26349215871051761416e-2 + (0.22371342712572567744e-4 + (0.16868008199296822247e-6 + (0.11216596910444996246e-8 + (0.65015264753090890662e-11 + 0.32210394506666666666e-13 * t) * t) * t) * t) * t) * t;
1947 }
1948 case 51: {
1949 T t = 2*y100 - 103;
1950 return 0.14497287157673800690e0 + (0.27264675383982439814e-2 + (0.23410870961050950197e-4 + (0.17791863939526376477e-6 + (0.11886425714330958106e-8 + (0.68993039665054288034e-11 + 0.34101266222222222221e-13 * t) * t) * t) * t) * t) * t;
1951 }
1952 case 52: {
1953 T t = 2*y100 - 105;
1954 return 0.15052089272774618151e0 + (0.28222846410136238008e-2 + (0.24507470422713397006e-4 + (0.18770927679626136909e-6 + (0.12597184587583370712e-8 + (0.73203433049229821618e-11 + 0.36087889048888888890e-13 * t) * t) * t) * t) * t) * t;
1955 }
1956 case 53: {
1957 T t = 2*y100 - 107;
1958 return 0.15626501395774612325e0 + (0.29226079376196624949e-2 + (0.25664553693768450545e-4 + (0.19808568415654461964e-6 + (0.13351257759815557897e-8 + (0.77658124891046760667e-11 + 0.38173420035555555555e-13 * t) * t) * t) * t) * t) * t;
1959 }
1960 case 54: {
1961 T t = 2*y100 - 109;
1962 return 0.16221449434620737567e0 + (0.30276865332726475672e-2 + (0.26885741326534564336e-4 + (0.20908350604346384143e-6 + (0.14151148144240728728e-8 + (0.82369170665974313027e-11 + 0.40360957457777777779e-13 * t) * t) * t) * t) * t) * t;
1963 }
1964 case 55: {
1965 T t = 2*y100 - 111;
1966 return 0.16837910595412130659e0 + (0.31377844510793082301e-2 + (0.28174873844911175026e-4 + (0.22074043807045782387e-6 + (0.14999481055996090039e-8 + (0.87348993661930809254e-11 + 0.42653528977777777779e-13 * t) * t) * t) * t) * t) * t;
1967 }
1968 case 56: {
1969 T t = 2*y100 - 113;
1970 return 0.17476916455659369953e0 + (0.32531815370903068316e-2 + (0.29536024347344364074e-4 + (0.23309632627767074202e-6 + (0.15899007843582444846e-8 + (0.92610375235427359475e-11 + 0.45054073102222222221e-13 * t) * t) * t) * t) * t) * t;
1971 }
1972 case 57: {
1973 T t = 2*y100 - 115;
1974 return 0.18139556223643701364e0 + (0.33741744168096996041e-2 + (0.30973511714709500836e-4 + (0.24619326937592290996e-6 + (0.16852609412267750744e-8 + (0.98166442942854895573e-11 + 0.47565418097777777779e-13 * t) * t) * t) * t) * t) * t;
1975 }
1976 case 58: {
1977 T t = 2*y100 - 117;
1978 return 0.18826980194443664549e0 + (0.35010775057740317997e-2 + (0.32491914440014267480e-4 + (0.26007572375886319028e-6 + (0.17863299617388376116e-8 + (0.10403065638343878679e-10 + 0.50190265831111111110e-13 * t) * t) * t) * t) * t) * t;
1979 }
1980 case 59: {
1981 T t = 2*y100 - 119;
1982 return 0.19540403413693967350e0 + (0.36342240767211326315e-2 + (0.34096085096200907289e-4 + (0.27479061117017637474e-6 + (0.18934228504790032826e-8 + (0.11021679075323598664e-10 + 0.52931171733333333334e-13 * t) * t) * t) * t) * t) * t;
1983 }
1984 case 60: {
1985 T t = 2*y100 - 121;
1986 return 0.20281109560651886959e0 + (0.37739673859323597060e-2 + (0.35791165457592409054e-4 + (0.29038742889416172404e-6 + (0.20068685374849001770e-8 + (0.11673891799578381999e-10 + 0.55790523093333333334e-13 * t) * t) * t) * t) * t) * t;
1987 }
1988 case 61: {
1989 T t = 2*y100 - 123;
1990 return 0.21050455062669334978e0 + (0.39206818613925652425e-2 + (0.37582602289680101704e-4 + (0.30691836231886877385e-6 + (0.21270101645763677824e-8 + (0.12361138551062899455e-10 + 0.58770520160000000000e-13 * t) * t) * t) * t) * t) * t;
1991 }
1992 case 62: {
1993 T t = 2*y100 - 125;
1994 return 0.21849873453703332479e0 + (0.40747643554689586041e-2 + (0.39476163820986711501e-4 + (0.32443839970139918836e-6 + (0.22542053491518680200e-8 + (0.13084879235290858490e-10 + 0.61873153262222222221e-13 * t) * t) * t) * t) * t) * t;
1995 }
1996 case 63: {
1997 T t = 2*y100 - 127;
1998 return 0.22680879990043229327e0 + (0.42366354648628516935e-2 + (0.41477956909656896779e-4 + (0.34300544894502810002e-6 + (0.23888264229264067658e-8 + (0.13846596292818514601e-10 + 0.65100183751111111110e-13 * t) * t) * t) * t) * t) * t;
1999 }
2000 case 64: {
2001 T t = 2*y100 - 129;
2002 return 0.23545076536988703937e0 + (0.44067409206365170888e-2 + (0.43594444916224700881e-4 + (0.36268045617760415178e-6 + (0.25312606430853202748e-8 + (0.14647791812837903061e-10 + 0.68453122631111111110e-13 * t) * t) * t) * t) * t) * t;
2003 }
2004 case 65: {
2005 T t = 2*y100 - 131;
2006 return 0.24444156740777432838e0 + (0.45855530511605787178e-2 + (0.45832466292683085475e-4 + (0.38352752590033030472e-6 + (0.26819103733055603460e-8 + (0.15489984390884756993e-10 + 0.71933206364444444445e-13 * t) * t) * t) * t) * t) * t;
2007 }
2008 case 66: {
2009 T t = 2*y100 - 133;
2010 return 0.25379911500634264643e0 + (0.47735723208650032167e-2 + (0.48199253896534185372e-4 + (0.40561404245564732314e-6 + (0.28411932320871165585e-8 + (0.16374705736458320149e-10 + 0.75541379822222222221e-13 * t) * t) * t) * t) * t) * t;
2011 }
2012 case 67: {
2013 T t = 2*y100 - 135;
2014 return 0.26354234756393613032e0 + (0.49713289477083781266e-2 + (0.50702455036930367504e-4 + (0.42901079254268185722e-6 + (0.30095422058900481753e-8 + (0.17303497025347342498e-10 + 0.79278273368888888890e-13 * t) * t) * t) * t) * t) * t;
2015 }
2016 case 68: {
2017 T t = 2*y100 - 137;
2018 return 0.27369129607732343398e0 + (0.51793846023052643767e-2 + (0.53350152258326602629e-4 + (0.45379208848865015485e-6 + (0.31874057245814381257e-8 + (0.18277905010245111046e-10 + 0.83144182364444444445e-13 * t) * t) * t) * t) * t) * t;
2019 }
2020 case 69: {
2021 T t = 2*y100 - 139;
2022 return 0.28426714781640316172e0 + (0.53983341916695141966e-2 + (0.56150884865255810638e-4 + (0.48003589196494734238e-6 + (0.33752476967570796349e-8 + (0.19299477888083469086e-10 + 0.87139049137777777779e-13 * t) * t) * t) * t) * t) * t;
2023 }
2024 case 70: {
2025 T t = 2*y100 - 141;
2026 return 0.29529231465348519920e0 + (0.56288077305420795663e-2 + (0.59113671189913307427e-4 + (0.50782393781744840482e-6 + (0.35735475025851713168e-8 + (0.20369760937017070382e-10 + 0.91262442613333333334e-13 * t) * t) * t) * t) * t) * t;
2027 }
2028 case 71: {
2029 T t = 2*y100 - 143;
2030 return 0.30679050522528838613e0 + (0.58714723032745403331e-2 + (0.62248031602197686791e-4 + (0.53724185766200945789e-6 + (0.37827999418960232678e-8 + (0.21490291930444538307e-10 + 0.95513539182222222221e-13 * t) * t) * t) * t) * t) * t;
2031 }
2032 case 72: {
2033 T t = 2*y100 - 145;
2034 return 0.31878680111173319425e0 + (0.61270341192339103514e-2 + (0.65564012259707640976e-4 + (0.56837930287837738996e-6 + (0.40035151353392378882e-8 + (0.22662596341239294792e-10 + 0.99891109760000000000e-13 * t) * t) * t) * t) * t) * t;
2035 }
2036 case 73: {
2037 T t = 2*y100 - 147;
2038 return 0.33130773722152622027e0 + (0.63962406646798080903e-2 + (0.69072209592942396666e-4 + (0.60133006661885941812e-6 + (0.42362183765883466691e-8 + (0.23888182347073698382e-10 + 0.10439349811555555556e-12 * t) * t) * t) * t) * t) * t;
2039 }
2040 case 74: {
2041 T t = 2*y100 - 149;
2042 return 0.34438138658041336523e0 + (0.66798829540414007258e-2 + (0.72783795518603561144e-4 + (0.63619220443228800680e-6 + (0.44814499336514453364e-8 + (0.25168535651285475274e-10 + 0.10901861383111111111e-12 * t) * t) * t) * t) * t) * t;
2043 }
2044 case 75: {
2045 T t = 2*y100 - 151;
2046 return 0.35803744972380175583e0 + (0.69787978834882685031e-2 + (0.76710543371454822497e-4 + (0.67306815308917386747e-6 + (0.47397647975845228205e-8 + (0.26505114141143050509e-10 + 0.11376390933333333333e-12 * t) * t) * t) * t) * t) * t;
2047 }
2048 case 76: {
2049 T t = 2*y100 - 153;
2050 return 0.37230734890119724188e0 + (0.72938706896461381003e-2 + (0.80864854542670714092e-4 + (0.71206484718062688779e-6 + (0.50117323769745883805e-8 + (0.27899342394100074165e-10 + 0.11862637614222222222e-12 * t) * t) * t) * t) * t) * t;
2051 }
2052 case 77: {
2053 T t = 2*y100 - 155;
2054 return 0.38722432730555448223e0 + (0.76260375162549802745e-2 + (0.85259785810004603848e-4 + (0.75329383305171327677e-6 + (0.52979361368388119355e-8 + (0.29352606054164086709e-10 + 0.12360253370666666667e-12 * t) * t) * t) * t) * t) * t;
2055 }
2056 case 78: {
2057 T t = 2*y100 - 157;
2058 return 0.40282355354616940667e0 + (0.79762880915029728079e-2 + (0.89909077342438246452e-4 + (0.79687137961956194579e-6 + (0.55989731807360403195e-8 + (0.30866246101464869050e-10 + 0.12868841946666666667e-12 * t) * t) * t) * t) * t) * t;
2059 }
2060 case 79: {
2061 T t = 2*y100 - 159;
2062 return 0.41914223158913787649e0 + (0.83456685186950463538e-2 + (0.94827181359250161335e-4 + (0.84291858561783141014e-6 + (0.59154537751083485684e-8 + (0.32441553034347469291e-10 + 0.13387957943111111111e-12 * t) * t) * t) * t) * t) * t;
2063 }
2064 case 80: {
2065 T t = 2*y100 - 161;
2066 return 0.43621971639463786896e0 + (0.87352841828289495773e-2 + (0.10002929142066799966e-3 + (0.89156148280219880024e-6 + (0.62480008150788597147e-8 + (0.34079760983458878910e-10 + 0.13917107176888888889e-12 * t) * t) * t) * t) * t) * t;
2067 }
2068 case 81: {
2069 T t = 2*y100 - 163;
2070 return 0.45409763548534330981e0 + (0.91463027755548240654e-2 + (0.10553137232446167258e-3 + (0.94293113464638623798e-6 + (0.65972492312219959885e-8 + (0.35782041795476563662e-10 + 0.14455745872000000000e-12 * t) * t) * t) * t) * t) * t;
2071 }
2072 case 82: {
2073 T t = 2*y100 - 165;
2074 return 0.47282001668512331468e0 + (0.95799574408860463394e-2 + (0.11135019058000067469e-3 + (0.99716373005509038080e-6 + (0.69638453369956970347e-8 + (0.37549499088161345850e-10 + 0.15003280712888888889e-12 * t) * t) * t) * t) * t) * t;
2075 }
2076 case 83: {
2077 T t = 2*y100 - 167;
2078 return 0.49243342227179841649e0 + (0.10037550043909497071e-1 + (0.11750334542845234952e-3 + (0.10544006716188967172e-5 + (0.73484461168242224872e-8 + (0.39383162326435752965e-10 + 0.15559069118222222222e-12 * t) * t) * t) * t) * t) * t;
2079 }
2080 case 84: {
2081 T t = 2*y100 - 169;
2082 return 0.51298708979209258326e0 + (0.10520454564612427224e-1 + (0.12400930037494996655e-3 + (0.11147886579371265246e-5 + (0.77517184550568711454e-8 + (0.41283980931872622611e-10 + 0.16122419680000000000e-12 * t) * t) * t) * t) * t) * t;
2083 }
2084 case 85: {
2085 T t = 2*y100 - 171;
2086 return 0.53453307979101369843e0 + (0.11030120618800726938e-1 + (0.13088741519572269581e-3 + (0.11784797595374515432e-5 + (0.81743383063044825400e-8 + (0.43252818449517081051e-10 + 0.16692592640000000000e-12 * t) * t) * t) * t) * t) * t;
2087 }
2088 case 86: {
2089 T t = 2*y100 - 173;
2090 return 0.55712643071169299478e0 + (0.11568077107929735233e-1 + (0.13815797838036651289e-3 + (0.12456314879260904558e-5 + (0.86169898078969313597e-8 + (0.45290446811539652525e-10 + 0.17268801084444444444e-12 * t) * t) * t) * t) * t) * t;
2091 }
2092 case 87: {
2093 T t = 2*y100 - 175;
2094 return 0.58082532122519320968e0 + (0.12135935999503877077e-1 + (0.14584223996665838559e-3 + (0.13164068573095710742e-5 + (0.90803643355106020163e-8 + (0.47397540713124619155e-10 + 0.17850211608888888889e-12 * t) * t) * t) * t) * t) * t;
2095 }
2096 case 88: {
2097 T t = 2*y100 - 177;
2098 return 0.60569124025293375554e0 + (0.12735396239525550361e-1 + (0.15396244472258863344e-3 + (0.13909744385382818253e-5 + (0.95651595032306228245e-8 + (0.49574672127669041550e-10 + 0.18435945564444444444e-12 * t) * t) * t) * t) * t) * t;
2099 }
2100 case 89: {
2101 T t = 2*y100 - 179;
2102 return 0.63178916494715716894e0 + (0.13368247798287030927e-1 + (0.16254186562762076141e-3 + (0.14695084048334056083e-5 + (0.10072078109604152350e-7 + (0.51822304995680707483e-10 + 0.19025081422222222222e-12 * t) * t) * t) * t) * t) * t;
2103 }
2104 case 90: {
2105 T t = 2*y100 - 181;
2106 return 0.65918774689725319200e0 + (0.14036375850601992063e-1 + (0.17160483760259706354e-3 + (0.15521885688723188371e-5 + (0.10601827031535280590e-7 + (0.54140790105837520499e-10 + 0.19616655146666666667e-12 * t) * t) * t) * t) * t) * t;
2107 }
2108 case 91: {
2109 T t = 2*y100 - 183;
2110 return 0.68795950683174433822e0 + (0.14741765091365869084e-1 + (0.18117679143520433835e-3 + (0.16392004108230585213e-5 + (0.11155116068018043001e-7 + (0.56530360194925690374e-10 + 0.20209663662222222222e-12 * t) * t) * t) * t) * t) * t;
2111 }
2112 case 92: {
2113 T t = 2*y100 - 185;
2114 return 0.71818103808729967036e0 + (0.15486504187117112279e-1 + (0.19128428784550923217e-3 + (0.17307350969359975848e-5 + (0.11732656736113607751e-7 + (0.58991125287563833603e-10 + 0.20803065333333333333e-12 * t) * t) * t) * t) * t) * t;
2115 }
2116 case 93: {
2117 T t = 2*y100 - 187;
2118 return 0.74993321911726254661e0 + (0.16272790364044783382e-1 + (0.20195505163377912645e-3 + (0.18269894883203346953e-5 + (0.12335161021630225535e-7 + (0.61523068312169087227e-10 + 0.21395783431111111111e-12 * t) * t) * t) * t) * t) * t;
2119 }
2120 case 94: {
2121 T t = 2*y100 - 189;
2122 return 0.78330143531283492729e0 + (0.17102934132652429240e-1 + (0.21321800585063327041e-3 + (0.19281661395543913713e-5 + (0.12963340087354341574e-7 + (0.64126040998066348872e-10 + 0.21986708942222222222e-12 * t) * t) * t) * t) * t) * t;
2123 }
2124 case 95: {
2125 T t = 2*y100 - 191;
2126 return 0.81837581041023811832e0 + (0.17979364149044223802e-1 + (0.22510330592753129006e-3 + (0.20344732868018175389e-5 + (0.13617902941839949718e-7 + (0.66799760083972474642e-10 + 0.22574701262222222222e-12 * t) * t) * t) * t) * t) * t;
2127 }
2128 case 96: {
2129 T t = 2*y100 - 193;
2130 return 0.85525144775685126237e0 + (0.18904632212547561026e-1 + (0.23764237370371255638e-3 + (0.21461248251306387979e-5 + (0.14299555071870523786e-7 + (0.69543803864694171934e-10 + 0.23158593688888888889e-12 * t) * t) * t) * t) * t) * t;
2131 }
2132 case 97: {
2133 T t = 2*y100 - 195;
2134 return 0.89402868170849933734e0 + (0.19881418399127202569e-1 + (0.25086793128395995798e-3 + (0.22633402747585233180e-5 + (0.15008997042116532283e-7 + (0.72357609075043941261e-10 + 0.23737194737777777778e-12 * t) * t) * t) * t) * t) * t;
2135 }
2136 case 98: {
2137 T t = 2*y100 - 197;
2138 return 0.93481333942870796363e0 + (0.20912536329780368893e-1 + (0.26481403465998477969e-3 + (0.23863447359754921676e-5 + (0.15746923065472184451e-7 + (0.75240468141720143653e-10 + 0.24309291271111111111e-12 * t) * t) * t) * t) * t) * t;
2139 }
2140 case 99: {
2141 T t = 2*y100 - 199;
2142 return 0.97771701335885035464e0 + (0.22000938572830479551e-1 + (0.27951610702682383001e-3 + (0.25153688325245314530e-5 + (0.16514019547822821453e-7 + (0.78191526829368231251e-10 + 0.24873652355555555556e-12 * t) * t) * t) * t) * t) * t;
2143 }
2144   }
2145   // we only get here if y = 1, i.e. |x| < 4*eps, in which case
2146   // erfcx is within 1e-15 of 1..
2147   return 1.0;
2148 }
2149 
2150 template <typename T>
2151 C10_HOST_DEVICE inline typename std::enable_if<std::is_floating_point<T>::value, T>::type
calc_erfcx(T x)2152 calc_erfcx(T x)
2153 {
2154   if (at::_isnan(x)) {
2155     return x;
2156   }
2157 
2158   if (x >= 0) {
2159     if (x > 50) { // continued-fraction expansion is faster
2160       const T ispi = 0.56418958354775628694807945156; // 1 / sqrt(pi)
2161       if (x > 5e7) { // 1-term expansion, important to avoid overflow
2162         return ispi / x;
2163       }
2164       /* 5-term expansion (rely on compiler for CSE), simplified from:
2165                 ispi / (x+0.5/(x+1/(x+1.5/(x+2/x))))  */
2166       return ispi*((x*x) * (x*x+4.5) + 2) / (x * ((x*x) * (x*x+5) + 3.75));
2167     }
2168     return erfcx_y100(400/(4+x));
2169   }
2170   else {
2171     if (x < -26.7) {
2172       return std::numeric_limits<T>::infinity();
2173     }
2174     else if (x < -6.1) {
2175       return 2*exp(x*x);
2176     }
2177     else {
2178       return 2*exp(x*x) - erfcx_y100(400/(4-x));
2179     }
2180   }
2181 }
2182 
2183 /*
2184  * Logarithm of Gaussian cumulative distribution function.
2185 
2186  * This implementation of log_ndtr and its helper functions
2187  * follow SciPy's implementation
2188  * See NOTICE for the licenses.
2189  */
2190 template <typename T>
calc_log_ndtr(T x)2191 inline C10_HOST_DEVICE T calc_log_ndtr(T x) {
2192   T t = x * c10::frac_sqrt_2<T>;
2193   if (x < T{-1.0}) {
2194     return std::log(calc_erfcx(-t) / 2) - t * t;
2195   } else {
2196     return std::log1p(-std::erfc(t) / 2);
2197   }
2198 }
2199 
2200 template<typename T>
airy_ai_forward(T x)2201 inline C10_HOST_DEVICE T airy_ai_forward(T x) {
2202     static const T AN[] = {
2203             +3.46538101525629032477e-01,
2204             +1.20075952739645805542e+01,
2205             +7.62796053615234516538e+01,
2206             +1.68089224934630576269e+02,
2207             +1.59756391350164413639e+02,
2208             +7.05360906840444183113e+01,
2209             +1.40264691163389668864e+01,
2210             +9.99999999999999995305e-01,
2211     };
2212 
2213     static const T AD[] = {
2214             +5.67594532638770212846e-01,
2215             +1.47562562584847203173e+01,
2216             +8.45138970141474626562e+01,
2217             +1.77318088145400459522e+02,
2218             +1.64234692871529701831e+02,
2219             +7.14778400825575695274e+01,
2220             +1.40959135607834029598e+01,
2221             +1.00000000000000000470e+00,
2222     };
2223 
2224     static const T AFN[] = {
2225             -1.31696323418331795333e-01,
2226             -6.26456544431912369773e-01,
2227             -6.93158036036933542233e-01,
2228             -2.79779981545119124951e-01,
2229             -4.91900132609500318020e-02,
2230             -4.06265923594885404393e-03,
2231             -1.59276496239262096340e-04,
2232             -2.77649108155232920844e-06,
2233             -1.67787698489114633780e-08,
2234     };
2235 
2236     static const T AFD[] = {
2237             +1.33560420706553243746e+01,
2238             +3.26825032795224613948e+01,
2239             +2.67367040941499554804e+01,
2240             +9.18707402907259625840e+00,
2241             +1.47529146771666414581e+00,
2242             +1.15687173795188044134e-01,
2243             +4.40291641615211203805e-03,
2244             +7.54720348287414296618e-05,
2245             +4.51850092970580378464e-07,
2246     };
2247 
2248     static const T AGN[] = {
2249             +1.97339932091685679179e-02,
2250             +3.91103029615688277255e-01,
2251             +1.06579897599595591108e+00,
2252             +9.39169229816650230044e-01,
2253             +3.51465656105547619242e-01,
2254             +6.33888919628925490927e-02,
2255             +5.85804113048388458567e-03,
2256             +2.82851600836737019778e-04,
2257             +6.98793669997260967291e-06,
2258             +8.11789239554389293311e-08,
2259             +3.41551784765923618484e-10,
2260     };
2261 
2262     static const T AGD[] = {
2263             +9.30892908077441974853e+00,
2264             +1.98352928718312140417e+01,
2265             +1.55646628932864612953e+01,
2266             +5.47686069422975497931e+00,
2267             +9.54293611618961883998e-01,
2268             +8.64580826352392193095e-02,
2269             +4.12656523824222607191e-03,
2270             +1.01259085116509135510e-04,
2271             +1.17166733214413521882e-06,
2272             +4.91834570062930015649e-09,
2273     };
2274 
2275     int domain_flag = 0;
2276 
2277     T ai;
2278 
2279     if (std::isinf(x)) {
2280         return std::numeric_limits<T>::quiet_NaN();
2281     }
2282 
2283     if (x > T(103.892)) {
2284         return T(0.0);
2285     }
2286 
2287     T f;
2288     T g;
2289     T k;
2290 
2291     if (x < T(-2.09)) {
2292         T z = T(1.0) / (T(-2.0) * x * std::sqrt(-x) / T(3.0));
2293 
2294         T afn = 0.0;
2295 
2296         for (uint8_t index = 0; index <= 8; index++) {
2297             afn = afn * (z * z) + AFN[index];
2298         }
2299 
2300         T afd = 0.0;
2301 
2302         for (uint8_t index = 0; index <= 8; index++) {
2303             afd = afd * (z * z) + AFD[index];
2304         }
2305 
2306         T agn = 0.0;
2307 
2308         for (uint8_t index = 0; index <= 10 + 0; index++) {
2309             agn = agn * (z * z) + AGN[index];
2310         }
2311 
2312         T agd = 0.0;
2313 
2314         for (uint8_t index = 0; index <= 10 - 1; index++) {
2315             agd = agd * (z * z) + AGD[index];
2316         }
2317 
2318         T t = T(-2.0) * x * std::sqrt(-x) / T(3.0) + T(0.25) * c10::pi<T>;
2319 
2320         return T(5.64189583547756286948e-01) / std::sqrt(std::sqrt(-x)) * (std::sin(t) * (T(1.0) + z * z * afn / afd) - std::cos(t) * (z * agn / agd));
2321     }
2322 
2323     if (x >= T(2.09)) {
2324         domain_flag = 5;
2325 
2326         T zeta = T(2.0) * x * std::sqrt(x) / T(3.0);
2327 
2328         T an = 0.0;
2329 
2330         for (uint8_t index = 0; index <= 7; index++) {
2331             an = an * (T(1.0) / zeta) + AN[index];
2332         }
2333 
2334         T ad = 0.0;
2335 
2336         for (uint8_t index = 0; index <= 7; index++) {
2337             ad = ad * (T(1.0) / zeta) + AD[index];
2338         }
2339 
2340         ai = T(5.64189583547756286948e-01) * (an / ad) / (T(2.0) * std::sqrt(std::sqrt(x)) * std::exp(zeta));
2341 
2342         if (x > T(8.3203353)) {
2343             return ai;
2344         }
2345     }
2346 
2347     f = 1.0;
2348     g = x;
2349     k = 1.0;
2350 
2351     T m = 1.0;
2352     T n = x;
2353     T t = 1.0;
2354     T z = x * x * x;
2355 
2356     while (t > std::numeric_limits<T>::epsilon()) {
2357         m *= z;
2358         k += T(1.0);
2359         m /= k;
2360         n *= z;
2361         k += T(1.0);
2362         n /= k;
2363         m /= k;
2364         f += m;
2365         k += T(1.0);
2366         n /= k;
2367         g += n;
2368 
2369         t = std::abs(m / f);
2370     }
2371 
2372     if ((domain_flag & 1) == 0) {
2373         return T(0.355028053887817239260) * f - T(0.258819403792806798405) * g;
2374     }
2375 
2376     return ai;
2377 } // T airy_ai(T x)
2378 
2379 template<typename T>
bessel_j0_forward(T x)2380 inline C10_HOST_DEVICE T bessel_j0_forward(T x) {
2381     static const T PP[] = {
2382             +7.96936729297347051624e-04,
2383             +8.28352392107440799803e-02,
2384             +1.23953371646414299388e+00,
2385             +5.44725003058768775090e+00,
2386             +8.74716500199817011941e+00,
2387             +5.30324038235394892183e+00,
2388             +9.99999999999999997821e-01,
2389     };
2390 
2391     static const T PQ[] = {
2392             +9.24408810558863637013e-04,
2393             +8.56288474354474431428e-02,
2394             +1.25352743901058953537e+00,
2395             +5.47097740330417105182e+00,
2396             +8.76190883237069594232e+00,
2397             +5.30605288235394617618e+00,
2398             +1.00000000000000000218e+00,
2399     };
2400 
2401     static const T QP[] = {
2402             -1.13663838898469149931e-02,
2403             -1.28252718670509318512e+00,
2404             -1.95539544257735972385e+01,
2405             -9.32060152123768231369e+01,
2406             -1.77681167980488050595e+02,
2407             -1.47077505154951170175e+02,
2408             -5.14105326766599330220e+01,
2409             -6.05014350600728481186e+00,
2410     };
2411 
2412     static const T QQ[] = {
2413             +6.43178256118178023184e+01,
2414             +8.56430025976980587198e+02,
2415             +3.88240183605401609683e+03,
2416             +7.24046774195652478189e+03,
2417             +5.93072701187316984827e+03,
2418             +2.06209331660327847417e+03,
2419             +2.42005740240291393179e+02,
2420     };
2421 
2422     static const T RP[] = {
2423             -4.79443220978201773821e+09,
2424             +1.95617491946556577543e+12,
2425             -2.49248344360967716204e+14,
2426             +9.70862251047306323952e+15,
2427     };
2428 
2429     static const T RQ[] = {
2430             +4.99563147152651017219e+02,
2431             +1.73785401676374683123e+05,
2432             +4.84409658339962045305e+07,
2433             +1.11855537045356834862e+10,
2434             +2.11277520115489217587e+12,
2435             +3.10518229857422583814e+14,
2436             +3.18121955943204943306e+16,
2437             +1.71086294081043136091e+18,
2438     };
2439 
2440     if (x < T(0)) {
2441         x = -x;
2442     }
2443 
2444     if (x <= T(5.0)) {
2445         if (x < T(0.00001)) {
2446             return T(1.0) - x * x / T(4.0);
2447         }
2448 
2449         T rp = 0.0;
2450 
2451         for (uint8_t index = 0; index <= 3; index++) {
2452             rp = rp * (x * x) + RP[index];
2453         }
2454 
2455         T rq = 0.0;
2456 
2457         for (uint8_t index = 0; index <= 7; index++) {
2458             rq = rq * (x * x) + RQ[index];
2459         }
2460 
2461         return (x * x - T(5.78318596294678452118e+00)) * (x * x - T(3.04712623436620863991e+01)) * rp / rq;
2462     }
2463 
2464     T pp = 0.0;
2465 
2466     for (uint8_t index = 0; index <= 6; index++) {
2467         pp = pp * (T(25.0) / (x * x)) + PP[index];
2468     }
2469 
2470     T pq = 0.0;
2471 
2472     for (uint8_t index = 0; index <= 6; index++) {
2473         pq = pq * (T(25.0) / (x * x)) + PQ[index];
2474     }
2475 
2476     T qp = 0.0;
2477 
2478     for (uint8_t index = 0; index <= 7; index++) {
2479         qp = qp * (T(25.0) / (x * x)) + QP[index];
2480     }
2481 
2482     T qq = 0.0;
2483 
2484     for (uint8_t index = 0; index <= 6; index++) {
2485         qq = qq * (T(25.0) / (x * x)) + QQ[index];
2486     }
2487 
2488     return (pp / pq * std::cos(x - T(0.785398163397448309615660845819875721)) - T(5.0) / x * (qp / qq) * std::sin(x - T(0.785398163397448309615660845819875721))) * T(0.797884560802865355879892119868763737) / std::sqrt(x);
2489 } // bessel_j0_forward(T x)
2490 
2491 template<typename T>
bessel_j1_forward(T x)2492 inline C10_HOST_DEVICE T bessel_j1_forward(T x) {
2493     static const T PP[] = {
2494             +7.62125616208173112003e-04,
2495             +7.31397056940917570436e-02,
2496             +1.12719608129684925192e+00,
2497             +5.11207951146807644818e+00,
2498             +8.42404590141772420927e+00,
2499             +5.21451598682361504063e+00,
2500             +1.00000000000000000254e+00,
2501     };
2502 
2503     static const T PQ[] = {
2504             +5.71323128072548699714e-04,
2505             +6.88455908754495404082e-02,
2506             +1.10514232634061696926e+00,
2507             +5.07386386128601488557e+00,
2508             +8.39985554327604159757e+00,
2509             +5.20982848682361821619e+00,
2510             +9.99999999999999997461e-01,
2511     };
2512 
2513     static const T QP[] = {
2514             +5.10862594750176621635e-02,
2515             +4.98213872951233449420e+00,
2516             +7.58238284132545283818e+01,
2517             +3.66779609360150777800e+02,
2518             +7.10856304998926107277e+02,
2519             +5.97489612400613639965e+02,
2520             +2.11688757100572135698e+02,
2521             +2.52070205858023719784e+01,
2522     };
2523 
2524     static const T QQ[] = {
2525             +7.42373277035675149943e+01,
2526             +1.05644886038262816351e+03,
2527             +4.98641058337653607651e+03,
2528             +9.56231892404756170795e+03,
2529             +7.99704160447350683650e+03,
2530             +2.82619278517639096600e+03,
2531             +3.36093607810698293419e+02,
2532     };
2533 
2534     static const T RP[] = {
2535             -8.99971225705559398224e+08,
2536             +4.52228297998194034323e+11,
2537             -7.27494245221818276015e+13,
2538             +3.68295732863852883286e+15,
2539     };
2540 
2541     static const T RQ[] = {
2542             +6.20836478118054335476e+02,
2543             +2.56987256757748830383e+05,
2544             +8.35146791431949253037e+07,
2545             +2.21511595479792499675e+10,
2546             +4.74914122079991414898e+12,
2547             +7.84369607876235854894e+14,
2548             +8.95222336184627338078e+16,
2549             +5.32278620332680085395e+18,
2550     };
2551 
2552     if (x < T(0.0)) {
2553         return -bessel_j1_forward(-x);
2554     }
2555 
2556     if (x <= T(5.0)) {
2557         T rp = 0.0;
2558 
2559         for (uint8_t index = 0; index <= 3; index++) {
2560             rp = rp * (x * x) + RP[index];
2561         }
2562 
2563         T rq = 0.0;
2564 
2565         for (uint8_t index = 0; index <= 7; index++) {
2566             rq = rq * (x * x) + RQ[index];
2567         }
2568 
2569         return rp / rq * x * (x * x - T(1.46819706421238932572e+01)) * (x * x - T(4.92184563216946036703e+01));
2570     }
2571 
2572     T pp = 0.0;
2573 
2574     for (uint8_t index = 0; index <= 6; index++) {
2575         pp = pp * (T(5.0) / x * (T(5.0) / x)) + PP[index];
2576     }
2577 
2578     T pq = 0.0;
2579 
2580     for (uint8_t index = 0; index <= 6; index++) {
2581         pq = pq * (T(5.0) / x * (T(5.0) / x)) + PQ[index];
2582     }
2583 
2584     T qp = 0.0;
2585 
2586     for (uint8_t index = 0; index <= 7; index++) {
2587         qp = qp * (T(5.0) / x * (T(5.0) / x)) + QP[index];
2588     }
2589 
2590     T qq = 0.0;
2591 
2592     for (uint8_t index = 0; index <= 6; index++) {
2593         qq = qq * (T(5.0) / x * (T(5.0) / x)) + QQ[index];
2594     }
2595 
2596     return (pp / pq * std::cos(x - T(2.356194490192344928846982537459627163)) - T(5.0) / x * (qp / qq) * std::sin(x - T(2.356194490192344928846982537459627163))) * T(0.797884560802865355879892119868763737) / std::sqrt(x);
2597 } // bessel_j1_forward(T x)
2598 
2599 template<typename T>
bessel_y0_forward(T x)2600 inline C10_HOST_DEVICE T bessel_y0_forward(T x) {
2601     static const T PP[] = {
2602             +7.96936729297347051624e-04,
2603             +8.28352392107440799803e-02,
2604             +1.23953371646414299388e+00,
2605             +5.44725003058768775090e+00,
2606             +8.74716500199817011941e+00,
2607             +5.30324038235394892183e+00,
2608             +9.99999999999999997821e-01,
2609     };
2610 
2611     static const T PQ[] = {
2612             +9.24408810558863637013e-04,
2613             +8.56288474354474431428e-02,
2614             +1.25352743901058953537e+00,
2615             +5.47097740330417105182e+00,
2616             +8.76190883237069594232e+00,
2617             +5.30605288235394617618e+00,
2618             +1.00000000000000000218e+00,
2619     };
2620 
2621     static const T QP[] = {
2622             -1.13663838898469149931e-02,
2623             -1.28252718670509318512e+00,
2624             -1.95539544257735972385e+01,
2625             -9.32060152123768231369e+01,
2626             -1.77681167980488050595e+02,
2627             -1.47077505154951170175e+02,
2628             -5.14105326766599330220e+01,
2629             -6.05014350600728481186e+00,
2630     };
2631 
2632     static const T QQ[] = {
2633             +6.43178256118178023184e+01,
2634             +8.56430025976980587198e+02,
2635             +3.88240183605401609683e+03,
2636             +7.24046774195652478189e+03,
2637             +5.93072701187316984827e+03,
2638             +2.06209331660327847417e+03,
2639             +2.42005740240291393179e+02,
2640     };
2641 
2642     static const T YP[] = {
2643             +1.55924367855235737965e+04,
2644             -1.46639295903971606143e+07,
2645             +5.43526477051876500413e+09,
2646             -9.82136065717911466409e+11,
2647             +8.75906394395366999549e+13,
2648             -3.46628303384729719441e+15,
2649             +4.42733268572569800351e+16,
2650             -1.84950800436986690637e+16,
2651     };
2652 
2653     static const T YQ[] = {
2654             +1.04128353664259848412e+03,
2655             +6.26107330137134956842e+05,
2656             +2.68919633393814121987e+08,
2657             +8.64002487103935000337e+10,
2658             +2.02979612750105546709e+13,
2659             +3.17157752842975028269e+15,
2660             +2.50596256172653059228e+17,
2661     };
2662 
2663     if (x <= T(5.0)) {
2664         if (x == T(0.0)) {
2665             return -std::numeric_limits<T>::infinity();
2666         }
2667 
2668         if (x < T(0.0)) {
2669             return std::numeric_limits<T>::quiet_NaN();
2670         }
2671 
2672         T yp = 0.0;
2673 
2674         for (uint8_t index = 0; index <= 7; index++) {
2675             yp = yp * (x * x) + YP[index];
2676         }
2677 
2678         T yq = 0.0;
2679 
2680         for (uint8_t index = 0; index <= 6; index++) {
2681             yq = yq * (x * x) + YQ[index];
2682         }
2683 
2684         return yp / yq + (T(0.636619772367581343075535053490057448) * std::log(x) * bessel_j0_forward(x));
2685     }
2686 
2687     T pp = 0.0;
2688 
2689     for (uint8_t index = 0; index <= 6; index++) {
2690         pp = pp * (T(25.0) / (x * x)) + PP[index];
2691     }
2692 
2693     T pq = 0.0;
2694 
2695     for (uint8_t index = 0; index <= 6; index++) {
2696         pq = pq * (T(25.0) / (x * x)) + PQ[index];
2697     }
2698 
2699     T qp = 0.0;
2700 
2701     for (uint8_t index = 0; index <= 7; index++) {
2702         qp = qp * (T(25.0) / (x * x)) + QP[index];
2703     }
2704 
2705     T qq = 0.0;
2706 
2707     for (uint8_t index = 0; index <= 6; index++) {
2708         qq = qq * (T(25.0) / (x * x)) + QQ[index];
2709     }
2710 
2711     return (pp / pq * std::sin(x - T(0.785398163397448309615660845819875721)) + T(5.0) / x * (qp / qq) * std::cos(x - T(0.785398163397448309615660845819875721))) * T(0.797884560802865355879892119868763737) / std::sqrt(x);
2712 } // bessel_y0_forward(T x)
2713 
2714 template<typename T>
bessel_y1_forward(T x)2715 inline C10_HOST_DEVICE T bessel_y1_forward(T x) {
2716     static const T PP[] = {
2717             +7.62125616208173112003e-04,
2718             +7.31397056940917570436e-02,
2719             +1.12719608129684925192e+00,
2720             +5.11207951146807644818e+00,
2721             +8.42404590141772420927e+00,
2722             +5.21451598682361504063e+00,
2723             +1.00000000000000000254e+00,
2724     };
2725 
2726     static const T PQ[] = {
2727             +5.71323128072548699714e-04,
2728             +6.88455908754495404082e-02,
2729             +1.10514232634061696926e+00,
2730             +5.07386386128601488557e+00,
2731             +8.39985554327604159757e+00,
2732             +5.20982848682361821619e+00,
2733             +9.99999999999999997461e-01,
2734     };
2735 
2736     static const T QP[] = {
2737             +5.10862594750176621635e-02,
2738             +4.98213872951233449420e+00,
2739             +7.58238284132545283818e+01,
2740             +3.66779609360150777800e+02,
2741             +7.10856304998926107277e+02,
2742             +5.97489612400613639965e+02,
2743             +2.11688757100572135698e+02,
2744             +2.52070205858023719784e+01,
2745     };
2746 
2747     static const T QQ[] = {
2748             +7.42373277035675149943e+01,
2749             +1.05644886038262816351e+03,
2750             +4.98641058337653607651e+03,
2751             +9.56231892404756170795e+03,
2752             +7.99704160447350683650e+03,
2753             +2.82619278517639096600e+03,
2754             +3.36093607810698293419e+02,
2755     };
2756 
2757     static const T YP[] = {
2758             +1.26320474790178026440e+09,
2759             -6.47355876379160291031e+11,
2760             +1.14509511541823727583e+14,
2761             -8.12770255501325109621e+15,
2762             +2.02439475713594898196e+17,
2763             -7.78877196265950026825e+17,
2764     };
2765 
2766     static const T YQ[] = {
2767             +5.94301592346128195359e+02,
2768             +2.35564092943068577943e+05,
2769             +7.34811944459721705660e+07,
2770             +1.87601316108706159478e+10,
2771             +3.88231277496238566008e+12,
2772             +6.20557727146953693363e+14,
2773             +6.87141087355300489866e+16,
2774             +3.97270608116560655612e+18,
2775     };
2776 
2777     if (x <= T(5.0)) {
2778         if (x == T(0.0)) {
2779             return -std::numeric_limits<T>::infinity();
2780         }
2781 
2782         if (x <= T(0.0)) {
2783             return std::numeric_limits<T>::quiet_NaN();
2784         }
2785 
2786         T yp = 0.0;
2787 
2788         for (uint8_t index = 0; index <= 5; index++) {
2789             yp = yp * (x * x) + YP[index];
2790         }
2791 
2792         T yq = 0.0;
2793 
2794         for (uint8_t index = 0; index <= 7; index++) {
2795             yq = yq * (x * x) + YQ[index];
2796         }
2797 
2798         return x * (yp / yq) + (T(0.636619772367581343075535053490057448) * (bessel_j1_forward(x) * std::log(x) - T(1.0) / x));
2799     }
2800 
2801     T pp = 0.0;
2802 
2803     for (uint8_t index = 0; index <= 6; index++) {
2804         pp = pp * (T(5.0) / x * (T(5.0) / x)) + PP[index];
2805     }
2806 
2807     T pq = 0.0;
2808 
2809     for (uint8_t index = 0; index <= 6; index++) {
2810         pq = pq * (T(5.0) / x * (T(5.0) / x)) + PQ[index];
2811     }
2812 
2813     T qp = 0.0;
2814 
2815     for (uint8_t index = 0; index <= 7; index++) {
2816         qp = qp * (T(5.0) / x * (T(5.0) / x)) + QP[index];
2817     }
2818 
2819     T qq = 0.0;
2820 
2821     for (uint8_t index = 0; index <= 6; index++) {
2822         qq = qq * (T(5.0) / x * (T(5.0) / x)) + QQ[index];
2823     }
2824 
2825     return (pp / pq * std::sin(x - T(2.356194490192344928846982537459627163)) + T(5.0) / x * (qp / qq) * std::cos(x - T(2.356194490192344928846982537459627163))) * T(0.797884560802865355879892119868763737) / std::sqrt(x);
2826 } // bessel_y1_forward(T x)
2827 
2828 template<typename T>
chebyshev_polynomial_t_forward(T x,int64_t n)2829 inline C10_HOST_DEVICE T chebyshev_polynomial_t_forward(T x, int64_t n) {
2830     if (n < 0) {
2831         return T(0.0);
2832     }
2833 
2834     if (std::abs(x) == T(1.0)) {
2835         if (x > T(0.0) || n % 2 == 0) {
2836             return T(1.0);
2837         }
2838 
2839         return T(-1.0);
2840     }
2841 
2842     if ((n > 6) && (std::abs(x) < T(1.0))) {
2843         return std::cos(n * std::acos(x));
2844     }
2845 
2846     if (n == 0) {
2847         return T(1.0);
2848     }
2849 
2850     if (n == 1) {
2851         return x;
2852     }
2853 
2854     T p = T(1.0);
2855     T q = x;
2856     T r;
2857 
2858     for (int64_t k = 2; k <= n; k++) {
2859         r = (x + x) * q - p;
2860         p = q;
2861         q = r;
2862     }
2863 
2864     return r;
2865 } // chebyshev_polynomial_t_forward(T x, int64_t n)
2866 
2867 template<typename T, bool is_cuda=false>
chebyshev_polynomial_t_forward(T x,T n)2868 inline C10_HOST_DEVICE T chebyshev_polynomial_t_forward(T x, T n) {
2869     return chebyshev_polynomial_t_forward(x, static_cast<int64_t>(n));
2870 } // chebyshev_polynomial_t_forward(T x, T n)
2871 
2872 template<typename T>
chebyshev_polynomial_u_forward(T x,int64_t n)2873 inline C10_HOST_DEVICE T chebyshev_polynomial_u_forward(T x, int64_t n) {
2874     if (n < 0) {
2875         return T(0.0);
2876     }
2877 
2878     if (std::abs(x) == T(1.0)) {
2879         if (x > T(0.0) || n % 2 == 0) {
2880             return n + 1;
2881         }
2882 
2883         return -(n + 1);
2884     }
2885 
2886     if ((n > 8) && (std::abs(x) < T(1.0))) {
2887         if (std::sin(std::acos(x)) != T(0.0)) {
2888             return std::sin((n + 1) * std::acos(x)) / std::sin(std::acos(x));
2889         }
2890 
2891         return (n + 1) * std::cos((n + 1) * std::acos(x)) / x;
2892     }
2893 
2894     if (n == 0) {
2895         return T(1.0);
2896     }
2897 
2898     if (n == 1) {
2899         return x + x;
2900     }
2901 
2902     T p = T(1.0);
2903     T q = x + x;
2904     T r;
2905 
2906     for (int64_t k = 2; k <= n; k++) {
2907         r = (x + x) * q - p;
2908         p = q;
2909         q = r;
2910     }
2911 
2912     return r;
2913 } // chebyshev_polynomial_u_forward(T x, int64_t n)
2914 
2915 template<typename T, bool is_cuda=false>
chebyshev_polynomial_u_forward(T x,T n)2916 inline C10_HOST_DEVICE T chebyshev_polynomial_u_forward(T x, T n) {
2917     return chebyshev_polynomial_u_forward(x, static_cast<int64_t>(n));
2918 } // chebyshev_polynomial_u_forward(T x, T n)
2919 
2920 template<typename T>
chebyshev_polynomial_v_forward(T x,int64_t n)2921 inline C10_HOST_DEVICE T chebyshev_polynomial_v_forward(T x, int64_t n) {
2922     if (n < 0) {
2923         return T(0.0);
2924     }
2925 
2926     if (std::abs(x) == T(1.0)) {
2927         if (x > T(0.0)) {
2928             return T(1.0);
2929         }
2930 
2931         if (n % 2 == 0) {
2932             return n + n + 1;
2933         }
2934 
2935         return -(n + n + 1);
2936     }
2937 
2938     if ((n > 8) && (std::abs(x) < T(1.0))) {
2939         if (std::sin(std::acos(x) / T(2.0)) != T(1.0)) {
2940             return std::cos((n + T(0.5)) * std::acos(x)) / std::cos(std::acos(x) / T(2.0));
2941         }
2942 
2943         if (n % 2 == 0) {
2944             return n + n + 1;
2945         }
2946 
2947         return -(n + n + 1);
2948     }
2949 
2950     if (n == 0) {
2951         return T(1.0);
2952     }
2953 
2954     if (n == 1) {
2955         return x + x - T(1.0);
2956     }
2957 
2958     T p = T(1.0);
2959     T q = x + x - T(1.0);
2960     T r;
2961 
2962     for (int64_t k = 2; k <= n; k++) {
2963         r = (x + x) * q - p;
2964         p = q;
2965         q = r;
2966     }
2967 
2968     return r;
2969 } // chebyshev_polynomial_v_forward(T x, int64_t n)
2970 
2971 template<typename T, bool is_cuda=false>
chebyshev_polynomial_v_forward(T x,T n)2972 inline C10_HOST_DEVICE T chebyshev_polynomial_v_forward(T x, T n) {
2973     return chebyshev_polynomial_v_forward(x, static_cast<int64_t>(n));
2974 } // chebyshev_polynomial_v_forward(T x, T n)
2975 
2976 template<typename T>
chebyshev_polynomial_w_forward(T x,int64_t n)2977 inline C10_HOST_DEVICE T chebyshev_polynomial_w_forward(T x, int64_t n) {
2978     if (n < 0) {
2979         return T(0.0);
2980     }
2981 
2982     if (std::abs(x) == T(1.0)) {
2983         if (x > T(0.0)) {
2984             return n + n + 1;
2985         }
2986 
2987         if (n % 2 == 0) {
2988             return T(1.0);
2989         }
2990 
2991         return T(-1.0);
2992     }
2993 
2994     if ((n > 8) && (std::abs(x) < T(1.0))) {
2995         if (std::cos(std::acos(x) / T(2.0)) != T(1.0)) {
2996             return std::sin((n + T(0.5)) * std::acos(x)) / std::sin(std::acos(x) / T(2.0));
2997         }
2998 
2999         if (x > T(0.0)) {
3000             return n + n + 1;
3001         }
3002 
3003         if (n % 2 == 0) {
3004             return T(1.0);
3005         }
3006 
3007         return T(-1.0);
3008     }
3009 
3010     if (n == 0) {
3011         return T(1.0);
3012     }
3013 
3014     if (n == 1) {
3015         return x + x + T(1.0);
3016     }
3017 
3018     T p = T(1.0);
3019     T q = x + x + T(1.0);
3020     T r;
3021 
3022     for (int64_t k = 2; k <= n; k++) {
3023         r = (x + x) * q - p;
3024         p = q;
3025         q = r;
3026     }
3027 
3028     return r;
3029 } // chebyshev_polynomial_w_forward(T x, int64_t n)
3030 
3031 template<typename T, bool is_cuda=false>
chebyshev_polynomial_w_forward(T x,T n)3032 inline C10_HOST_DEVICE T chebyshev_polynomial_w_forward(T x, T n) {
3033     return chebyshev_polynomial_w_forward(x, static_cast<int64_t>(n));
3034 } // chebyshev_polynomial_w_forward(T x, T n)
3035 
3036 template<typename T>
hermite_polynomial_h_forward(T x,int64_t n)3037 inline C10_HOST_DEVICE T hermite_polynomial_h_forward(T x, int64_t n) {
3038     if (n < 0) {
3039         return T(0.0);
3040     }
3041 
3042     if (n == 0) {
3043         return T(1.0);
3044     }
3045 
3046     if (n == 1) {
3047         return x + x;
3048     }
3049 
3050     T p = T(1.0);
3051     T q = x + x;
3052     T r = T(0.0);
3053 
3054     for (int64_t k = 2; k < n + n; k += 2) {
3055         r = (x + x) * q - k * p;
3056         p = q;
3057         q = r;
3058     }
3059 
3060     return r;
3061 } // hermite_polynomial_h_forward(T x, int64_t n)
3062 
3063 template<typename T, bool is_cuda=false, std::enable_if_t<!std::is_floating_point<T>::value, int> = 0>
hermite_polynomial_h_forward(T x,T n)3064 inline C10_HOST_DEVICE T hermite_polynomial_h_forward(T x, T n) {
3065     return hermite_polynomial_h_forward(x, static_cast<int64_t>(n));
3066 } // hermite_polynomial_h_forward(T x, T n)
3067 
3068 template<typename T, bool is_cuda=false, std::enable_if_t<std::is_floating_point<T>::value, int> = 0>
hermite_polynomial_h_forward(T x,T n)3069 inline C10_HOST_DEVICE T hermite_polynomial_h_forward(T x, T n) {
3070     return hermite_polynomial_h_forward(x, ((!std::isinf(n)) && (!std::isnan(n))) ? static_cast<int64_t>(n) : static_cast<int64_t>(-1));
3071 } // hermite_polynomial_h_forward(T x, T n)
3072 
3073 template<typename T>
hermite_polynomial_he_forward(T x,int64_t n)3074 inline C10_HOST_DEVICE T hermite_polynomial_he_forward(T x, int64_t n) {
3075     if (n < 0) {
3076         return T(0.0);
3077     }
3078 
3079     if (n == 0) {
3080         return T(1.0);
3081     }
3082 
3083     if (n == 1) {
3084         return x;
3085     }
3086 
3087     T p = T(1.0);
3088     T q = x;
3089     T r;
3090 
3091     for (int64_t k = 1; k < n; k++) {
3092         r = x * q - k * p;
3093         p = q;
3094         q = r;
3095     }
3096 
3097     return r;
3098 } // hermite_polynomial_he_forward(T x, int64_t n)
3099 
3100 template<typename T, bool is_cuda=false>
hermite_polynomial_he_forward(T x,T n)3101 inline C10_HOST_DEVICE T hermite_polynomial_he_forward(T x, T n) {
3102     return hermite_polynomial_he_forward(x, static_cast<int64_t>(n));
3103 } // hermite_polynomial_he_forward(T x, T n)
3104 
3105 template<typename T>
laguerre_polynomial_l_forward(T x,int64_t n)3106 inline C10_HOST_DEVICE T laguerre_polynomial_l_forward(T x, int64_t n) {
3107     if (n < 0) {
3108         return T(0.0);
3109     }
3110 
3111     if (std::abs(x) == T(0.0)) {
3112         return T(1.0);
3113     }
3114 
3115     if (n == 0) {
3116         return T(1.0);
3117     }
3118 
3119     if (n == 1) {
3120         return T(1.0) - x;
3121     }
3122 
3123     T p = T(1.0);
3124     T q = T(1.0) - x;
3125     T r;
3126 
3127     for (int64_t k = 1; k < n; k++) {
3128         r = (((k + k) + (T(1.0) - x)) * q - k * p) / (k + 1);
3129         p = q;
3130         q = r;
3131     }
3132 
3133     return r;
3134 } // laguerre_polynomial_l_forward(T x, int64_t n)
3135 
3136 template<typename T, bool is_cuda=false>
laguerre_polynomial_l_forward(T x,T n)3137 inline C10_HOST_DEVICE T laguerre_polynomial_l_forward(T x, T n) {
3138     return laguerre_polynomial_l_forward(x, static_cast<int64_t>(n));
3139 } // laguerre_polynomial_l_forward(T x, T n)
3140 
3141 template<typename T>
legendre_polynomial_p_forward(T x,int64_t n)3142 inline C10_HOST_DEVICE T legendre_polynomial_p_forward(T x, int64_t n) {
3143     if (n < 0) {
3144         return T(0.0);
3145     }
3146 
3147     if (std::abs(x) == T(1.0)) {
3148         if (x > T(0.0) || n % 2 == 0) {
3149             return T(1.0);
3150         }
3151 
3152         return T(-1.0);
3153     }
3154 
3155     if (n == 0) {
3156         return T(1.0);
3157     }
3158 
3159     if (n == 1) {
3160         return x;
3161     }
3162 
3163     T p = T(1.0);
3164     T q = x;
3165     T r;
3166 
3167     for (int64_t k = 1; k < n; k++) {
3168         r = ((k + k + 1) * x * q - k * p) / (k + 1);
3169         p = q;
3170         q = r;
3171     }
3172 
3173     return r;
3174 } // legendre_polynomial_p_forward(T x, int64_t n)
3175 
3176 template<typename T, bool is_cuda=false>
legendre_polynomial_p_forward(T x,T n)3177 inline C10_HOST_DEVICE T legendre_polynomial_p_forward(T x, T n) {
3178     return legendre_polynomial_p_forward(x, static_cast<int64_t>(n));
3179 } // legendre_polynomial_p_forward(T x, T n)
3180 
3181 template<typename T>
modified_bessel_i0_forward(T x)3182 inline C10_HOST_DEVICE T modified_bessel_i0_forward(T x) {
3183     static const T A[] = {
3184             -4.41534164647933937950e-18,
3185             +3.33079451882223809783e-17,
3186             -2.43127984654795469359e-16,
3187             +1.71539128555513303061e-15,
3188             -1.16853328779934516808e-14,
3189             +7.67618549860493561688e-14,
3190             -4.85644678311192946090e-13,
3191             +2.95505266312963983461e-12,
3192             -1.72682629144155570723e-11,
3193             +9.67580903537323691224e-11,
3194             -5.18979560163526290666e-10,
3195             +2.65982372468238665035e-09,
3196             -1.30002500998624804212e-08,
3197             +6.04699502254191894932e-08,
3198             -2.67079385394061173391e-07,
3199             +1.11738753912010371815e-06,
3200             -4.41673835845875056359e-06,
3201             +1.64484480707288970893e-05,
3202             -5.75419501008210370398e-05,
3203             +1.88502885095841655729e-04,
3204             -5.76375574538582365885e-04,
3205             +1.63947561694133579842e-03,
3206             -4.32430999505057594430e-03,
3207             +1.05464603945949983183e-02,
3208             -2.37374148058994688156e-02,
3209             +4.93052842396707084878e-02,
3210             -9.49010970480476444210e-02,
3211             +1.71620901522208775349e-01,
3212             -3.04682672343198398683e-01,
3213             +6.76795274409476084995e-01,
3214     };
3215 
3216     static const T B[] = {
3217             -7.23318048787475395456e-18,
3218             -4.83050448594418207126e-18,
3219             +4.46562142029675999901e-17,
3220             +3.46122286769746109310e-17,
3221             -2.82762398051658348494e-16,
3222             -3.42548561967721913462e-16,
3223             +1.77256013305652638360e-15,
3224             +3.81168066935262242075e-15,
3225             -9.55484669882830764870e-15,
3226             -4.15056934728722208663e-14,
3227             +1.54008621752140982691e-14,
3228             +3.85277838274214270114e-13,
3229             +7.18012445138366623367e-13,
3230             -1.79417853150680611778e-12,
3231             -1.32158118404477131188e-11,
3232             -3.14991652796324136454e-11,
3233             +1.18891471078464383424e-11,
3234             +4.94060238822496958910e-10,
3235             +3.39623202570838634515e-09,
3236             +2.26666899049817806459e-08,
3237             +2.04891858946906374183e-07,
3238             +2.89137052083475648297e-06,
3239             +6.88975834691682398426e-05,
3240             +3.36911647825569408990e-03,
3241             +8.04490411014108831608e-01,
3242     };
3243 
3244     T p;
3245     T q = 0.0;
3246 
3247     if (std::abs(x) <= T(8.0)) {
3248         T a = A[0];
3249 
3250         for (uint8_t index = 1; index < 30; index++) {
3251             p = q;
3252             q = a;
3253             a = ((std::abs(x) / T(2.0)) - T(2.0)) * q - p + A[index];
3254         }
3255 
3256         return std::exp(std::abs(x)) * (T(0.5) * (a - p));
3257     }
3258 
3259     T b = B[0];
3260 
3261     for (uint8_t index = 1; index < 25; index++) {
3262         p = q;
3263         q = b;
3264         b = (T(32.0) / std::abs(x) - T(2.0)) * q - p + B[index];
3265     }
3266 
3267     return std::exp(std::abs(x)) * (T(0.5) * (b - p)) / std::sqrt(std::abs(x));
3268 } // modified_bessel_i0_forward(T x)
3269 
3270 template<typename T>
modified_bessel_i1_forward(T x)3271 inline C10_HOST_DEVICE T modified_bessel_i1_forward(T x) {
3272     static const T A[] = {
3273             +2.77791411276104639959e-18,
3274             -2.11142121435816608115e-17,
3275             +1.55363195773620046921e-16,
3276             -1.10559694773538630805e-15,
3277             +7.60068429473540693410e-15,
3278             -5.04218550472791168711e-14,
3279             +3.22379336594557470981e-13,
3280             -1.98397439776494371520e-12,
3281             +1.17361862988909016308e-11,
3282             -6.66348972350202774223e-11,
3283             +3.62559028155211703701e-10,
3284             -1.88724975172282928790e-09,
3285             +9.38153738649577178388e-09,
3286             -4.44505912879632808065e-08,
3287             +2.00329475355213526229e-07,
3288             -8.56872026469545474066e-07,
3289             +3.47025130813767847674e-06,
3290             -1.32731636560394358279e-05,
3291             +4.78156510755005422638e-05,
3292             -1.61760815825896745588e-04,
3293             +5.12285956168575772895e-04,
3294             -1.51357245063125314899e-03,
3295             +4.15642294431288815669e-03,
3296             -1.05640848946261981558e-02,
3297             +2.47264490306265168283e-02,
3298             -5.29459812080949914269e-02,
3299             +1.02643658689847095384e-01,
3300             -1.76416518357834055153e-01,
3301             +2.52587186443633654823e-01,
3302     };
3303 
3304     static const T B[] = {
3305             +7.51729631084210481353e-18,
3306             +4.41434832307170791151e-18,
3307             -4.65030536848935832153e-17,
3308             -3.20952592199342395980e-17,
3309             +2.96262899764595013876e-16,
3310             +3.30820231092092828324e-16,
3311             -1.88035477551078244854e-15,
3312             -3.81440307243700780478e-15,
3313             +1.04202769841288027642e-14,
3314             +4.27244001671195135429e-14,
3315             -2.10154184277266431302e-14,
3316             -4.08355111109219731823e-13,
3317             -7.19855177624590851209e-13,
3318             +2.03562854414708950722e-12,
3319             +1.41258074366137813316e-11,
3320             +3.25260358301548823856e-11,
3321             -1.89749581235054123450e-11,
3322             -5.58974346219658380687e-10,
3323             -3.83538038596423702205e-09,
3324             -2.63146884688951950684e-08,
3325             -2.51223623787020892529e-07,
3326             -3.88256480887769039346e-06,
3327             -1.10588938762623716291e-04,
3328             -9.76109749136146840777e-03,
3329             +7.78576235018280120474e-01,
3330     };
3331 
3332     T p;
3333     T q = 0.0;
3334 
3335     if (std::abs(x) <= T(8.0)) {
3336         T a = A[0];
3337 
3338         for (uint8_t index = 1; index < 29; index++) {
3339             p = q;
3340             q = a;
3341             a = ((std::abs(x) / T(2.0)) - T(2.0)) * q - p + A[index];
3342         }
3343 
3344         if (x < T(0.0)) {
3345             return -(T(0.5) * (a - p) * std::abs(x) * std::exp(std::abs(x)));
3346         }
3347 
3348         return T(0.5) * (a - p) * std::abs(x) * std::exp(std::abs(x));
3349     }
3350 
3351     T b = B[0];
3352 
3353     for (uint8_t index = 1; index < 25; index++) {
3354         p = q;
3355         q = b;
3356         b = (T(32.0) / std::abs(x) - T(2.0)) * q - p + B[index];
3357     }
3358 
3359     if (x < T(0.0)) {
3360         return -(std::exp(std::abs(x)) * (T(0.5) * (b - p)) / std::sqrt(std::abs(x)));
3361     }
3362 
3363     return std::exp(std::abs(x)) * (T(0.5) * (b - p)) / std::sqrt(std::abs(x));
3364 } // modified_bessel_i1_forward(T x)
3365 
3366 template<typename T>
modified_bessel_k0_forward(T x)3367 inline C10_HOST_DEVICE T modified_bessel_k0_forward(T x) {
3368     static const T A[] = {
3369             +1.37446543561352307156e-16,
3370             +4.25981614279661018399e-14,
3371             +1.03496952576338420167e-11,
3372             +1.90451637722020886025e-09,
3373             +2.53479107902614945675e-07,
3374             +2.28621210311945178607e-05,
3375             +1.26461541144692592338e-03,
3376             +3.59799365153615016266e-02,
3377             +3.44289899924628486886e-01,
3378             -5.35327393233902768720e-01,
3379     };
3380 
3381     static const T B[] = {
3382             +5.30043377268626276149e-18,
3383             -1.64758043015242134646e-17,
3384             +5.21039150503902756861e-17,
3385             -1.67823109680541210385e-16,
3386             +5.51205597852431940784e-16,
3387             -1.84859337734377901440e-15,
3388             +6.34007647740507060557e-15,
3389             -2.22751332699166985548e-14,
3390             +8.03289077536357521100e-14,
3391             -2.98009692317273043925e-13,
3392             +1.14034058820847496303e-12,
3393             -4.51459788337394416547e-12,
3394             +1.85594911495471785253e-11,
3395             -7.95748924447710747776e-11,
3396             +3.57739728140030116597e-10,
3397             -1.69753450938905987466e-09,
3398             +8.57403401741422608519e-09,
3399             -4.66048989768794782956e-08,
3400             +2.76681363944501510342e-07,
3401             -1.83175552271911948767e-06,
3402             +1.39498137188764993662e-05,
3403             -1.28495495816278026384e-04,
3404             +1.56988388573005337491e-03,
3405             -3.14481013119645005427e-02,
3406             +2.44030308206595545468e+00,
3407     };
3408 
3409     if (x == T(0.0)) {
3410         return std::numeric_limits<T>::infinity();
3411     }
3412 
3413     if (x < T(0.0)) {
3414         return std::numeric_limits<T>::quiet_NaN();
3415     }
3416 
3417     T p;
3418     T q = 0.0;
3419 
3420     if (x <= T(2.0)) {
3421         T a = A[0];
3422 
3423         for (uint8_t index = 1; index < 10; index++) {
3424             p = q;
3425             q = a;
3426             a = (x * x - T(2.0)) * q - p + A[index];
3427         }
3428 
3429         return T(0.5) * (a - p) - std::log(0.5 * x) * modified_bessel_i0_forward(x);
3430     }
3431 
3432     T b = B[0];
3433 
3434     for (uint8_t index = 1; index < 25; index++) {
3435         p = q;
3436         q = b;
3437         b = (T(8.0) / x - T(2.0)) * q - p + B[index];
3438     }
3439 
3440     return std::exp(-x) * (T(0.5) * (b - p)) / std::sqrt(x);
3441 } // modified_bessel_k0_forward(T x)
3442 
3443 template<typename T>
modified_bessel_k1_forward(T x)3444 inline C10_HOST_DEVICE T modified_bessel_k1_forward(T x) {
3445     static const T A[] = {
3446             -7.02386347938628759343e-18,
3447             -2.42744985051936593393e-15,
3448             -6.66690169419932900609e-13,
3449             -1.41148839263352776110e-10,
3450             -2.21338763073472585583e-08,
3451             -2.43340614156596823496e-06,
3452             -1.73028895751305206302e-04,
3453             -6.97572385963986435018e-03,
3454             -1.22611180822657148235e-01,
3455             -3.53155960776544875667e-01,
3456             +1.52530022733894777053e+00,
3457     };
3458 
3459     static const T B[] = {
3460             -5.75674448366501715755e-18,
3461             +1.79405087314755922667e-17,
3462             -5.68946255844285935196e-17,
3463             +1.83809354436663880070e-16,
3464             -6.05704724837331885336e-16,
3465             +2.03870316562433424052e-15,
3466             -7.01983709041831346144e-15,
3467             +2.47715442448130437068e-14,
3468             -8.97670518232499435011e-14,
3469             +3.34841966607842919884e-13,
3470             -1.28917396095102890680e-12,
3471             +5.13963967348173025100e-12,
3472             -2.12996783842756842877e-11,
3473             +9.21831518760500529508e-11,
3474             -4.19035475934189648750e-10,
3475             +2.01504975519703286596e-09,
3476             -1.03457624656780970260e-08,
3477             +5.74108412545004946722e-08,
3478             -3.50196060308781257119e-07,
3479             +2.40648494783721712015e-06,
3480             -1.93619797416608296024e-05,
3481             +1.95215518471351631108e-04,
3482             -2.85781685962277938680e-03,
3483             +1.03923736576817238437e-01,
3484             +2.72062619048444266945e+00,
3485     };
3486 
3487     if (x == T(0.0)) {
3488         return std::numeric_limits<T>::infinity();
3489     }
3490 
3491     if (x < T(0.0)) {
3492         return std::numeric_limits<T>::quiet_NaN();
3493     }
3494 
3495     T p;
3496     T q = 0.0;
3497 
3498     if (x <= T(2.0)) {
3499         T a = A[0];
3500 
3501         for (uint8_t index = 1; index < 11; index++) {
3502             p = q;
3503             q = a;
3504             a = (x * x - T(2.0)) * q - p + A[index];
3505         }
3506 
3507         return std::log(T(0.5) * x) * modified_bessel_i1_forward(x) + T(0.5) * (a - p) / x;
3508     }
3509 
3510     T b = B[0];
3511 
3512     for (uint8_t index = 1; index < 25; index++) {
3513         p = q;
3514         q = b;
3515         b = (T(8.0) / x - T(2.0)) * q - p + B[index];
3516     }
3517 
3518     return std::exp(-x) * (T(0.5) * (b - p)) / std::sqrt(x);
3519 } // modified_bessel_k1_forward(T x)
3520 
3521 template<typename T>
scaled_modified_bessel_k0_forward(T x)3522 inline C10_HOST_DEVICE T scaled_modified_bessel_k0_forward(T x) {
3523     static const T A[] = {
3524             +1.37446543561352307156e-16,
3525             +4.25981614279661018399e-14,
3526             +1.03496952576338420167e-11,
3527             +1.90451637722020886025e-09,
3528             +2.53479107902614945675e-07,
3529             +2.28621210311945178607e-05,
3530             +1.26461541144692592338e-03,
3531             +3.59799365153615016266e-02,
3532             +3.44289899924628486886e-01,
3533             -5.35327393233902768720e-01,
3534     };
3535 
3536     static const T B[] = {
3537             +5.30043377268626276149e-18,
3538             -1.64758043015242134646e-17,
3539             +5.21039150503902756861e-17,
3540             -1.67823109680541210385e-16,
3541             +5.51205597852431940784e-16,
3542             -1.84859337734377901440e-15,
3543             +6.34007647740507060557e-15,
3544             -2.22751332699166985548e-14,
3545             +8.03289077536357521100e-14,
3546             -2.98009692317273043925e-13,
3547             +1.14034058820847496303e-12,
3548             -4.51459788337394416547e-12,
3549             +1.85594911495471785253e-11,
3550             -7.95748924447710747776e-11,
3551             +3.57739728140030116597e-10,
3552             -1.69753450938905987466e-09,
3553             +8.57403401741422608519e-09,
3554             -4.66048989768794782956e-08,
3555             +2.76681363944501510342e-07,
3556             -1.83175552271911948767e-06,
3557             +1.39498137188764993662e-05,
3558             -1.28495495816278026384e-04,
3559             +1.56988388573005337491e-03,
3560             -3.14481013119645005427e-02,
3561             +2.44030308206595545468e+00,
3562     };
3563 
3564     if (x == T(0.0)) {
3565         return std::numeric_limits<T>::infinity();
3566     }
3567 
3568     if (x < T(0.0)) {
3569         return std::numeric_limits<T>::quiet_NaN();
3570     }
3571 
3572     T p;
3573     T q = 0.0;
3574 
3575     if (x <= T(2.0)) {
3576         T a = A[0];
3577 
3578         for (uint64_t index = 1; index < 10; index++) {
3579             p = q;
3580             q = a;
3581             a = (x * x - T(2.0)) * q - p + A[index];
3582         }
3583 
3584         return (T(0.5) * (a - p) - std::log(T(0.5) * x) * modified_bessel_i0_forward(x)) * std::exp(x);
3585     }
3586 
3587     T b = B[0];
3588 
3589     for (uint64_t index = 1; index < 25; index++) {
3590         p = q;
3591         q = b;
3592         b = (T(8.0) / x - T(2.0)) * q - p + B[index];
3593     }
3594 
3595     return T(0.5) * (b - p) / std::sqrt(x);
3596 } // T scaled_modified_bessel_k0_forward(T x)
3597 
3598 template<typename T>
scaled_modified_bessel_k1_forward(T x)3599 inline C10_HOST_DEVICE T scaled_modified_bessel_k1_forward(T x) {
3600     static const T A[] = {
3601             -7.02386347938628759343e-18,
3602             -2.42744985051936593393e-15,
3603             -6.66690169419932900609e-13,
3604             -1.41148839263352776110e-10,
3605             -2.21338763073472585583e-08,
3606             -2.43340614156596823496e-06,
3607             -1.73028895751305206302e-04,
3608             -6.97572385963986435018e-03,
3609             -1.22611180822657148235e-01,
3610             -3.53155960776544875667e-01,
3611             +1.52530022733894777053e+00,
3612     };
3613 
3614     static const T B[] = {
3615             -5.75674448366501715755e-18,
3616             +1.79405087314755922667e-17,
3617             -5.68946255844285935196e-17,
3618             +1.83809354436663880070e-16,
3619             -6.05704724837331885336e-16,
3620             +2.03870316562433424052e-15,
3621             -7.01983709041831346144e-15,
3622             +2.47715442448130437068e-14,
3623             -8.97670518232499435011e-14,
3624             +3.34841966607842919884e-13,
3625             -1.28917396095102890680e-12,
3626             +5.13963967348173025100e-12,
3627             -2.12996783842756842877e-11,
3628             +9.21831518760500529508e-11,
3629             -4.19035475934189648750e-10,
3630             +2.01504975519703286596e-09,
3631             -1.03457624656780970260e-08,
3632             +5.74108412545004946722e-08,
3633             -3.50196060308781257119e-07,
3634             +2.40648494783721712015e-06,
3635             -1.93619797416608296024e-05,
3636             +1.95215518471351631108e-04,
3637             -2.85781685962277938680e-03,
3638             +1.03923736576817238437e-01,
3639             +2.72062619048444266945e+00,
3640     };
3641 
3642     if (x == T(0.0)) {
3643         return std::numeric_limits<T>::infinity();
3644     }
3645 
3646     if (x < T(0.0)) {
3647         return std::numeric_limits<T>::quiet_NaN();
3648     }
3649 
3650     T p;
3651     T q = 0.0;
3652 
3653     if (x <= T(2.0)) {
3654         T a = A[0];
3655 
3656         for (uint64_t index = 1; index < 11; index++) {
3657             p = q;
3658             q = a;
3659             a = (x * x - T(2.0)) * q - p + A[index];
3660         }
3661 
3662         return (std::log(T(0.5) * x) * modified_bessel_i1_forward(x) + T(0.5) * (a - p) / x) * std::exp(x);
3663     }
3664 
3665     T b = B[0];
3666 
3667     for (uint64_t index = 1; index < 25; index++) {
3668         p = q;
3669         q = b;
3670         b = (T(8.0) / x - T(2.0)) * q - p + B[index];
3671     }
3672 
3673     return (T(0.5) * (b - p) / std::sqrt(x));
3674 } // T scaled_modified_bessel_k1_forward(T x)
3675 
3676 template<typename T>
shifted_chebyshev_polynomial_t_forward(T x,int64_t n)3677 inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_t_forward(T x, int64_t n) {
3678     if (n < 0) {
3679         return T(0.0);
3680     }
3681 
3682     if (x == T(1.0)) {
3683         return T(1.0);
3684     }
3685 
3686     if (x == T(0.0)) {
3687         if (n % 2 == 0) {
3688             return T(1.0);
3689         }
3690 
3691         return T(-1.0);
3692     }
3693 
3694     if ((n > 6) && (std::abs(x + x - T(1.0)) < T(1.0))) {
3695         return std::cos(n * std::acos(x + x - T(1.0)));
3696     }
3697 
3698     if (n == 0) {
3699         return T(1.0);
3700     }
3701 
3702     if (n == 1) {
3703         return x + x - T(1.0);
3704     }
3705 
3706     T p = T(1.0);
3707     T q = x + x - T(1.0);
3708     T r;
3709 
3710     for (int64_t k = 2; k <= n; k++) {
3711         r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
3712         p = q;
3713         q = r;
3714     }
3715 
3716     return r;
3717 } // shifted_chebyshev_polynomial_t_forward(T x, int64_t n)
3718 
3719 template<typename T, bool is_cuda=false>
shifted_chebyshev_polynomial_t_forward(T x,T n)3720 inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_t_forward(T x, T n) {
3721     return shifted_chebyshev_polynomial_t_forward(x, static_cast<int64_t>(n));
3722 } // shifted_chebyshev_polynomial_t_forward(T x, T n)
3723 
3724 template<typename T>
shifted_chebyshev_polynomial_u_forward(T x,int64_t n)3725 inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_u_forward(T x, int64_t n) {
3726     if (n < 0) {
3727         return T(0.0);
3728     }
3729 
3730     if (x == T(1.0)) {
3731         return n + 1;
3732     }
3733 
3734     if (x == T(0.0)) {
3735         if (n % 2 == 0) {
3736             return n + 1;
3737         }
3738 
3739         return -(n + 1);
3740     }
3741 
3742     if ((n > 6) && (std::abs(x + x - T(1.0)) < T(1.0))) {
3743         if (std::sin(std::acos(x + x - T(1.0))) != T(0.0)) {
3744             return std::sin((n + 1) * std::acos(x + x - T(1.0))) / std::sin(std::acos(x + x - T(1.0)));
3745         }
3746 
3747         return (n + 1) * std::cos((n + 1) * std::acos(x + x - T(1.0))) / (x + x - T(1.0));
3748     }
3749 
3750     if (n == 0) {
3751         return T(1.0);
3752     }
3753 
3754     if (n == 1) {
3755         return x + x - T(1.0) + (x + x - T(1.0));
3756     }
3757 
3758     T p = T(1.0);
3759     T q = x + x - T(1.0) + (x + x - T(1.0));
3760     T r;
3761 
3762     for (int64_t k = 2; k <= n; k++) {
3763         r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
3764         p = q;
3765         q = r;
3766     }
3767 
3768     return r;
3769 } // shifted_chebyshev_polynomial_u_forward(T x, int64_t n)
3770 
3771 template<typename T, bool is_cuda=false>
shifted_chebyshev_polynomial_u_forward(T x,T n)3772 inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_u_forward(T x, T n) {
3773     return shifted_chebyshev_polynomial_u_forward(x, static_cast<int64_t>(n));
3774 } // shifted_chebyshev_polynomial_u_forward(T x, T n)
3775 
3776 template<typename T>
shifted_chebyshev_polynomial_v_forward(T x,int64_t n)3777 inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_v_forward(T x, int64_t n) {
3778     if (n < 0) {
3779         return T(0.0);
3780     }
3781 
3782     if (x == T(1.0)) {
3783         return T(1.0);
3784     }
3785 
3786     if (x == T(0.0)) {
3787         if (n % 2 == 0) {
3788             return (n + n + 1);
3789         }
3790 
3791         return -(n + n + 1);
3792     }
3793 
3794     if ((n > 6) && (std::abs(x + x - T(1.0)) < T(1.0))) {
3795         if (std::sin(std::acos(x + x - T(1.0)) / T(2.0)) != T(1.0)) {
3796             return std::cos(((n) + T(0.5)) * std::acos(x + x - T(1.0))) / std::cos(std::acos(x + x - T(1.0)) / T(2.0));
3797         }
3798 
3799         if (n % 2 == 0) {
3800             return n + n + 1;
3801         }
3802 
3803         return -(n + n + 1);
3804     }
3805 
3806     if (n == 0) {
3807         return T(1.0);
3808     }
3809 
3810     if (n == 1) {
3811         return x + x - T(1.0) + (x + x - T(1.0)) - T(1.0);
3812     }
3813 
3814     T p = T(1.0);
3815     T q = x + x - T(1.0) + (x + x - T(1.0)) - T(1.0);
3816     T r;
3817 
3818     for (int64_t k = 2; k <= n; k++) {
3819         r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
3820         p = q;
3821         q = r;
3822     }
3823 
3824     return r;
3825 } // shifted_chebyshev_polynomial_v_forward(T x, int64_t n)
3826 
3827 template<typename T, bool is_cuda=false>
shifted_chebyshev_polynomial_v_forward(T x,T n)3828 inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_v_forward(T x, T n) {
3829     return shifted_chebyshev_polynomial_v_forward(x, static_cast<int64_t>(n));
3830 } // shifted_chebyshev_polynomial_v_forward(T x, T n)
3831 
3832 template<typename T>
shifted_chebyshev_polynomial_w_forward(T x,int64_t n)3833 inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_w_forward(T x, int64_t n) {
3834     if (n < 0) {
3835         return T(0.0);
3836     }
3837 
3838     if (x == T(1.0)) {
3839         return n + n + 1;
3840     }
3841 
3842     if (x == T(0.0)) {
3843         if (n % 2 == 0) {
3844             return T(1.0);
3845         }
3846 
3847         return T(-1.0);
3848     }
3849 
3850     if ((n > 4) && (std::abs(x + x - T(1.0)) < T(1.0))) {
3851         if (std::cos(std::acos(x + x - T(1.0)) / T(2.0)) != T(1.0)) {
3852             return std::sin((n + T(0.5)) * std::acos(x + x - T(1.0))) / std::sin(std::acos(x + x - T(1.0)) / T(2.0));
3853         }
3854 
3855         if (n % 2 == 0) {
3856             return T(1.0);
3857         }
3858 
3859         return T(-1.0);
3860     }
3861 
3862     if (n == 0) {
3863         return T(1.0);
3864     }
3865 
3866     if (n == 1) {
3867         return x + x - T(1.0) + (x + x - T(1.0)) + T(1.0);
3868     }
3869 
3870     T p = T(1.0);
3871     T q = x + x - T(1.0) + (x + x - T(1.0)) + T(1.0);
3872     T r;
3873 
3874     for (int64_t k = 2; k <= n; k++) {
3875         r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
3876         p = q;
3877         q = r;
3878     }
3879 
3880     return r;
3881 } // shifted_chebyshev_polynomial_w_forward(T x, int64_t n)
3882 
3883 template<typename T, bool is_cuda=false>
shifted_chebyshev_polynomial_w_forward(T x,T n)3884 inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_w_forward(T x, T n) {
3885     return shifted_chebyshev_polynomial_w_forward(x, static_cast<int64_t>(n));
3886 } // shifted_chebyshev_polynomial_w_forward(T x, T n)
3887 
3888 template<typename T>
spherical_bessel_j0_forward(T x)3889 inline C10_HOST_DEVICE T spherical_bessel_j0_forward(T x) {
3890     if (std::isinf(x)) {
3891         return T(0.0);
3892     }
3893 
3894     if (std::abs(x) < T(0.5)) {
3895         return T(1.0) + x * x * (T(-1.0) / T(6.0) + x * x * (T(1.0) / T(120.0) + x * x * (T(-1.0) / T(5040.0) + x * x * (T(1.0) / T(362880.0) + x * x * (T(-1.0) / T(39916800.0) + x * x * (T(1.0) / T(6227020800.0)))))));
3896     }
3897 
3898     return std::sin(x) / x;
3899 } // T spherical_bessel_j0_forward(T x)
3900 
3901 C10_CLANG_DIAGNOSTIC_POP()
3902