xref: /aosp_15_r20/external/XNNPACK/src/f32-raddstoreexpminusmax/scalar-rr2-p5.c.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6$assert ELEMENTS_TILE >= 1
7$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
8#include <assert.h>
9
10#include <xnnpack/common.h>
11#include <xnnpack/math.h>
12#include <xnnpack/raddstoreexpminusmax.h>
13
14
15void xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x${ELEMENTS_TILE}${"" if ACCUMULATORS == 1 else "_acc%d" % ACCUMULATORS}(
16    size_t elements,
17    const float* input,
18    const float* max,
19    float* output,
20    float* sum,
21    const union xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)])
22{
23  assert(elements % sizeof(float) == 0);
24
25  const float vi_max = *max;
26  const float vlog2e = params->scalar_rr2_p5.log2e;
27  const float vmagic_bias = params->scalar_rr2_p5.magic_bias;
28  const float vminus_ln2_hi = params->scalar_rr2_p5.minus_ln2_hi;
29  const float vminus_ln2_lo = params->scalar_rr2_p5.minus_ln2_lo;
30  const float vc5 = params->scalar_rr2_p5.c5;
31  const float vc4 = params->scalar_rr2_p5.c4;
32  const float vc3 = params->scalar_rr2_p5.c3;
33  const float vc2 = params->scalar_rr2_p5.c2;
34  const float vc1 = params->scalar_rr2_p5.c1;
35  const float vdenorm_cutoff = params->scalar_rr2_p5.denorm_cutoff;
36
37  $if ELEMENTS_TILE > 1:
38    $for K in range(ACCUMULATORS):
39      float vacc${K} = 0.0f;
40    for (; elements >= ${ELEMENTS_TILE} * sizeof(float); elements -= ${ELEMENTS_TILE} * sizeof(float)) {
41      // Load ${ELEMENTS_TILE} inputs at a time.
42      $for N in range(ELEMENTS_TILE):
43        const float vi${N} = input[${N}];
44      input += ${ELEMENTS_TILE};
45
46      // Subtract maximum input x := i - i_max. This implies x <= 0.
47      $for N in range(ELEMENTS_TILE):
48        const float vx${N} = vi${N} - vi_max;
49
50      // Compute reduced argument n := round(x / log(2)).
51      // We do it by adding a large number (magic bias) to the product x * (1/log(2)), which cause rounding of the result
52      // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
53      // certain bounds (|x| <= 2**22), but that's ok, because inputs outside of [-87.336540, 0.0] underflow expf(x)
54      // anyway. We fixup the result for such inputs at the very end of the algorithm.
55      $for N in range(ELEMENTS_TILE):
56        float vn${N} = vx${N} * vlog2e + vmagic_bias;
57
58      // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
59      // -87.33642 <= x <= 0.0, and -126 <= n <= 0 accordingly.
60      $for N in range(ELEMENTS_TILE):
61        const float vs${N} = uint32_as_float(float_as_uint32(vn${N}) << 23);
62
63      // Subtract the large number back to get final n := round(x / log(2)).
64      $for N in range(ELEMENTS_TILE):
65        vn${N} -= vmagic_bias;
66
67      // Compute reduced argument t := x - n * log(2).
68      // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
69      $for N in range(ELEMENTS_TILE):
70        float vt${N} = vn${N} * vminus_ln2_hi + vx${N};
71
72      $for N in range(ELEMENTS_TILE):
73        vt${N} = vn${N} * vminus_ln2_lo + vt${N};
74
75      // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2].
76      $for N in range(ELEMENTS_TILE):
77        float vp${N} = vc5 * vt${N} + vc4;
78
79      $for N in range(ELEMENTS_TILE):
80        vp${N} = vp${N} * vt${N} + vc3;
81
82      $for N in range(ELEMENTS_TILE):
83        vp${N} = vp${N} * vt${N} + vc2;
84
85      $for N in range(ELEMENTS_TILE):
86        vp${N} = vp${N} * vt${N} + vc1;
87
88      // Reconstruct the final f value:
89      //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
90      //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
91      //     = s + (t * s) * p
92      $for N in range(ELEMENTS_TILE):
93        vt${N} *= vs${N};
94
95      $for N in range(ELEMENTS_TILE):
96        float vf${N} = vt${N} * vp${N} + vs${N};
97
98      // For inputs below denormal cutoff, replace output with +0.0f.
99      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
100      $for N in range(ELEMENTS_TILE):
101        if XNN_UNPREDICTABLE(vx${N} < vdenorm_cutoff) {
102          vf${N} = 0.0f;
103        }
104
105      // Store ${ELEMENTS_TILE} outputs at a time.
106      $for N in range(ELEMENTS_TILE):
107        output[${N}] = vf${N};
108      output += ${ELEMENTS_TILE};
109
110      // Accumulate computed exponents.
111      $for N in range(ELEMENTS_TILE):
112        vacc${N % ACCUMULATORS} += vf${N};
113    }
114    $if ACCUMULATORS > 1:
115      // Add up all accumulators to vacc0
116      $ACC_SLICE = 1
117      $while ACC_SLICE < ACCUMULATORS:
118        $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
119          $if A + ACC_SLICE < ACCUMULATORS:
120            vacc${A} += vacc${A + ACC_SLICE};
121        $ACC_SLICE *= 2
122
123    float vacc = vacc0;
124  $else:
125    float vacc = 0.0f;
126  for (; elements >= sizeof(float); elements -= sizeof(float)) {
127    // Load 1 input at a time.
128    const float vi = *input++;
129
130    // Subtract maximum input x := i - i_max. This implies x <= 0.
131    const float vx = vi - vi_max;
132
133    // Compute reduced argument n := round(x / log(2)).
134    // We do it by adding a large number (magic bias) to the product x * (1/log(2)), which cause rounding of the result
135    // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
136    // certain bounds (|x| <= 2**22), but that's ok, because inputs outside of [-87.336540, 0.0] underflow expf(x)
137    // anyway. We fixup the result for such inputs at the very end of the algorithm.
138    float vn = vx * vlog2e + vmagic_bias;
139
140    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
141    // -87.33642 <= x <= 0.0, and -126 <= n <= 0 accordingly.
142    const float vs = uint32_as_float(float_as_uint32(vn) << 23);
143
144    // Subtract the large number back to get final n := round(x / log(2)).
145    vn -= vmagic_bias;
146
147    // Compute reduced argument t := x - n * log(2).
148    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
149    float vt = vn * vminus_ln2_hi + vx;
150    vt = vn * vminus_ln2_lo + vt;
151
152    // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2].
153    float vp = vc5 * vt + vc4;
154    vp = vp * vt + vc3;
155    vp = vp * vt + vc2;
156    vp = vp * vt + vc1;
157
158    // Reconstruct the final f value:
159    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
160    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
161    //     = s + (t * s) * p
162    vt *= vs;
163    float vf = vt * vp + vs;
164
165    // For inputs below denormal cutoff, replace output with +0.0f.
166    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
167    if XNN_UNPREDICTABLE(vx < vdenorm_cutoff) {
168      vf = 0.0f;
169    }
170
171    // Store 1 output at a time.
172    *output++ = vf;
173
174    // Accumulate computed exponents.
175    vacc += vf;
176  }
177  *sum = vacc;
178}
179