xref: /aosp_15_r20/external/XNNPACK/src/qu8-gemm/c4-neondot.c.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
7$assert NR % 8 == 0
8$assert 8 <= NR <= 32
9$assert REQUANTIZATION in ["FP32", "RNDNU"]
10#include <assert.h>
11
12#include <arm_neon.h>
13
14#include <xnnpack/gemm.h>
15$if REQUANTIZATION == "FP32":
16  #include <xnnpack/intrinsics-polyfill.h>
17#include <xnnpack/math.h>
18
19
20$PARAMS_STRUCT = "fp32_neonv8" if REQUANTIZATION == "FP32" else REQUANTIZATION.lower() + "_neon"
21void xnn_qu8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_${MR}x${NR}c4__neondot(
22    size_t mr,
23    size_t nc,
24    size_t kc,
25    const uint8_t* restrict a,
26    size_t a_stride,
27    const void* restrict w,
28    uint8_t* restrict c,
29    size_t cm_stride,
30    size_t cn_stride,
31    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
32{
33  assert(mr != 0);
34  assert(mr <= ${MR});
35  assert(nc != 0);
36  assert(kc != 0);
37  assert(kc % sizeof(uint8_t) == 0);
38  assert(a != NULL);
39  assert(w != NULL);
40  assert(c != NULL);
41
42  kc = round_up_po2(kc, 4 * sizeof(uint8_t));
43  const uint8_t* a0 = a;
44  uint8_t* c0 = c;
45  $for M in range(1, MR):
46    const uint8_t* a${M} = (const uint8_t*) ((uintptr_t) a${M-1} + a_stride);
47    uint8_t* c${M} = (uint8_t*) ((uintptr_t) c${M-1} + cm_stride);
48    $if M % 2 == 0:
49      if XNN_UNPREDICTABLE(mr <= ${M}) {
50        a${M} = a${M-1};
51        c${M} = c${M-1};
52      }
53    $elif M + 1 == MR:
54      if XNN_UNPREDICTABLE(mr != ${M+1}) {
55        a${M} = a${M-1};
56        c${M} = c${M-1};
57      }
58    $else:
59      if XNN_UNPREDICTABLE(mr < ${M+1}) {
60        a${M} = a${M-1};
61        c${M} = c${M-1};
62      }
63
64  const uint8x8_t va_zero_point = vld1_dup_u8(&params->${PARAMS_STRUCT}.kernel_zero_point[0]);
65
66  // Loop over groups of ${NR} columns.
67  do {
68    // Initialize accumulators with bias. ${NR} bias values are loaded from the
69    // weight matrix, at the start of the group of ${NR} columns.
70    $for N in range(0, NR, 4):
71      uint32x4_t vpacc0x${ABC[N:N+4]} = vld1q_u32(w); w = (const void*) ((const uint32_t*) w + 4);
72    $for M in range(1, MR):
73      $for N in range(0, NR, 4):
74        uint32x4_t vpacc${M}x${ABC[N:N+4]} = vpacc0x${ABC[N:N+4]};
75    $for M in range(0, MR):
76      uint32x2_t vnacc${M} = vmov_n_u32(0);
77
78    // Inner accumulation loop along the ${NR} columns.
79    size_t k = kc;
80    // 2x partial unrolled loop to load 8 bytes at a time.
81    while (k >= 8 * sizeof(uint8_t)) {
82      // Load a ${MR}x8 block of activations.
83      $for M in range(MR):
84        const uint8x8_t va${M}x01234567 = vld1_u8(a${M}); a${M} += 8;
85
86      // Load a 8x${NR} block of weights.
87      $for K in range(0, 8, 4):
88        $for N in range(0, NR, 4):
89          const uint8x16_t vb${ABC[K:K+4]}x${ABC[N:N+4]} = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
90
91      // Multiply-accumulate: ${MR}x8 * 8x${NR} --> ${MR}x${NR}.
92      $for M in range(MR):
93        vnacc${M} = vdot_u32(vnacc${M}, va_zero_point, va${M}x01234567);
94        $for K in range(0, 8, 4):
95          $for N in range(0, NR, 4):
96            vpacc${M}x${ABC[N:N+4]} = vdotq_lane_u32(vpacc${M}x${ABC[N:N+4]}, vb${ABC[K:K+4]}x${ABC[N:N+4]}, va${M}x01234567, ${K//4});
97
98      k -= 8 * sizeof(uint8_t);
99    }
100    // Handle up to 4 final positions of `k`
101    if XNN_UNLIKELY(k != 0) {
102      // Load a ${MR}x4 block of activations.
103      $for M in range(MR):
104        const uint8x8_t va${M}x01234567 = vreinterpret_u8_u32(vld1_lane_u32((const void*) a${M}, vmov_n_u32(0), 0)); a${M} += 4;
105
106      // Load a 4x${NR} block of weights.
107      $for N in range(0, NR, 4):
108        const uint8x16_t vb0123x${ABC[N:N+4]} = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
109
110      // Multiply-accumulate: ${MR}x4 * 4x${NR} --> ${MR}x${NR}.
111      $for M in range(MR):
112        vnacc${M} = vdot_u32(vnacc${M}, va_zero_point, va${M}x01234567);
113        $for N in range(0, NR, 4):
114          vpacc${M}x${ABC[N:N+4]} = vdotq_lane_u32(vpacc${M}x${ABC[N:N+4]}, vb0123x${ABC[N:N+4]}, va${M}x01234567, 0);
115    }
116
117    // Subtract zero point from accumulators.
118    $for M in range(0, MR):
119      vnacc${M} = vpadd_u32(vnacc${M}, vnacc${M});
120      const uint32x4_t vnacc${M}x0123 = vcombine_u32(vnacc${M}, vnacc${M});
121      $for N in range(0, NR, 4):
122        int32x4_t vacc${M}x${ABC[N:N+4]} = vreinterpretq_s32_u32(vsubq_u32(vpacc${M}x${ABC[N:N+4]}, vnacc${M}x0123));
123
124    $if REQUANTIZATION == "RNDNU":
125      const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->${PARAMS_STRUCT}.right_pre_shift);
126      const int32x4_t vmultiplier = vld1q_dup_s32(&params->${PARAMS_STRUCT}.multiplier);
127      const int32x4_t vright_post_shift = vld1q_dup_s32(&params->${PARAMS_STRUCT}.right_post_shift);
128
129      $for M in range(MR):
130        $for N in range(0, NR, 4):
131          vacc${M}x${ABC[N:N+4]} = vshlq_s32(vacc${M}x${ABC[N:N+4]}, vright_pre_shift);
132
133      $for M in range(MR):
134        $for N in range(0, NR, 4):
135          vacc${M}x${ABC[N:N+4]} = vqdmulhq_s32(vacc${M}x${ABC[N:N+4]}, vmultiplier);
136
137      $for M in range(MR):
138        $for N in range(0, NR, 4):
139          vacc${M}x${ABC[N:N+4]} = vrshlq_s32(vacc${M}x${ABC[N:N+4]}, vright_post_shift);
140    $elif REQUANTIZATION == "FP32":
141      $for M in range(MR):
142        $for N in range(0, NR, 4):
143          float32x4_t vfpacc${M}x${ABC[N:N+4]} = vcvtq_f32_s32(vacc${M}x${ABC[N:N+4]});
144
145      const float32x4_t vscale = vld1q_dup_f32(&params->${PARAMS_STRUCT}.scale);
146      $for M in range(MR):
147        $for N in range(0, NR, 4):
148          vfpacc${M}x${ABC[N:N+4]} = vmulq_f32(vfpacc${M}x${ABC[N:N+4]}, vscale);
149
150      $for M in range(MR):
151        $for N in range(0, NR, 4):
152          vacc${M}x${ABC[N:N+4]} = vcvtnq_s32_f32(vfpacc${M}x${ABC[N:N+4]});
153
154    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->${PARAMS_STRUCT}.output_zero_point);
155#if XNN_ARCH_ARM64
156    $for M in range(MR):
157      $for N in range(0, NR, 8):
158        const int16x8_t vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vacc${M}x${ABC[N+4:N+8]}), voutput_zero_point);
159
160    $for M in range(MR):
161      $for N in range(0, NR, 16):
162        $if N + 8 < NR:
163          uint8x16_t vout${M}x${ABC[N:N+16]} = vqmovun_high_s16(vqmovun_s16(vacc${M}x${ABC[N:N+8]}), vacc${M}x${ABC[N+8:N+16]});
164        $elif M % 2 == 1:
165          uint8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vqmovun_high_s16(vqmovun_s16(vacc${M-1}x${ABC[N:N+8]}), vacc${M}x${ABC[N:N+8]});
166        $elif M + 1 == MR:
167          uint8x8_t vout${M}x${ABC[N:N+8]} = vqmovun_s16(vacc${M}x${ABC[N:N+8]});
168#else
169    $for M in range(MR):
170      $for N in range(0, NR, 8):
171        const int16x8_t vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vqmovn_s32(vacc${M}x${ABC[N+4:N+8]})), voutput_zero_point);
172
173    $for M in range(MR):
174      $for N in range(0, NR, 16):
175        $if N + 8 < NR:
176          uint8x16_t vout${M}x${ABC[N:N+16]} = vcombine_u8(vqmovun_s16(vacc${M}x${ABC[N:N+8]}), vqmovun_s16(vacc${M}x${ABC[N+8:N+16]}));
177        $elif M % 2 == 1:
178          uint8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vcombine_u8(vqmovun_s16(vacc${M-1}x${ABC[N:N+8]}), vqmovun_s16(vacc${M}x${ABC[N:N+8]}));
179        $elif M + 1 == MR:
180          uint8x8_t vout${M}x${ABC[N:N+8]} = vqmovun_s16(vacc${M}x${ABC[N:N+8]});
181#endif
182    $if NR == 8 and MR == 1:
183      const uint8x8_t voutput_min = vld1_dup_u8(&params->${PARAMS_STRUCT}.output_min);
184      const uint8x8_t voutput_max = vld1_dup_u8(&params->${PARAMS_STRUCT}.output_max);
185    $else:
186      const uint8x16_t voutput_min = vld1q_dup_u8(&params->${PARAMS_STRUCT}.output_min);
187      const uint8x16_t voutput_max = vld1q_dup_u8(&params->${PARAMS_STRUCT}.output_max);
188
189    $for M in range(MR):
190      $for N in range(0, NR, 16):
191        $if N + 8 < NR:
192          vout${M}x${ABC[N:N+16]} = vmaxq_u8(vout${M}x${ABC[N:N+16]}, voutput_min);
193        $elif M % 2 == 1:
194          vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vmaxq_u8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_min);
195        $elif M + 1 == MR:
196          $if NR == 8 and MR == 1:
197            vout${M}x${ABC[N:N+8]} = vmax_u8(vout${M}x${ABC[N:N+8]}, voutput_min);
198          $else:
199            vout${M}x${ABC[N:N+8]} = vmax_u8(vout${M}x${ABC[N:N+8]}, vget_low_u8(voutput_min));
200
201    $for M in range(MR):
202      $for N in range(0, NR, 16):
203        $if N + 8 < NR:
204          vout${M}x${ABC[N:N+16]} = vminq_u8(vout${M}x${ABC[N:N+16]}, voutput_max);
205        $elif M % 2 == 1:
206          vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vminq_u8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_max);
207        $elif M + 1 == MR:
208          $if NR == 8 and MR == 1:
209            vout${M}x${ABC[N:N+8]} = vmin_u8(vout${M}x${ABC[N:N+8]}, voutput_max);
210          $else:
211            vout${M}x${ABC[N:N+8]} = vmin_u8(vout${M}x${ABC[N:N+8]}, vget_low_u8(voutput_max));
212
213    if (nc >= ${NR}) {
214      $for M in range(MR):
215        $for N in range(0, NR, 16):
216          $if N + 8 < NR:
217            vst1q_u8(c${M} + ${N}, vout${M}x${ABC[N:N+16]});
218          $elif M % 2 == 1:
219            vst1_u8(c${M-1} + ${N}, vget_low_u8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}));
220            vst1_u8(c${M} + ${N}, vget_high_u8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}));
221          $elif M + 1 == MR:
222            vst1_u8(c${M} + ${N}, vout${M}x${ABC[N:N+8]});
223
224      $for M in range(MR):
225        c${M} = (uint8_t*) ((uintptr_t) c${M} + cn_stride);
226
227      $for M in range(MR):
228        a${M} = (const uint8_t*) ((uintptr_t) a${M} - kc);
229
230      nc -= ${NR};
231    } else {
232      $if NR == 32:
233        if (nc & 16) {
234          $for M in range(MR):
235            vst1q_u8(c${M}, vout${M}x${ABC[0:16]});  c${M} += 16;
236
237          $for M in range(MR):
238            vout${M}x${ABC[0:16]} = vout${M}x${ABC[16:32]};
239        }
240      $if NR >= 16:
241        $for M in range(MR):
242          $if M % 2 == 1:
243            uint8x16_t vout${M-1}x01234567_${M}x01234567 = vcombine_u8(vget_low_u8(vout${M-1}x0123456789ABCDEF), vget_low_u8(vout${M}x0123456789ABCDEF));
244          $elif M + 1 == MR:
245            uint8x8_t vout${M}x01234567 = vget_low_u8(vout${M}x0123456789ABCDEF);
246        if (nc & 8) {
247          $for M in range(MR):
248            $if M % 2 == 1:
249              vst1_u8(c${M-1}, vget_low_u8(vout${M-1}x01234567_${M}x01234567)); c${M-1} += 8;
250              vst1_u8(c${M}, vget_high_u8(vout${M-1}x01234567_${M}x01234567)); c${M} += 8;
251            $elif M + 1 == MR:
252              vst1_u8(c${M}, vout${M}x01234567); c${M} += 8;
253          $for M in range(MR):
254            $if M % 2 == 1:
255              vout${M-1}x01234567_${M}x01234567 = vcombine_u8(vget_high_u8(vout${M-1}x0123456789ABCDEF), vget_high_u8(vout${M}x0123456789ABCDEF));
256            $elif M + 1 == MR:
257              vout${M}x01234567 = vget_high_u8(vout${M}x0123456789ABCDEF);
258        }
259      if (nc & 4) {
260        $for M in range(MR):
261          $if M % 2 == 1:
262            vst1q_lane_u32((void*) c${M-1}, vreinterpretq_u32_u8(vout${M-1}x01234567_${M}x01234567), 0); c${M-1} += 4;
263            vst1q_lane_u32((void*) c${M}, vreinterpretq_u32_u8(vout${M-1}x01234567_${M}x01234567), 2); c${M} += 4;
264          $elif M + 1 == MR:
265            vst1_lane_u32((void*) c${M}, vreinterpret_u32_u8(vout${M}x01234567), 0); c${M} += 4;
266        $for M in range(MR):
267          $if M % 2 == 1:
268            vout${M-1}x01234567_${M}x01234567 = vextq_u8(vout${M-1}x01234567_${M}x01234567, vout${M-1}x01234567_${M}x01234567, 4);
269          $elif M + 1 == MR:
270            vout${M}x01234567 = vext_u8(vout${M}x01234567, vout${M}x01234567, 4);
271      }
272      if (nc & 2) {
273        $for M in range(MR):
274          $if M % 2 == 1:
275            vst1q_lane_u16((void*) c${M-1}, vreinterpretq_u16_u8(vout${M-1}x01234567_${M}x01234567), 0); c${M-1} += 2;
276            vst1q_lane_u16((void*) c${M}, vreinterpretq_u16_u8(vout${M-1}x01234567_${M}x01234567), 4); c${M} += 2;
277          $elif M + 1 == MR:
278            vst1_lane_u16((void*) c${M}, vreinterpret_u16_u8(vout${M}x01234567), 0); c${M} += 2;
279        $for M in range(MR):
280          $if M % 2 == 1:
281            vout${M-1}x01234567_${M}x01234567 = vextq_u8(vout${M-1}x01234567_${M}x01234567, vout${M-1}x01234567_${M}x01234567, 2);
282          $elif M + 1 == MR:
283            vout${M}x01234567 = vext_u8(vout${M}x01234567, vout${M}x01234567, 2);
284      }
285      if (nc & 1) {
286        $for M in range(MR):
287          $if M % 2 == 1:
288            vst1q_lane_u8(c${M-1}, vout${M-1}x01234567_${M}x01234567, 0);
289            vst1q_lane_u8(c${M}, vout${M-1}x01234567_${M}x01234567, 8);
290          $elif M + 1 == MR:
291            vst1_lane_u8(c${M}, vout${M}x01234567, 0);
292      }
293
294      nc = 0;
295    }
296  } while (nc != 0);
297}
298