1 /*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9 /*******************************************************************************
10 * Copyright (c) 2018-2023 Cadence Design Systems, Inc.
11 *
12 * Permission is hereby granted, free of charge, to any person obtaining
13 * a copy of this software and associated documentation files (the
14 * "Software"), to use this Software with Cadence processor cores only and
15 * not with any other processors and platforms, subject to
16 * the following conditions:
17 *
18 * The above copyright notice and this permission notice shall be included
19 * in all copies or substantial portions of the Software.
20 *
21 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
24 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
25 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
26 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
27 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
28
29 ******************************************************************************/
30
31 #include "xa_nnlib_common.h"
32 #include "xa_type_def.h"
33
34 #ifdef ROW_UNROLL
35 #undef ROW_UNROLL
36 #endif
37 #define ROW_UNROLL 4
38
39 #include "xa_nnlib_common_macros.h"
40
41 /* Include the asym8uxasym8u_asym8u macros */
42 #include "matmul_asym8uxasym8u_asym8u_macros.h"
43
44 /*----------------------------Main function---------------------------------*/
45
46 namespace cadence {
47 namespace impl {
48 namespace HiFi {
49 namespace kernels {
50 /*
51 The following function is copied from xa_nn_matmul_asym8xasym8_asym8 in
52 xa_nnlib/algo/kernels/matXvec/hifi4/xa_nn_matmul_asym8xasym8.c.
53
54 xa_nn_matmul_asym8xasym8_asym8 multiplies two quint8 matrices, and requantizes
55 the result to produce a quint8 output. However, it has two limitations:
56 1. It only works for per-tensor quantized weight.
57 2. It forces the weight rows to be unrolled by 4 when both input and weight are
58 aligned to 8b boundary.
59
60 We modify xa_nn_matmul_asym8xasym8_asym8 to allow per-channel quantized weights
61 as well. To do so, we make the following two changes:
62 1. The out_multiplier and out_shift now become arrays instead of scalars. Apart
63 from the function arg, we add a new macro (UNROLL_ROW_SETUP_SHIFT) which
64 computes the right out_shift for each channel (i.e., unrolled row of weight),
65 and stores it in the appropriate index of left_shift[ROW_UNROLL] and
66 right_shift[ROW_UNROLL].
67 2. We modify the ADJUST_ACC_BATCH_ASYM8b macro so that it it picks up the right
68 out_multiplier and out_shift for the accumulation corresponding to each channel
69 (i.e., unrolled row of weight).
70
71 Through experimentation, we observe that the kernel performs better when the
72 weight matrix is uniformly unrolled by a factor of 2 instead of 4 for 8b aligned
73 case. We add a case for ROW_UNROLL=2 and VEC_UNROLL=2 in
74 xa_nnlib_matmul_unroll_macros.h. This code is similar to the ROW_UNROLL=4 and
75 VEC_UNROLL=2 code in
76 nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_common_macros.h.
77
78 General information about the code:
79 The HiFi4 xa_nn_matmul_asym8xasym8_asym8 kernel writes the code using macros,
80 which are expanded to HiFi4 intrinsics in
81 nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_common_macros.h.
82 The code caters to two specific cases:
83 1. When the two input matrices (p_mat1 and p_vec1) are aligned to 8-byte
84 boundary, we do not need unaligned loads. In that case, 'chk_align' is true, and
85 the code unrolls p_mat1 by 4 and p_vec1 by 2.
86 2. If chk_align is false, then the code unrolls both p_mat1 and p_vec1 by a
87 factor of 2. The code will use macros that expand to unaligned loads via
88 register priming (e.g., LOAD_VEC_BATCH_ASYM8b_UNALIGNED)
89 3. If either p_mat1 or p_vec1 are nullptr, the code returns -1.
90
91 The choice of unrolling factors in the NNLib kernel is not controlled by the
92 user: it sets ROW_UNROLL to 4 by default. This choice is not goverened by any
93 heuristics. The performance degradation due to unaligned loads/stores also is
94 not clear to warrant two branches in the code (if/else branching on chk_align).
95
96 Future modifications: In future, if Tensilica provides a new version of the
97 xa_nn_matmul_asym8xasym8_asym8 kernel, the changes to this file would be
98 minimal: we just copy the entire function here, change the args for
99 out_multiplier and out_shift, and add SETUP_SHIFT/UNROLL_ROW_SETUP_SHIFT macro
100 to get the right out_shift for each unrolled row.
101 */
102
matmul_asym8uxasym8u_asym8u(UWORD8 * __restrict__ p_out,const UWORD8 * __restrict__ p_mat1,const UWORD8 * __restrict__ p_vec1,const WORD32 * __restrict__ p_bias,WORD32 rows,WORD32 cols1,WORD32 row_stride1,WORD32 vec_count,WORD32 vec_offset,WORD32 out_offset,WORD32 out_stride,WORD32 mat1_zero_bias,WORD32 vec1_zero_bias,const WORD32 * __restrict__ out_multiplier,const WORD32 * __restrict__ out_shift,WORD32 out_zero_bias,bool per_channel_quantized)103 WORD32 matmul_asym8uxasym8u_asym8u(
104 UWORD8* __restrict__ p_out,
105 const UWORD8* __restrict__ p_mat1,
106 const UWORD8* __restrict__ p_vec1,
107 const WORD32* __restrict__ p_bias,
108 WORD32 rows,
109 WORD32 cols1,
110 WORD32 row_stride1,
111 WORD32 vec_count,
112 WORD32 vec_offset,
113 WORD32 out_offset,
114 WORD32 out_stride,
115 WORD32 mat1_zero_bias,
116 WORD32 vec1_zero_bias,
117 const WORD32* __restrict__ out_multiplier,
118 const WORD32* __restrict__ out_shift,
119 WORD32 out_zero_bias,
120 bool per_channel_quantized) {
121 /* Iterators used in for loops */
122 int m_itr, c_itr, vec_itr;
123 /* Assign initial value so this value will be used in trailing loop */
124 m_itr = 0;
125 /* Shifts to match with Tensorflow */
126 int left_shift[ROW_UNROLL] = {0}, right_shift[ROW_UNROLL] = {0};
127
128 #define UNROLL_ROW_SETUP_ACC_BATCH SETUP_ACC_BATCH_ROW_FOR_ASYM8bxASYM8b
129 #define UNROLL_SETUP_ACC_BATCH SETUP_ACC_BATCH_FOR_ASYM8bxASYM8b
130 #define UNROLL_SETUP_MAT1 SETUP_MAT1_ASYM8b
131 #define UNROLL_SETUP_VEC_BATCH SETUP_VEC_OFFSET_BATCH_ASYM8b
132 #define SETUP_BIAS SETUP_BIAS_ASYM8b
133 #define UNROLL_LOAD_VEC_BATCH LOAD_VEC_BATCH_ASYM8b
134 #define UNROLL_LOAD_ROW_MAT1 LOAD_ROW_MAT1_ASYM8b
135 #define LOAD_BIAS LOAD_BIAS_ASYM8b_MATMUL
136 #define UNROLL_ROW_KERNEL_MAT1_VEC_BATCH KERNEL_MAT1_VEC_BATCH_ROW_ASYM8b_ASYM8b
137 #define UNROLL_KERNEL_MAT1_VEC_BATCH KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b
138 #define UNROLL_ROW_ADD_BIAS_ACC \
139 ADD_BIAS_BATCH_ROW_ASYM8b_ACC_FOR_ASYM8bxASYM8b_MATMUL
140 #define UNROLL_ADD_BIAS_ACC_BATCH \
141 ADD_BIAS_BATCH_ASYM8b_ACC_FOR_ASYM8bxASYM8b_MATMUL
142 #define UNROLL_ROW_ADJUST_ACC ADJUST_ACC_BATCH_ROW_ASYM8b
143 #define UNROLL_ADJUST_ACC_BATCH ADJUST_ACC_BATCH_ASYM8b
144 #define UNROLL_ROW_STORE_ACC STORE_ACC_BATCH_ROW_ASYM8bxASYM8b_AT_OUT_ASYM8b
145 #define UNROLL_STORE_ACC_BATCH \
146 STORE_STRIDE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b
147
148 int chk_align = 0;
149 CHK_MATMUL_ALIGN(
150 chk_align, p_mat1, 1, p_vec1, 1, cols1, row_stride1, vec_offset, 4);
151
152 if (chk_align) {
153 for (vec_itr = 0; vec_itr < (vec_count & ~(VEC_UNROLL - 1));
154 vec_itr += VEC_UNROLL) {
155 SETUP_BIAS;
156 for (m_itr = 0; m_itr < (rows & ~(ROW_UNROLL - 1)); m_itr += ROW_UNROLL) {
157 SETUP_SHIFT;
158 SETUP_ACC_BATCH;
159 SETUP_VEC_BATCH;
160 SETUP_MAT1;
161
162 for (c_itr = 0; c_itr < (cols1 >> 2); c_itr++) {
163 LOAD_VEC_BATCH;
164 LOAD_MAT1;
165 KERNEL_MAT1_VEC_BATCH;
166 }
167
168 ADD_BIAS_ACC_BATCH;
169 ADJUST_ACC_BATCH;
170 STORE_ACC_BATCH;
171 }
172
173 #pragma no_unroll
174 for (; m_itr < rows; m_itr++) {
175 UNROLL_ROW_SETUP_SHIFT(0);
176 UNROLL_ROW_SETUP_ACC_BATCH(0);
177 SETUP_VEC_BATCH;
178 UNROLL_SETUP_MAT1(0);
179
180 for (c_itr = 0; c_itr < (cols1 >> 2); c_itr++) {
181 LOAD_VEC_BATCH;
182 UNROLL_LOAD_ROW_MAT1(0);
183 UNROLL_ROW_KERNEL_MAT1_VEC_BATCH(0);
184 }
185
186 UNROLL_ROW_ADD_BIAS_ACC(0);
187 UNROLL_ROW_ADJUST_ACC(0);
188 UNROLL_ROW_STORE_ACC(0);
189 }
190 }
191 /* Tail loop for vec unroll */
192 for (; vec_itr < vec_count; vec_itr++) {
193 SETUP_BIAS;
194 for (m_itr = 0; m_itr < (rows & ~(ROW_UNROLL - 1)); m_itr += ROW_UNROLL) {
195 SETUP_SHIFT;
196 SETUP_ACC_BATCH_TAIL;
197 UNROLL_SETUP_VEC_BATCH(0);
198 SETUP_MAT1;
199
200 for (c_itr = 0; c_itr < (cols1 >> 2); c_itr++) {
201 UNROLL_LOAD_VEC_BATCH(0);
202 LOAD_MAT1;
203 KERNEL_MAT1_VEC_BATCH_TAIL;
204 }
205
206 ADD_BIAS_ACC_BATCH_TAIL;
207 ADJUST_ACC_BATCH_TAIL;
208 STORE_ACC_BATCH_TAIL;
209 }
210
211 #pragma no_unroll
212 for (; m_itr < rows; m_itr++) {
213 UNROLL_ROW_SETUP_SHIFT(0);
214 UNROLL_SETUP_ACC_BATCH(0, 0);
215 UNROLL_SETUP_VEC_BATCH(0);
216 UNROLL_SETUP_MAT1(0);
217
218 for (c_itr = 0; c_itr < (cols1 >> 2); c_itr++) {
219 UNROLL_LOAD_VEC_BATCH(0);
220 UNROLL_LOAD_ROW_MAT1(0);
221 UNROLL_KERNEL_MAT1_VEC_BATCH(0, 0);
222 }
223
224 LOAD_BIAS;
225 UNROLL_ADD_BIAS_ACC_BATCH(0, 0);
226 UNROLL_ADJUST_ACC_BATCH(0, 0);
227 UNROLL_STORE_ACC_BATCH(0, 0);
228 }
229 }
230
231 /* Undefining the defined macro to make them available for reuse */
232 #undef UNROLL_ROW_SETUP_ACC_BATCH
233 #undef UNROLL_SETUP_ACC_BATCH
234 #undef UNROLL_SETUP_MAT1
235 #undef UNROLL_SETUP_VEC_BATCH
236 #undef SETUP_BIAS
237 #undef SETUP_SHIFT
238 #undef UNROLL_LOAD_VEC_BATCH
239 #undef UNROLL_LOAD_ROW_MAT1
240 #undef LOAD_BIAS
241 #undef UNROLL_ROW_KERNEL_MAT1_VEC_BATCH
242 #undef UNROLL_KERNEL_MAT1_VEC_BATCH
243 #undef UNROLL_ROW_ADD_BIAS_ACC
244 #undef UNROLL_ADD_BIAS_ACC_BATCH
245 #undef UNROLL_ROW_ADJUST_ACC
246 #undef UNROLL_ADJUST_ACC_BATCH
247 #undef UNROLL_ROW_STORE_ACC
248 #undef UNROLL_STORE_ACC_BATCH
249 #undef VEC_UNROLL
250 #undef ROW_UNROLL
251 } else if (p_mat1 && p_vec1) {
252 #define ROW_UNROLL 2
253 #define VEC_UNROLL 2
254 #define UNROLL_ROW_SETUP_ACC_BATCH SETUP_ACC_BATCH_ROW_FOR_ASYM8bxASYM8b
255 #define UNROLL_SETUP_ACC_BATCH SETUP_ACC_BATCH_FOR_ASYM8bxASYM8b
256 #define SETUP_BIAS SETUP_BIAS_ASYM8b
257 #define LOAD_BIAS LOAD_BIAS_ASYM8b_MATMUL
258 #define UNROLL_ROW_ADD_BIAS_ACC \
259 ADD_BIAS_BATCH_ROW_ASYM8b_ACC_FOR_ASYM8bxASYM8b_MATMUL
260 #define UNROLL_ADD_BIAS_ACC_BATCH \
261 ADD_BIAS_BATCH_ASYM8b_ACC_FOR_ASYM8bxASYM8b_MATMUL
262 #define UNROLL_ROW_ADJUST_ACC ADJUST_ACC_BATCH_ROW_ASYM8b
263 #define UNROLL_ADJUST_ACC_BATCH ADJUST_ACC_BATCH_ASYM8b
264 for (vec_itr = 0; vec_itr < (vec_count & ~(VEC_UNROLL - 1));
265 vec_itr += VEC_UNROLL) {
266 SETUP_BIAS;
267 for (m_itr = 0; m_itr < (rows & ~(ROW_UNROLL - 1)); m_itr += ROW_UNROLL) {
268 UNROLL_ROW_SETUP_SHIFT(0);
269 UNROLL_ROW_SETUP_SHIFT(1);
270 UNROLL_SETUP_ACC_BATCH(0, 0);
271 UNROLL_SETUP_ACC_BATCH(0, 1);
272 UNROLL_SETUP_ACC_BATCH(1, 0);
273 UNROLL_SETUP_ACC_BATCH(1, 1);
274 SETUP_VEC_OFFSET_BATCH_ASYM8b_UNALIGNED(0);
275 SETUP_VEC_OFFSET_BATCH_ASYM8b_UNALIGNED(1);
276 SETUP_MAT1_ASYM8b_UNALIGNED(0);
277 SETUP_MAT1_ASYM8b_UNALIGNED(1);
278
279 int cols1_count = cols1 - cols1 % 4;
280 for (c_itr = 0; c_itr < (cols1_count >> 2); c_itr++) {
281 LOAD_VEC_BATCH_ASYM8b_UNALIGNED(0);
282 LOAD_VEC_BATCH_ASYM8b_UNALIGNED(1);
283 LOAD_ROW_MAT1_ASYM8b_UNALIGNED(0);
284 LOAD_ROW_MAT1_ASYM8b_UNALIGNED(1);
285 KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b(0, 0);
286 KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b(1, 0);
287 KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b(0, 1);
288 KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b(1, 1);
289 }
290 #pragma no_unroll
291 for (c_itr = cols1_count; c_itr < cols1; c_itr++) {
292 LOAD_VEC_BATCH_ASYM8b_SINGLE_UNALIGNED(0);
293 LOAD_VEC_BATCH_ASYM8b_SINGLE_UNALIGNED(1);
294 LOAD_ROW_MAT1_ASYM8b_SINGLE_UNALIGNED(0);
295 LOAD_ROW_MAT1_ASYM8b_SINGLE_UNALIGNED(1);
296 KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b_SINGLE_UNALIGNED(0, 0);
297 KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b_SINGLE_UNALIGNED(1, 0);
298 KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b_SINGLE_UNALIGNED(0, 1);
299 KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b_SINGLE_UNALIGNED(1, 1);
300 }
301
302 ADD_BIAS_BATCH_ROW_ASYM8b_ACC_FOR_ASYM8bxASYM8b(0);
303 ADD_BIAS_BATCH_ROW_ASYM8b_ACC_FOR_ASYM8bxASYM8b(1);
304 ADJUST_ACC_BATCH_ROW_ASYM8b(0);
305 ADJUST_ACC_BATCH_ROW_ASYM8b(1);
306 STORE_STRIDE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(0, 0);
307 STORE_STRIDE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(1, 0);
308 STORE_STRIDE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(0, 1);
309 STORE_STRIDE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(1, 1);
310 }
311 // Remaining row
312 for (; m_itr < rows; m_itr++) {
313 UNROLL_ROW_SETUP_SHIFT(0);
314 UNROLL_SETUP_ACC_BATCH(0, 0);
315 UNROLL_SETUP_ACC_BATCH(0, 1);
316 SETUP_VEC_OFFSET_BATCH_ASYM8b_UNALIGNED(0);
317 SETUP_VEC_OFFSET_BATCH_ASYM8b_UNALIGNED(1);
318 SETUP_MAT1_ASYM8b_UNALIGNED(0);
319 int cols1_count = cols1 - cols1 % 4;
320
321 for (c_itr = 0; c_itr < (cols1_count >> 2); c_itr++) {
322 LOAD_VEC_BATCH_ASYM8b_UNALIGNED(0);
323 LOAD_VEC_BATCH_ASYM8b_UNALIGNED(1);
324 LOAD_ROW_MAT1_ASYM8b_UNALIGNED(0);
325 KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b(0, 0);
326 KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b(0, 1);
327 }
328 #pragma no_unroll
329 for (c_itr = cols1_count; c_itr < cols1; c_itr++) {
330 LOAD_VEC_BATCH_ASYM8b_SINGLE_UNALIGNED(0);
331 LOAD_VEC_BATCH_ASYM8b_SINGLE_UNALIGNED(1);
332 LOAD_ROW_MAT1_ASYM8b_SINGLE_UNALIGNED(0);
333 KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b_SINGLE_UNALIGNED(0, 0);
334 KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b_SINGLE_UNALIGNED(0, 1);
335 }
336 ADD_BIAS_BATCH_ROW_ASYM8b_ACC_FOR_ASYM8bxASYM8b(0);
337 ADJUST_ACC_BATCH_ROW_ASYM8b(0);
338 STORE_STRIDE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(0, 0);
339 STORE_STRIDE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(0, 1);
340 }
341 }
342 {
343 /* Tail loop for vec unroll */
344 for (; vec_itr < vec_count; vec_itr++) {
345 SETUP_BIAS;
346 for (m_itr = 0; m_itr < (rows & ~(ROW_UNROLL - 1));
347 m_itr += ROW_UNROLL) {
348 UNROLL_ROW_SETUP_SHIFT(0);
349 UNROLL_ROW_SETUP_SHIFT(1);
350 UNROLL_SETUP_ACC_BATCH(0, 0);
351 UNROLL_SETUP_ACC_BATCH(1, 0);
352 SETUP_VEC_OFFSET_BATCH_ASYM8b_UNALIGNED(0);
353 SETUP_MAT1_ASYM8b_UNALIGNED(0);
354 SETUP_MAT1_ASYM8b_UNALIGNED(1);
355 int cols1_count = cols1 - cols1 % 4;
356
357 for (c_itr = 0; c_itr < (cols1_count >> 2); c_itr++) {
358 LOAD_VEC_BATCH_ASYM8b_UNALIGNED(0);
359 LOAD_ROW_MAT1_ASYM8b_UNALIGNED(0);
360 LOAD_ROW_MAT1_ASYM8b_UNALIGNED(1);
361 KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b(0, 0);
362 KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b(1, 0);
363 }
364 #pragma no_unroll
365 for (c_itr = cols1_count; c_itr < cols1; c_itr++) {
366 LOAD_VEC_BATCH_ASYM8b_SINGLE_UNALIGNED(0);
367 LOAD_ROW_MAT1_ASYM8b_SINGLE_UNALIGNED(0);
368 LOAD_ROW_MAT1_ASYM8b_SINGLE_UNALIGNED(1);
369 KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b_SINGLE_UNALIGNED(0, 0);
370 KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b_SINGLE_UNALIGNED(1, 0);
371 }
372
373 LOAD_BIAS;
374 UNROLL_ADD_BIAS_ACC_BATCH(0, 0);
375 UNROLL_ADJUST_ACC_BATCH(0, 0);
376 LOAD_BIAS;
377 UNROLL_ADD_BIAS_ACC_BATCH(1, 0);
378 UNROLL_ADJUST_ACC_BATCH(1, 0);
379
380 STORE_STRIDE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(0, 0);
381 STORE_STRIDE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(1, 0);
382 }
383
384 for (; m_itr < rows; m_itr++) {
385 UNROLL_ROW_SETUP_SHIFT(0);
386 UNROLL_SETUP_ACC_BATCH(0, 0);
387 SETUP_VEC_OFFSET_BATCH_ASYM8b_UNALIGNED(0);
388 SETUP_MAT1_ASYM8b_UNALIGNED(0);
389 int cols1_count = cols1 - cols1 % 4;
390
391 for (c_itr = 0; c_itr < (cols1_count >> 2); c_itr++) {
392 LOAD_VEC_BATCH_ASYM8b_UNALIGNED(0);
393 LOAD_ROW_MAT1_ASYM8b_UNALIGNED(0);
394 KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b(0, 0);
395 }
396 #pragma no_unroll
397 for (c_itr = cols1_count; c_itr < cols1; c_itr++) {
398 LOAD_VEC_BATCH_ASYM8b_SINGLE_UNALIGNED(0);
399 LOAD_ROW_MAT1_ASYM8b_SINGLE_UNALIGNED(0);
400 KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b_SINGLE_UNALIGNED(0, 0);
401 }
402
403 LOAD_BIAS;
404 UNROLL_ADD_BIAS_ACC_BATCH(0, 0);
405 UNROLL_ADJUST_ACC_BATCH(0, 0);
406 STORE_STRIDE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(0, 0);
407 }
408 }
409 }
410 } else {
411 return -1;
412 }
413
414 #undef UNROLL_ROW_SETUP_ACC_BATCH
415 #undef UNROLL_SETUP_ACC_BATCH
416 #undef UNROLL_SETUP_MAT1
417 #undef UNROLL_SETUP_VEC_BATCH
418 #undef SETUP_BIAS
419 #undef SETUP_SHIFT
420 #undef UNROLL_LOAD_VEC_BATCH
421 #undef UNROLL_LOAD_ROW_MAT1
422 #undef LOAD_BIAS
423 #undef UNROLL_ROW_KERNEL_MAT1_VEC_BATCH
424 #undef UNROLL_KERNEL_MAT1_VEC_BATCH
425 #undef UNROLL_ROW_ADD_BIAS_ACC
426 #undef UNROLL_ADD_BIAS_ACC_BATCH
427 #undef UNROLL_ROW_ADJUST_ACC
428 #undef UNROLL_ADJUST_ACC_BATCH
429 #undef UNROLL_ROW_STORE_ACC
430 #undef UNROLL_STORE_ACC_BATCH
431 #undef VEC_UNROLL
432 #undef ROW_UNROLL
433
434 return 0;
435 }
436
437 }; // namespace kernels
438 }; // namespace HiFi
439 }; // namespace impl
440 }; // namespace cadence
441