1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD-style license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 /*******************************************************************************
10 * Copyright (c) 2018-2023 Cadence Design Systems, Inc.
11 *
12 * Permission is hereby granted, free of charge, to any person obtaining
13 * a copy of this software and associated documentation files (the
14 * "Software"), to use this Software with Cadence processor cores only and
15 * not with any other processors and platforms, subject to
16 * the following conditions:
17 *
18 * The above copyright notice and this permission notice shall be included
19 * in all copies or substantial portions of the Software.
20 *
21 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
24 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
25 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
26 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
27 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
28 
29 ******************************************************************************/
30 
31 #include "xa_nnlib_common.h"
32 #include "xa_type_def.h"
33 
34 #ifdef ROW_UNROLL
35 #undef ROW_UNROLL
36 #endif
37 #define ROW_UNROLL 4
38 
39 #include "xa_nnlib_common_macros.h"
40 
41 /* Include the asym8uxasym8u_asym8u macros */
42 #include "matmul_asym8uxasym8u_asym8u_macros.h"
43 
44 /*----------------------------Main function---------------------------------*/
45 
46 namespace cadence {
47 namespace impl {
48 namespace HiFi {
49 namespace kernels {
50 /*
51 The following function is copied from xa_nn_matmul_asym8xasym8_asym8 in
52 xa_nnlib/algo/kernels/matXvec/hifi4/xa_nn_matmul_asym8xasym8.c.
53 
54 xa_nn_matmul_asym8xasym8_asym8 multiplies two quint8 matrices, and requantizes
55 the result to produce a quint8 output. However, it has two limitations:
56 1. It only works for per-tensor quantized weight.
57 2. It forces the weight rows to be unrolled by 4 when both input and weight are
58 aligned to 8b boundary.
59 
60 We modify xa_nn_matmul_asym8xasym8_asym8 to allow per-channel quantized weights
61 as well. To do so, we make the following two changes:
62 1. The out_multiplier and out_shift now become arrays instead of scalars. Apart
63 from the function arg, we add a new macro (UNROLL_ROW_SETUP_SHIFT) which
64 computes the right out_shift for each channel (i.e., unrolled row of weight),
65 and stores it in the appropriate index of left_shift[ROW_UNROLL] and
66 right_shift[ROW_UNROLL].
67 2. We modify the ADJUST_ACC_BATCH_ASYM8b macro so that it it picks up the right
68 out_multiplier and out_shift for the accumulation corresponding to each channel
69 (i.e., unrolled row of weight).
70 
71 Through experimentation, we observe that the kernel performs better when the
72 weight matrix is uniformly unrolled by a factor of 2 instead of 4 for 8b aligned
73 case. We add a case for ROW_UNROLL=2 and VEC_UNROLL=2 in
74 xa_nnlib_matmul_unroll_macros.h. This code is similar to the ROW_UNROLL=4 and
75 VEC_UNROLL=2 code in
76 nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_common_macros.h.
77 
78 General information about the code:
79 The HiFi4 xa_nn_matmul_asym8xasym8_asym8 kernel writes the code using macros,
80 which are expanded to HiFi4 intrinsics in
81 nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_common_macros.h.
82 The code caters to two specific cases:
83 1. When the two input matrices (p_mat1 and p_vec1) are aligned to 8-byte
84 boundary, we do not need unaligned loads. In that case, 'chk_align' is true, and
85 the code unrolls p_mat1 by 4 and p_vec1 by 2.
86 2. If chk_align is false, then the code unrolls both p_mat1 and p_vec1 by a
87 factor of 2. The code will use macros that expand to unaligned loads via
88 register priming (e.g., LOAD_VEC_BATCH_ASYM8b_UNALIGNED)
89 3. If either p_mat1 or p_vec1 are nullptr, the code returns -1.
90 
91 The choice of unrolling factors in the NNLib kernel is not controlled by the
92 user: it sets ROW_UNROLL to 4 by default. This choice is not goverened by any
93 heuristics. The performance degradation due to unaligned loads/stores also is
94 not clear to warrant two branches in the code (if/else branching on chk_align).
95 
96 Future modifications: In future, if Tensilica provides a new version of the
97 xa_nn_matmul_asym8xasym8_asym8 kernel, the changes to this file would be
98 minimal: we just copy the entire function here, change the args for
99 out_multiplier and out_shift, and add SETUP_SHIFT/UNROLL_ROW_SETUP_SHIFT macro
100 to get the right out_shift for each unrolled row.
101 */
102 
matmul_asym8uxasym8u_asym8u(UWORD8 * __restrict__ p_out,const UWORD8 * __restrict__ p_mat1,const UWORD8 * __restrict__ p_vec1,const WORD32 * __restrict__ p_bias,WORD32 rows,WORD32 cols1,WORD32 row_stride1,WORD32 vec_count,WORD32 vec_offset,WORD32 out_offset,WORD32 out_stride,WORD32 mat1_zero_bias,WORD32 vec1_zero_bias,const WORD32 * __restrict__ out_multiplier,const WORD32 * __restrict__ out_shift,WORD32 out_zero_bias,bool per_channel_quantized)103 WORD32 matmul_asym8uxasym8u_asym8u(
104     UWORD8* __restrict__ p_out,
105     const UWORD8* __restrict__ p_mat1,
106     const UWORD8* __restrict__ p_vec1,
107     const WORD32* __restrict__ p_bias,
108     WORD32 rows,
109     WORD32 cols1,
110     WORD32 row_stride1,
111     WORD32 vec_count,
112     WORD32 vec_offset,
113     WORD32 out_offset,
114     WORD32 out_stride,
115     WORD32 mat1_zero_bias,
116     WORD32 vec1_zero_bias,
117     const WORD32* __restrict__ out_multiplier,
118     const WORD32* __restrict__ out_shift,
119     WORD32 out_zero_bias,
120     bool per_channel_quantized) {
121   /* Iterators used in for loops */
122   int m_itr, c_itr, vec_itr;
123   /* Assign initial value so this value will be used in trailing loop */
124   m_itr = 0;
125   /* Shifts to match with Tensorflow */
126   int left_shift[ROW_UNROLL] = {0}, right_shift[ROW_UNROLL] = {0};
127 
128 #define UNROLL_ROW_SETUP_ACC_BATCH SETUP_ACC_BATCH_ROW_FOR_ASYM8bxASYM8b
129 #define UNROLL_SETUP_ACC_BATCH SETUP_ACC_BATCH_FOR_ASYM8bxASYM8b
130 #define UNROLL_SETUP_MAT1 SETUP_MAT1_ASYM8b
131 #define UNROLL_SETUP_VEC_BATCH SETUP_VEC_OFFSET_BATCH_ASYM8b
132 #define SETUP_BIAS SETUP_BIAS_ASYM8b
133 #define UNROLL_LOAD_VEC_BATCH LOAD_VEC_BATCH_ASYM8b
134 #define UNROLL_LOAD_ROW_MAT1 LOAD_ROW_MAT1_ASYM8b
135 #define LOAD_BIAS LOAD_BIAS_ASYM8b_MATMUL
136 #define UNROLL_ROW_KERNEL_MAT1_VEC_BATCH KERNEL_MAT1_VEC_BATCH_ROW_ASYM8b_ASYM8b
137 #define UNROLL_KERNEL_MAT1_VEC_BATCH KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b
138 #define UNROLL_ROW_ADD_BIAS_ACC \
139   ADD_BIAS_BATCH_ROW_ASYM8b_ACC_FOR_ASYM8bxASYM8b_MATMUL
140 #define UNROLL_ADD_BIAS_ACC_BATCH \
141   ADD_BIAS_BATCH_ASYM8b_ACC_FOR_ASYM8bxASYM8b_MATMUL
142 #define UNROLL_ROW_ADJUST_ACC ADJUST_ACC_BATCH_ROW_ASYM8b
143 #define UNROLL_ADJUST_ACC_BATCH ADJUST_ACC_BATCH_ASYM8b
144 #define UNROLL_ROW_STORE_ACC STORE_ACC_BATCH_ROW_ASYM8bxASYM8b_AT_OUT_ASYM8b
145 #define UNROLL_STORE_ACC_BATCH \
146   STORE_STRIDE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b
147 
148   int chk_align = 0;
149   CHK_MATMUL_ALIGN(
150       chk_align, p_mat1, 1, p_vec1, 1, cols1, row_stride1, vec_offset, 4);
151 
152   if (chk_align) {
153     for (vec_itr = 0; vec_itr < (vec_count & ~(VEC_UNROLL - 1));
154          vec_itr += VEC_UNROLL) {
155       SETUP_BIAS;
156       for (m_itr = 0; m_itr < (rows & ~(ROW_UNROLL - 1)); m_itr += ROW_UNROLL) {
157         SETUP_SHIFT;
158         SETUP_ACC_BATCH;
159         SETUP_VEC_BATCH;
160         SETUP_MAT1;
161 
162         for (c_itr = 0; c_itr < (cols1 >> 2); c_itr++) {
163           LOAD_VEC_BATCH;
164           LOAD_MAT1;
165           KERNEL_MAT1_VEC_BATCH;
166         }
167 
168         ADD_BIAS_ACC_BATCH;
169         ADJUST_ACC_BATCH;
170         STORE_ACC_BATCH;
171       }
172 
173 #pragma no_unroll
174       for (; m_itr < rows; m_itr++) {
175         UNROLL_ROW_SETUP_SHIFT(0);
176         UNROLL_ROW_SETUP_ACC_BATCH(0);
177         SETUP_VEC_BATCH;
178         UNROLL_SETUP_MAT1(0);
179 
180         for (c_itr = 0; c_itr < (cols1 >> 2); c_itr++) {
181           LOAD_VEC_BATCH;
182           UNROLL_LOAD_ROW_MAT1(0);
183           UNROLL_ROW_KERNEL_MAT1_VEC_BATCH(0);
184         }
185 
186         UNROLL_ROW_ADD_BIAS_ACC(0);
187         UNROLL_ROW_ADJUST_ACC(0);
188         UNROLL_ROW_STORE_ACC(0);
189       }
190     }
191     /* Tail loop for vec unroll */
192     for (; vec_itr < vec_count; vec_itr++) {
193       SETUP_BIAS;
194       for (m_itr = 0; m_itr < (rows & ~(ROW_UNROLL - 1)); m_itr += ROW_UNROLL) {
195         SETUP_SHIFT;
196         SETUP_ACC_BATCH_TAIL;
197         UNROLL_SETUP_VEC_BATCH(0);
198         SETUP_MAT1;
199 
200         for (c_itr = 0; c_itr < (cols1 >> 2); c_itr++) {
201           UNROLL_LOAD_VEC_BATCH(0);
202           LOAD_MAT1;
203           KERNEL_MAT1_VEC_BATCH_TAIL;
204         }
205 
206         ADD_BIAS_ACC_BATCH_TAIL;
207         ADJUST_ACC_BATCH_TAIL;
208         STORE_ACC_BATCH_TAIL;
209       }
210 
211 #pragma no_unroll
212       for (; m_itr < rows; m_itr++) {
213         UNROLL_ROW_SETUP_SHIFT(0);
214         UNROLL_SETUP_ACC_BATCH(0, 0);
215         UNROLL_SETUP_VEC_BATCH(0);
216         UNROLL_SETUP_MAT1(0);
217 
218         for (c_itr = 0; c_itr < (cols1 >> 2); c_itr++) {
219           UNROLL_LOAD_VEC_BATCH(0);
220           UNROLL_LOAD_ROW_MAT1(0);
221           UNROLL_KERNEL_MAT1_VEC_BATCH(0, 0);
222         }
223 
224         LOAD_BIAS;
225         UNROLL_ADD_BIAS_ACC_BATCH(0, 0);
226         UNROLL_ADJUST_ACC_BATCH(0, 0);
227         UNROLL_STORE_ACC_BATCH(0, 0);
228       }
229     }
230 
231 /* Undefining the defined macro to make them available for reuse */
232 #undef UNROLL_ROW_SETUP_ACC_BATCH
233 #undef UNROLL_SETUP_ACC_BATCH
234 #undef UNROLL_SETUP_MAT1
235 #undef UNROLL_SETUP_VEC_BATCH
236 #undef SETUP_BIAS
237 #undef SETUP_SHIFT
238 #undef UNROLL_LOAD_VEC_BATCH
239 #undef UNROLL_LOAD_ROW_MAT1
240 #undef LOAD_BIAS
241 #undef UNROLL_ROW_KERNEL_MAT1_VEC_BATCH
242 #undef UNROLL_KERNEL_MAT1_VEC_BATCH
243 #undef UNROLL_ROW_ADD_BIAS_ACC
244 #undef UNROLL_ADD_BIAS_ACC_BATCH
245 #undef UNROLL_ROW_ADJUST_ACC
246 #undef UNROLL_ADJUST_ACC_BATCH
247 #undef UNROLL_ROW_STORE_ACC
248 #undef UNROLL_STORE_ACC_BATCH
249 #undef VEC_UNROLL
250 #undef ROW_UNROLL
251   } else if (p_mat1 && p_vec1) {
252 #define ROW_UNROLL 2
253 #define VEC_UNROLL 2
254 #define UNROLL_ROW_SETUP_ACC_BATCH SETUP_ACC_BATCH_ROW_FOR_ASYM8bxASYM8b
255 #define UNROLL_SETUP_ACC_BATCH SETUP_ACC_BATCH_FOR_ASYM8bxASYM8b
256 #define SETUP_BIAS SETUP_BIAS_ASYM8b
257 #define LOAD_BIAS LOAD_BIAS_ASYM8b_MATMUL
258 #define UNROLL_ROW_ADD_BIAS_ACC \
259   ADD_BIAS_BATCH_ROW_ASYM8b_ACC_FOR_ASYM8bxASYM8b_MATMUL
260 #define UNROLL_ADD_BIAS_ACC_BATCH \
261   ADD_BIAS_BATCH_ASYM8b_ACC_FOR_ASYM8bxASYM8b_MATMUL
262 #define UNROLL_ROW_ADJUST_ACC ADJUST_ACC_BATCH_ROW_ASYM8b
263 #define UNROLL_ADJUST_ACC_BATCH ADJUST_ACC_BATCH_ASYM8b
264     for (vec_itr = 0; vec_itr < (vec_count & ~(VEC_UNROLL - 1));
265          vec_itr += VEC_UNROLL) {
266       SETUP_BIAS;
267       for (m_itr = 0; m_itr < (rows & ~(ROW_UNROLL - 1)); m_itr += ROW_UNROLL) {
268         UNROLL_ROW_SETUP_SHIFT(0);
269         UNROLL_ROW_SETUP_SHIFT(1);
270         UNROLL_SETUP_ACC_BATCH(0, 0);
271         UNROLL_SETUP_ACC_BATCH(0, 1);
272         UNROLL_SETUP_ACC_BATCH(1, 0);
273         UNROLL_SETUP_ACC_BATCH(1, 1);
274         SETUP_VEC_OFFSET_BATCH_ASYM8b_UNALIGNED(0);
275         SETUP_VEC_OFFSET_BATCH_ASYM8b_UNALIGNED(1);
276         SETUP_MAT1_ASYM8b_UNALIGNED(0);
277         SETUP_MAT1_ASYM8b_UNALIGNED(1);
278 
279         int cols1_count = cols1 - cols1 % 4;
280         for (c_itr = 0; c_itr < (cols1_count >> 2); c_itr++) {
281           LOAD_VEC_BATCH_ASYM8b_UNALIGNED(0);
282           LOAD_VEC_BATCH_ASYM8b_UNALIGNED(1);
283           LOAD_ROW_MAT1_ASYM8b_UNALIGNED(0);
284           LOAD_ROW_MAT1_ASYM8b_UNALIGNED(1);
285           KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b(0, 0);
286           KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b(1, 0);
287           KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b(0, 1);
288           KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b(1, 1);
289         }
290 #pragma no_unroll
291         for (c_itr = cols1_count; c_itr < cols1; c_itr++) {
292           LOAD_VEC_BATCH_ASYM8b_SINGLE_UNALIGNED(0);
293           LOAD_VEC_BATCH_ASYM8b_SINGLE_UNALIGNED(1);
294           LOAD_ROW_MAT1_ASYM8b_SINGLE_UNALIGNED(0);
295           LOAD_ROW_MAT1_ASYM8b_SINGLE_UNALIGNED(1);
296           KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b_SINGLE_UNALIGNED(0, 0);
297           KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b_SINGLE_UNALIGNED(1, 0);
298           KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b_SINGLE_UNALIGNED(0, 1);
299           KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b_SINGLE_UNALIGNED(1, 1);
300         }
301 
302         ADD_BIAS_BATCH_ROW_ASYM8b_ACC_FOR_ASYM8bxASYM8b(0);
303         ADD_BIAS_BATCH_ROW_ASYM8b_ACC_FOR_ASYM8bxASYM8b(1);
304         ADJUST_ACC_BATCH_ROW_ASYM8b(0);
305         ADJUST_ACC_BATCH_ROW_ASYM8b(1);
306         STORE_STRIDE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(0, 0);
307         STORE_STRIDE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(1, 0);
308         STORE_STRIDE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(0, 1);
309         STORE_STRIDE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(1, 1);
310       }
311       // Remaining row
312       for (; m_itr < rows; m_itr++) {
313         UNROLL_ROW_SETUP_SHIFT(0);
314         UNROLL_SETUP_ACC_BATCH(0, 0);
315         UNROLL_SETUP_ACC_BATCH(0, 1);
316         SETUP_VEC_OFFSET_BATCH_ASYM8b_UNALIGNED(0);
317         SETUP_VEC_OFFSET_BATCH_ASYM8b_UNALIGNED(1);
318         SETUP_MAT1_ASYM8b_UNALIGNED(0);
319         int cols1_count = cols1 - cols1 % 4;
320 
321         for (c_itr = 0; c_itr < (cols1_count >> 2); c_itr++) {
322           LOAD_VEC_BATCH_ASYM8b_UNALIGNED(0);
323           LOAD_VEC_BATCH_ASYM8b_UNALIGNED(1);
324           LOAD_ROW_MAT1_ASYM8b_UNALIGNED(0);
325           KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b(0, 0);
326           KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b(0, 1);
327         }
328 #pragma no_unroll
329         for (c_itr = cols1_count; c_itr < cols1; c_itr++) {
330           LOAD_VEC_BATCH_ASYM8b_SINGLE_UNALIGNED(0);
331           LOAD_VEC_BATCH_ASYM8b_SINGLE_UNALIGNED(1);
332           LOAD_ROW_MAT1_ASYM8b_SINGLE_UNALIGNED(0);
333           KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b_SINGLE_UNALIGNED(0, 0);
334           KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b_SINGLE_UNALIGNED(0, 1);
335         }
336         ADD_BIAS_BATCH_ROW_ASYM8b_ACC_FOR_ASYM8bxASYM8b(0);
337         ADJUST_ACC_BATCH_ROW_ASYM8b(0);
338         STORE_STRIDE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(0, 0);
339         STORE_STRIDE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(0, 1);
340       }
341     }
342     {
343       /* Tail loop for vec unroll */
344       for (; vec_itr < vec_count; vec_itr++) {
345         SETUP_BIAS;
346         for (m_itr = 0; m_itr < (rows & ~(ROW_UNROLL - 1));
347              m_itr += ROW_UNROLL) {
348           UNROLL_ROW_SETUP_SHIFT(0);
349           UNROLL_ROW_SETUP_SHIFT(1);
350           UNROLL_SETUP_ACC_BATCH(0, 0);
351           UNROLL_SETUP_ACC_BATCH(1, 0);
352           SETUP_VEC_OFFSET_BATCH_ASYM8b_UNALIGNED(0);
353           SETUP_MAT1_ASYM8b_UNALIGNED(0);
354           SETUP_MAT1_ASYM8b_UNALIGNED(1);
355           int cols1_count = cols1 - cols1 % 4;
356 
357           for (c_itr = 0; c_itr < (cols1_count >> 2); c_itr++) {
358             LOAD_VEC_BATCH_ASYM8b_UNALIGNED(0);
359             LOAD_ROW_MAT1_ASYM8b_UNALIGNED(0);
360             LOAD_ROW_MAT1_ASYM8b_UNALIGNED(1);
361             KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b(0, 0);
362             KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b(1, 0);
363           }
364 #pragma no_unroll
365           for (c_itr = cols1_count; c_itr < cols1; c_itr++) {
366             LOAD_VEC_BATCH_ASYM8b_SINGLE_UNALIGNED(0);
367             LOAD_ROW_MAT1_ASYM8b_SINGLE_UNALIGNED(0);
368             LOAD_ROW_MAT1_ASYM8b_SINGLE_UNALIGNED(1);
369             KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b_SINGLE_UNALIGNED(0, 0);
370             KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b_SINGLE_UNALIGNED(1, 0);
371           }
372 
373           LOAD_BIAS;
374           UNROLL_ADD_BIAS_ACC_BATCH(0, 0);
375           UNROLL_ADJUST_ACC_BATCH(0, 0);
376           LOAD_BIAS;
377           UNROLL_ADD_BIAS_ACC_BATCH(1, 0);
378           UNROLL_ADJUST_ACC_BATCH(1, 0);
379 
380           STORE_STRIDE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(0, 0);
381           STORE_STRIDE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(1, 0);
382         }
383 
384         for (; m_itr < rows; m_itr++) {
385           UNROLL_ROW_SETUP_SHIFT(0);
386           UNROLL_SETUP_ACC_BATCH(0, 0);
387           SETUP_VEC_OFFSET_BATCH_ASYM8b_UNALIGNED(0);
388           SETUP_MAT1_ASYM8b_UNALIGNED(0);
389           int cols1_count = cols1 - cols1 % 4;
390 
391           for (c_itr = 0; c_itr < (cols1_count >> 2); c_itr++) {
392             LOAD_VEC_BATCH_ASYM8b_UNALIGNED(0);
393             LOAD_ROW_MAT1_ASYM8b_UNALIGNED(0);
394             KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b(0, 0);
395           }
396 #pragma no_unroll
397           for (c_itr = cols1_count; c_itr < cols1; c_itr++) {
398             LOAD_VEC_BATCH_ASYM8b_SINGLE_UNALIGNED(0);
399             LOAD_ROW_MAT1_ASYM8b_SINGLE_UNALIGNED(0);
400             KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b_SINGLE_UNALIGNED(0, 0);
401           }
402 
403           LOAD_BIAS;
404           UNROLL_ADD_BIAS_ACC_BATCH(0, 0);
405           UNROLL_ADJUST_ACC_BATCH(0, 0);
406           STORE_STRIDE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(0, 0);
407         }
408       }
409     }
410   } else {
411     return -1;
412   }
413 
414 #undef UNROLL_ROW_SETUP_ACC_BATCH
415 #undef UNROLL_SETUP_ACC_BATCH
416 #undef UNROLL_SETUP_MAT1
417 #undef UNROLL_SETUP_VEC_BATCH
418 #undef SETUP_BIAS
419 #undef SETUP_SHIFT
420 #undef UNROLL_LOAD_VEC_BATCH
421 #undef UNROLL_LOAD_ROW_MAT1
422 #undef LOAD_BIAS
423 #undef UNROLL_ROW_KERNEL_MAT1_VEC_BATCH
424 #undef UNROLL_KERNEL_MAT1_VEC_BATCH
425 #undef UNROLL_ROW_ADD_BIAS_ACC
426 #undef UNROLL_ADD_BIAS_ACC_BATCH
427 #undef UNROLL_ROW_ADJUST_ACC
428 #undef UNROLL_ADJUST_ACC_BATCH
429 #undef UNROLL_ROW_STORE_ACC
430 #undef UNROLL_STORE_ACC_BATCH
431 #undef VEC_UNROLL
432 #undef ROW_UNROLL
433 
434   return 0;
435 }
436 
437 }; // namespace kernels
438 }; // namespace HiFi
439 }; // namespace impl
440 }; // namespace cadence
441