matmul_asym8uxasym8u_asym8u.cpp (revision 523fa7a60841cd1ecfb9cc4201f1ca8b03ed023a) - OpenGrok cross reference for /aosp_15_r20/external/executorch/backends/cadence/hifi/third-party/nnlib/matmul_asym8uxasym8u_asym8u.cpp

/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

/*******************************************************************************
* Copyright (c) 2018-2023 Cadence Design Systems, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to use this Software with Cadence processor cores only and
* not with any other processors and platforms, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

******************************************************************************/

#include "xa_nnlib_common.h"
#include "xa_type_def.h"

#ifdef ROW_UNROLL
#undef ROW_UNROLL
#endif
#define ROW_UNROLL 4

#include "xa_nnlib_common_macros.h"

/* Include the asym8uxasym8u_asym8u macros */
#include "matmul_asym8uxasym8u_asym8u_macros.h"

/*----------------------------Main function---------------------------------*/

namespace cadence {
namespace impl {
namespace HiFi {
namespace kernels {
/*
The following function is copied from xa_nn_matmul_asym8xasym8_asym8 in
xa_nnlib/algo/kernels/matXvec/hifi4/xa_nn_matmul_asym8xasym8.c.

xa_nn_matmul_asym8xasym8_asym8 multiplies two quint8 matrices, and requantizes
the result to produce a quint8 output. However, it has two limitations:
1. It only works for per-tensor quantized weight.
2. It forces the weight rows to be unrolled by 4 when both input and weight are
aligned to 8b boundary.

We modify xa_nn_matmul_asym8xasym8_asym8 to allow per-channel quantized weights
as well. To do so, we make the following two changes:
1. The out_multiplier and out_shift now become arrays instead of scalars. Apart
from the function arg, we add a new macro (UNROLL_ROW_SETUP_SHIFT) which
computes the right out_shift for each channel (i.e., unrolled row of weight),
and stores it in the appropriate index of left_shift[ROW_UNROLL] and
right_shift[ROW_UNROLL].
2. We modify the ADJUST_ACC_BATCH_ASYM8b macro so that it it picks up the right
out_multiplier and out_shift for the accumulation corresponding to each channel
(i.e., unrolled row of weight).

Through experimentation, we observe that the kernel performs better when the
weight matrix is uniformly unrolled by a factor of 2 instead of 4 for 8b aligned
case. We add a case for ROW_UNROLL=2 and VEC_UNROLL=2 in
xa_nnlib_matmul_unroll_macros.h. This code is similar to the ROW_UNROLL=4 and
VEC_UNROLL=2 code in
nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_common_macros.h.

General information about the code:
The HiFi4 xa_nn_matmul_asym8xasym8_asym8 kernel writes the code using macros,
which are expanded to HiFi4 intrinsics in
nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_common_macros.h.
The code caters to two specific cases:
1. When the two input matrices (p_mat1 and p_vec1) are aligned to 8-byte
boundary, we do not need unaligned loads. In that case, 'chk_align' is true, and
the code unrolls p_mat1 by 4 and p_vec1 by 2.
2. If chk_align is false, then the code unrolls both p_mat1 and p_vec1 by a
factor of 2. The code will use macros that expand to unaligned loads via
register priming (e.g., LOAD_VEC_BATCH_ASYM8b_UNALIGNED)
3. If either p_mat1 or p_vec1 are nullptr, the code returns -1.

The choice of unrolling factors in the NNLib kernel is not controlled by the
user: it sets ROW_UNROLL to 4 by default. This choice is not goverened by any
heuristics. The performance degradation due to unaligned loads/stores also is
not clear to warrant two branches in the code (if/else branching on chk_align).

Future modifications: In future, if Tensilica provides a new version of the
xa_nn_matmul_asym8xasym8_asym8 kernel, the changes to this file would be
minimal: we just copy the entire function here, change the args for
out_multiplier and out_shift, and add SETUP_SHIFT/UNROLL_ROW_SETUP_SHIFT macro
to get the right out_shift for each unrolled row.
*/

WORD32 matmul_asym8uxasym8u_asym8u(
    UWORD8* __restrict__ p_out,
    const UWORD8* __restrict__ p_mat1,
    const UWORD8* __restrict__ p_vec1,
    const WORD32* __restrict__ p_bias,
    WORD32 rows,
    WORD32 cols1,
    WORD32 row_stride1,
    WORD32 vec_count,
    WORD32 vec_offset,
    WORD32 out_offset,
    WORD32 out_stride,
    WORD32 mat1_zero_bias,
    WORD32 vec1_zero_bias,
    const WORD32* __restrict__ out_multiplier,
    const WORD32* __restrict__ out_shift,
    WORD32 out_zero_bias,
    bool per_channel_quantized) {
  /* Iterators used in for loops */
  int m_itr, c_itr, vec_itr;
  /* Assign initial value so this value will be used in trailing loop */
  m_itr = 0;
  /* Shifts to match with Tensorflow */
  int left_shift[ROW_UNROLL] = {0}, right_shift[ROW_UNROLL] = {0};

#define UNROLL_ROW_SETUP_ACC_BATCH SETUP_ACC_BATCH_ROW_FOR_ASYM8bxASYM8b
#define UNROLL_SETUP_ACC_BATCH SETUP_ACC_BATCH_FOR_ASYM8bxASYM8b
#define UNROLL_SETUP_MAT1 SETUP_MAT1_ASYM8b
#define UNROLL_SETUP_VEC_BATCH SETUP_VEC_OFFSET_BATCH_ASYM8b
#define SETUP_BIAS SETUP_BIAS_ASYM8b
#define UNROLL_LOAD_VEC_BATCH LOAD_VEC_BATCH_ASYM8b
#define UNROLL_LOAD_ROW_MAT1 LOAD_ROW_MAT1_ASYM8b
#define LOAD_BIAS LOAD_BIAS_ASYM8b_MATMUL
#define UNROLL_ROW_KERNEL_MAT1_VEC_BATCH KERNEL_MAT1_VEC_BATCH_ROW_ASYM8b_ASYM8b
#define UNROLL_KERNEL_MAT1_VEC_BATCH KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b
#define UNROLL_ROW_ADD_BIAS_ACC \
  ADD_BIAS_BATCH_ROW_ASYM8b_ACC_FOR_ASYM8bxASYM8b_MATMUL
#define UNROLL_ADD_BIAS_ACC_BATCH \
  ADD_BIAS_BATCH_ASYM8b_ACC_FOR_ASYM8bxASYM8b_MATMUL
#define UNROLL_ROW_ADJUST_ACC ADJUST_ACC_BATCH_ROW_ASYM8b
#define UNROLL_ADJUST_ACC_BATCH ADJUST_ACC_BATCH_ASYM8b
#define UNROLL_ROW_STORE_ACC STORE_ACC_BATCH_ROW_ASYM8bxASYM8b_AT_OUT_ASYM8b
#define UNROLL_STORE_ACC_BATCH \
  STORE_STRIDE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b

  int chk_align = 0;
  CHK_MATMUL_ALIGN(
      chk_align, p_mat1, 1, p_vec1, 1, cols1, row_stride1, vec_offset, 4);

  if (chk_align) {
    for (vec_itr = 0; vec_itr < (vec_count & ~(VEC_UNROLL - 1));
         vec_itr += VEC_UNROLL) {
      SETUP_BIAS;
      for (m_itr = 0; m_itr < (rows & ~(ROW_UNROLL - 1)); m_itr += ROW_UNROLL) {
        SETUP_SHIFT;
        SETUP_ACC_BATCH;
        SETUP_VEC_BATCH;
        SETUP_MAT1;

        for (c_itr = 0; c_itr < (cols1 >> 2); c_itr++) {
          LOAD_VEC_BATCH;
          LOAD_MAT1;
          KERNEL_MAT1_VEC_BATCH;
        }

        ADD_BIAS_ACC_BATCH;
        ADJUST_ACC_BATCH;
        STORE_ACC_BATCH;
      }

#pragma no_unroll
      for (; m_itr < rows; m_itr++) {
        UNROLL_ROW_SETUP_SHIFT(0);
        UNROLL_ROW_SETUP_ACC_BATCH(0);
        SETUP_VEC_BATCH;
        UNROLL_SETUP_MAT1(0);

        for (c_itr = 0; c_itr < (cols1 >> 2); c_itr++) {
          LOAD_VEC_BATCH;
          UNROLL_LOAD_ROW_MAT1(0);
          UNROLL_ROW_KERNEL_MAT1_VEC_BATCH(0);
        }

        UNROLL_ROW_ADD_BIAS_ACC(0);
        UNROLL_ROW_ADJUST_ACC(0);
        UNROLL_ROW_STORE_ACC(0);
      }
    }
    /* Tail loop for vec unroll */
    for (; vec_itr < vec_count; vec_itr++) {
      SETUP_BIAS;
      for (m_itr = 0; m_itr < (rows & ~(ROW_UNROLL - 1)); m_itr += ROW_UNROLL) {
        SETUP_SHIFT;
        SETUP_ACC_BATCH_TAIL;
        UNROLL_SETUP_VEC_BATCH(0);
        SETUP_MAT1;

        for (c_itr = 0; c_itr < (cols1 >> 2); c_itr++) {
          UNROLL_LOAD_VEC_BATCH(0);
          LOAD_MAT1;
          KERNEL_MAT1_VEC_BATCH_TAIL;
        }

        ADD_BIAS_ACC_BATCH_TAIL;
        ADJUST_ACC_BATCH_TAIL;
        STORE_ACC_BATCH_TAIL;
      }

#pragma no_unroll
      for (; m_itr < rows; m_itr++) {
        UNROLL_ROW_SETUP_SHIFT(0);
        UNROLL_SETUP_ACC_BATCH(0, 0);
        UNROLL_SETUP_VEC_BATCH(0);
        UNROLL_SETUP_MAT1(0);

        for (c_itr = 0; c_itr < (cols1 >> 2); c_itr++) {
          UNROLL_LOAD_VEC_BATCH(0);
          UNROLL_LOAD_ROW_MAT1(0);
          UNROLL_KERNEL_MAT1_VEC_BATCH(0, 0);
        }

        LOAD_BIAS;
        UNROLL_ADD_BIAS_ACC_BATCH(0, 0);
        UNROLL_ADJUST_ACC_BATCH(0, 0);
        UNROLL_STORE_ACC_BATCH(0, 0);
      }
    }

/* Undefining the defined macro to make them available for reuse */
#undef UNROLL_ROW_SETUP_ACC_BATCH
#undef UNROLL_SETUP_ACC_BATCH
#undef UNROLL_SETUP_MAT1
#undef UNROLL_SETUP_VEC_BATCH
#undef SETUP_BIAS
#undef SETUP_SHIFT
#undef UNROLL_LOAD_VEC_BATCH
#undef UNROLL_LOAD_ROW_MAT1
#undef LOAD_BIAS
#undef UNROLL_ROW_KERNEL_MAT1_VEC_BATCH
#undef UNROLL_KERNEL_MAT1_VEC_BATCH
#undef UNROLL_ROW_ADD_BIAS_ACC
#undef UNROLL_ADD_BIAS_ACC_BATCH
#undef UNROLL_ROW_ADJUST_ACC
#undef UNROLL_ADJUST_ACC_BATCH
#undef UNROLL_ROW_STORE_ACC
#undef UNROLL_STORE_ACC_BATCH
#undef VEC_UNROLL
#undef ROW_UNROLL
  } else if (p_mat1 && p_vec1) {
#define ROW_UNROLL 2
#define VEC_UNROLL 2
#define UNROLL_ROW_SETUP_ACC_BATCH SETUP_ACC_BATCH_ROW_FOR_ASYM8bxASYM8b
#define UNROLL_SETUP_ACC_BATCH SETUP_ACC_BATCH_FOR_ASYM8bxASYM8b
#define SETUP_BIAS SETUP_BIAS_ASYM8b
#define LOAD_BIAS LOAD_BIAS_ASYM8b_MATMUL
#define UNROLL_ROW_ADD_BIAS_ACC \
  ADD_BIAS_BATCH_ROW_ASYM8b_ACC_FOR_ASYM8bxASYM8b_MATMUL
#define UNROLL_ADD_BIAS_ACC_BATCH \
  ADD_BIAS_BATCH_ASYM8b_ACC_FOR_ASYM8bxASYM8b_MATMUL
#define UNROLL_ROW_ADJUST_ACC ADJUST_ACC_BATCH_ROW_ASYM8b
#define UNROLL_ADJUST_ACC_BATCH ADJUST_ACC_BATCH_ASYM8b
    for (vec_itr = 0; vec_itr < (vec_count & ~(VEC_UNROLL - 1));
         vec_itr += VEC_UNROLL) {
      SETUP_BIAS;
      for (m_itr = 0; m_itr < (rows & ~(ROW_UNROLL - 1)); m_itr += ROW_UNROLL) {
        UNROLL_ROW_SETUP_SHIFT(0);
        UNROLL_ROW_SETUP_SHIFT(1);
        UNROLL_SETUP_ACC_BATCH(0, 0);
        UNROLL_SETUP_ACC_BATCH(0, 1);
        UNROLL_SETUP_ACC_BATCH(1, 0);
        UNROLL_SETUP_ACC_BATCH(1, 1);
        SETUP_VEC_OFFSET_BATCH_ASYM8b_UNALIGNED(0);
        SETUP_VEC_OFFSET_BATCH_ASYM8b_UNALIGNED(1);
        SETUP_MAT1_ASYM8b_UNALIGNED(0);
        SETUP_MAT1_ASYM8b_UNALIGNED(1);

        int cols1_count = cols1 - cols1 % 4;
        for (c_itr = 0; c_itr < (cols1_count >> 2); c_itr++) {
          LOAD_VEC_BATCH_ASYM8b_UNALIGNED(0);
          LOAD_VEC_BATCH_ASYM8b_UNALIGNED(1);
          LOAD_ROW_MAT1_ASYM8b_UNALIGNED(0);
          LOAD_ROW_MAT1_ASYM8b_UNALIGNED(1);
          KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b(0, 0);
          KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b(1, 0);
          KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b(0, 1);
          KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b(1, 1);
        }
#pragma no_unroll
        for (c_itr = cols1_count; c_itr < cols1; c_itr++) {
          LOAD_VEC_BATCH_ASYM8b_SINGLE_UNALIGNED(0);
          LOAD_VEC_BATCH_ASYM8b_SINGLE_UNALIGNED(1);
          LOAD_ROW_MAT1_ASYM8b_SINGLE_UNALIGNED(0);
          LOAD_ROW_MAT1_ASYM8b_SINGLE_UNALIGNED(1);
          KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b_SINGLE_UNALIGNED(0, 0);
          KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b_SINGLE_UNALIGNED(1, 0);
          KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b_SINGLE_UNALIGNED(0, 1);
          KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b_SINGLE_UNALIGNED(1, 1);
        }

        ADD_BIAS_BATCH_ROW_ASYM8b_ACC_FOR_ASYM8bxASYM8b(0);
        ADD_BIAS_BATCH_ROW_ASYM8b_ACC_FOR_ASYM8bxASYM8b(1);
        ADJUST_ACC_BATCH_ROW_ASYM8b(0);
        ADJUST_ACC_BATCH_ROW_ASYM8b(1);
        STORE_STRIDE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(0, 0);
        STORE_STRIDE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(1, 0);
        STORE_STRIDE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(0, 1);
        STORE_STRIDE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(1, 1);
      }
      // Remaining row
      for (; m_itr < rows; m_itr++) {
        UNROLL_ROW_SETUP_SHIFT(0);
        UNROLL_SETUP_ACC_BATCH(0, 0);
        UNROLL_SETUP_ACC_BATCH(0, 1);
        SETUP_VEC_OFFSET_BATCH_ASYM8b_UNALIGNED(0);
        SETUP_VEC_OFFSET_BATCH_ASYM8b_UNALIGNED(1);
        SETUP_MAT1_ASYM8b_UNALIGNED(0);
        int cols1_count = cols1 - cols1 % 4;

        for (c_itr = 0; c_itr < (cols1_count >> 2); c_itr++) {
          LOAD_VEC_BATCH_ASYM8b_UNALIGNED(0);
          LOAD_VEC_BATCH_ASYM8b_UNALIGNED(1);
          LOAD_ROW_MAT1_ASYM8b_UNALIGNED(0);
          KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b(0, 0);
          KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b(0, 1);
        }
#pragma no_unroll
        for (c_itr = cols1_count; c_itr < cols1; c_itr++) {
          LOAD_VEC_BATCH_ASYM8b_SINGLE_UNALIGNED(0);
          LOAD_VEC_BATCH_ASYM8b_SINGLE_UNALIGNED(1);
          LOAD_ROW_MAT1_ASYM8b_SINGLE_UNALIGNED(0);
          KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b_SINGLE_UNALIGNED(0, 0);
          KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b_SINGLE_UNALIGNED(0, 1);
        }
        ADD_BIAS_BATCH_ROW_ASYM8b_ACC_FOR_ASYM8bxASYM8b(0);
        ADJUST_ACC_BATCH_ROW_ASYM8b(0);
        STORE_STRIDE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(0, 0);
        STORE_STRIDE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(0, 1);
      }
    }
    {
      /* Tail loop for vec unroll */
      for (; vec_itr < vec_count; vec_itr++) {
        SETUP_BIAS;
        for (m_itr = 0; m_itr < (rows & ~(ROW_UNROLL - 1));
             m_itr += ROW_UNROLL) {
          UNROLL_ROW_SETUP_SHIFT(0);
          UNROLL_ROW_SETUP_SHIFT(1);
          UNROLL_SETUP_ACC_BATCH(0, 0);
          UNROLL_SETUP_ACC_BATCH(1, 0);
          SETUP_VEC_OFFSET_BATCH_ASYM8b_UNALIGNED(0);
          SETUP_MAT1_ASYM8b_UNALIGNED(0);
          SETUP_MAT1_ASYM8b_UNALIGNED(1);
          int cols1_count = cols1 - cols1 % 4;

          for (c_itr = 0; c_itr < (cols1_count >> 2); c_itr++) {
            LOAD_VEC_BATCH_ASYM8b_UNALIGNED(0);
            LOAD_ROW_MAT1_ASYM8b_UNALIGNED(0);
            LOAD_ROW_MAT1_ASYM8b_UNALIGNED(1);
            KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b(0, 0);
            KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b(1, 0);
          }
#pragma no_unroll
          for (c_itr = cols1_count; c_itr < cols1; c_itr++) {
            LOAD_VEC_BATCH_ASYM8b_SINGLE_UNALIGNED(0);
            LOAD_ROW_MAT1_ASYM8b_SINGLE_UNALIGNED(0);
            LOAD_ROW_MAT1_ASYM8b_SINGLE_UNALIGNED(1);
            KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b_SINGLE_UNALIGNED(0, 0);
            KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b_SINGLE_UNALIGNED(1, 0);
          }

          LOAD_BIAS;
          UNROLL_ADD_BIAS_ACC_BATCH(0, 0);
          UNROLL_ADJUST_ACC_BATCH(0, 0);
          LOAD_BIAS;
          UNROLL_ADD_BIAS_ACC_BATCH(1, 0);
          UNROLL_ADJUST_ACC_BATCH(1, 0);

          STORE_STRIDE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(0, 0);
          STORE_STRIDE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(1, 0);
        }

        for (; m_itr < rows; m_itr++) {
          UNROLL_ROW_SETUP_SHIFT(0);
          UNROLL_SETUP_ACC_BATCH(0, 0);
          SETUP_VEC_OFFSET_BATCH_ASYM8b_UNALIGNED(0);
          SETUP_MAT1_ASYM8b_UNALIGNED(0);
          int cols1_count = cols1 - cols1 % 4;

          for (c_itr = 0; c_itr < (cols1_count >> 2); c_itr++) {
            LOAD_VEC_BATCH_ASYM8b_UNALIGNED(0);
            LOAD_ROW_MAT1_ASYM8b_UNALIGNED(0);
            KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b(0, 0);
          }
#pragma no_unroll
          for (c_itr = cols1_count; c_itr < cols1; c_itr++) {
            LOAD_VEC_BATCH_ASYM8b_SINGLE_UNALIGNED(0);
            LOAD_ROW_MAT1_ASYM8b_SINGLE_UNALIGNED(0);
            KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b_SINGLE_UNALIGNED(0, 0);
          }

          LOAD_BIAS;
          UNROLL_ADD_BIAS_ACC_BATCH(0, 0);
          UNROLL_ADJUST_ACC_BATCH(0, 0);
          STORE_STRIDE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(0, 0);
        }
      }
    }
  } else {
    return -1;
  }

#undef UNROLL_ROW_SETUP_ACC_BATCH
#undef UNROLL_SETUP_ACC_BATCH
#undef UNROLL_SETUP_MAT1
#undef UNROLL_SETUP_VEC_BATCH
#undef SETUP_BIAS
#undef SETUP_SHIFT
#undef UNROLL_LOAD_VEC_BATCH
#undef UNROLL_LOAD_ROW_MAT1
#undef LOAD_BIAS
#undef UNROLL_ROW_KERNEL_MAT1_VEC_BATCH
#undef UNROLL_KERNEL_MAT1_VEC_BATCH
#undef UNROLL_ROW_ADD_BIAS_ACC
#undef UNROLL_ADD_BIAS_ACC_BATCH
#undef UNROLL_ROW_ADJUST_ACC
#undef UNROLL_ADJUST_ACC_BATCH
#undef UNROLL_ROW_STORE_ACC
#undef UNROLL_STORE_ACC_BATCH
#undef VEC_UNROLL
#undef ROW_UNROLL

  return 0;
}

}; // namespace kernels
}; // namespace HiFi
}; // namespace impl
}; // namespace cadence