1 /*
2  * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
3  *
4  * SPDX-License-Identifier: Apache-2.0
5  *
6  * Licensed under the Apache License, Version 2.0 (the License); you may
7  * not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 /* ----------------------------------------------------------------------
20  * Project:      CMSIS NN Library
21  * Title:        arm_convolve_HWC_q15_basic.c
22  * Description:  Q15 version of convolution
23  *
24  * $Date:        17. January 2018
25  * $Revision:    V.1.0.0
26  *
27  * Target Processor:  Cortex-M cores
28  *
29  * -------------------------------------------------------------------- */
30 
31 #include "arm_math.h"
32 #include "arm_nnfunctions.h"
33 
34 /**
35  *  @ingroup groupNN
36  */
37 
38 /**
39  * @addtogroup NNConv
40  * @{
41  */
42 
43   /**
44    * @brief Basic Q15 convolution function
45    * @param[in]       Im_in       pointer to input tensor
46    * @param[in]       dim_im_in   input tensor dimention
47    * @param[in]       ch_im_in    number of input tensor channels
48    * @param[in]       wt          pointer to kernel weights
49    * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
50    * @param[in]       dim_kernel  filter kernel size
51    * @param[in]       padding     padding sizes
52    * @param[in]       stride      convolution stride
53    * @param[in]       bias        pointer to bias
54    * @param[in]       bias_shift  amount of left-shift for bias
55    * @param[in]       out_shift   amount of right-shift for output
56    * @param[in,out]   Im_out      pointer to output tensor
57    * @param[in]       dim_im_out  output tensor dimension
58    * @param[in,out]   bufferA     pointer to buffer space for input
59    * @param[in,out]   bufferB     pointer to buffer space for output
60    * @return     The function returns <code>ARM_MATH_SUCCESS</code>
61    *
62    * @details
63    *
64    * <b>Buffer size:</b>
65    *
66    * bufferA size: ch_im_in*dim_kernel*dim_kernel
67    *
68    * bufferB size: 0
69    *
70    * This basic version is designed to work for any input tensor and weight
71    * dimension.
72    */
73 
74 arm_status
arm_convolve_HWC_q15_basic(const q15_t * Im_in,const uint16_t dim_im_in,const uint16_t ch_im_in,const q15_t * wt,const uint16_t ch_im_out,const uint16_t dim_kernel,const uint16_t padding,const uint16_t stride,const q15_t * bias,const uint16_t bias_shift,const uint16_t out_shift,q15_t * Im_out,const uint16_t dim_im_out,q15_t * bufferA,q7_t * bufferB)75 arm_convolve_HWC_q15_basic(const q15_t * Im_in,
76                            const uint16_t dim_im_in,
77                            const uint16_t ch_im_in,
78                            const q15_t * wt,
79                            const uint16_t ch_im_out,
80                            const uint16_t dim_kernel,
81                            const uint16_t padding,
82                            const uint16_t stride,
83                            const q15_t * bias,
84                            const uint16_t bias_shift,
85                            const uint16_t out_shift,
86                            q15_t * Im_out,
87                            const uint16_t dim_im_out,
88                            q15_t * bufferA,
89                            q7_t * bufferB)
90 {
91 
92 #if defined (ARM_MATH_DSP)
93     /* Run the following code for Cortex-M4 and Cortex-M7 */
94 
95     int16_t   i_out_y, i_out_x, i_ker_y, i_ker_x;
96 
97     uint16_t  im2col_out_pixel_index = 0;
98     q15_t    *pBuffer = bufferA;
99     q15_t    *pOut = Im_out;
100     q15_t    *im_buffer = bufferA;
101     const q15_t *pA;
102     int       i;
103 
104     /* This part implements the im2col function */
105     for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
106     {
107         for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
108         {
109             for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
110             {
111                 for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
112                 {
113                     if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
114                     {
115                         /* Filling 0 for out-of-bound paddings */
116                         /* arm_fill_q15(0, pBuffer, ch_im_in); */
117                         memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
118                     } else
119                     {
120                         /* arm_copy_q15((q15_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in); */
121                         memcpy(pBuffer, (q15_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, sizeof(q15_t)*ch_im_in);
122                     }
123                     pBuffer += ch_im_in;
124                 }
125             }
126 
127             pA = wt;
128             for (i = 0; i < ch_im_out; i++)
129             {
130                 q31_t     sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
131                 q15_t    *pB = im_buffer;
132                 uint16_t  colCnt = ch_im_in * dim_kernel * dim_kernel >> 2;
133                 while (colCnt)
134                 {
135                     q31_t     inA1 = *__SIMD32(pA)++;
136                     q31_t     inB1 = *__SIMD32(pB)++;
137                     q31_t     inA2 = *__SIMD32(pA)++;
138                     q31_t     inB2 = *__SIMD32(pB)++;
139 
140                     sum = __SMLAD(inA1, inB1, sum);
141                     sum = __SMLAD(inA2, inB2, sum);
142 
143                     colCnt--;
144                 }
145                 colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3;
146                 while (colCnt)
147                 {
148                     q15_t     inA1 = *pA++;
149                     q15_t     inB1 = *pB++;
150                     sum += inA1 * inB1;
151                     colCnt--;
152                 }
153                 *pOut = (q15_t) __SSAT((sum >> out_shift), 16);
154                 pOut++;
155             }
156 
157             /* counter reset */
158             pBuffer = im_buffer;
159             im2col_out_pixel_index++;
160         }
161     }
162 
163 #else
164     /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
165     uint16_t  i, j, k, l, m, n;
166     int       conv_out;
167     signed char in_row, in_col;
168 
169     for (i = 0; i < ch_im_out; i++)
170     {
171         for (j = 0; j < dim_im_out; j++)
172         {
173             for (k = 0; k < dim_im_out; k++)
174             {
175                 conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
176                 for (m = 0; m < dim_kernel; m++)
177                 {
178                     for (n = 0; n < dim_kernel; n++)
179                     {
180                         in_row = stride * j + m - padding;
181                         in_col = stride * k + n - padding;
182                         if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
183                         {
184                             for (l = 0; l < ch_im_in; l++)
185                             {
186                                 conv_out +=
187                                     Im_in[(in_row * dim_im_in + in_col) * ch_im_in +
188                                           l] * wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel +
189                                                                                             n) * ch_im_in + l];
190                             }
191                         }
192                     }
193                 }
194                 Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q15_t) __SSAT((conv_out >> out_shift), 16);
195             }
196         }
197     }
198 
199 #endif                          /* ARM_MATH_DSP */
200 
201     /* Return to application */
202     return ARM_MATH_SUCCESS;
203 }
204 
205 /**
206  * @} end of NNConv group
207  */
208