1 /*
2  * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
3  *
4  * SPDX-License-Identifier: Apache-2.0
5  *
6  * Licensed under the Apache License, Version 2.0 (the License); you may
7  * not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 /* ----------------------------------------------------------------------
20  * Project:      CMSIS NN Library
21  * Title:        arm_convolve_HWC_q7_fast_nonsquare.c
22  * Description:  Fast Q7 version of convolution (non-sqaure shape)
23  *
24  * $Date:        17. January 2018
25  * $Revision:    V.1.0.0
26  *
27  * Target Processor:  Cortex-M cores
28  *
29  * -------------------------------------------------------------------- */
30 
31 #include "arm_math.h"
32 #include "arm_nnfunctions.h"
33 
34 /**
35  *  @ingroup groupNN
36  */
37 
38 /**
39  * @addtogroup NNConv
40  * @{
41  */
42 
43 /**
44  * @brief Fast Q7 convolution function (non-sqaure shape)
45  * @param[in]       Im_in        pointer to input tensor
46  * @param[in]       dim_im_in_x  input tensor dimention x
47  * @param[in]       dim_im_in_y  input tensor dimention y
48  * @param[in]       ch_im_in     number of input tensor channels
49  * @param[in]       wt           pointer to kernel weights
50  * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
51  * @param[in]       dim_kernel_x filter kernel size x
52  * @param[in]       dim_kernel_y filter kernel size y
53  * @param[in]       padding_x    padding size x
54  * @param[in]       padding_y    padding size y
55  * @param[in]       stride_x     convolution stride x
56  * @param[in]       stride_y     convolution stride y
57  * @param[in]       bias         pointer to bias
58  * @param[in]       bias_shift   amount of left-shift for bias
59  * @param[in]       out_shift    amount of right-shift for output
60  * @param[in,out]   Im_out       pointer to output tensor
61  * @param[in]       dim_im_out_x output tensor dimension x
62  * @param[in]       dim_im_out_y output tensor dimension y
63  * @param[in,out]   bufferA      pointer to buffer space for input
64  * @param[in,out]   bufferB      pointer to buffer space for output
65  * @return     The function returns either
66  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
67  *
68  * This function is the version with full list of optimization tricks, but with
69  * some contraints:
70  *   ch_im_in is multiple of 4
71  *   ch_im_out is multiple of 2
72  */
73 
arm_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in,const uint16_t dim_im_in_x,const uint16_t dim_im_in_y,const uint16_t ch_im_in,const q7_t * wt,const uint16_t ch_im_out,const uint16_t dim_kernel_x,const uint16_t dim_kernel_y,const uint16_t padding_x,const uint16_t padding_y,const uint16_t stride_x,const uint16_t stride_y,const q7_t * bias,const uint16_t bias_shift,const uint16_t out_shift,q7_t * Im_out,const uint16_t dim_im_out_x,const uint16_t dim_im_out_y,q15_t * bufferA,q7_t * bufferB)74 arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in,
75                                               const uint16_t dim_im_in_x,
76                                               const uint16_t dim_im_in_y,
77                                               const uint16_t ch_im_in,
78                                               const q7_t * wt,
79                                               const uint16_t ch_im_out,
80                                               const uint16_t dim_kernel_x,
81                                               const uint16_t dim_kernel_y,
82                                               const uint16_t padding_x,
83                                               const uint16_t padding_y,
84                                               const uint16_t stride_x,
85                                               const uint16_t stride_y,
86                                               const q7_t * bias,
87                                               const uint16_t bias_shift,
88                                               const uint16_t out_shift,
89                                               q7_t * Im_out,
90                                               const uint16_t dim_im_out_x,
91                                               const uint16_t dim_im_out_y,
92                                               q15_t * bufferA,
93                                               q7_t * bufferB)
94 {
95 
96 #if defined (ARM_MATH_DSP)
97     /* Run the following code for Cortex-M4 and Cortex-M7 */
98 
99     int16_t   i_out_y, i_out_x, i_ker_y, i_ker_x;
100 
101     /* -----------------------
102      *  Here we use bufferA as q15_t internally as computation are done with q15_t level
103      *  im2col are done to output in q15_t format from q7_t input
104      */
105 
106     q15_t    *pBuffer = bufferA;
107     q7_t     *pOut = Im_out;
108 
109     if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0)
110     {
111         /* check if the input dimension meets the constraints */
112         return ARM_MATH_SIZE_MISMATCH;
113     }
114 
115     /*
116      *  Here we split the entire matrix into three regions depending on the padding situation
117      *    Top: i_out_y from 0 to padding - 1
118      * Middle: i_out_y from padding to dim_im_out-padding-1
119      * Bottom: i_out_y from dim_im_out-padding to dim_im_out-1
120      */
121 
122     /* top part */
123     for (i_out_y = 0; i_out_y < padding_y; i_out_y++)
124     {
125         for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
126         {
127             /* This part implements the im2col function */
128             for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
129                  i_ker_y++)
130             {
131                 for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
132                      i_ker_x++)
133                 {
134                     if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
135                     {
136                         /* arm_fill_q15(0, pBuffer, ch_im_in); */
137                         memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
138                     } else
139                     {
140                         arm_q7_to_q15_reordered_no_shift((q7_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in,
141                                                          pBuffer, ch_im_in);
142                     }
143                     pBuffer += ch_im_in;
144                 }
145             }
146 
147             if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
148             {
149                 pOut =
150                     arm_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y,
151                                                   bias_shift, out_shift, bias, pOut);
152                 /* counter reset */
153                 pBuffer = bufferA;
154             }
155         }
156     }
157 
158     /* middle part, here we also divide the x into left, mid and right */
159     for (; i_out_y < dim_im_out_y - padding_y; i_out_y++)
160     {
161 
162         /* left part */
163         for (i_out_x = 0; i_out_x < padding_x; i_out_x++)
164         {
165             /* This part implements the im2col function */
166             for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
167                  i_ker_y++)
168             {
169                 for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
170                      i_ker_x++)
171                 {
172                     if (i_ker_x < 0 || i_ker_x >= dim_im_in_x)
173                     {
174                         /* arm_fill_q15(0, pBuffer, ch_im_in); */
175                         memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
176                     } else
177                     {
178                         arm_q7_to_q15_reordered_no_shift((q7_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in,
179                                                          pBuffer, ch_im_in);
180                     }
181                     pBuffer += ch_im_in;
182                 }
183             }
184 
185             if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
186             {
187                 pOut =
188                     arm_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y,
189                                                   bias_shift, out_shift, bias, pOut);
190                 /* counter reset */
191                 pBuffer = bufferA;
192             }
193         }
194 
195         /* mid part */
196         for (; i_out_x < dim_im_out_x - padding_x; i_out_x++)
197         {
198             /* This part implements the im2col function */
199             for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
200                  i_ker_y++)
201             {
202                 arm_q7_to_q15_reordered_no_shift((q7_t *) Im_in +
203                                                  (i_ker_y * dim_im_in_x + i_out_x * stride_x - padding_x) * ch_im_in,
204                                                  pBuffer, ch_im_in * dim_kernel_x);
205                 pBuffer += ch_im_in * dim_kernel_x;
206             }
207 
208             if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
209             {
210                 pOut =
211                     arm_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y,
212                                                   bias_shift, out_shift, bias, pOut);
213                 /* counter reset */
214                 pBuffer = bufferA;
215             }
216         }
217 
218         /* right part */
219         for (; i_out_x < dim_im_out_x; i_out_x++)
220         {
221             /* This part implements the im2col function */
222             for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
223                  i_ker_y++)
224             {
225                 for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
226                      i_ker_x++)
227                 {
228                     if (i_ker_x < 0 || i_ker_x >= dim_im_in_x)
229                     {
230                         /* arm_fill_q15(0, pBuffer, ch_im_in); */
231                         memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
232                     } else
233                     {
234                         arm_q7_to_q15_reordered_no_shift((q7_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in,
235                                                          pBuffer, ch_im_in);
236                     }
237                     pBuffer += ch_im_in;
238                 }
239             }
240 
241             if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
242             {
243                 pOut =
244                     arm_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y,
245                                                   bias_shift, out_shift, bias, pOut);
246                 /* counter reset */
247                 pBuffer = bufferA;
248             }
249         }
250     }
251 
252     for (; i_out_y < dim_im_out_y; i_out_y++)
253     {
254         for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
255         {
256             /* This part implements the im2col function */
257             for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
258                  i_ker_y++)
259             {
260                 for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
261                      i_ker_x++)
262                 {
263                     if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
264                     {
265                         /* arm_fill_q15(0, pBuffer, ch_im_in); */
266                         memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
267                     } else
268                     {
269                         arm_q7_to_q15_reordered_no_shift((q7_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in,
270                                                          pBuffer, ch_im_in);
271                     }
272                     pBuffer += ch_im_in;
273                 }
274             }
275 
276             if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
277             {
278                 pOut =
279                     arm_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y,
280                                                   bias_shift, out_shift, bias, pOut);
281                 /* counter reset */
282                 pBuffer = bufferA;
283             }
284         }
285     }
286 
287     /* check if there is left-over for compute */
288     if (pBuffer != bufferA)
289     {
290         const q7_t *pA = wt;
291         int       i;
292         for (i = 0; i < ch_im_out; i++)
293         {
294             q31_t     sum = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
295             q15_t    *pB = bufferA;
296             /* basically each time it process 4 entries */
297             uint16_t  colCnt = ch_im_in * dim_kernel_x * dim_kernel_y >> 2;
298 
299             while (colCnt)
300             {
301 
302                 q31_t     inA1, inA2;
303                 q31_t     inB1, inB2;
304 
305                 pA = (const q7_t *)read_and_pad_reordered((void *)pA, &inA1, &inA2);
306 
307                 inB1 = *__SIMD32(pB)++;
308                 sum = __SMLAD(inA1, inB1, sum);
309                 inB2 = *__SIMD32(pB)++;
310                 sum = __SMLAD(inA2, inB2, sum);
311 
312                 colCnt--;
313             }
314             colCnt = (ch_im_in * dim_kernel_y * dim_kernel_x) & 0x3;
315             while (colCnt)
316             {
317                 q7_t      inA1 = *pA++;
318                 q15_t     inB1 = *pB++;
319                 sum += inA1 * inB1;
320                 colCnt--;
321             }
322             *pOut = (q7_t) __SSAT((sum >> out_shift), 8);
323             pOut++;
324 
325         }
326 
327     }
328 
329 #else
330     /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
331     int       i, j, k, l, m, n;
332     int       conv_out;
333     int       in_row, in_col;
334 
335     if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0)
336     {
337         /* check if the input dimension meets the constraints */
338         return ARM_MATH_SIZE_MISMATCH;
339     }
340 
341     for (i = 0; i < ch_im_out; i++)
342     {
343         for (j = 0; j < dim_im_out_y; j++)
344         {
345             for (k = 0; k < dim_im_out_x; k++)
346             {
347                 conv_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
348                 for (m = 0; m < dim_kernel_y; m++)
349                 {
350                     for (n = 0; n < dim_kernel_x; n++)
351                     {
352                         /* if-for implementation */
353                         in_row = stride_y * j + m - padding_y;
354                         in_col = stride_x * k + n - padding_x;
355                         if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
356                         {
357                             for (l = 0; l < ch_im_in; l++)
358                             {
359                                 conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] *
360                                     wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_x + n) * ch_im_in + l];
361                             }
362                         }
363                     }
364                 }
365                 Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t) __SSAT((conv_out >> out_shift), 8);
366             }
367         }
368     }
369 
370 
371 #endif                          /* ARM_MATH_DSP */
372 
373     /* Return to application */
374     return ARM_MATH_SUCCESS;
375 }
376 
377 /**
378  * @} end of NNConv group
379  */
380