1 /*
2 * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
3 *
4 * SPDX-License-Identifier: Apache-2.0
5 *
6 * Licensed under the Apache License, Version 2.0 (the License); you may
7 * not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19 /* ----------------------------------------------------------------------
20 * Project: CMSIS NN Library
21 * Title: arm_convolve_HWC_q7_fast.c
22 * Description: Fast Q7 version of convolution
23 *
24 * $Date: 17. January 2018
25 * $Revision: V.1.0.0
26 *
27 * Target Processor: Cortex-M cores
28 *
29 * -------------------------------------------------------------------- */
30
31 #include "arm_math.h"
32 #include "arm_nnfunctions.h"
33
34 /**
35 * @ingroup groupNN
36 */
37
38 /**
39 * @addtogroup NNConv
40 * @{
41 */
42
43 /**
44 * @brief Fast Q7 convolution function
45 * @param[in] Im_in pointer to input tensor
46 * @param[in] dim_im_in input tensor dimention
47 * @param[in] ch_im_in number of input tensor channels
48 * @param[in] wt pointer to kernel weights
49 * @param[in] ch_im_out number of filters, i.e., output tensor channels
50 * @param[in] dim_kernel filter kernel size
51 * @param[in] padding padding sizes
52 * @param[in] stride convolution stride
53 * @param[in] bias pointer to bias
54 * @param[in] bias_shift amount of left-shift for bias
55 * @param[in] out_shift amount of right-shift for output
56 * @param[in,out] Im_out pointer to output tensor
57 * @param[in] dim_im_out output tensor dimension
58 * @param[in,out] bufferA pointer to buffer space for input
59 * @param[in,out] bufferB pointer to buffer space for output
60 * @return The function returns either
61 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
62 *
63 * @details
64 *
65 * <b>Buffer size:</b>
66 *
67 * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
68 *
69 * bufferB size: 0
70 *
71 * <b>Input dimension constraints:</b>
72 *
73 * ch_im_in is multiple of 4 ( because of the SIMD32 read and swap )
74 *
75 * ch_im_out is multipe of 2 ( bacause 2x2 mat_mult kernel )
76 *
77 * The im2col converts the Q7 tensor input into Q15 column, which is stored in
78 * bufferA. There is reordering happenning during this im2col process with
79 * arm_q7_to_q15_reordered_no_shift. For every four elements, the second and
80 * third elements are swapped.
81 *
82 * The computation kernel arm_nn_mat_mult_kernel_q7_q15_reordered does the
83 * GEMM computation with the reordered columns.
84 *
85 * To speed-up the determination of the padding condition, we split the
86 * computation into 3x3 parts, i.e., {top, mid, bottom} X {left, mid, right}.
87 * This reduces the total number of boundary condition checks and improves
88 * the data copying performance.
89 */
90
91 arm_status
arm_convolve_HWC_q7_fast(const q7_t * Im_in,const uint16_t dim_im_in,const uint16_t ch_im_in,const q7_t * wt,const uint16_t ch_im_out,const uint16_t dim_kernel,const uint16_t padding,const uint16_t stride,const q7_t * bias,const uint16_t bias_shift,const uint16_t out_shift,q7_t * Im_out,const uint16_t dim_im_out,q15_t * bufferA,q7_t * bufferB)92 arm_convolve_HWC_q7_fast(const q7_t * Im_in,
93 const uint16_t dim_im_in,
94 const uint16_t ch_im_in,
95 const q7_t * wt,
96 const uint16_t ch_im_out,
97 const uint16_t dim_kernel,
98 const uint16_t padding,
99 const uint16_t stride,
100 const q7_t * bias,
101 const uint16_t bias_shift,
102 const uint16_t out_shift,
103 q7_t * Im_out,
104 const uint16_t dim_im_out,
105 q15_t * bufferA,
106 q7_t * bufferB)
107 {
108
109 #if defined (ARM_MATH_DSP)
110 /* Run the following code for Cortex-M4 and Cortex-M7 */
111
112 int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
113
114 /*
115 * Here we use bufferA as q15_t internally as computation are done with q15_t level
116 * im2col are done to output in q15_t format from q7_t input
117 */
118
119 q15_t *pBuffer = bufferA;
120 q7_t *pOut = Im_out;
121
122 if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0)
123 {
124 /* check if the input dimension meets the constraints */
125 return ARM_MATH_SIZE_MISMATCH;
126 }
127
128 /*
129 * Here we split the entire matrix into three regions depending on the padding situation
130 * Top: i_out_y from 0 to padding - 1
131 * Middle: i_out_y from padding to dim_im_out-padding-1
132 * Bottom: i_out_y from dim_im_out-padding to dim_im_out-1
133 */
134
135 /* top part */
136 for (i_out_y = 0; i_out_y < padding; i_out_y++)
137 {
138 for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
139 {
140 /* This part implements the im2col function */
141 for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
142 {
143 for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
144 {
145 if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
146 {
147 /* arm_fill_q15(0, pBuffer, ch_im_in); */
148 memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
149 } else
150 {
151 arm_q7_to_q15_reordered_no_shift
152 ((q7_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
153 }
154 pBuffer += ch_im_in;
155 }
156 }
157
158 if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
159 {
160 pOut =
161 arm_nn_mat_mult_kernel_q7_q15_reordered(wt,
162 bufferA,
163 ch_im_out,
164 ch_im_in
165 *
166 dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
167 /* counter reset */
168 pBuffer = bufferA;
169 }
170 }
171 }
172
173 /* middle part, here we also divide the x into left, mid and right */
174 for (; i_out_y < dim_im_out - padding; i_out_y++)
175 {
176
177 /* left part */
178 for (i_out_x = 0; i_out_x < padding; i_out_x++)
179 {
180 /* This part implements the im2col function */
181 for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
182 {
183 for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
184 {
185 if (i_ker_x < 0 || i_ker_x >= dim_im_in)
186 {
187 /* arm_fill_q15(0, pBuffer, ch_im_in); */
188 memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
189 } else
190 {
191 arm_q7_to_q15_reordered_no_shift
192 ((q7_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
193 }
194 pBuffer += ch_im_in;
195 }
196 }
197
198 if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
199 {
200 pOut =
201 arm_nn_mat_mult_kernel_q7_q15_reordered(wt,
202 bufferA,
203 ch_im_out,
204 ch_im_in
205 *
206 dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
207 /* counter reset */
208 pBuffer = bufferA;
209 }
210 }
211
212 /* mid part */
213 for (; i_out_x < dim_im_out - padding; i_out_x++)
214 {
215 /* This part implements the im2col function */
216 for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
217 {
218 arm_q7_to_q15_reordered_no_shift((q7_t *) Im_in
219 +
220 (i_ker_y *
221 dim_im_in +
222 i_out_x *
223 stride - padding) * ch_im_in, pBuffer, ch_im_in * dim_kernel);
224 pBuffer += ch_im_in * dim_kernel;
225 }
226
227 if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
228 {
229 pOut =
230 arm_nn_mat_mult_kernel_q7_q15_reordered(wt,
231 bufferA,
232 ch_im_out,
233 ch_im_in
234 *
235 dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
236 /* counter reset */
237 pBuffer = bufferA;
238 }
239 }
240
241 /* right part */
242 for (; i_out_x < dim_im_out; i_out_x++)
243 {
244 /* This part implements the im2col function */
245 for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
246 {
247 for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
248 {
249 if (i_ker_x < 0 || i_ker_x >= dim_im_in)
250 {
251 /* arm_fill_q15(0, pBuffer, ch_im_in); */
252 memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
253 } else
254 {
255 arm_q7_to_q15_reordered_no_shift
256 ((q7_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
257 }
258 pBuffer += ch_im_in;
259 }
260 }
261
262 if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
263 {
264 pOut =
265 arm_nn_mat_mult_kernel_q7_q15_reordered(wt,
266 bufferA,
267 ch_im_out,
268 ch_im_in
269 *
270 dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
271 /* counter reset */
272 pBuffer = bufferA;
273 }
274 }
275 }
276
277 for (; i_out_y < dim_im_out; i_out_y++)
278 {
279 for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
280 {
281 /* This part implements the im2col function */
282 for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
283 {
284 for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
285 {
286 if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
287 {
288 /* arm_fill_q15(0, pBuffer, ch_im_in); */
289 memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
290 } else
291 {
292 arm_q7_to_q15_reordered_no_shift
293 ((q7_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
294 }
295 pBuffer += ch_im_in;
296 }
297 }
298
299 if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
300 {
301 pOut =
302 arm_nn_mat_mult_kernel_q7_q15_reordered(wt,
303 bufferA,
304 ch_im_out,
305 ch_im_in
306 *
307 dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
308 /* counter reset */
309 pBuffer = bufferA;
310 }
311 }
312 }
313
314 /* check if there is left-over for compute */
315 if (pBuffer != bufferA)
316 {
317 const q7_t *pA = wt;
318 int i;
319
320 for (i = 0; i < ch_im_out; i++)
321 {
322 q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
323 q15_t *pB = bufferA;
324 /* each time it process 4 entries */
325 uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 2;
326
327 while (colCnt)
328 {
329
330 q31_t inA1, inA2;
331 q31_t inB1, inB2;
332
333 pA = (q7_t *) read_and_pad_reordered((void *)pA, &inA1, &inA2);
334
335 inB1 = *__SIMD32(pB)++;
336 sum = __SMLAD(inA1, inB1, sum);
337 inB2 = *__SIMD32(pB)++;
338 sum = __SMLAD(inA2, inB2, sum);
339
340 colCnt--;
341 }
342 colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3;
343 while (colCnt)
344 {
345 q7_t inA1 = *pA++;
346 q15_t inB1 = *pB++;
347 sum += inA1 * inB1;
348 colCnt--;
349 }
350 *pOut = (q7_t) __SSAT((sum >> out_shift), 8);
351 pOut++;
352
353 }
354
355 }
356 #else
357 /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
358
359 uint16_t i, j, k, l, m, n;
360 int conv_out;
361 signed char in_row, in_col;
362
363 if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0)
364 {
365 /* check if the input dimension meets the constraints */
366 return ARM_MATH_SIZE_MISMATCH;
367 }
368
369 for (i = 0; i < ch_im_out; i++)
370 {
371 for (j = 0; j < dim_im_out; j++)
372 {
373 for (k = 0; k < dim_im_out; k++)
374 {
375 conv_out = (bias[i] << bias_shift) + NN_ROUND(out_shift);
376 for (m = 0; m < dim_kernel; m++)
377 {
378 for (n = 0; n < dim_kernel; n++)
379 {
380 // if-for implementation
381 in_row = stride * j + m - padding;
382 in_col = stride * k + n - padding;
383 if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
384 {
385 for (l = 0; l < ch_im_in; l++)
386 {
387 conv_out +=
388 Im_in[(in_row * dim_im_in + in_col) * ch_im_in +
389 l] * wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel +
390 n) * ch_im_in + l];
391 }
392 }
393 }
394 }
395 Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q7_t) __SSAT((conv_out >> out_shift), 8);
396 }
397 }
398 }
399
400 #endif /* ARM_MATH_DSP */
401
402 /* Return to application */
403 return ARM_MATH_SUCCESS;
404 }
405
406 /**
407 * @} end of NNConv group
408 */
409