xref: /btstack/port/stm32-f4discovery-usb/Drivers/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp (revision a8f7f3fcbcd51f8d2e92aca076b6a9f812db358c)
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2018 Arm Limited. All rights reserved.
3 *
4 *
5 * Project:       CMSIS NN Library
6 * Title:         arm_nnexamples_nn_test.cpp
7 *
8 * Description:   Example code for NN kernel testing.
9 *
10 * Target Processor: Cortex-M cores
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 *   - Redistributions of source code must retain the above copyright
16 *     notice, this list of conditions and the following disclaimer.
17 *   - Redistributions in binary form must reproduce the above copyright
18 *     notice, this list of conditions and the following disclaimer in
19 *     the documentation and/or other materials provided with the
20 *     distribution.
21 *   - Neither the name of ARM LIMITED nor the names of its contributors
22 *     may be used to endorse or promote products derived from this
23 *     software without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
28 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
29 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
30 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
31 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
32 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
33 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
35 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 * -------------------------------------------------------------------- */
38 
39 #include "arm_nnexamples_nn_test.h"
40 
41 //#define TEST_SIGMOID
42 //#define TEST_TANH
43 #define TEST_POOL
44 #define TEST_RELU
45 #define TEST_IP
46 #define TEST_CONV
47 #define TEST_NONSQUARE
48 #define TEST_NNMULT
49 
50 int test_index = 0;
51 q7_t test_flags[50];
52 bool test_pass;
53 
main()54 int main()
55 {
56     printf("start tests\n");
57 
58     srand(1);
59 
60     // common pointers for testing data
61     q7_t     *test1;
62     q15_t    *test2;
63     q7_t     *test3;
64     q15_t    *test4;
65 
66     for (test_index = 0; test_index<50; test_index++) {
67         test_flags[test_index] = -1;
68     }
69     test_index = 0;
70 
71 #ifdef TEST_NNMULT
72 #define NNMULT_DIM 128
73     test1 = new q7_t[NNMULT_DIM*2];
74     test2 = new q15_t[NNMULT_DIM*2];
75     test3 = new q7_t[NNMULT_DIM*2];
76     test4 = new q15_t[NNMULT_DIM*2];
77 
78     q7_t * mult_out_q7 = test3;
79     q7_t * mult_ref_q7 = test3 + NNMULT_DIM;
80     q15_t * mult_out_q15 = test4;
81     q15_t * mult_ref_q15 = test4 + NNMULT_DIM;
82 
83     for (int i=0;i<NNMULT_DIM*2;i++) {
84         test1[i] = (rand() % 256 - 128);
85         test2[i] = (rand() % 65536 - 32768);
86     }
87 
88     // Test q7
89     arm_nn_mult_q7(test1, test1+NNMULT_DIM, mult_out_q7, 5, NNMULT_DIM);
90 
91     arm_nn_mult_q7_ref(test1, test1+NNMULT_DIM, mult_ref_q7, 5, NNMULT_DIM);
92 
93     verify_results_q7(mult_out_q7, mult_ref_q7, NNMULT_DIM);
94 
95     arm_nn_mult_q7(test1, test1+NNMULT_DIM, mult_out_q7, 9, NNMULT_DIM);
96 
97     arm_nn_mult_q7_ref(test1, test1+NNMULT_DIM, mult_ref_q7, 9, NNMULT_DIM);
98 
99     verify_results_q7(mult_out_q7, mult_ref_q7, NNMULT_DIM);
100 
101     // Test q15
102     arm_nn_mult_q15(test2, test2+NNMULT_DIM, mult_out_q15, 13, NNMULT_DIM);
103 
104     arm_nn_mult_q15_ref(test2, test2+NNMULT_DIM, mult_ref_q15, 13, NNMULT_DIM);
105 
106     verify_results_q15(mult_out_q15, mult_ref_q15, NNMULT_DIM);
107 
108     arm_nn_mult_q15(test2, test2+NNMULT_DIM, mult_out_q15, 18, NNMULT_DIM);
109 
110     arm_nn_mult_q15_ref(test2, test2+NNMULT_DIM, mult_ref_q15, 18, NNMULT_DIM);
111 
112     verify_results_q15(mult_out_q15, mult_ref_q15, NNMULT_DIM);
113 
114 #endif
115 
116 #ifdef TEST_SIGMOID
117 
118 #define SIGMOID_DIM 128
119 
120     /* This part tests the running of sigmoid functions */
121 
122     test1 = new q7_t[SIGMOID_DIM];
123     test2 = new q15_t[SIGMOID_DIM];
124     test3 = new q7_t[SIGMOID_DIM];
125     test4 = new q15_t[SIGMOID_DIM];
126 
127     srand(1);
128 
129     for (int i = 0; i < SIGMOID_DIM; i++)
130     {
131         test1[i] = (rand() % 256 - 128);
132         test2[i] = (rand() % 65536 - 32768);
133         test3[i] = test1[i];
134         test4[i] = test2[i];
135     }
136 
137     arm_nn_activations_direct_q7(test3, SIGMOID_DIM, 3, ARM_SIGMOID);
138 
139     for (int i = 0; i < SIGMOID_DIM; i++)
140     {
141         printf("in: %d  out: %d\n", test1[i], test3[i]);
142     }
143 
144     printf("start testing q15_t sigmoid\n\n");
145 
146     arm_nn_activations_direct_q15(test4, SIGMOID_DIM, 3, ARM_SIGMOID);
147 
148     for (int i = 0; i < SIGMOID_DIM; i++)
149     {
150         printf("in: %d  out: %d\n", test2[i], test4[i]);
151     }
152 
153     delete[]test1;
154     delete[]test2;
155     delete[]test3;
156     delete[]test4;
157 
158 #endif
159 
160 #ifdef TEST_TANH
161 
162 #define TANH_DIM 128
163 
164     /* This part tests the running of sigmoid functions */
165 
166     test1 = new q7_t[TANH_DIM];
167     test2 = new q15_t[TANH_DIM];
168     test3 = new q7_t[TANH_DIM];
169     test4 = new q15_t[TANH_DIM];
170 
171     srand(1);
172 
173     for (int i = 0; i < TANH_DIM; i++)
174     {
175         test1[i] = (rand() % 256 - 128);
176         test2[i] = (rand() % 65536 - 32768);
177         test3[i] = test1[i];
178         test4[i] = test2[i];
179     }
180 
181     arm_nn_activations_direct_q7(test3, TANH_DIM, 3, ARM_TANH);
182 
183     printf("start testing q7_t tanh\n\n");
184 
185     for (int i = 0; i < TANH_DIM; i++)
186     {
187         printf("in: %d  out: %d\n", test1[i], test3[i]);
188     }
189 
190     printf("start testing q15_t tanh\n\n");
191 
192     arm_nn_activations_direct_q15(test4, TANH_DIM, 3, ARM_TANH);
193 
194     for (int i = 0; i < TANH_DIM; i++)
195     {
196         printf("in: %d  out: %d\n", test2[i], test4[i]);
197     }
198 
199     delete[]test1;
200     delete[]test2;
201     delete[]test3;
202     delete[]test4;
203 
204 #endif
205 
206 #ifdef TEST_POOL
207 
208 #define POOL_IM_DIM 32
209 #define POOL_IM_CH 8
210 
211     test1 = new q7_t[POOL_IM_DIM * POOL_IM_DIM * POOL_IM_CH * 2];
212     test2 = new q15_t[POOL_IM_DIM * POOL_IM_CH];
213     test3 = new q7_t[POOL_IM_DIM * POOL_IM_DIM * POOL_IM_CH];
214 
215     for (int i = 0; i < POOL_IM_DIM * POOL_IM_DIM * POOL_IM_CH; i++)
216     {
217         test1[i] = (rand() % 256 - 128);
218     }
219 
220     q7_t     *img_in = test1 + POOL_IM_DIM * POOL_IM_DIM * POOL_IM_CH;
221     q7_t     *pool_out_ref = test3;
222     q7_t     *pool_out_opt = test3 + POOL_IM_DIM * POOL_IM_DIM * POOL_IM_CH / 2;
223 
224     for (int i = 0; i < POOL_IM_DIM * POOL_IM_DIM * POOL_IM_CH; i++)
225     {
226         test3[i] = 0;
227     }
228 
229     // copy over the img input
230     for (int i = 0; i < POOL_IM_DIM * POOL_IM_DIM * POOL_IM_CH; i++)
231     {
232         img_in[i] = test1[i];
233     }
234 
235     initialize_results_q7(pool_out_ref, pool_out_opt, POOL_IM_DIM / 2 * POOL_IM_DIM / 2 * POOL_IM_CH);
236 
237     printf("Start maxpool reference implementation\n");
238 
239     arm_maxpool_q7_HWC_ref(img_in, POOL_IM_DIM, POOL_IM_CH, 3, 0, 2, POOL_IM_DIM / 2, (q7_t *) test2, pool_out_ref);
240 
241     // copy over the img input
242     for (int i = 0; i < POOL_IM_DIM * POOL_IM_DIM * POOL_IM_CH; i++)
243     {
244         img_in[i] = test1[i];
245     }
246 
247     printf("Start maxpool opt implementation\n");
248 
249     arm_maxpool_q7_HWC(img_in, POOL_IM_DIM, POOL_IM_CH, 3, 0, 2, POOL_IM_DIM / 2, (q7_t *) test2, pool_out_opt);
250 
251     verify_results_q7(pool_out_ref, pool_out_opt, POOL_IM_DIM / 2 * POOL_IM_DIM / 2 * POOL_IM_CH);
252 
253     // copy over the img input
254     for (int i = 0; i < POOL_IM_DIM * POOL_IM_DIM * POOL_IM_CH; i++)
255     {
256         img_in[i] = test1[i];
257     }
258 
259     // copy over the img input
260     for (int i = 0; i < POOL_IM_DIM * POOL_IM_DIM * POOL_IM_CH; i++)
261     {
262         img_in[i] = test1[i];
263     }
264 
265     printf("Start avepool ref implementation\n");
266 
267     arm_avepool_q7_HWC_ref(img_in, POOL_IM_DIM, POOL_IM_CH, 3, 0, 2, POOL_IM_DIM / 2, (q7_t *) test2, pool_out_ref);
268 
269     // copy over the img input
270     for (int i = 0; i < POOL_IM_DIM * POOL_IM_DIM * POOL_IM_CH; i++)
271     {
272         img_in[i] = test1[i];
273     }
274 
275     printf("Start avepool opt implementation\n");
276 
277     arm_avepool_q7_HWC(img_in, POOL_IM_DIM, POOL_IM_CH, 3, 0, 2, POOL_IM_DIM / 2, (q7_t *) test2, pool_out_opt);
278 
279     // special check here
280     bool      if_ave_pool_match = true;
281     for (int i = 0; i < POOL_IM_DIM / 2 * POOL_IM_DIM / 2 * POOL_IM_CH; i++)
282     {
283         // we tolerate at most difference of 1 here because of rounding errors
284         if (pool_out_ref[i] - pool_out_opt[i] >= 2 || pool_out_opt[i] - pool_out_ref[i] >= 2)
285         {
286             printf("Output mismatch at %d, expected %d, actual %d\n", i, pool_out_ref[i], pool_out_opt[i]);
287             if_ave_pool_match = false;
288         }
289     }
290     if (if_ave_pool_match == true)
291     {
292         printf("Outputs match.\n");
293     }
294 
295     delete[]test1;
296     delete[]test2;
297     delete[]test3;
298 
299 #endif
300 
301 #ifdef TEST_RELU
302 
303 #define RELU_DIM 127
304 
305     test1 = new q7_t[RELU_DIM];
306     test2 = new q15_t[RELU_DIM];
307     test3 = new q7_t[RELU_DIM];
308     test4 = new q15_t[RELU_DIM];
309 
310     for (int i = 0; i < RELU_DIM; i++)
311     {
312         test1[i] = (rand() % 256 - 128);
313         test2[i] = (rand() % 65536 - 32768);
314         test3[i] = test1[i];
315         test4[i] = test2[i];
316     }
317 
318     q7_t     *relu_ref_data_q7 = test1;
319     q7_t     *relu_opt_data_q7 = test3;
320     q15_t    *relu_ref_data_q15 = test2;
321     q15_t    *relu_opt_data_q15 = test4;
322 
323     printf("Start ref relu q7 implementation\n");
324 
325     arm_relu_q7_ref(relu_ref_data_q7, RELU_DIM);
326 
327     printf("Start opt relu q7 implementation\n");
328 
329     arm_relu_q7(relu_opt_data_q7, RELU_DIM);
330 
331     verify_results_q7(relu_ref_data_q7, relu_opt_data_q7, RELU_DIM);
332 
333     printf("Start ref relu q15 implementation\n");
334 
335     arm_relu_q15_ref(relu_ref_data_q15, RELU_DIM);
336 
337     printf("Start opt relu q15 implementation\n");
338 
339     arm_relu_q15(relu_opt_data_q15, RELU_DIM);
340 
341     verify_results_q15(relu_ref_data_q15, relu_opt_data_q15, RELU_DIM);
342 
343     delete[]test1;
344     delete[]test2;
345     delete[]test3;
346     delete[]test4;
347 
348 #endif
349 
350 #ifdef TEST_IP
351 
352 #define IP_ROW_DIM 127
353 #define IP_COL_DIM 127
354 
355     q7_t      ip_weights[IP_ROW_DIM * IP_COL_DIM] = IP2_WEIGHT;
356     q7_t      ip_q7_opt_weights[IP_ROW_DIM * IP_COL_DIM] = IP4_WEIGHT;
357     q7_t      ip_q7_q15_opt_weights[IP_ROW_DIM * IP_COL_DIM] = IP4_q7_q15_WEIGHT;
358     q15_t     ip_q15_weights[IP_ROW_DIM * IP_COL_DIM] = IP2_WEIGHT;
359     q15_t     ip_q15_opt_weights[IP_ROW_DIM * IP_COL_DIM] = IP4_WEIGHT_Q15;
360 
361     test1 = new q7_t[IP_COL_DIM + IP_ROW_DIM];
362     test2 = new q15_t[IP_COL_DIM];
363     test3 = new q7_t[IP_ROW_DIM * 3];
364     test4 = new q15_t[IP_COL_DIM + IP_ROW_DIM * 2];
365 
366     for (int i = 0; i < IP_ROW_DIM + IP_COL_DIM; i++)
367     {
368         test1[i] = rand() % 256 - 100;
369     }
370     for (int i = 0; i < IP_ROW_DIM * 3; i++)
371     {
372         test3[i] = 0;
373     }
374 
375     q7_t     *ip_bias_q7 = test1 + IP_COL_DIM;
376 
377     q7_t     *ip_out_q7_ref = test3;
378     q7_t     *ip_out_q7_opt = test3 + IP_ROW_DIM;
379     q7_t     *ip_out_q7_opt_fast = test3 + 2 * IP_ROW_DIM;
380     q15_t    *ip_out_q15_ref = test4 + IP_COL_DIM;
381     q15_t    *ip_out_q15_opt = test4 + IP_COL_DIM + IP_ROW_DIM;
382 
383     initialize_results_q7(ip_out_q7_ref, ip_out_q7_opt, IP_ROW_DIM);
384     initialize_results_q7(ip_out_q7_ref, ip_out_q7_opt_fast, IP_ROW_DIM);
385     initialize_results_q7(ip_out_q7_ref, ip_out_q7_opt_fast, IP_ROW_DIM);
386 
387     printf("Start ref q7 implementation\n");
388 
389     arm_fully_connected_q7_ref(test1, ip_weights, IP_COL_DIM, IP_ROW_DIM, 1, 7, ip_bias_q7, ip_out_q7_ref, test2);
390 
391     printf("Start q7 implementation\n");
392 
393     arm_fully_connected_q7(test1, ip_weights, IP_COL_DIM, IP_ROW_DIM, 1, 7, ip_bias_q7, ip_out_q7_opt, test2);
394 
395     verify_results_q7(ip_out_q7_ref, ip_out_q7_opt, IP_ROW_DIM);
396 
397     printf("Start q7 ref opt implementation\n");
398 
399     arm_fully_connected_q7_opt_ref(test1, ip_q7_opt_weights, IP_COL_DIM, IP_ROW_DIM, 1, 7, ip_bias_q7,
400                                    ip_out_q7_opt_fast, test2);
401 
402     verify_results_q7(ip_out_q7_ref, ip_out_q7_opt_fast, IP_ROW_DIM);
403 
404     printf("Start q7 opt implementation\n");
405 
406     arm_fully_connected_q7_opt(test1, ip_q7_opt_weights, IP_COL_DIM, IP_ROW_DIM, 1, 7, ip_bias_q7, ip_out_q7_opt_fast,
407                                test2);
408 
409     verify_results_q7(ip_out_q7_ref, ip_out_q7_opt_fast, IP_ROW_DIM);
410 
411     for (int i = 0; i < IP_ROW_DIM + IP_COL_DIM; i++)
412     {
413         test4[i] = (rand() % 65536 - 32768);
414     }
415 
416     initialize_results_q15(ip_out_q15_ref, ip_out_q15_opt, IP_ROW_DIM);
417 
418     printf("Start ref q15 implementation\n");
419 
420     arm_fully_connected_q15_ref(test4, ip_q15_weights, IP_COL_DIM, IP_ROW_DIM, 1, 7, test2, ip_out_q15_ref, NULL);
421 
422     printf("Start q15 implementation\n");
423 
424     arm_fully_connected_q15(test4, ip_q15_weights, IP_COL_DIM, IP_ROW_DIM, 1, 7, test2, ip_out_q15_opt, NULL);
425 
426     verify_results_q15(ip_out_q15_ref, ip_out_q15_opt, IP_ROW_DIM);
427 
428     printf("Start ref opt q15 implementation\n");
429 
430     arm_fully_connected_q15_opt_ref(test4, ip_q15_opt_weights, IP_COL_DIM, IP_ROW_DIM, 1, 7, test2, ip_out_q15_opt,
431                                     NULL);
432 
433     verify_results_q15(ip_out_q15_ref, ip_out_q15_opt, IP_ROW_DIM);
434 
435     printf("Start opt q15 implementation\n");
436 
437     arm_fully_connected_q15_opt(test4, ip_q15_opt_weights, IP_COL_DIM, IP_ROW_DIM, 1, 7, test2, ip_out_q15_opt, NULL);
438 
439     verify_results_q15(ip_out_q15_ref, ip_out_q15_opt, IP_ROW_DIM);
440 
441     initialize_results_q15(ip_out_q15_ref, ip_out_q15_opt, IP_ROW_DIM);
442 
443     printf("Start ref q7_q15 implementation\n");
444 
445     arm_fully_connected_mat_q7_vec_q15_ref(test4, ip_weights, IP_COL_DIM, IP_ROW_DIM, 1, 7, ip_bias_q7, ip_out_q15_ref,
446                                            test2);
447 
448     printf("Start q7_q15 implementation\n");
449 
450     arm_fully_connected_mat_q7_vec_q15(test4, ip_weights, IP_COL_DIM, IP_ROW_DIM, 1, 7, ip_bias_q7, ip_out_q15_opt,
451                                        test2);
452 
453     verify_results_q15(ip_out_q15_ref, ip_out_q15_opt, IP_ROW_DIM);
454 
455     printf("Start ref opt q7_q15 implementation\n");
456 
457     arm_fully_connected_mat_q7_vec_q15_opt_ref(test4, ip_q7_q15_opt_weights, IP_COL_DIM, IP_ROW_DIM, 1, 7, ip_bias_q7,
458                                                ip_out_q15_opt, test2);
459 
460     verify_results_q15(ip_out_q15_ref, ip_out_q15_opt, IP_ROW_DIM);
461 
462     printf("Start opt q7_q15 implementation\n");
463 
464     arm_fully_connected_mat_q7_vec_q15_opt(test4, ip_q7_q15_opt_weights, IP_COL_DIM, IP_ROW_DIM, 1, 7, ip_bias_q7,
465                                            ip_out_q15_opt, test2);
466 
467     verify_results_q15(ip_out_q15_ref, ip_out_q15_opt, IP_ROW_DIM);
468 
469     delete[]test1;
470     delete[]test2;
471     delete[]test3;
472     delete[]test4;
473 
474 #endif
475 
476 #ifdef TEST_NONSQUARE
477 
478 /* Use RCONV to differential with square CONV */
479 
480 #define RCONV_IM_DIM_X 10
481 #define RCONV_IM_DIM_Y 8
482 #define RCONV_IM_CH 4
483 #define RCONV_KER_DIM_X 5
484 #define RCONV_KER_DIM_Y 3
485 #define RCONV_STRIDE_X 1
486 #define RCONV_STRIDE_Y 1
487 #define RCONV_PADDING_X 2
488 #define RCONV_PADDING_Y 1
489 #define RCONV_OUT_CH 4
490 #define RCONV_OUT_DIM_X 10
491 #define RCONV_OUT_DIM_Y 8
492 
493     test1 = new q7_t[RCONV_KER_DIM_Y * RCONV_KER_DIM_X * RCONV_IM_CH * RCONV_OUT_CH + RCONV_OUT_CH];
494     test2 = new q15_t[2 * RCONV_KER_DIM_Y * RCONV_KER_DIM_X * RCONV_IM_CH];
495     test3 =
496         new q7_t[RCONV_IM_DIM_Y * RCONV_IM_DIM_X * RCONV_IM_CH + 2 * RCONV_OUT_DIM_Y * RCONV_OUT_DIM_X * RCONV_OUT_CH];
497 
498     for (int i = 0; i < RCONV_KER_DIM_Y * RCONV_KER_DIM_X * RCONV_IM_CH * RCONV_OUT_CH + RCONV_OUT_CH; i++)
499     {
500         test1[i] = rand() % 256 - 100;
501     }
502 
503     for (int i = 0;
504          i < RCONV_IM_DIM_Y * RCONV_IM_DIM_X * RCONV_IM_CH + 2 * RCONV_OUT_DIM_Y * RCONV_OUT_DIM_X * RCONV_OUT_CH; i++)
505     {
506         test3[i] = rand() % 256 - 100;
507     }
508 
509     q7_t     *rconv_weight_q7 = test1;
510     q7_t     *rconv_bias_q7 = test1 + RCONV_KER_DIM_Y * RCONV_KER_DIM_X * RCONV_IM_CH * RCONV_OUT_CH;
511 
512     q15_t    *rconv_buf = test2;
513 
514     q7_t     *rconv_im_in_q7 = test3;
515     q7_t     *rconv_im_out_ref_q7 = test3 + RCONV_IM_DIM_Y * RCONV_IM_DIM_X * RCONV_IM_CH;
516     q7_t     *rconv_im_out_opt_q7 =
517         test3 + RCONV_IM_DIM_Y * RCONV_IM_DIM_X * RCONV_IM_CH + RCONV_OUT_DIM_Y * RCONV_OUT_DIM_X * RCONV_OUT_CH;
518 
519     initialize_results_q7(rconv_im_out_ref_q7, rconv_im_out_opt_q7, RCONV_OUT_DIM_Y * RCONV_OUT_DIM_X * RCONV_OUT_CH);
520 
521     printf("start conv q7 nonsquare ref implementation\n");
522     arm_convolve_HWC_q7_ref_nonsquare(rconv_im_in_q7, RCONV_IM_DIM_X, RCONV_IM_DIM_Y, RCONV_IM_CH, rconv_weight_q7,
523                                       RCONV_OUT_CH, RCONV_KER_DIM_X, RCONV_KER_DIM_Y, RCONV_PADDING_X, RCONV_PADDING_Y,
524                                       RCONV_STRIDE_X, RCONV_STRIDE_Y, rconv_bias_q7, 1, 7, rconv_im_out_ref_q7,
525                                       RCONV_OUT_DIM_X, RCONV_OUT_DIM_Y, rconv_buf, NULL);
526 
527     printf("start conv q7 nonsquare opt implementation\n");
528     arm_convolve_HWC_q7_fast_nonsquare(rconv_im_in_q7, RCONV_IM_DIM_X, RCONV_IM_DIM_Y, RCONV_IM_CH, rconv_weight_q7,
529                                        RCONV_OUT_CH, RCONV_KER_DIM_X, RCONV_KER_DIM_Y, RCONV_PADDING_X, RCONV_PADDING_Y,
530                                        RCONV_STRIDE_X, RCONV_STRIDE_Y, rconv_bias_q7, 1, 7, rconv_im_out_opt_q7,
531                                        RCONV_OUT_DIM_X, RCONV_OUT_DIM_Y, rconv_buf, NULL);
532 
533     verify_results_q7(rconv_im_out_ref_q7, rconv_im_out_opt_q7, RCONV_OUT_DIM_Y * RCONV_OUT_DIM_X * RCONV_OUT_CH);
534 
535     initialize_results_q7(rconv_im_out_ref_q7, rconv_im_out_opt_q7, RCONV_OUT_DIM_Y * RCONV_OUT_DIM_X * RCONV_OUT_CH);
536 
537     printf("start conv q7 nonsquare ref implementation\n");
538     arm_convolve_HWC_q7_ref_nonsquare(rconv_im_in_q7, RCONV_IM_DIM_X, RCONV_IM_DIM_Y, RCONV_IM_CH, rconv_weight_q7,
539                                       RCONV_OUT_CH, RCONV_KER_DIM_X, RCONV_KER_DIM_Y, RCONV_PADDING_X, RCONV_PADDING_Y,
540                                       RCONV_STRIDE_X, RCONV_STRIDE_Y, rconv_bias_q7, 1, 7, rconv_im_out_ref_q7,
541                                       RCONV_OUT_DIM_X, RCONV_OUT_DIM_Y, rconv_buf, NULL);
542 
543     printf("start conv q7 nonsquare basic implementation\n");
544     arm_convolve_HWC_q7_basic_nonsquare(rconv_im_in_q7, RCONV_IM_DIM_X, RCONV_IM_DIM_Y, RCONV_IM_CH, rconv_weight_q7,
545                                        RCONV_OUT_CH, RCONV_KER_DIM_X, RCONV_KER_DIM_Y, RCONV_PADDING_X, RCONV_PADDING_Y,
546                                        RCONV_STRIDE_X, RCONV_STRIDE_Y, rconv_bias_q7, 1, 7, rconv_im_out_opt_q7,
547                                        RCONV_OUT_DIM_X, RCONV_OUT_DIM_Y, rconv_buf, NULL);
548 
549     verify_results_q7(rconv_im_out_ref_q7, rconv_im_out_opt_q7, RCONV_OUT_DIM_Y * RCONV_OUT_DIM_X * RCONV_OUT_CH);
550 
551     initialize_results_q7(rconv_im_out_ref_q7, rconv_im_out_opt_q7, RCONV_OUT_DIM_Y * RCONV_OUT_DIM_X * RCONV_OUT_CH);
552 
553     printf("start 1x1 conv q7 nonsquare fast implementation\n");
554     arm_convolve_HWC_q7_fast_nonsquare(rconv_im_in_q7, RCONV_IM_DIM_X, RCONV_IM_DIM_Y, RCONV_IM_CH, rconv_weight_q7,
555                                        RCONV_OUT_CH, 1, 1, 0, 0, RCONV_STRIDE_X,
556                                        RCONV_STRIDE_Y, rconv_bias_q7, 1, 7, rconv_im_out_ref_q7, RCONV_OUT_DIM_X,
557                                        RCONV_OUT_DIM_Y, rconv_buf, NULL);
558 
559     printf("start 1x1 conv q7 nonsquare dedicated function implementation\n");
560     arm_convolve_1x1_HWC_q7_fast_nonsquare(rconv_im_in_q7, RCONV_IM_DIM_X, RCONV_IM_DIM_Y, RCONV_IM_CH, rconv_weight_q7,
561                                            RCONV_OUT_CH, 1, 1, 0, 0, RCONV_STRIDE_X,
562                                            RCONV_STRIDE_Y, rconv_bias_q7, 1, 7, rconv_im_out_opt_q7, RCONV_OUT_DIM_X,
563                                            RCONV_OUT_DIM_Y, rconv_buf, NULL);
564 
565     verify_results_q7(rconv_im_out_ref_q7, rconv_im_out_opt_q7, RCONV_OUT_DIM_Y * RCONV_OUT_DIM_X * RCONV_OUT_CH);
566 
567     printf("start depthwise separable conv q7 nonsquare ref implementation\n");
568     arm_depthwise_separable_conv_HWC_q7_ref_nonsquare(rconv_im_in_q7, RCONV_IM_DIM_X, RCONV_IM_DIM_Y, RCONV_IM_CH,
569                                                       rconv_weight_q7, RCONV_OUT_CH, RCONV_KER_DIM_X, RCONV_KER_DIM_Y,
570                                                       RCONV_PADDING_X, RCONV_PADDING_Y, RCONV_STRIDE_X, RCONV_STRIDE_Y,
571                                                       rconv_bias_q7, 1, 7, rconv_im_out_ref_q7, RCONV_OUT_DIM_X,
572                                                       RCONV_OUT_DIM_Y, rconv_buf, NULL);
573 
574     printf("start depthwise separable conv q7 nonsquare opt implementation\n");
575     arm_depthwise_separable_conv_HWC_q7_nonsquare(rconv_im_in_q7, RCONV_IM_DIM_X, RCONV_IM_DIM_Y, RCONV_IM_CH,
576                                                   rconv_weight_q7, RCONV_OUT_CH, RCONV_KER_DIM_X, RCONV_KER_DIM_Y,
577                                                   RCONV_PADDING_X, RCONV_PADDING_Y, RCONV_STRIDE_X, RCONV_STRIDE_Y,
578                                                   rconv_bias_q7, 1, 7, rconv_im_out_opt_q7, RCONV_OUT_DIM_X,
579                                                   RCONV_OUT_DIM_Y, rconv_buf, NULL);
580 
581     verify_results_q7(rconv_im_out_ref_q7, rconv_im_out_opt_q7, RCONV_OUT_DIM_Y * RCONV_OUT_DIM_X * RCONV_OUT_CH);
582 
583     delete[]test1;
584     delete[]test2;
585     delete[]test3;
586 
587 	test2 = new q15_t[RCONV_KER_DIM_Y * RCONV_KER_DIM_X * RCONV_IM_CH * RCONV_OUT_CH + RCONV_OUT_CH]; // weights + bias
588 	test4 = new q15_t[2 * RCONV_KER_DIM_Y * RCONV_KER_DIM_X * RCONV_IM_CH   //buffer
589 	         + RCONV_IM_DIM_Y * RCONV_IM_DIM_X * RCONV_IM_CH + 2 * RCONV_OUT_DIM_Y * RCONV_OUT_DIM_X * RCONV_OUT_CH]; // i/o
590 
591     for (int i = 0; i < RCONV_KER_DIM_Y * RCONV_KER_DIM_X * RCONV_IM_CH * RCONV_OUT_CH + RCONV_OUT_CH; i++)
592     {
593         test2[i] = rand() % 256 - 100;
594     }
595 
596     for (int i = 0;
597          i < 2 * RCONV_KER_DIM_Y * RCONV_KER_DIM_X * RCONV_IM_CH
598          + RCONV_IM_DIM_Y * RCONV_IM_DIM_X * RCONV_IM_CH + 2 * RCONV_OUT_DIM_Y * RCONV_OUT_DIM_X * RCONV_OUT_CH;
599         i++)
600     {
601         test4[i] = rand() % 256 - 100;
602     }
603 
604     q15_t     *rconv_weight_q15 = test2;
605     q15_t     *rconv_bias_q15 = test2 + RCONV_KER_DIM_Y * RCONV_KER_DIM_X * RCONV_IM_CH * RCONV_OUT_CH;
606 
607     rconv_buf = test4;
608 
609     q15_t     *rconv_im_in_q15 = test4 + 2 * RCONV_KER_DIM_Y * RCONV_KER_DIM_X * RCONV_IM_CH;
610     q15_t     *rconv_im_out_ref_q15 = rconv_im_in_q15 + RCONV_IM_DIM_Y * RCONV_IM_DIM_X * RCONV_IM_CH;
611     q15_t     *rconv_im_out_opt_q15 = rconv_im_out_ref_q15 + RCONV_OUT_DIM_Y * RCONV_OUT_DIM_X * RCONV_OUT_CH;
612 
613     initialize_results_q15(rconv_im_out_ref_q15, rconv_im_out_opt_q15, RCONV_OUT_DIM_Y * RCONV_OUT_DIM_X * RCONV_OUT_CH);
614 
615     printf("start conv q15 nonsquare ref implementation\n");
616     arm_convolve_HWC_q15_nonsquare_ref(rconv_im_in_q15, RCONV_IM_DIM_X, RCONV_IM_DIM_Y, RCONV_IM_CH, rconv_weight_q15,
617                                       RCONV_OUT_CH, RCONV_KER_DIM_X, RCONV_KER_DIM_Y, RCONV_PADDING_X, RCONV_PADDING_Y,
618                                       RCONV_STRIDE_X, RCONV_STRIDE_Y, rconv_bias_q15, 1, 7, rconv_im_out_ref_q15,
619                                       RCONV_OUT_DIM_X, RCONV_OUT_DIM_Y, rconv_buf, NULL);
620 
621     printf("start conv q5 nonsquare opt implementation\n");
622     arm_convolve_HWC_q15_fast_nonsquare(rconv_im_in_q15, RCONV_IM_DIM_X, RCONV_IM_DIM_Y, RCONV_IM_CH, rconv_weight_q15,
623                                        RCONV_OUT_CH, RCONV_KER_DIM_X, RCONV_KER_DIM_Y, RCONV_PADDING_X, RCONV_PADDING_Y,
624                                        RCONV_STRIDE_X, RCONV_STRIDE_Y, rconv_bias_q15, 1, 7, rconv_im_out_opt_q15,
625                                        RCONV_OUT_DIM_X, RCONV_OUT_DIM_Y, rconv_buf, NULL);
626 
627     verify_results_q15(rconv_im_out_ref_q15, rconv_im_out_opt_q15, RCONV_OUT_DIM_Y * RCONV_OUT_DIM_X * RCONV_OUT_CH);
628 
629     delete [] test2;
630     delete [] test4;
631 #endif
632 
633 #ifdef TEST_CONV
634 
635 #define CONV_IM_DIM 16
636 #define CONV_IM_CH 16
637 #define CONV_KER_DIM 5
638 #define CONV_OUT_CH 16
639 #define CONV_OUT_DIM 16
640 
641     test1 = new q7_t[CONV_KER_DIM * CONV_KER_DIM * CONV_IM_CH * CONV_OUT_CH + CONV_OUT_CH];
642     test2 =
643         new q15_t[CONV_KER_DIM * CONV_KER_DIM * CONV_IM_CH * CONV_OUT_CH +
644                   2 * CONV_KER_DIM * CONV_KER_DIM * CONV_IM_CH * CONV_OUT_CH + CONV_OUT_CH];
645     test3 = new q7_t[CONV_IM_DIM * CONV_IM_DIM * CONV_IM_CH + 2 * CONV_OUT_DIM * CONV_OUT_DIM * CONV_OUT_CH];
646     test4 = new q15_t[CONV_IM_DIM * CONV_IM_DIM * CONV_IM_CH + 2 * CONV_OUT_DIM * CONV_OUT_DIM * CONV_OUT_CH];
647 
648     for (int i = 0; i < CONV_KER_DIM * CONV_KER_DIM * CONV_IM_CH * CONV_OUT_CH + CONV_OUT_CH; i++)
649     {
650         test1[i] = rand() % 256 - 100;
651     }
652 
653     for (int i = 0;
654          i <
655          CONV_KER_DIM * CONV_KER_DIM * CONV_IM_CH * CONV_OUT_CH +
656          2 * CONV_KER_DIM * CONV_KER_DIM * CONV_IM_CH * CONV_OUT_CH + CONV_OUT_CH; i++)
657     {
658         test2[i] = (rand() % 65536 - 32768);
659     }
660 
661     for (int i = 0; i < CONV_IM_DIM * CONV_IM_DIM * CONV_IM_CH + 2 * CONV_OUT_DIM * CONV_OUT_DIM * CONV_OUT_CH; i++)
662     {
663         test3[i] = rand() % 256 - 100;
664     }
665 
666     for (int i = 0; i < CONV_IM_DIM * CONV_IM_DIM * CONV_IM_CH + 2 * CONV_OUT_DIM * CONV_OUT_DIM * CONV_OUT_CH; i++)
667     {
668         test4[i] = (rand() % 65536 - 32768);
669     }
670 
671     q7_t     *conv_weight_q7 = test1;
672     q7_t     *conv_bias_q7 = test1 + CONV_KER_DIM * CONV_KER_DIM * CONV_IM_CH * CONV_OUT_CH;
673 
674     q15_t    *conv_weight_q15 = test2;
675     q15_t    *conv_buf = test2 + CONV_KER_DIM * CONV_KER_DIM * CONV_IM_CH * CONV_OUT_CH;
676     q15_t    *conv_bias_q15 =
677         test2 + CONV_KER_DIM * CONV_KER_DIM * CONV_IM_CH * CONV_OUT_CH +
678         2 * CONV_KER_DIM * CONV_KER_DIM * CONV_IM_CH * CONV_OUT_CH;
679 
680     q7_t     *conv_im_in_q7 = test3;
681     q7_t     *conv_im_out_ref_q7 = test3 + CONV_IM_DIM * CONV_IM_DIM * CONV_IM_CH;
682     q7_t     *conv_im_out_opt_q7 =
683         test3 + CONV_IM_DIM * CONV_IM_DIM * CONV_IM_CH + CONV_OUT_DIM * CONV_OUT_DIM * CONV_OUT_CH;
684 
685     q15_t    *conv_im_in_q15 = test4;
686     q15_t    *conv_im_out_ref_q15 = test4 + CONV_IM_DIM * CONV_IM_DIM * CONV_IM_CH;
687     q15_t    *conv_im_out_opt_q15 =
688         test4 + CONV_IM_DIM * CONV_IM_DIM * CONV_IM_CH + CONV_OUT_DIM * CONV_OUT_DIM * CONV_OUT_CH;
689 
690     initialize_results_q7(conv_im_out_ref_q7, conv_im_out_opt_q7, CONV_OUT_DIM * CONV_OUT_DIM * CONV_OUT_CH);
691 
692     printf("start q7 ref implementation\n");
693 
694     arm_convolve_HWC_q7_ref(conv_im_in_q7, CONV_IM_DIM, CONV_IM_CH, conv_weight_q7,
695                             CONV_OUT_CH, CONV_KER_DIM, 2, 1, conv_bias_q7, 1, 7, conv_im_out_ref_q7,
696                             CONV_OUT_DIM, conv_buf, NULL);
697 
698     printf("start q7 basic implementation\n");
699 
700     arm_convolve_HWC_q7_basic(conv_im_in_q7, CONV_IM_DIM, CONV_IM_CH, conv_weight_q7,
701                               CONV_OUT_CH, CONV_KER_DIM, 2, 1, conv_bias_q7, 1, 7, conv_im_out_opt_q7,
702                               CONV_OUT_DIM, conv_buf, NULL);
703 
704     verify_results_q7(conv_im_out_ref_q7, conv_im_out_opt_q7, CONV_OUT_DIM * CONV_OUT_DIM * CONV_OUT_CH);
705 
706     printf("start q7 fast implementation\n");
707 
708     arm_convolve_HWC_q7_fast(conv_im_in_q7, CONV_IM_DIM, CONV_IM_CH, conv_weight_q7,
709                              CONV_OUT_CH, CONV_KER_DIM, 2, 1, conv_bias_q7, 1, 7, conv_im_out_opt_q7,
710                              CONV_OUT_DIM, conv_buf, NULL);
711 
712     verify_results_q7(conv_im_out_ref_q7, conv_im_out_opt_q7, CONV_OUT_DIM * CONV_OUT_DIM * CONV_OUT_CH);
713 
714     // testing with RGB
715     printf("start q7 ref implementation for RGB\n");
716 
717     arm_convolve_HWC_q7_ref(conv_im_in_q7, CONV_IM_DIM, 3, conv_weight_q7,
718                             CONV_OUT_CH, CONV_KER_DIM, 2, 1, conv_bias_q7, 1, 7, conv_im_out_ref_q7,
719                             CONV_OUT_DIM, conv_buf, NULL);
720 
721     printf("start q7 basic implementation for RGB\n");
722 
723     arm_convolve_HWC_q7_basic(conv_im_in_q7, CONV_IM_DIM, 3, conv_weight_q7,
724                               CONV_OUT_CH, CONV_KER_DIM, 2, 1, conv_bias_q7, 1, 7, conv_im_out_opt_q7,
725                               CONV_OUT_DIM, conv_buf, NULL);
726 
727     verify_results_q7(conv_im_out_ref_q7, conv_im_out_opt_q7, CONV_OUT_DIM * CONV_OUT_DIM * CONV_OUT_CH);
728 
729     printf("start q7 RGB implementation for RGB\n");
730 
731     arm_convolve_HWC_q7_RGB(conv_im_in_q7, CONV_IM_DIM, 3, conv_weight_q7,
732                             CONV_OUT_CH, CONV_KER_DIM, 2, 1, conv_bias_q7, 1, 7, conv_im_out_opt_q7,
733                             CONV_OUT_DIM, conv_buf, NULL);
734 
735     verify_results_q7(conv_im_out_ref_q7, conv_im_out_opt_q7, CONV_OUT_DIM * CONV_OUT_DIM * CONV_OUT_CH);
736 
737     // testing q15
738     initialize_results_q15(conv_im_out_ref_q15, conv_im_out_opt_q15, CONV_OUT_DIM * CONV_OUT_DIM * CONV_OUT_CH);
739 
740     printf("start q15 ref implementation\n");
741 
742     arm_convolve_HWC_q15_ref(conv_im_in_q15, CONV_IM_DIM, CONV_IM_CH, conv_weight_q15,
743                              CONV_OUT_CH, CONV_KER_DIM, 2, 1, conv_bias_q15, 0, 15, conv_im_out_ref_q15,
744                              CONV_OUT_DIM, conv_buf, NULL);
745 
746     printf("start q15 basic implementation\n");
747 
748     arm_convolve_HWC_q15_basic(conv_im_in_q15, CONV_IM_DIM, CONV_IM_CH, conv_weight_q15,
749                                CONV_OUT_CH, CONV_KER_DIM, 2, 1, conv_bias_q15, 0, 15, conv_im_out_opt_q15,
750                                CONV_OUT_DIM, conv_buf, NULL);
751 
752     verify_results_q15(conv_im_out_ref_q15, conv_im_out_opt_q15, CONV_OUT_DIM * CONV_OUT_DIM * CONV_OUT_CH);
753 
754     printf("start q15 fast implementation\n");
755 
756     arm_convolve_HWC_q15_fast(conv_im_in_q15, CONV_IM_DIM, CONV_IM_CH, conv_weight_q15,
757                               CONV_OUT_CH, CONV_KER_DIM, 2, 1, conv_bias_q15, 0, 15, conv_im_out_opt_q15,
758                               CONV_OUT_DIM, conv_buf, NULL);
759 
760     verify_results_q15(conv_im_out_ref_q15, conv_im_out_opt_q15, CONV_OUT_DIM * CONV_OUT_DIM * CONV_OUT_CH);
761 
762     // depthwise separable conv
763     initialize_results_q7(conv_im_out_ref_q7, conv_im_out_opt_q7, CONV_OUT_DIM * CONV_OUT_DIM * CONV_OUT_CH);
764 
765     printf("start q7 depthwise_separable_conv ref implementation\n");
766 
767     arm_depthwise_separable_conv_HWC_q7_ref(conv_im_in_q7, CONV_IM_DIM, CONV_IM_CH, conv_weight_q7,
768                                             CONV_OUT_CH, CONV_KER_DIM, 2, 1, conv_bias_q7, 1, 7, conv_im_out_ref_q7,
769                                             CONV_OUT_DIM, conv_buf, NULL);
770 
771     printf("start q7 depthwise_separable_conv implementation\n");
772 
773     arm_depthwise_separable_conv_HWC_q7(conv_im_in_q7, CONV_IM_DIM, CONV_IM_CH, conv_weight_q7,
774                                         CONV_OUT_CH, CONV_KER_DIM, 2, 1, conv_bias_q7, 1, 7, conv_im_out_opt_q7,
775                                         CONV_OUT_DIM, conv_buf, NULL);
776 
777     verify_results_q7(conv_im_out_ref_q7, conv_im_out_opt_q7, CONV_OUT_DIM * CONV_OUT_DIM * CONV_OUT_CH);
778 
779     delete[]test1;
780     delete[]test2;
781     delete[]test3;
782     delete[]test4;
783 
784 #endif
785 
786     test_pass = true;
787     test_index = 0;
788     while (test_flags[test_index] != -1) {
789         if (test_flags[test_index]) {
790              test_pass = false;
791         }
792         test_index ++;
793     }
794     if (test_pass) {
795         printf("All tests passed\n");
796     } else {
797         printf("Test failed passed\n");
798     }
799 
800     return 0;
801 }
802