1 /*
2  * Copyright (c) 2008-2012 Travis Geiselbrecht
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining
5  * a copy of this software and associated documentation files
6  * (the "Software"), to deal in the Software without restriction,
7  * including without limitation the rights to use, copy, modify, merge,
8  * publish, distribute, sublicense, and/or sell copies of the Software,
9  * and to permit persons to whom the Software is furnished to do so,
10  * subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be
13  * included in all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22  */
23 #include <sys/types.h>
24 #include <stdio.h>
25 #include <rand.h>
26 #include <err.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <app/tests.h>
30 #include <kernel/thread.h>
31 #include <kernel/mutex.h>
32 #include <kernel/semaphore.h>
33 #include <kernel/event.h>
34 #include <platform.h>
35 
36 const size_t BUFSIZE = (1024*1024);
37 const uint ITER = 1024;
38 
bench_set_overhead(void)39 __NO_INLINE static void bench_set_overhead(void)
40 {
41     uint32_t *buf = malloc(BUFSIZE);
42     if (!buf) {
43         printf("failed to allocate buffer\n");
44         return;
45     }
46 
47     uint count = arch_cycle_count();
48     for (uint i = 0; i < ITER; i++) {
49         __asm__ volatile("");
50     }
51     count = arch_cycle_count() - count;
52 
53     printf("took %u cycles overhead to loop %u times\n",
54            count, ITER);
55 
56     free(buf);
57 }
58 
bench_memset(void)59 __NO_INLINE static void bench_memset(void)
60 {
61     void *buf = malloc(BUFSIZE);
62     if (!buf) {
63         printf("failed to allocate buffer\n");
64         return;
65     }
66 
67     uint count = arch_cycle_count();
68     for (uint i = 0; i < ITER; i++) {
69         memset(buf, 0, BUFSIZE);
70     }
71     count = arch_cycle_count() - count;
72 
73     printf("took %u cycles to memset a buffer of size %u %d times (%u bytes), %f bytes/cycle\n",
74            count, BUFSIZE, ITER, BUFSIZE * ITER, (BUFSIZE * ITER) / (float)count);
75 
76     free(buf);
77 }
78 
79 #define bench_cset(type) \
80 __NO_INLINE static void bench_cset_##type(void) \
81 { \
82     type *buf = malloc(BUFSIZE); \
83     if (!buf) { \
84         printf("failed to allocate buffer\n"); \
85         return; \
86     } \
87  \
88     uint count = arch_cycle_count(); \
89     for (uint i = 0; i < ITER; i++) { \
90         for (uint j = 0; j < BUFSIZE / sizeof(*buf); j++) { \
91             buf[j] = 0; \
92         } \
93     } \
94     count = arch_cycle_count() - count; \
95  \
96     printf("took %u cycles to manually clear a buffer using wordsize %d of size %u %d times (%u bytes), %f bytes/cycle\n", \
97            count, sizeof(*buf), BUFSIZE, ITER, BUFSIZE * ITER, (BUFSIZE * ITER) / (float)count); \
98  \
99     free(buf); \
100 }
101 
102 bench_cset(uint8_t)
bench_cset(uint16_t)103 bench_cset(uint16_t)
104 bench_cset(uint32_t)
105 bench_cset(uint64_t)
106 
107 __NO_INLINE static void bench_cset_wide(void)
108 {
109     uint32_t *buf = malloc(BUFSIZE);
110     if (!buf) {
111         printf("failed to allocate buffer\n");
112         return;
113     }
114 
115     uint count = arch_cycle_count();
116     for (uint i = 0; i < ITER; i++) {
117         for (uint j = 0; j < BUFSIZE / sizeof(*buf) / 8; j++) {
118             buf[j*8] = 0;
119             buf[j*8+1] = 0;
120             buf[j*8+2] = 0;
121             buf[j*8+3] = 0;
122             buf[j*8+4] = 0;
123             buf[j*8+5] = 0;
124             buf[j*8+6] = 0;
125             buf[j*8+7] = 0;
126         }
127     }
128     count = arch_cycle_count() - count;
129 
130     printf("took %u cycles to manually clear a buffer of size %u %d times 8 words at a time (%u bytes), %f bytes/cycle\n",
131            count, BUFSIZE, ITER, BUFSIZE * ITER, (BUFSIZE * ITER) / (float)count);
132 
133     free(buf);
134 }
135 
bench_memcpy(void)136 __NO_INLINE static void bench_memcpy(void)
137 {
138     uint8_t *buf = malloc(BUFSIZE);
139     if (!buf) {
140         printf("failed to allocate buffer\n");
141         return;
142     }
143 
144     uint count = arch_cycle_count();
145     for (uint i = 0; i < ITER; i++) {
146         memcpy(buf, buf + BUFSIZE / 2, BUFSIZE / 2);
147     }
148     count = arch_cycle_count() - count;
149 
150     printf("took %u cycles to memcpy a buffer of size %u %d times (%u source bytes), %f source bytes/cycle\n",
151            count, BUFSIZE / 2, ITER, BUFSIZE / 2 * ITER, (BUFSIZE / 2 * ITER) / (float)count);
152 
153     free(buf);
154 }
155 
156 #if ARCH_ARM
arm_bench_cset_stm(void)157 __NO_INLINE static void arm_bench_cset_stm(void)
158 {
159     uint32_t *buf = malloc(BUFSIZE);
160     if (!buf) {
161         printf("failed to allocate buffer\n");
162         return;
163     }
164 
165     uint count = arch_cycle_count();
166     for (uint i = 0; i < ITER; i++) {
167         for (uint j = 0; j < BUFSIZE / sizeof(*buf) / 8; j++) {
168             __asm__ volatile(
169                 "stm    %0, {r0-r7};"
170                 :: "r" (&buf[j*8])
171             );
172         }
173     }
174     count = arch_cycle_count() - count;
175 
176     printf("took %u cycles to manually clear a buffer of size %u %d times 8 words at a time using stm (%u bytes), %f bytes/cycle\n",
177            count, BUFSIZE, ITER, BUFSIZE * ITER, (BUFSIZE * ITER) / (float)count);
178 
179     free(buf);
180 }
181 
182 #if       (__CORTEX_M >= 0x03)
arm_bench_multi_issue(void)183 __NO_INLINE static void arm_bench_multi_issue(void)
184 {
185     uint32_t cycles;
186     uint32_t a = 0, b = 0, c = 0, d = 0, e = 0, f = 0, g = 0, h = 0;
187 #define ITER 1000000
188     uint count = ITER;
189     cycles = arch_cycle_count();
190     while (count--) {
191         asm volatile ("");
192         asm volatile ("add %0, %0, %0" : "=r" (a) : "r" (a));
193         asm volatile ("add %0, %0, %0" : "=r" (b) : "r" (b));
194         asm volatile ("and %0, %0, %0" : "=r" (c) : "r" (c));
195         asm volatile ("mov %0, %0" : "=r" (d) : "r" (d));
196         asm volatile ("orr %0, %0, %0" : "=r" (e) : "r" (e));
197         asm volatile ("add %0, %0, %0" : "=r" (f) : "r" (f));
198         asm volatile ("and %0, %0, %0" : "=r" (g) : "r" (g));
199         asm volatile ("mov %0, %0" : "=r" (h) : "r" (h));
200     }
201     cycles = arch_cycle_count() - cycles;
202 
203     printf("took %u cycles to issue 8 integer ops (%f cycles/iteration)\n", cycles, (float)cycles / ITER);
204 #undef ITER
205 }
206 #endif // __CORTEX_M
207 #endif // ARCH_ARM
208 
209 #if WITH_LIB_LIBM
210 #include <math.h>
211 
bench_sincos(void)212 __NO_INLINE static void bench_sincos(void)
213 {
214     printf("touching the floating point unit\n");
215     __UNUSED volatile double _hole = sin(0);
216 
217     uint count = arch_cycle_count();
218     __UNUSED double a = sin(2.0);
219     count = arch_cycle_count() - count;
220     printf("took %u cycles for sin()\n", count);
221 
222     count = arch_cycle_count();
223     a = cos(2.0);
224     count = arch_cycle_count() - count;
225     printf("took %u cycles for cos()\n", count);
226 
227     count = arch_cycle_count();
228     a = sinf(2.0);
229     count = arch_cycle_count() - count;
230     printf("took %u cycles for sinf()\n", count);
231 
232     count = arch_cycle_count();
233     a = cosf(2.0);
234     count = arch_cycle_count() - count;
235     printf("took %u cycles for cosf()\n", count);
236 
237     count = arch_cycle_count();
238     a = sqrt(1234567.0);
239     count = arch_cycle_count() - count;
240     printf("took %u cycles for sqrt()\n", count);
241 
242     count = arch_cycle_count();
243     a = sqrtf(1234567.0f);
244     count = arch_cycle_count() - count;
245     printf("took %u cycles for sqrtf()\n", count);
246 }
247 
248 #endif // WITH_LIB_LIBM
249 
benchmarks(void)250 void benchmarks(void)
251 {
252     bench_set_overhead();
253     bench_memset();
254     bench_memcpy();
255 
256     bench_cset_uint8_t();
257     bench_cset_uint16_t();
258     bench_cset_uint32_t();
259     bench_cset_uint64_t();
260     bench_cset_wide();
261 
262 #if ARCH_ARM
263     arm_bench_cset_stm();
264 
265 #if       (__CORTEX_M >= 0x03)
266     arm_bench_multi_issue();
267 #endif
268 #endif
269 #if WITH_LIB_LIBM
270     bench_sincos();
271 #endif
272 }
273 
274