1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #ifdef __APPLE__
16 #include <sys/time.h>
17 #endif
18
19 #include <cstdint>
20 #include <cstdlib>
21 #include <ctime>
22 #include <iostream>
23 #include <map>
24 #include <vector>
25 #ifdef __APPLE__
26 #include <TargetConditionals.h>
27 #endif
28
29 #include "test.h"
30
31 #ifndef GEMMLOWP_TEST_BIT_DEPTH_PARAMS
32 #define GEMMLOWP_TEST_BIT_DEPTH_PARAMS DefaultL8R8BitDepthParams
33 #endif
34
35 #if defined(__arm__) && !defined(GEMMLOWP_NEON)
36 #warning "Building without NEON support on ARM, check your compiler setup!"
37 #endif
38
39 #if defined(__mips) && !defined(GEMMLOWP_MSA)
40 #warning "Building without MSA support on MIPS, check your compiler setup!"
41 #endif
42
43 #if defined(__AVX2__) && !defined(GEMMLOWP_AVX2)
44 #warning \
45 "Building without AVX2 support on AVX2 enabled machine, check your compiler setup!"
46 #endif
47
48 #if defined(__SSE4_2__) && !defined(GEMMLOWP_AVX2) && !defined(GEMMLOWP_SSE4)
49 #warning \
50 "Building without SSE4.2 support on SSE4.2 enabled machine, check your compiler setup!"
51 #endif
52
53 namespace gemmlowp {
54
55 const double min_accurate_duration = 1e-1;
56 const std::size_t min_working_set_size = 16 * 1024 * 1024;
57
58 struct gemm_t {
59 int rows, depth, cols;
gemm_tgemmlowp::gemm_t60 gemm_t() : rows(0), depth(0), cols(0) {}
gemm_tgemmlowp::gemm_t61 gemm_t(int r, int d, int c) : rows(r), depth(d), cols(c) {}
62 };
63
operator <(const gemm_t & a,const gemm_t & b)64 bool operator<(const gemm_t& a, const gemm_t& b) {
65 return a.rows < b.rows ||
66 (a.rows <= b.rows &&
67 (a.depth < b.depth || (a.depth <= b.depth && (a.cols < b.cols))));
68 }
69
70 template <typename LhsType, typename RhsType, typename ResultType>
time_for_gemms(GemmContext * context,const std::vector<gemm_t> & gemms)71 double time_for_gemms(GemmContext* context, const std::vector<gemm_t>& gemms) {
72 typedef std::uint8_t Scalar;
73
74 // set up the matrix pool
75
76 std::size_t combined_gemm_sizes = 0;
77 for (auto gemm : gemms) {
78 int rows = gemm.rows;
79 int depth = gemm.depth;
80 int cols = gemm.cols;
81 combined_gemm_sizes +=
82 sizeof(Scalar) * (rows * depth + depth * cols + rows * cols);
83 }
84
85 const std::size_t pool_size = 1 + min_working_set_size / combined_gemm_sizes;
86
87 std::vector<LhsType> lhs(pool_size * gemms.size());
88 std::vector<RhsType> rhs(pool_size * gemms.size());
89 std::vector<ResultType> result(pool_size * gemms.size());
90
91 for (std::size_t i = 0; i < pool_size; i++) {
92 for (std::size_t j = 0; j < gemms.size(); j++) {
93 int k = i * gemms.size() + j;
94 lhs[k].Resize(gemms[j].rows, gemms[j].depth);
95 MakeConstant(&lhs[k], 0);
96 rhs[k].Resize(gemms[j].depth, gemms[j].cols);
97 MakeConstant(&rhs[k], 0);
98 result[k].Resize(gemms[j].rows, gemms[j].cols);
99 MakeConstant(&result[k], 0);
100 }
101 }
102
103 // main benchmark loop
104
105 int iters_at_a_time = 1;
106 float time_per_iter = 0.0f;
107 std::size_t pool_index = 0;
108
109 while (true) {
110 double starttime = real_time_in_seconds();
111 for (int i = 0; i < iters_at_a_time; i++) {
112 for (size_t j = 0; j < gemms.size(); j++) {
113 size_t k = pool_index * gemms.size() + j;
114 Gemm<std::uint8_t, GEMMLOWP_TEST_BIT_DEPTH_PARAMS>(
115 context, lhs[k].const_map(), rhs[k].const_map(), &result[k].map(),
116 -75, -91, 74980, 123, 20);
117 }
118 pool_index++;
119 if (pool_index == pool_size) {
120 pool_index = 0;
121 }
122 }
123 double endtime = real_time_in_seconds();
124
125 const float timing = static_cast<float>(endtime - starttime);
126
127 if (timing >= min_accurate_duration) {
128 time_per_iter = timing / iters_at_a_time;
129 break;
130 }
131
132 iters_at_a_time *= 2;
133 }
134
135 return time_per_iter;
136 }
137
138 template <typename LhsType, typename RhsType, typename ResultType>
gflops_for_gemms(GemmContext * context,const std::vector<gemm_t> & gemms)139 double gflops_for_gemms(GemmContext* context,
140 const std::vector<gemm_t>& gemms) {
141 const double time_per_iter =
142 time_for_gemms<LhsType, RhsType, ResultType>(context, gemms);
143 double ops = 0;
144 for (auto gemm : gemms) {
145 ops += 2.0 * gemm.rows * gemm.depth * gemm.cols;
146 }
147 return 1e-9 * ops / time_per_iter;
148 }
149
benchmark(GemmContext * context)150 void benchmark(GemmContext* context) {
151 std::map<gemm_t, std::vector<double>> benchmark_results;
152
153 std::vector<gemm_t> benchmark_gemms;
154 benchmark_gemms.emplace_back(10, 10, 10);
155 benchmark_gemms.emplace_back(20, 20, 20);
156 benchmark_gemms.emplace_back(30, 30, 30);
157 benchmark_gemms.emplace_back(40, 40, 40);
158 benchmark_gemms.emplace_back(50, 50, 50);
159 benchmark_gemms.emplace_back(60, 60, 60);
160 benchmark_gemms.emplace_back(64, 256, 147);
161 benchmark_gemms.emplace_back(100, 100, 1);
162 benchmark_gemms.emplace_back(100, 100, 100);
163 benchmark_gemms.emplace_back(100, 1000, 100);
164 benchmark_gemms.emplace_back(1000, 1000, 1);
165 benchmark_gemms.emplace_back(1000, 1000, 10);
166 benchmark_gemms.emplace_back(1000, 1000, 100);
167 benchmark_gemms.emplace_back(1000, 1000, 1000);
168
169 const int repeat = 2;
170
171 typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
172 typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
173 typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
174
175 #ifdef GEMMLOWP_TEST_PROFILE
176 gemmlowp::RegisterCurrentThreadForProfiling();
177 gemmlowp::StartProfiling();
178 #endif
179
180 // We don't record the first repetition, it's just warm-up.
181 for (int r = 0; r < repeat + 1; r++) {
182 std::cout << "repetition " << r + 1 << "/" << repeat + 1 << "...\r"
183 << std::flush;
184 for (auto gemm : benchmark_gemms) {
185 double gflops = 0;
186 std::vector<gemm_t> unique_gemm;
187 unique_gemm.push_back(gemm);
188 gflops =
189 gflops_for_gemms<LhsType, RhsType, ResultType>(context, unique_gemm);
190 if (r > 0) {
191 benchmark_results[gemm].emplace_back(gflops);
192 }
193 }
194 }
195
196 #ifdef GEMMLOWP_TEST_PROFILE
197 gemmlowp::FinishProfiling();
198 #endif
199
200 std::cout << " \r"
201 << std::flush;
202
203 std::cout.precision(4);
204
205 for (auto b : benchmark_results) {
206 sort(b.second.begin(), b.second.end());
207 std::cout << b.first.rows << "x" << b.first.depth << "x" << b.first.cols
208 << " : " << b.second.back() << " GFlops/s" << std::endl;
209 }
210 std::cout << std::endl;
211 }
212
benchmark_gemm_sizes(GemmContext * context,const std::vector<gemm_t> & gemms,double mintime)213 void benchmark_gemm_sizes(GemmContext* context,
214 const std::vector<gemm_t>& gemms, double mintime) {
215 typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
216 typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
217 typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
218
219 std::vector<float> gemm_times;
220 std::cout << "running for " << mintime << " seconds..." << std::endl;
221
222 #ifdef GEMMLOWP_TEST_PROFILE
223 gemmlowp::RegisterCurrentThreadForProfiling();
224 gemmlowp::StartProfiling();
225 #endif
226
227 double starttime = real_time_in_seconds();
228 while (real_time_in_seconds() < starttime + mintime) {
229 gemm_times.push_back(
230 time_for_gemms<LhsType, RhsType, ResultType>(context, gemms));
231 }
232
233 #ifdef GEMMLOWP_TEST_PROFILE
234 gemmlowp::FinishProfiling();
235 #endif
236
237 std::sort(gemm_times.begin(), gemm_times.end());
238
239 double sum_gemm_times = 0;
240 double sum_gemm_times_trimmed = 0;
241 int count_gemm_times_trimmed = 0;
242 const float trim_ratio = 0.25;
243 const size_t count_trimmed = gemm_times.size() * trim_ratio;
244 double sum_gemm_times_best = 0;
245 int count_gemm_times_best = 0;
246 const float best_ratio = 0.1;
247 const size_t count_best = gemm_times.size() * best_ratio;
248
249 for (size_t i = 0; i < gemm_times.size(); i++) {
250 sum_gemm_times += gemm_times[i];
251 if (i >= count_trimmed && i < gemm_times.size() - count_trimmed) {
252 sum_gemm_times_trimmed += gemm_times[i];
253 count_gemm_times_trimmed++;
254 }
255 if (i < count_best) {
256 sum_gemm_times_best += gemm_times[i];
257 count_gemm_times_best++;
258 }
259 }
260
261 const double min_latency = gemm_times.front();
262 const double max_latency = gemm_times.back();
263 const double mean_latency = sum_gemm_times / gemm_times.size();
264 const double trimmed_mean_latency =
265 sum_gemm_times_trimmed / count_gemm_times_trimmed;
266 const double best_mean_latency = sum_gemm_times_best / count_gemm_times_best;
267
268 std::cout << "Graph latency (over " << gemm_times.size()
269 << " iterations):" << std::endl;
270 std::cout << " Best: " << min_latency << "s" << std::endl;
271 std::cout << " Worst: " << max_latency << "s" << std::endl;
272 std::cout << " Mean: " << mean_latency << "s" << std::endl;
273 std::cout << " " << 100 * trim_ratio
274 << "% trimmed mean: " << trimmed_mean_latency << "s" << std::endl;
275 std::cout << " Mean of " << 100 * best_ratio
276 << "% best: " << best_mean_latency << "s" << std::endl;
277 }
278
benchmark_googlenet(GemmContext * context)279 void benchmark_googlenet(GemmContext* context) {
280 // These are the m, n, k sizes for a typical GoogLeNet.
281 const int googlenet_gemm_sizes[] = {
282 12544, 64, 147, 3136, 64, 64, 3136, 192, 576, 784, 64, 192,
283 784, 96, 192, 784, 128, 864, 784, 16, 192, 784, 32, 400,
284 784, 32, 192, 784, 128, 256, 784, 128, 256, 784, 192, 1152,
285 784, 32, 256, 784, 96, 800, 784, 64, 256, 196, 192, 480,
286 196, 96, 480, 196, 204, 864, 196, 16, 480, 196, 48, 400,
287 196, 64, 480, 196, 160, 508, 196, 112, 508, 196, 224, 1008,
288 196, 24, 508, 196, 64, 600, 196, 64, 508, 196, 128, 512,
289 196, 128, 512, 196, 256, 1152, 196, 24, 512, 196, 64, 600,
290 196, 64, 512, 196, 112, 512, 196, 144, 512, 196, 288, 1296,
291 196, 32, 512, 196, 64, 800, 196, 64, 512, 196, 256, 528,
292 196, 160, 528, 196, 320, 1440, 196, 32, 528, 196, 128, 800,
293 196, 128, 528, 49, 256, 832, 49, 160, 832, 49, 320, 1440,
294 49, 48, 832, 49, 128, 1200, 49, 128, 832, 49, 384, 832,
295 49, 192, 832, 49, 384, 1728, 49, 48, 832, 49, 128, 1200,
296 49, 128, 832, 16, 128, 508, 1, 1024, 2048, 1, 1008, 1024,
297 16, 128, 528, 1, 1024, 2048, 1, 1008, 1024, 1, 1008, 1024,
298 };
299 assert(sizeof(googlenet_gemm_sizes) % (3 * sizeof(googlenet_gemm_sizes[0])) ==
300 0);
301 const std::size_t num_googlenet_gemms =
302 sizeof(googlenet_gemm_sizes) / (3 * sizeof(googlenet_gemm_sizes[0]));
303
304 std::vector<gemm_t> googlenet_gemms(num_googlenet_gemms);
305 for (std::size_t i = 0; i < num_googlenet_gemms; i++) {
306 googlenet_gemms[i].rows = googlenet_gemm_sizes[3 * i + 1];
307 googlenet_gemms[i].depth = googlenet_gemm_sizes[3 * i + 2];
308 googlenet_gemms[i].cols = googlenet_gemm_sizes[3 * i + 0];
309 }
310
311 const double mintime = 20.0;
312 benchmark_gemm_sizes(context, googlenet_gemms, mintime);
313 }
314
benchmark_small_model(GemmContext * context)315 void benchmark_small_model(GemmContext* context) {
316 // These are the m, n, k sizes for a small model with large batches.
317 const int small_model_gemm_sizes[] = {
318 29232, 16, 25, 7308, 6, 400, 203, 3002, 216,
319 };
320 assert(sizeof(small_model_gemm_sizes) %
321 (3 * sizeof(small_model_gemm_sizes[0])) ==
322 0);
323 const std::size_t num_small_model_gemms =
324 sizeof(small_model_gemm_sizes) / (3 * sizeof(small_model_gemm_sizes[0]));
325
326 std::vector<gemm_t> small_model_gemms(num_small_model_gemms);
327 for (std::size_t i = 0; i < num_small_model_gemms; i++) {
328 small_model_gemms[i].rows = small_model_gemm_sizes[3 * i + 1];
329 small_model_gemms[i].depth = small_model_gemm_sizes[3 * i + 2];
330 small_model_gemms[i].cols = small_model_gemm_sizes[3 * i + 0];
331 }
332
333 const double mintime = 10.0;
334 benchmark_gemm_sizes(context, small_model_gemms, mintime);
335 }
336
benchmark_all()337 void benchmark_all() {
338 {
339 gemmlowp::GemmContext context;
340 std::cout << "Benchmarking small model GEMMs..." << std::endl;
341 gemmlowp::benchmark_small_model(&context);
342 }
343
344 {
345 gemmlowp::GemmContext context;
346 std::cout << "Benchmarking typical GoogLeNet GEMMs..." << std::endl;
347 gemmlowp::benchmark_googlenet(&context);
348 }
349
350 {
351 gemmlowp::GemmContext context;
352 context.set_max_num_threads(0);
353 std::cout << "Benchmarking multi-threaded mode..." << std::endl;
354 gemmlowp::benchmark(&context);
355 }
356
357 {
358 gemmlowp::GemmContext context;
359 context.set_max_num_threads(1);
360 std::cout << "Benchmarking single-threaded mode..." << std::endl;
361 gemmlowp::benchmark(&context);
362 }
363 }
364
365 } // end namespace gemmlowp
366
367 // For iOS, we need to define our own main(), so skip it here.
368 #if !(defined(__APPLE__) && (TARGET_OS_IPHONE || TARGET_IPHONE_SIMULATOR))
main()369 int main() { gemmlowp::benchmark_all(); }
370 #endif
371