xref: /aosp_15_r20/external/gemmlowp/test/benchmark.cc (revision 5f39d1b313f0528e11bae88b3029b54b9e1033e7)
1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifdef __APPLE__
16 #include <sys/time.h>
17 #endif
18 
19 #include <cstdint>
20 #include <cstdlib>
21 #include <ctime>
22 #include <iostream>
23 #include <map>
24 #include <vector>
25 #ifdef __APPLE__
26 #include <TargetConditionals.h>
27 #endif
28 
29 #include "test.h"
30 
31 #ifndef GEMMLOWP_TEST_BIT_DEPTH_PARAMS
32 #define GEMMLOWP_TEST_BIT_DEPTH_PARAMS DefaultL8R8BitDepthParams
33 #endif
34 
35 #if defined(__arm__) && !defined(GEMMLOWP_NEON)
36 #warning "Building without NEON support on ARM, check your compiler setup!"
37 #endif
38 
39 #if defined(__mips) && !defined(GEMMLOWP_MSA)
40 #warning "Building without MSA support on MIPS, check your compiler setup!"
41 #endif
42 
43 #if defined(__AVX2__) && !defined(GEMMLOWP_AVX2)
44 #warning \
45     "Building without AVX2 support on AVX2 enabled machine, check your compiler setup!"
46 #endif
47 
48 #if defined(__SSE4_2__) && !defined(GEMMLOWP_AVX2) && !defined(GEMMLOWP_SSE4)
49 #warning \
50     "Building without SSE4.2 support on SSE4.2 enabled machine, check your compiler setup!"
51 #endif
52 
53 namespace gemmlowp {
54 
55 const double min_accurate_duration = 1e-1;
56 const std::size_t min_working_set_size = 16 * 1024 * 1024;
57 
58 struct gemm_t {
59   int rows, depth, cols;
gemm_tgemmlowp::gemm_t60   gemm_t() : rows(0), depth(0), cols(0) {}
gemm_tgemmlowp::gemm_t61   gemm_t(int r, int d, int c) : rows(r), depth(d), cols(c) {}
62 };
63 
operator <(const gemm_t & a,const gemm_t & b)64 bool operator<(const gemm_t& a, const gemm_t& b) {
65   return a.rows < b.rows ||
66          (a.rows <= b.rows &&
67           (a.depth < b.depth || (a.depth <= b.depth && (a.cols < b.cols))));
68 }
69 
70 template <typename LhsType, typename RhsType, typename ResultType>
time_for_gemms(GemmContext * context,const std::vector<gemm_t> & gemms)71 double time_for_gemms(GemmContext* context, const std::vector<gemm_t>& gemms) {
72   typedef std::uint8_t Scalar;
73 
74   // set up the matrix pool
75 
76   std::size_t combined_gemm_sizes = 0;
77   for (auto gemm : gemms) {
78     int rows = gemm.rows;
79     int depth = gemm.depth;
80     int cols = gemm.cols;
81     combined_gemm_sizes +=
82         sizeof(Scalar) * (rows * depth + depth * cols + rows * cols);
83   }
84 
85   const std::size_t pool_size = 1 + min_working_set_size / combined_gemm_sizes;
86 
87   std::vector<LhsType> lhs(pool_size * gemms.size());
88   std::vector<RhsType> rhs(pool_size * gemms.size());
89   std::vector<ResultType> result(pool_size * gemms.size());
90 
91   for (std::size_t i = 0; i < pool_size; i++) {
92     for (std::size_t j = 0; j < gemms.size(); j++) {
93       int k = i * gemms.size() + j;
94       lhs[k].Resize(gemms[j].rows, gemms[j].depth);
95       MakeConstant(&lhs[k], 0);
96       rhs[k].Resize(gemms[j].depth, gemms[j].cols);
97       MakeConstant(&rhs[k], 0);
98       result[k].Resize(gemms[j].rows, gemms[j].cols);
99       MakeConstant(&result[k], 0);
100     }
101   }
102 
103   // main benchmark loop
104 
105   int iters_at_a_time = 1;
106   float time_per_iter = 0.0f;
107   std::size_t pool_index = 0;
108 
109   while (true) {
110     double starttime = real_time_in_seconds();
111     for (int i = 0; i < iters_at_a_time; i++) {
112       for (size_t j = 0; j < gemms.size(); j++) {
113         size_t k = pool_index * gemms.size() + j;
114         Gemm<std::uint8_t, GEMMLOWP_TEST_BIT_DEPTH_PARAMS>(
115             context, lhs[k].const_map(), rhs[k].const_map(), &result[k].map(),
116             -75, -91, 74980, 123, 20);
117       }
118       pool_index++;
119       if (pool_index == pool_size) {
120         pool_index = 0;
121       }
122     }
123     double endtime = real_time_in_seconds();
124 
125     const float timing = static_cast<float>(endtime - starttime);
126 
127     if (timing >= min_accurate_duration) {
128       time_per_iter = timing / iters_at_a_time;
129       break;
130     }
131 
132     iters_at_a_time *= 2;
133   }
134 
135   return time_per_iter;
136 }
137 
138 template <typename LhsType, typename RhsType, typename ResultType>
gflops_for_gemms(GemmContext * context,const std::vector<gemm_t> & gemms)139 double gflops_for_gemms(GemmContext* context,
140                         const std::vector<gemm_t>& gemms) {
141   const double time_per_iter =
142       time_for_gemms<LhsType, RhsType, ResultType>(context, gemms);
143   double ops = 0;
144   for (auto gemm : gemms) {
145     ops += 2.0 * gemm.rows * gemm.depth * gemm.cols;
146   }
147   return 1e-9 * ops / time_per_iter;
148 }
149 
benchmark(GemmContext * context)150 void benchmark(GemmContext* context) {
151   std::map<gemm_t, std::vector<double>> benchmark_results;
152 
153   std::vector<gemm_t> benchmark_gemms;
154   benchmark_gemms.emplace_back(10, 10, 10);
155   benchmark_gemms.emplace_back(20, 20, 20);
156   benchmark_gemms.emplace_back(30, 30, 30);
157   benchmark_gemms.emplace_back(40, 40, 40);
158   benchmark_gemms.emplace_back(50, 50, 50);
159   benchmark_gemms.emplace_back(60, 60, 60);
160   benchmark_gemms.emplace_back(64, 256, 147);
161   benchmark_gemms.emplace_back(100, 100, 1);
162   benchmark_gemms.emplace_back(100, 100, 100);
163   benchmark_gemms.emplace_back(100, 1000, 100);
164   benchmark_gemms.emplace_back(1000, 1000, 1);
165   benchmark_gemms.emplace_back(1000, 1000, 10);
166   benchmark_gemms.emplace_back(1000, 1000, 100);
167   benchmark_gemms.emplace_back(1000, 1000, 1000);
168 
169   const int repeat = 2;
170 
171   typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
172   typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
173   typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
174 
175 #ifdef GEMMLOWP_TEST_PROFILE
176   gemmlowp::RegisterCurrentThreadForProfiling();
177   gemmlowp::StartProfiling();
178 #endif
179 
180   // We don't record the first repetition, it's just warm-up.
181   for (int r = 0; r < repeat + 1; r++) {
182     std::cout << "repetition " << r + 1 << "/" << repeat + 1 << "...\r"
183               << std::flush;
184     for (auto gemm : benchmark_gemms) {
185       double gflops = 0;
186       std::vector<gemm_t> unique_gemm;
187       unique_gemm.push_back(gemm);
188       gflops =
189           gflops_for_gemms<LhsType, RhsType, ResultType>(context, unique_gemm);
190       if (r > 0) {
191         benchmark_results[gemm].emplace_back(gflops);
192       }
193     }
194   }
195 
196 #ifdef GEMMLOWP_TEST_PROFILE
197   gemmlowp::FinishProfiling();
198 #endif
199 
200   std::cout << "                                                \r"
201             << std::flush;
202 
203   std::cout.precision(4);
204 
205   for (auto b : benchmark_results) {
206     sort(b.second.begin(), b.second.end());
207     std::cout << b.first.rows << "x" << b.first.depth << "x" << b.first.cols
208               << " : " << b.second.back() << " GFlops/s" << std::endl;
209   }
210   std::cout << std::endl;
211 }
212 
benchmark_gemm_sizes(GemmContext * context,const std::vector<gemm_t> & gemms,double mintime)213 void benchmark_gemm_sizes(GemmContext* context,
214                           const std::vector<gemm_t>& gemms, double mintime) {
215   typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
216   typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
217   typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
218 
219   std::vector<float> gemm_times;
220   std::cout << "running for " << mintime << " seconds..." << std::endl;
221 
222 #ifdef GEMMLOWP_TEST_PROFILE
223   gemmlowp::RegisterCurrentThreadForProfiling();
224   gemmlowp::StartProfiling();
225 #endif
226 
227   double starttime = real_time_in_seconds();
228   while (real_time_in_seconds() < starttime + mintime) {
229     gemm_times.push_back(
230         time_for_gemms<LhsType, RhsType, ResultType>(context, gemms));
231   }
232 
233 #ifdef GEMMLOWP_TEST_PROFILE
234   gemmlowp::FinishProfiling();
235 #endif
236 
237   std::sort(gemm_times.begin(), gemm_times.end());
238 
239   double sum_gemm_times = 0;
240   double sum_gemm_times_trimmed = 0;
241   int count_gemm_times_trimmed = 0;
242   const float trim_ratio = 0.25;
243   const size_t count_trimmed = gemm_times.size() * trim_ratio;
244   double sum_gemm_times_best = 0;
245   int count_gemm_times_best = 0;
246   const float best_ratio = 0.1;
247   const size_t count_best = gemm_times.size() * best_ratio;
248 
249   for (size_t i = 0; i < gemm_times.size(); i++) {
250     sum_gemm_times += gemm_times[i];
251     if (i >= count_trimmed && i < gemm_times.size() - count_trimmed) {
252       sum_gemm_times_trimmed += gemm_times[i];
253       count_gemm_times_trimmed++;
254     }
255     if (i < count_best) {
256       sum_gemm_times_best += gemm_times[i];
257       count_gemm_times_best++;
258     }
259   }
260 
261   const double min_latency = gemm_times.front();
262   const double max_latency = gemm_times.back();
263   const double mean_latency = sum_gemm_times / gemm_times.size();
264   const double trimmed_mean_latency =
265       sum_gemm_times_trimmed / count_gemm_times_trimmed;
266   const double best_mean_latency = sum_gemm_times_best / count_gemm_times_best;
267 
268   std::cout << "Graph latency (over " << gemm_times.size()
269             << " iterations):" << std::endl;
270   std::cout << "  Best:             " << min_latency << "s" << std::endl;
271   std::cout << "  Worst:            " << max_latency << "s" << std::endl;
272   std::cout << "  Mean:             " << mean_latency << "s" << std::endl;
273   std::cout << "  " << 100 * trim_ratio
274             << "% trimmed mean: " << trimmed_mean_latency << "s" << std::endl;
275   std::cout << "  Mean of " << 100 * best_ratio
276             << "% best: " << best_mean_latency << "s" << std::endl;
277 }
278 
benchmark_googlenet(GemmContext * context)279 void benchmark_googlenet(GemmContext* context) {
280   // These are the m, n, k sizes for a typical GoogLeNet.
281   const int googlenet_gemm_sizes[] = {
282       12544, 64,  147, 3136, 64,   64,   3136, 192,  576,  784, 64,   192,
283       784,   96,  192, 784,  128,  864,  784,  16,   192,  784, 32,   400,
284       784,   32,  192, 784,  128,  256,  784,  128,  256,  784, 192,  1152,
285       784,   32,  256, 784,  96,   800,  784,  64,   256,  196, 192,  480,
286       196,   96,  480, 196,  204,  864,  196,  16,   480,  196, 48,   400,
287       196,   64,  480, 196,  160,  508,  196,  112,  508,  196, 224,  1008,
288       196,   24,  508, 196,  64,   600,  196,  64,   508,  196, 128,  512,
289       196,   128, 512, 196,  256,  1152, 196,  24,   512,  196, 64,   600,
290       196,   64,  512, 196,  112,  512,  196,  144,  512,  196, 288,  1296,
291       196,   32,  512, 196,  64,   800,  196,  64,   512,  196, 256,  528,
292       196,   160, 528, 196,  320,  1440, 196,  32,   528,  196, 128,  800,
293       196,   128, 528, 49,   256,  832,  49,   160,  832,  49,  320,  1440,
294       49,    48,  832, 49,   128,  1200, 49,   128,  832,  49,  384,  832,
295       49,    192, 832, 49,   384,  1728, 49,   48,   832,  49,  128,  1200,
296       49,    128, 832, 16,   128,  508,  1,    1024, 2048, 1,   1008, 1024,
297       16,    128, 528, 1,    1024, 2048, 1,    1008, 1024, 1,   1008, 1024,
298   };
299   assert(sizeof(googlenet_gemm_sizes) % (3 * sizeof(googlenet_gemm_sizes[0])) ==
300          0);
301   const std::size_t num_googlenet_gemms =
302       sizeof(googlenet_gemm_sizes) / (3 * sizeof(googlenet_gemm_sizes[0]));
303 
304   std::vector<gemm_t> googlenet_gemms(num_googlenet_gemms);
305   for (std::size_t i = 0; i < num_googlenet_gemms; i++) {
306     googlenet_gemms[i].rows = googlenet_gemm_sizes[3 * i + 1];
307     googlenet_gemms[i].depth = googlenet_gemm_sizes[3 * i + 2];
308     googlenet_gemms[i].cols = googlenet_gemm_sizes[3 * i + 0];
309   }
310 
311   const double mintime = 20.0;
312   benchmark_gemm_sizes(context, googlenet_gemms, mintime);
313 }
314 
benchmark_small_model(GemmContext * context)315 void benchmark_small_model(GemmContext* context) {
316   // These are the m, n, k sizes for a small model with large batches.
317   const int small_model_gemm_sizes[] = {
318       29232, 16, 25, 7308, 6, 400, 203, 3002, 216,
319   };
320   assert(sizeof(small_model_gemm_sizes) %
321              (3 * sizeof(small_model_gemm_sizes[0])) ==
322          0);
323   const std::size_t num_small_model_gemms =
324       sizeof(small_model_gemm_sizes) / (3 * sizeof(small_model_gemm_sizes[0]));
325 
326   std::vector<gemm_t> small_model_gemms(num_small_model_gemms);
327   for (std::size_t i = 0; i < num_small_model_gemms; i++) {
328     small_model_gemms[i].rows = small_model_gemm_sizes[3 * i + 1];
329     small_model_gemms[i].depth = small_model_gemm_sizes[3 * i + 2];
330     small_model_gemms[i].cols = small_model_gemm_sizes[3 * i + 0];
331   }
332 
333   const double mintime = 10.0;
334   benchmark_gemm_sizes(context, small_model_gemms, mintime);
335 }
336 
benchmark_all()337 void benchmark_all() {
338   {
339     gemmlowp::GemmContext context;
340     std::cout << "Benchmarking small model GEMMs..." << std::endl;
341     gemmlowp::benchmark_small_model(&context);
342   }
343 
344   {
345     gemmlowp::GemmContext context;
346     std::cout << "Benchmarking typical GoogLeNet GEMMs..." << std::endl;
347     gemmlowp::benchmark_googlenet(&context);
348   }
349 
350   {
351     gemmlowp::GemmContext context;
352     context.set_max_num_threads(0);
353     std::cout << "Benchmarking multi-threaded mode..." << std::endl;
354     gemmlowp::benchmark(&context);
355   }
356 
357   {
358     gemmlowp::GemmContext context;
359     context.set_max_num_threads(1);
360     std::cout << "Benchmarking single-threaded mode..." << std::endl;
361     gemmlowp::benchmark(&context);
362   }
363 }
364 
365 }  // end namespace gemmlowp
366 
367 // For iOS, we need to define our own main(), so skip it here.
368 #if !(defined(__APPLE__) && (TARGET_OS_IPHONE || TARGET_IPHONE_SIMULATOR))
main()369 int main() { gemmlowp::benchmark_all(); }
370 #endif
371