1*bf2c3715SXin Li // This file is part of Eigen, a lightweight C++ template library
2*bf2c3715SXin Li // for linear algebra.
3*bf2c3715SXin Li //
4*bf2c3715SXin Li // Copyright (C) 2015 Benoit Jacob <[email protected]>
5*bf2c3715SXin Li //
6*bf2c3715SXin Li // This Source Code Form is subject to the terms of the Mozilla
7*bf2c3715SXin Li // Public License v. 2.0. If a copy of the MPL was not distributed
8*bf2c3715SXin Li // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9*bf2c3715SXin Li
10*bf2c3715SXin Li #include <iostream>
11*bf2c3715SXin Li #include <cstdint>
12*bf2c3715SXin Li #include <cstdlib>
13*bf2c3715SXin Li #include <vector>
14*bf2c3715SXin Li #include <algorithm>
15*bf2c3715SXin Li #include <fstream>
16*bf2c3715SXin Li #include <string>
17*bf2c3715SXin Li #include <cmath>
18*bf2c3715SXin Li #include <cassert>
19*bf2c3715SXin Li #include <cstring>
20*bf2c3715SXin Li #include <memory>
21*bf2c3715SXin Li
22*bf2c3715SXin Li #include <Eigen/Core>
23*bf2c3715SXin Li
24*bf2c3715SXin Li using namespace std;
25*bf2c3715SXin Li
26*bf2c3715SXin Li const int default_precision = 4;
27*bf2c3715SXin Li
28*bf2c3715SXin Li // see --only-cubic-sizes
29*bf2c3715SXin Li bool only_cubic_sizes = false;
30*bf2c3715SXin Li
31*bf2c3715SXin Li // see --dump-tables
32*bf2c3715SXin Li bool dump_tables = false;
33*bf2c3715SXin Li
log2_pot(size_t x)34*bf2c3715SXin Li uint8_t log2_pot(size_t x) {
35*bf2c3715SXin Li size_t l = 0;
36*bf2c3715SXin Li while (x >>= 1) l++;
37*bf2c3715SXin Li return l;
38*bf2c3715SXin Li }
39*bf2c3715SXin Li
compact_size_triple(size_t k,size_t m,size_t n)40*bf2c3715SXin Li uint16_t compact_size_triple(size_t k, size_t m, size_t n)
41*bf2c3715SXin Li {
42*bf2c3715SXin Li return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n);
43*bf2c3715SXin Li }
44*bf2c3715SXin Li
45*bf2c3715SXin Li // just a helper to store a triple of K,M,N sizes for matrix product
46*bf2c3715SXin Li struct size_triple_t
47*bf2c3715SXin Li {
48*bf2c3715SXin Li uint16_t k, m, n;
size_triple_tsize_triple_t49*bf2c3715SXin Li size_triple_t() : k(0), m(0), n(0) {}
size_triple_tsize_triple_t50*bf2c3715SXin Li size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {}
size_triple_tsize_triple_t51*bf2c3715SXin Li size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {}
size_triple_tsize_triple_t52*bf2c3715SXin Li size_triple_t(uint16_t compact)
53*bf2c3715SXin Li {
54*bf2c3715SXin Li k = 1 << ((compact & 0xf00) >> 8);
55*bf2c3715SXin Li m = 1 << ((compact & 0x0f0) >> 4);
56*bf2c3715SXin Li n = 1 << ((compact & 0x00f) >> 0);
57*bf2c3715SXin Li }
is_cubicsize_triple_t58*bf2c3715SXin Li bool is_cubic() const { return k == m && m == n; }
59*bf2c3715SXin Li };
60*bf2c3715SXin Li
operator <<(ostream & s,const size_triple_t & t)61*bf2c3715SXin Li ostream& operator<<(ostream& s, const size_triple_t& t)
62*bf2c3715SXin Li {
63*bf2c3715SXin Li return s << "(" << t.k << ", " << t.m << ", " << t.n << ")";
64*bf2c3715SXin Li }
65*bf2c3715SXin Li
66*bf2c3715SXin Li struct inputfile_entry_t
67*bf2c3715SXin Li {
68*bf2c3715SXin Li uint16_t product_size;
69*bf2c3715SXin Li uint16_t pot_block_size;
70*bf2c3715SXin Li size_triple_t nonpot_block_size;
71*bf2c3715SXin Li float gflops;
72*bf2c3715SXin Li };
73*bf2c3715SXin Li
74*bf2c3715SXin Li struct inputfile_t
75*bf2c3715SXin Li {
76*bf2c3715SXin Li enum class type_t {
77*bf2c3715SXin Li unknown,
78*bf2c3715SXin Li all_pot_sizes,
79*bf2c3715SXin Li default_sizes
80*bf2c3715SXin Li };
81*bf2c3715SXin Li
82*bf2c3715SXin Li string filename;
83*bf2c3715SXin Li vector<inputfile_entry_t> entries;
84*bf2c3715SXin Li type_t type;
85*bf2c3715SXin Li
inputfile_tinputfile_t86*bf2c3715SXin Li inputfile_t(const string& fname)
87*bf2c3715SXin Li : filename(fname)
88*bf2c3715SXin Li , type(type_t::unknown)
89*bf2c3715SXin Li {
90*bf2c3715SXin Li ifstream stream(filename);
91*bf2c3715SXin Li if (!stream.is_open()) {
92*bf2c3715SXin Li cerr << "couldn't open input file: " << filename << endl;
93*bf2c3715SXin Li exit(1);
94*bf2c3715SXin Li }
95*bf2c3715SXin Li string line;
96*bf2c3715SXin Li while (getline(stream, line)) {
97*bf2c3715SXin Li if (line.empty()) continue;
98*bf2c3715SXin Li if (line.find("BEGIN MEASUREMENTS ALL POT SIZES") == 0) {
99*bf2c3715SXin Li if (type != type_t::unknown) {
100*bf2c3715SXin Li cerr << "Input file " << filename << " contains redundant BEGIN MEASUREMENTS lines";
101*bf2c3715SXin Li exit(1);
102*bf2c3715SXin Li }
103*bf2c3715SXin Li type = type_t::all_pot_sizes;
104*bf2c3715SXin Li continue;
105*bf2c3715SXin Li }
106*bf2c3715SXin Li if (line.find("BEGIN MEASUREMENTS DEFAULT SIZES") == 0) {
107*bf2c3715SXin Li if (type != type_t::unknown) {
108*bf2c3715SXin Li cerr << "Input file " << filename << " contains redundant BEGIN MEASUREMENTS lines";
109*bf2c3715SXin Li exit(1);
110*bf2c3715SXin Li }
111*bf2c3715SXin Li type = type_t::default_sizes;
112*bf2c3715SXin Li continue;
113*bf2c3715SXin Li }
114*bf2c3715SXin Li
115*bf2c3715SXin Li
116*bf2c3715SXin Li if (type == type_t::unknown) {
117*bf2c3715SXin Li continue;
118*bf2c3715SXin Li }
119*bf2c3715SXin Li switch(type) {
120*bf2c3715SXin Li case type_t::all_pot_sizes: {
121*bf2c3715SXin Li unsigned int product_size, block_size;
122*bf2c3715SXin Li float gflops;
123*bf2c3715SXin Li int sscanf_result =
124*bf2c3715SXin Li sscanf(line.c_str(), "%x %x %f",
125*bf2c3715SXin Li &product_size,
126*bf2c3715SXin Li &block_size,
127*bf2c3715SXin Li &gflops);
128*bf2c3715SXin Li if (3 != sscanf_result ||
129*bf2c3715SXin Li !product_size ||
130*bf2c3715SXin Li product_size > 0xfff ||
131*bf2c3715SXin Li !block_size ||
132*bf2c3715SXin Li block_size > 0xfff ||
133*bf2c3715SXin Li !isfinite(gflops))
134*bf2c3715SXin Li {
135*bf2c3715SXin Li cerr << "ill-formed input file: " << filename << endl;
136*bf2c3715SXin Li cerr << "offending line:" << endl << line << endl;
137*bf2c3715SXin Li exit(1);
138*bf2c3715SXin Li }
139*bf2c3715SXin Li if (only_cubic_sizes && !size_triple_t(product_size).is_cubic()) {
140*bf2c3715SXin Li continue;
141*bf2c3715SXin Li }
142*bf2c3715SXin Li inputfile_entry_t entry;
143*bf2c3715SXin Li entry.product_size = uint16_t(product_size);
144*bf2c3715SXin Li entry.pot_block_size = uint16_t(block_size);
145*bf2c3715SXin Li entry.gflops = gflops;
146*bf2c3715SXin Li entries.push_back(entry);
147*bf2c3715SXin Li break;
148*bf2c3715SXin Li }
149*bf2c3715SXin Li case type_t::default_sizes: {
150*bf2c3715SXin Li unsigned int product_size;
151*bf2c3715SXin Li float gflops;
152*bf2c3715SXin Li int bk, bm, bn;
153*bf2c3715SXin Li int sscanf_result =
154*bf2c3715SXin Li sscanf(line.c_str(), "%x default(%d, %d, %d) %f",
155*bf2c3715SXin Li &product_size,
156*bf2c3715SXin Li &bk, &bm, &bn,
157*bf2c3715SXin Li &gflops);
158*bf2c3715SXin Li if (5 != sscanf_result ||
159*bf2c3715SXin Li !product_size ||
160*bf2c3715SXin Li product_size > 0xfff ||
161*bf2c3715SXin Li !isfinite(gflops))
162*bf2c3715SXin Li {
163*bf2c3715SXin Li cerr << "ill-formed input file: " << filename << endl;
164*bf2c3715SXin Li cerr << "offending line:" << endl << line << endl;
165*bf2c3715SXin Li exit(1);
166*bf2c3715SXin Li }
167*bf2c3715SXin Li if (only_cubic_sizes && !size_triple_t(product_size).is_cubic()) {
168*bf2c3715SXin Li continue;
169*bf2c3715SXin Li }
170*bf2c3715SXin Li inputfile_entry_t entry;
171*bf2c3715SXin Li entry.product_size = uint16_t(product_size);
172*bf2c3715SXin Li entry.pot_block_size = 0;
173*bf2c3715SXin Li entry.nonpot_block_size = size_triple_t(bk, bm, bn);
174*bf2c3715SXin Li entry.gflops = gflops;
175*bf2c3715SXin Li entries.push_back(entry);
176*bf2c3715SXin Li break;
177*bf2c3715SXin Li }
178*bf2c3715SXin Li
179*bf2c3715SXin Li default:
180*bf2c3715SXin Li break;
181*bf2c3715SXin Li }
182*bf2c3715SXin Li }
183*bf2c3715SXin Li stream.close();
184*bf2c3715SXin Li if (type == type_t::unknown) {
185*bf2c3715SXin Li cerr << "Unrecognized input file " << filename << endl;
186*bf2c3715SXin Li exit(1);
187*bf2c3715SXin Li }
188*bf2c3715SXin Li if (entries.empty()) {
189*bf2c3715SXin Li cerr << "didn't find any measurements in input file: " << filename << endl;
190*bf2c3715SXin Li exit(1);
191*bf2c3715SXin Li }
192*bf2c3715SXin Li }
193*bf2c3715SXin Li };
194*bf2c3715SXin Li
195*bf2c3715SXin Li struct preprocessed_inputfile_entry_t
196*bf2c3715SXin Li {
197*bf2c3715SXin Li uint16_t product_size;
198*bf2c3715SXin Li uint16_t block_size;
199*bf2c3715SXin Li
200*bf2c3715SXin Li float efficiency;
201*bf2c3715SXin Li };
202*bf2c3715SXin Li
lower_efficiency(const preprocessed_inputfile_entry_t & e1,const preprocessed_inputfile_entry_t & e2)203*bf2c3715SXin Li bool lower_efficiency(const preprocessed_inputfile_entry_t& e1, const preprocessed_inputfile_entry_t& e2)
204*bf2c3715SXin Li {
205*bf2c3715SXin Li return e1.efficiency < e2.efficiency;
206*bf2c3715SXin Li }
207*bf2c3715SXin Li
208*bf2c3715SXin Li struct preprocessed_inputfile_t
209*bf2c3715SXin Li {
210*bf2c3715SXin Li string filename;
211*bf2c3715SXin Li vector<preprocessed_inputfile_entry_t> entries;
212*bf2c3715SXin Li
preprocessed_inputfile_tpreprocessed_inputfile_t213*bf2c3715SXin Li preprocessed_inputfile_t(const inputfile_t& inputfile)
214*bf2c3715SXin Li : filename(inputfile.filename)
215*bf2c3715SXin Li {
216*bf2c3715SXin Li if (inputfile.type != inputfile_t::type_t::all_pot_sizes) {
217*bf2c3715SXin Li abort();
218*bf2c3715SXin Li }
219*bf2c3715SXin Li auto it = inputfile.entries.begin();
220*bf2c3715SXin Li auto it_first_with_given_product_size = it;
221*bf2c3715SXin Li while (it != inputfile.entries.end()) {
222*bf2c3715SXin Li ++it;
223*bf2c3715SXin Li if (it == inputfile.entries.end() ||
224*bf2c3715SXin Li it->product_size != it_first_with_given_product_size->product_size)
225*bf2c3715SXin Li {
226*bf2c3715SXin Li import_input_file_range_one_product_size(it_first_with_given_product_size, it);
227*bf2c3715SXin Li it_first_with_given_product_size = it;
228*bf2c3715SXin Li }
229*bf2c3715SXin Li }
230*bf2c3715SXin Li }
231*bf2c3715SXin Li
232*bf2c3715SXin Li private:
import_input_file_range_one_product_sizepreprocessed_inputfile_t233*bf2c3715SXin Li void import_input_file_range_one_product_size(
234*bf2c3715SXin Li const vector<inputfile_entry_t>::const_iterator& begin,
235*bf2c3715SXin Li const vector<inputfile_entry_t>::const_iterator& end)
236*bf2c3715SXin Li {
237*bf2c3715SXin Li uint16_t product_size = begin->product_size;
238*bf2c3715SXin Li float max_gflops = 0.0f;
239*bf2c3715SXin Li for (auto it = begin; it != end; ++it) {
240*bf2c3715SXin Li if (it->product_size != product_size) {
241*bf2c3715SXin Li cerr << "Unexpected ordering of entries in " << filename << endl;
242*bf2c3715SXin Li cerr << "(Expected all entries for product size " << hex << product_size << dec << " to be grouped)" << endl;
243*bf2c3715SXin Li exit(1);
244*bf2c3715SXin Li }
245*bf2c3715SXin Li max_gflops = max(max_gflops, it->gflops);
246*bf2c3715SXin Li }
247*bf2c3715SXin Li for (auto it = begin; it != end; ++it) {
248*bf2c3715SXin Li preprocessed_inputfile_entry_t entry;
249*bf2c3715SXin Li entry.product_size = it->product_size;
250*bf2c3715SXin Li entry.block_size = it->pot_block_size;
251*bf2c3715SXin Li entry.efficiency = it->gflops / max_gflops;
252*bf2c3715SXin Li entries.push_back(entry);
253*bf2c3715SXin Li }
254*bf2c3715SXin Li }
255*bf2c3715SXin Li };
256*bf2c3715SXin Li
check_all_files_in_same_exact_order(const vector<preprocessed_inputfile_t> & preprocessed_inputfiles)257*bf2c3715SXin Li void check_all_files_in_same_exact_order(
258*bf2c3715SXin Li const vector<preprocessed_inputfile_t>& preprocessed_inputfiles)
259*bf2c3715SXin Li {
260*bf2c3715SXin Li if (preprocessed_inputfiles.empty()) {
261*bf2c3715SXin Li return;
262*bf2c3715SXin Li }
263*bf2c3715SXin Li
264*bf2c3715SXin Li const preprocessed_inputfile_t& first_file = preprocessed_inputfiles[0];
265*bf2c3715SXin Li const size_t num_entries = first_file.entries.size();
266*bf2c3715SXin Li
267*bf2c3715SXin Li for (size_t i = 0; i < preprocessed_inputfiles.size(); i++) {
268*bf2c3715SXin Li if (preprocessed_inputfiles[i].entries.size() != num_entries) {
269*bf2c3715SXin Li cerr << "these files have different number of entries: "
270*bf2c3715SXin Li << preprocessed_inputfiles[i].filename
271*bf2c3715SXin Li << " and "
272*bf2c3715SXin Li << first_file.filename
273*bf2c3715SXin Li << endl;
274*bf2c3715SXin Li exit(1);
275*bf2c3715SXin Li }
276*bf2c3715SXin Li }
277*bf2c3715SXin Li
278*bf2c3715SXin Li for (size_t entry_index = 0; entry_index < num_entries; entry_index++) {
279*bf2c3715SXin Li const uint16_t entry_product_size = first_file.entries[entry_index].product_size;
280*bf2c3715SXin Li const uint16_t entry_block_size = first_file.entries[entry_index].block_size;
281*bf2c3715SXin Li for (size_t file_index = 0; file_index < preprocessed_inputfiles.size(); file_index++) {
282*bf2c3715SXin Li const preprocessed_inputfile_t& cur_file = preprocessed_inputfiles[file_index];
283*bf2c3715SXin Li if (cur_file.entries[entry_index].product_size != entry_product_size ||
284*bf2c3715SXin Li cur_file.entries[entry_index].block_size != entry_block_size)
285*bf2c3715SXin Li {
286*bf2c3715SXin Li cerr << "entries not in same order between these files: "
287*bf2c3715SXin Li << first_file.filename
288*bf2c3715SXin Li << " and "
289*bf2c3715SXin Li << cur_file.filename
290*bf2c3715SXin Li << endl;
291*bf2c3715SXin Li exit(1);
292*bf2c3715SXin Li }
293*bf2c3715SXin Li }
294*bf2c3715SXin Li }
295*bf2c3715SXin Li }
296*bf2c3715SXin Li
efficiency_of_subset(const vector<preprocessed_inputfile_t> & preprocessed_inputfiles,const vector<size_t> & subset)297*bf2c3715SXin Li float efficiency_of_subset(
298*bf2c3715SXin Li const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
299*bf2c3715SXin Li const vector<size_t>& subset)
300*bf2c3715SXin Li {
301*bf2c3715SXin Li if (subset.size() <= 1) {
302*bf2c3715SXin Li return 1.0f;
303*bf2c3715SXin Li }
304*bf2c3715SXin Li const preprocessed_inputfile_t& first_file = preprocessed_inputfiles[subset[0]];
305*bf2c3715SXin Li const size_t num_entries = first_file.entries.size();
306*bf2c3715SXin Li float efficiency = 1.0f;
307*bf2c3715SXin Li size_t entry_index = 0;
308*bf2c3715SXin Li size_t first_entry_index_with_this_product_size = 0;
309*bf2c3715SXin Li uint16_t product_size = first_file.entries[0].product_size;
310*bf2c3715SXin Li while (entry_index < num_entries) {
311*bf2c3715SXin Li ++entry_index;
312*bf2c3715SXin Li if (entry_index == num_entries ||
313*bf2c3715SXin Li first_file.entries[entry_index].product_size != product_size)
314*bf2c3715SXin Li {
315*bf2c3715SXin Li float efficiency_this_product_size = 0.0f;
316*bf2c3715SXin Li for (size_t e = first_entry_index_with_this_product_size; e < entry_index; e++) {
317*bf2c3715SXin Li float efficiency_this_entry = 1.0f;
318*bf2c3715SXin Li for (auto i = subset.begin(); i != subset.end(); ++i) {
319*bf2c3715SXin Li efficiency_this_entry = min(efficiency_this_entry, preprocessed_inputfiles[*i].entries[e].efficiency);
320*bf2c3715SXin Li }
321*bf2c3715SXin Li efficiency_this_product_size = max(efficiency_this_product_size, efficiency_this_entry);
322*bf2c3715SXin Li }
323*bf2c3715SXin Li efficiency = min(efficiency, efficiency_this_product_size);
324*bf2c3715SXin Li if (entry_index < num_entries) {
325*bf2c3715SXin Li first_entry_index_with_this_product_size = entry_index;
326*bf2c3715SXin Li product_size = first_file.entries[entry_index].product_size;
327*bf2c3715SXin Li }
328*bf2c3715SXin Li }
329*bf2c3715SXin Li }
330*bf2c3715SXin Li
331*bf2c3715SXin Li return efficiency;
332*bf2c3715SXin Li }
333*bf2c3715SXin Li
dump_table_for_subset(const vector<preprocessed_inputfile_t> & preprocessed_inputfiles,const vector<size_t> & subset)334*bf2c3715SXin Li void dump_table_for_subset(
335*bf2c3715SXin Li const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
336*bf2c3715SXin Li const vector<size_t>& subset)
337*bf2c3715SXin Li {
338*bf2c3715SXin Li const preprocessed_inputfile_t& first_file = preprocessed_inputfiles[subset[0]];
339*bf2c3715SXin Li const size_t num_entries = first_file.entries.size();
340*bf2c3715SXin Li size_t entry_index = 0;
341*bf2c3715SXin Li size_t first_entry_index_with_this_product_size = 0;
342*bf2c3715SXin Li uint16_t product_size = first_file.entries[0].product_size;
343*bf2c3715SXin Li size_t i = 0;
344*bf2c3715SXin Li size_triple_t min_product_size(first_file.entries.front().product_size);
345*bf2c3715SXin Li size_triple_t max_product_size(first_file.entries.back().product_size);
346*bf2c3715SXin Li if (!min_product_size.is_cubic() || !max_product_size.is_cubic()) {
347*bf2c3715SXin Li abort();
348*bf2c3715SXin Li }
349*bf2c3715SXin Li if (only_cubic_sizes) {
350*bf2c3715SXin Li cerr << "Can't generate tables with --only-cubic-sizes." << endl;
351*bf2c3715SXin Li abort();
352*bf2c3715SXin Li }
353*bf2c3715SXin Li cout << "struct LookupTable {" << endl;
354*bf2c3715SXin Li cout << " static const size_t BaseSize = " << min_product_size.k << ";" << endl;
355*bf2c3715SXin Li const size_t NumSizes = log2_pot(max_product_size.k / min_product_size.k) + 1;
356*bf2c3715SXin Li const size_t TableSize = NumSizes * NumSizes * NumSizes;
357*bf2c3715SXin Li cout << " static const size_t NumSizes = " << NumSizes << ";" << endl;
358*bf2c3715SXin Li cout << " static const unsigned short* Data() {" << endl;
359*bf2c3715SXin Li cout << " static const unsigned short data[" << TableSize << "] = {";
360*bf2c3715SXin Li while (entry_index < num_entries) {
361*bf2c3715SXin Li ++entry_index;
362*bf2c3715SXin Li if (entry_index == num_entries ||
363*bf2c3715SXin Li first_file.entries[entry_index].product_size != product_size)
364*bf2c3715SXin Li {
365*bf2c3715SXin Li float best_efficiency_this_product_size = 0.0f;
366*bf2c3715SXin Li uint16_t best_block_size_this_product_size = 0;
367*bf2c3715SXin Li for (size_t e = first_entry_index_with_this_product_size; e < entry_index; e++) {
368*bf2c3715SXin Li float efficiency_this_entry = 1.0f;
369*bf2c3715SXin Li for (auto i = subset.begin(); i != subset.end(); ++i) {
370*bf2c3715SXin Li efficiency_this_entry = min(efficiency_this_entry, preprocessed_inputfiles[*i].entries[e].efficiency);
371*bf2c3715SXin Li }
372*bf2c3715SXin Li if (efficiency_this_entry > best_efficiency_this_product_size) {
373*bf2c3715SXin Li best_efficiency_this_product_size = efficiency_this_entry;
374*bf2c3715SXin Li best_block_size_this_product_size = first_file.entries[e].block_size;
375*bf2c3715SXin Li }
376*bf2c3715SXin Li }
377*bf2c3715SXin Li if ((i++) % NumSizes) {
378*bf2c3715SXin Li cout << " ";
379*bf2c3715SXin Li } else {
380*bf2c3715SXin Li cout << endl << " ";
381*bf2c3715SXin Li }
382*bf2c3715SXin Li cout << "0x" << hex << best_block_size_this_product_size << dec;
383*bf2c3715SXin Li if (entry_index < num_entries) {
384*bf2c3715SXin Li cout << ",";
385*bf2c3715SXin Li first_entry_index_with_this_product_size = entry_index;
386*bf2c3715SXin Li product_size = first_file.entries[entry_index].product_size;
387*bf2c3715SXin Li }
388*bf2c3715SXin Li }
389*bf2c3715SXin Li }
390*bf2c3715SXin Li if (i != TableSize) {
391*bf2c3715SXin Li cerr << endl << "Wrote " << i << " table entries, expected " << TableSize << endl;
392*bf2c3715SXin Li abort();
393*bf2c3715SXin Li }
394*bf2c3715SXin Li cout << endl << " };" << endl;
395*bf2c3715SXin Li cout << " return data;" << endl;
396*bf2c3715SXin Li cout << " }" << endl;
397*bf2c3715SXin Li cout << "};" << endl;
398*bf2c3715SXin Li }
399*bf2c3715SXin Li
efficiency_of_partition(const vector<preprocessed_inputfile_t> & preprocessed_inputfiles,const vector<vector<size_t>> & partition)400*bf2c3715SXin Li float efficiency_of_partition(
401*bf2c3715SXin Li const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
402*bf2c3715SXin Li const vector<vector<size_t>>& partition)
403*bf2c3715SXin Li {
404*bf2c3715SXin Li float efficiency = 1.0f;
405*bf2c3715SXin Li for (auto s = partition.begin(); s != partition.end(); ++s) {
406*bf2c3715SXin Li efficiency = min(efficiency, efficiency_of_subset(preprocessed_inputfiles, *s));
407*bf2c3715SXin Li }
408*bf2c3715SXin Li return efficiency;
409*bf2c3715SXin Li }
410*bf2c3715SXin Li
make_first_subset(size_t subset_size,vector<size_t> & out_subset,size_t set_size)411*bf2c3715SXin Li void make_first_subset(size_t subset_size, vector<size_t>& out_subset, size_t set_size)
412*bf2c3715SXin Li {
413*bf2c3715SXin Li assert(subset_size >= 1 && subset_size <= set_size);
414*bf2c3715SXin Li out_subset.resize(subset_size);
415*bf2c3715SXin Li for (size_t i = 0; i < subset_size; i++) {
416*bf2c3715SXin Li out_subset[i] = i;
417*bf2c3715SXin Li }
418*bf2c3715SXin Li }
419*bf2c3715SXin Li
is_last_subset(const vector<size_t> & subset,size_t set_size)420*bf2c3715SXin Li bool is_last_subset(const vector<size_t>& subset, size_t set_size)
421*bf2c3715SXin Li {
422*bf2c3715SXin Li return subset[0] == set_size - subset.size();
423*bf2c3715SXin Li }
424*bf2c3715SXin Li
next_subset(vector<size_t> & inout_subset,size_t set_size)425*bf2c3715SXin Li void next_subset(vector<size_t>& inout_subset, size_t set_size)
426*bf2c3715SXin Li {
427*bf2c3715SXin Li if (is_last_subset(inout_subset, set_size)) {
428*bf2c3715SXin Li cerr << "iterating past the last subset" << endl;
429*bf2c3715SXin Li abort();
430*bf2c3715SXin Li }
431*bf2c3715SXin Li size_t i = 1;
432*bf2c3715SXin Li while (inout_subset[inout_subset.size() - i] == set_size - i) {
433*bf2c3715SXin Li i++;
434*bf2c3715SXin Li assert(i <= inout_subset.size());
435*bf2c3715SXin Li }
436*bf2c3715SXin Li size_t first_index_to_change = inout_subset.size() - i;
437*bf2c3715SXin Li inout_subset[first_index_to_change]++;
438*bf2c3715SXin Li size_t p = inout_subset[first_index_to_change];
439*bf2c3715SXin Li for (size_t j = first_index_to_change + 1; j < inout_subset.size(); j++) {
440*bf2c3715SXin Li inout_subset[j] = ++p;
441*bf2c3715SXin Li }
442*bf2c3715SXin Li }
443*bf2c3715SXin Li
444*bf2c3715SXin Li const size_t number_of_subsets_limit = 100;
445*bf2c3715SXin Li const size_t always_search_subsets_of_size_at_least = 2;
446*bf2c3715SXin Li
is_number_of_subsets_feasible(size_t n,size_t p)447*bf2c3715SXin Li bool is_number_of_subsets_feasible(size_t n, size_t p)
448*bf2c3715SXin Li {
449*bf2c3715SXin Li assert(n>0 && p>0 && p<=n);
450*bf2c3715SXin Li uint64_t numerator = 1, denominator = 1;
451*bf2c3715SXin Li for (size_t i = 0; i < p; i++) {
452*bf2c3715SXin Li numerator *= n - i;
453*bf2c3715SXin Li denominator *= i + 1;
454*bf2c3715SXin Li if (numerator > denominator * number_of_subsets_limit) {
455*bf2c3715SXin Li return false;
456*bf2c3715SXin Li }
457*bf2c3715SXin Li }
458*bf2c3715SXin Li return true;
459*bf2c3715SXin Li }
460*bf2c3715SXin Li
max_feasible_subset_size(size_t n)461*bf2c3715SXin Li size_t max_feasible_subset_size(size_t n)
462*bf2c3715SXin Li {
463*bf2c3715SXin Li assert(n > 0);
464*bf2c3715SXin Li const size_t minresult = min<size_t>(n-1, always_search_subsets_of_size_at_least);
465*bf2c3715SXin Li for (size_t p = 1; p <= n - 1; p++) {
466*bf2c3715SXin Li if (!is_number_of_subsets_feasible(n, p+1)) {
467*bf2c3715SXin Li return max(p, minresult);
468*bf2c3715SXin Li }
469*bf2c3715SXin Li }
470*bf2c3715SXin Li return n - 1;
471*bf2c3715SXin Li }
472*bf2c3715SXin Li
find_subset_with_efficiency_higher_than(const vector<preprocessed_inputfile_t> & preprocessed_inputfiles,float required_efficiency_to_beat,vector<size_t> & inout_remainder,vector<size_t> & out_subset)473*bf2c3715SXin Li void find_subset_with_efficiency_higher_than(
474*bf2c3715SXin Li const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
475*bf2c3715SXin Li float required_efficiency_to_beat,
476*bf2c3715SXin Li vector<size_t>& inout_remainder,
477*bf2c3715SXin Li vector<size_t>& out_subset)
478*bf2c3715SXin Li {
479*bf2c3715SXin Li out_subset.resize(0);
480*bf2c3715SXin Li
481*bf2c3715SXin Li if (required_efficiency_to_beat >= 1.0f) {
482*bf2c3715SXin Li cerr << "can't beat efficiency 1." << endl;
483*bf2c3715SXin Li abort();
484*bf2c3715SXin Li }
485*bf2c3715SXin Li
486*bf2c3715SXin Li while (!inout_remainder.empty()) {
487*bf2c3715SXin Li
488*bf2c3715SXin Li vector<size_t> candidate_indices(inout_remainder.size());
489*bf2c3715SXin Li for (size_t i = 0; i < candidate_indices.size(); i++) {
490*bf2c3715SXin Li candidate_indices[i] = i;
491*bf2c3715SXin Li }
492*bf2c3715SXin Li
493*bf2c3715SXin Li size_t candidate_indices_subset_size = max_feasible_subset_size(candidate_indices.size());
494*bf2c3715SXin Li while (candidate_indices_subset_size >= 1) {
495*bf2c3715SXin Li vector<size_t> candidate_indices_subset;
496*bf2c3715SXin Li make_first_subset(candidate_indices_subset_size,
497*bf2c3715SXin Li candidate_indices_subset,
498*bf2c3715SXin Li candidate_indices.size());
499*bf2c3715SXin Li
500*bf2c3715SXin Li vector<size_t> best_candidate_indices_subset;
501*bf2c3715SXin Li float best_efficiency = 0.0f;
502*bf2c3715SXin Li vector<size_t> trial_subset = out_subset;
503*bf2c3715SXin Li trial_subset.resize(out_subset.size() + candidate_indices_subset_size);
504*bf2c3715SXin Li while (true)
505*bf2c3715SXin Li {
506*bf2c3715SXin Li for (size_t i = 0; i < candidate_indices_subset_size; i++) {
507*bf2c3715SXin Li trial_subset[out_subset.size() + i] = inout_remainder[candidate_indices_subset[i]];
508*bf2c3715SXin Li }
509*bf2c3715SXin Li
510*bf2c3715SXin Li float trial_efficiency = efficiency_of_subset(preprocessed_inputfiles, trial_subset);
511*bf2c3715SXin Li if (trial_efficiency > best_efficiency) {
512*bf2c3715SXin Li best_efficiency = trial_efficiency;
513*bf2c3715SXin Li best_candidate_indices_subset = candidate_indices_subset;
514*bf2c3715SXin Li }
515*bf2c3715SXin Li if (is_last_subset(candidate_indices_subset, candidate_indices.size())) {
516*bf2c3715SXin Li break;
517*bf2c3715SXin Li }
518*bf2c3715SXin Li next_subset(candidate_indices_subset, candidate_indices.size());
519*bf2c3715SXin Li }
520*bf2c3715SXin Li
521*bf2c3715SXin Li if (best_efficiency > required_efficiency_to_beat) {
522*bf2c3715SXin Li for (size_t i = 0; i < best_candidate_indices_subset.size(); i++) {
523*bf2c3715SXin Li candidate_indices[i] = candidate_indices[best_candidate_indices_subset[i]];
524*bf2c3715SXin Li }
525*bf2c3715SXin Li candidate_indices.resize(best_candidate_indices_subset.size());
526*bf2c3715SXin Li }
527*bf2c3715SXin Li candidate_indices_subset_size--;
528*bf2c3715SXin Li }
529*bf2c3715SXin Li
530*bf2c3715SXin Li size_t candidate_index = candidate_indices[0];
531*bf2c3715SXin Li auto candidate_iterator = inout_remainder.begin() + candidate_index;
532*bf2c3715SXin Li vector<size_t> trial_subset = out_subset;
533*bf2c3715SXin Li
534*bf2c3715SXin Li trial_subset.push_back(*candidate_iterator);
535*bf2c3715SXin Li float trial_efficiency = efficiency_of_subset(preprocessed_inputfiles, trial_subset);
536*bf2c3715SXin Li if (trial_efficiency > required_efficiency_to_beat) {
537*bf2c3715SXin Li out_subset.push_back(*candidate_iterator);
538*bf2c3715SXin Li inout_remainder.erase(candidate_iterator);
539*bf2c3715SXin Li } else {
540*bf2c3715SXin Li break;
541*bf2c3715SXin Li }
542*bf2c3715SXin Li }
543*bf2c3715SXin Li }
544*bf2c3715SXin Li
find_partition_with_efficiency_higher_than(const vector<preprocessed_inputfile_t> & preprocessed_inputfiles,float required_efficiency_to_beat,vector<vector<size_t>> & out_partition)545*bf2c3715SXin Li void find_partition_with_efficiency_higher_than(
546*bf2c3715SXin Li const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
547*bf2c3715SXin Li float required_efficiency_to_beat,
548*bf2c3715SXin Li vector<vector<size_t>>& out_partition)
549*bf2c3715SXin Li {
550*bf2c3715SXin Li out_partition.resize(0);
551*bf2c3715SXin Li
552*bf2c3715SXin Li vector<size_t> remainder;
553*bf2c3715SXin Li for (size_t i = 0; i < preprocessed_inputfiles.size(); i++) {
554*bf2c3715SXin Li remainder.push_back(i);
555*bf2c3715SXin Li }
556*bf2c3715SXin Li
557*bf2c3715SXin Li while (!remainder.empty()) {
558*bf2c3715SXin Li vector<size_t> new_subset;
559*bf2c3715SXin Li find_subset_with_efficiency_higher_than(
560*bf2c3715SXin Li preprocessed_inputfiles,
561*bf2c3715SXin Li required_efficiency_to_beat,
562*bf2c3715SXin Li remainder,
563*bf2c3715SXin Li new_subset);
564*bf2c3715SXin Li out_partition.push_back(new_subset);
565*bf2c3715SXin Li }
566*bf2c3715SXin Li }
567*bf2c3715SXin Li
print_partition(const vector<preprocessed_inputfile_t> & preprocessed_inputfiles,const vector<vector<size_t>> & partition)568*bf2c3715SXin Li void print_partition(
569*bf2c3715SXin Li const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
570*bf2c3715SXin Li const vector<vector<size_t>>& partition)
571*bf2c3715SXin Li {
572*bf2c3715SXin Li float efficiency = efficiency_of_partition(preprocessed_inputfiles, partition);
573*bf2c3715SXin Li cout << "Partition into " << partition.size() << " subsets for " << efficiency * 100.0f << "% efficiency" << endl;
574*bf2c3715SXin Li for (auto subset = partition.begin(); subset != partition.end(); ++subset) {
575*bf2c3715SXin Li cout << " Subset " << (subset - partition.begin())
576*bf2c3715SXin Li << ", efficiency " << efficiency_of_subset(preprocessed_inputfiles, *subset) * 100.0f << "%:"
577*bf2c3715SXin Li << endl;
578*bf2c3715SXin Li for (auto file = subset->begin(); file != subset->end(); ++file) {
579*bf2c3715SXin Li cout << " " << preprocessed_inputfiles[*file].filename << endl;
580*bf2c3715SXin Li }
581*bf2c3715SXin Li if (dump_tables) {
582*bf2c3715SXin Li cout << " Table:" << endl;
583*bf2c3715SXin Li dump_table_for_subset(preprocessed_inputfiles, *subset);
584*bf2c3715SXin Li }
585*bf2c3715SXin Li }
586*bf2c3715SXin Li cout << endl;
587*bf2c3715SXin Li }
588*bf2c3715SXin Li
589*bf2c3715SXin Li struct action_t
590*bf2c3715SXin Li {
invokation_nameaction_t591*bf2c3715SXin Li virtual const char* invokation_name() const { abort(); return nullptr; }
runaction_t592*bf2c3715SXin Li virtual void run(const vector<string>&) const { abort(); }
~action_taction_t593*bf2c3715SXin Li virtual ~action_t() {}
594*bf2c3715SXin Li };
595*bf2c3715SXin Li
596*bf2c3715SXin Li struct partition_action_t : action_t
597*bf2c3715SXin Li {
invokation_namepartition_action_t598*bf2c3715SXin Li virtual const char* invokation_name() const override { return "partition"; }
runpartition_action_t599*bf2c3715SXin Li virtual void run(const vector<string>& input_filenames) const override
600*bf2c3715SXin Li {
601*bf2c3715SXin Li vector<preprocessed_inputfile_t> preprocessed_inputfiles;
602*bf2c3715SXin Li
603*bf2c3715SXin Li if (input_filenames.empty()) {
604*bf2c3715SXin Li cerr << "The " << invokation_name() << " action needs a list of input files." << endl;
605*bf2c3715SXin Li exit(1);
606*bf2c3715SXin Li }
607*bf2c3715SXin Li
608*bf2c3715SXin Li for (auto it = input_filenames.begin(); it != input_filenames.end(); ++it) {
609*bf2c3715SXin Li inputfile_t inputfile(*it);
610*bf2c3715SXin Li switch (inputfile.type) {
611*bf2c3715SXin Li case inputfile_t::type_t::all_pot_sizes:
612*bf2c3715SXin Li preprocessed_inputfiles.emplace_back(inputfile);
613*bf2c3715SXin Li break;
614*bf2c3715SXin Li case inputfile_t::type_t::default_sizes:
615*bf2c3715SXin Li cerr << "The " << invokation_name() << " action only uses measurements for all pot sizes, and "
616*bf2c3715SXin Li << "has no use for " << *it << " which contains measurements for default sizes." << endl;
617*bf2c3715SXin Li exit(1);
618*bf2c3715SXin Li break;
619*bf2c3715SXin Li default:
620*bf2c3715SXin Li cerr << "Unrecognized input file: " << *it << endl;
621*bf2c3715SXin Li exit(1);
622*bf2c3715SXin Li }
623*bf2c3715SXin Li }
624*bf2c3715SXin Li
625*bf2c3715SXin Li check_all_files_in_same_exact_order(preprocessed_inputfiles);
626*bf2c3715SXin Li
627*bf2c3715SXin Li float required_efficiency_to_beat = 0.0f;
628*bf2c3715SXin Li vector<vector<vector<size_t>>> partitions;
629*bf2c3715SXin Li cerr << "searching for partitions...\r" << flush;
630*bf2c3715SXin Li while (true)
631*bf2c3715SXin Li {
632*bf2c3715SXin Li vector<vector<size_t>> partition;
633*bf2c3715SXin Li find_partition_with_efficiency_higher_than(
634*bf2c3715SXin Li preprocessed_inputfiles,
635*bf2c3715SXin Li required_efficiency_to_beat,
636*bf2c3715SXin Li partition);
637*bf2c3715SXin Li float actual_efficiency = efficiency_of_partition(preprocessed_inputfiles, partition);
638*bf2c3715SXin Li cerr << "partition " << preprocessed_inputfiles.size() << " files into " << partition.size()
639*bf2c3715SXin Li << " subsets for " << 100.0f * actual_efficiency
640*bf2c3715SXin Li << " % efficiency"
641*bf2c3715SXin Li << " \r" << flush;
642*bf2c3715SXin Li partitions.push_back(partition);
643*bf2c3715SXin Li if (partition.size() == preprocessed_inputfiles.size() || actual_efficiency == 1.0f) {
644*bf2c3715SXin Li break;
645*bf2c3715SXin Li }
646*bf2c3715SXin Li required_efficiency_to_beat = actual_efficiency;
647*bf2c3715SXin Li }
648*bf2c3715SXin Li cerr << " " << endl;
649*bf2c3715SXin Li while (true) {
650*bf2c3715SXin Li bool repeat = false;
651*bf2c3715SXin Li for (size_t i = 0; i < partitions.size() - 1; i++) {
652*bf2c3715SXin Li if (partitions[i].size() >= partitions[i+1].size()) {
653*bf2c3715SXin Li partitions.erase(partitions.begin() + i);
654*bf2c3715SXin Li repeat = true;
655*bf2c3715SXin Li break;
656*bf2c3715SXin Li }
657*bf2c3715SXin Li }
658*bf2c3715SXin Li if (!repeat) {
659*bf2c3715SXin Li break;
660*bf2c3715SXin Li }
661*bf2c3715SXin Li }
662*bf2c3715SXin Li for (auto it = partitions.begin(); it != partitions.end(); ++it) {
663*bf2c3715SXin Li print_partition(preprocessed_inputfiles, *it);
664*bf2c3715SXin Li }
665*bf2c3715SXin Li }
666*bf2c3715SXin Li };
667*bf2c3715SXin Li
668*bf2c3715SXin Li struct evaluate_defaults_action_t : action_t
669*bf2c3715SXin Li {
670*bf2c3715SXin Li struct results_entry_t {
671*bf2c3715SXin Li uint16_t product_size;
672*bf2c3715SXin Li size_triple_t default_block_size;
673*bf2c3715SXin Li uint16_t best_pot_block_size;
674*bf2c3715SXin Li float default_gflops;
675*bf2c3715SXin Li float best_pot_gflops;
676*bf2c3715SXin Li float default_efficiency;
677*bf2c3715SXin Li };
operator <<(ostream & s,const results_entry_t & entry)678*bf2c3715SXin Li friend ostream& operator<<(ostream& s, const results_entry_t& entry)
679*bf2c3715SXin Li {
680*bf2c3715SXin Li return s
681*bf2c3715SXin Li << "Product size " << size_triple_t(entry.product_size)
682*bf2c3715SXin Li << ": default block size " << entry.default_block_size
683*bf2c3715SXin Li << " -> " << entry.default_gflops
684*bf2c3715SXin Li << " GFlop/s = " << entry.default_efficiency * 100.0f << " %"
685*bf2c3715SXin Li << " of best POT block size " << size_triple_t(entry.best_pot_block_size)
686*bf2c3715SXin Li << " -> " << entry.best_pot_gflops
687*bf2c3715SXin Li << " GFlop/s" << dec;
688*bf2c3715SXin Li }
lower_efficiencyevaluate_defaults_action_t689*bf2c3715SXin Li static bool lower_efficiency(const results_entry_t& e1, const results_entry_t& e2) {
690*bf2c3715SXin Li return e1.default_efficiency < e2.default_efficiency;
691*bf2c3715SXin Li }
invokation_nameevaluate_defaults_action_t692*bf2c3715SXin Li virtual const char* invokation_name() const override { return "evaluate-defaults"; }
show_usage_and_exitevaluate_defaults_action_t693*bf2c3715SXin Li void show_usage_and_exit() const
694*bf2c3715SXin Li {
695*bf2c3715SXin Li cerr << "usage: " << invokation_name() << " default-sizes-data all-pot-sizes-data" << endl;
696*bf2c3715SXin Li cerr << "checks how well the performance with default sizes compares to the best "
697*bf2c3715SXin Li << "performance measured over all POT sizes." << endl;
698*bf2c3715SXin Li exit(1);
699*bf2c3715SXin Li }
runevaluate_defaults_action_t700*bf2c3715SXin Li virtual void run(const vector<string>& input_filenames) const override
701*bf2c3715SXin Li {
702*bf2c3715SXin Li if (input_filenames.size() != 2) {
703*bf2c3715SXin Li show_usage_and_exit();
704*bf2c3715SXin Li }
705*bf2c3715SXin Li inputfile_t inputfile_default_sizes(input_filenames[0]);
706*bf2c3715SXin Li inputfile_t inputfile_all_pot_sizes(input_filenames[1]);
707*bf2c3715SXin Li if (inputfile_default_sizes.type != inputfile_t::type_t::default_sizes) {
708*bf2c3715SXin Li cerr << inputfile_default_sizes.filename << " is not an input file with default sizes." << endl;
709*bf2c3715SXin Li show_usage_and_exit();
710*bf2c3715SXin Li }
711*bf2c3715SXin Li if (inputfile_all_pot_sizes.type != inputfile_t::type_t::all_pot_sizes) {
712*bf2c3715SXin Li cerr << inputfile_all_pot_sizes.filename << " is not an input file with all POT sizes." << endl;
713*bf2c3715SXin Li show_usage_and_exit();
714*bf2c3715SXin Li }
715*bf2c3715SXin Li vector<results_entry_t> results;
716*bf2c3715SXin Li vector<results_entry_t> cubic_results;
717*bf2c3715SXin Li
718*bf2c3715SXin Li uint16_t product_size = 0;
719*bf2c3715SXin Li auto it_all_pot_sizes = inputfile_all_pot_sizes.entries.begin();
720*bf2c3715SXin Li for (auto it_default_sizes = inputfile_default_sizes.entries.begin();
721*bf2c3715SXin Li it_default_sizes != inputfile_default_sizes.entries.end();
722*bf2c3715SXin Li ++it_default_sizes)
723*bf2c3715SXin Li {
724*bf2c3715SXin Li if (it_default_sizes->product_size == product_size) {
725*bf2c3715SXin Li continue;
726*bf2c3715SXin Li }
727*bf2c3715SXin Li product_size = it_default_sizes->product_size;
728*bf2c3715SXin Li while (it_all_pot_sizes != inputfile_all_pot_sizes.entries.end() &&
729*bf2c3715SXin Li it_all_pot_sizes->product_size != product_size)
730*bf2c3715SXin Li {
731*bf2c3715SXin Li ++it_all_pot_sizes;
732*bf2c3715SXin Li }
733*bf2c3715SXin Li if (it_all_pot_sizes == inputfile_all_pot_sizes.entries.end()) {
734*bf2c3715SXin Li break;
735*bf2c3715SXin Li }
736*bf2c3715SXin Li uint16_t best_pot_block_size = 0;
737*bf2c3715SXin Li float best_pot_gflops = 0;
738*bf2c3715SXin Li for (auto it = it_all_pot_sizes;
739*bf2c3715SXin Li it != inputfile_all_pot_sizes.entries.end() && it->product_size == product_size;
740*bf2c3715SXin Li ++it)
741*bf2c3715SXin Li {
742*bf2c3715SXin Li if (it->gflops > best_pot_gflops) {
743*bf2c3715SXin Li best_pot_gflops = it->gflops;
744*bf2c3715SXin Li best_pot_block_size = it->pot_block_size;
745*bf2c3715SXin Li }
746*bf2c3715SXin Li }
747*bf2c3715SXin Li results_entry_t entry;
748*bf2c3715SXin Li entry.product_size = product_size;
749*bf2c3715SXin Li entry.default_block_size = it_default_sizes->nonpot_block_size;
750*bf2c3715SXin Li entry.best_pot_block_size = best_pot_block_size;
751*bf2c3715SXin Li entry.default_gflops = it_default_sizes->gflops;
752*bf2c3715SXin Li entry.best_pot_gflops = best_pot_gflops;
753*bf2c3715SXin Li entry.default_efficiency = entry.default_gflops / entry.best_pot_gflops;
754*bf2c3715SXin Li results.push_back(entry);
755*bf2c3715SXin Li
756*bf2c3715SXin Li size_triple_t t(product_size);
757*bf2c3715SXin Li if (t.k == t.m && t.m == t.n) {
758*bf2c3715SXin Li cubic_results.push_back(entry);
759*bf2c3715SXin Li }
760*bf2c3715SXin Li }
761*bf2c3715SXin Li
762*bf2c3715SXin Li cout << "All results:" << endl;
763*bf2c3715SXin Li for (auto it = results.begin(); it != results.end(); ++it) {
764*bf2c3715SXin Li cout << *it << endl;
765*bf2c3715SXin Li }
766*bf2c3715SXin Li cout << endl;
767*bf2c3715SXin Li
768*bf2c3715SXin Li sort(results.begin(), results.end(), lower_efficiency);
769*bf2c3715SXin Li
770*bf2c3715SXin Li const size_t n = min<size_t>(20, results.size());
771*bf2c3715SXin Li cout << n << " worst results:" << endl;
772*bf2c3715SXin Li for (size_t i = 0; i < n; i++) {
773*bf2c3715SXin Li cout << results[i] << endl;
774*bf2c3715SXin Li }
775*bf2c3715SXin Li cout << endl;
776*bf2c3715SXin Li
777*bf2c3715SXin Li cout << "cubic results:" << endl;
778*bf2c3715SXin Li for (auto it = cubic_results.begin(); it != cubic_results.end(); ++it) {
779*bf2c3715SXin Li cout << *it << endl;
780*bf2c3715SXin Li }
781*bf2c3715SXin Li cout << endl;
782*bf2c3715SXin Li
783*bf2c3715SXin Li sort(cubic_results.begin(), cubic_results.end(), lower_efficiency);
784*bf2c3715SXin Li
785*bf2c3715SXin Li cout.precision(2);
786*bf2c3715SXin Li vector<float> a = {0.5f, 0.20f, 0.10f, 0.05f, 0.02f, 0.01f};
787*bf2c3715SXin Li for (auto it = a.begin(); it != a.end(); ++it) {
788*bf2c3715SXin Li size_t n = min(results.size() - 1, size_t(*it * results.size()));
789*bf2c3715SXin Li cout << (100.0f * n / (results.size() - 1))
790*bf2c3715SXin Li << " % of product sizes have default efficiency <= "
791*bf2c3715SXin Li << 100.0f * results[n].default_efficiency << " %" << endl;
792*bf2c3715SXin Li }
793*bf2c3715SXin Li cout.precision(default_precision);
794*bf2c3715SXin Li }
795*bf2c3715SXin Li };
796*bf2c3715SXin Li
797*bf2c3715SXin Li
show_usage_and_exit(int argc,char * argv[],const vector<unique_ptr<action_t>> & available_actions)798*bf2c3715SXin Li void show_usage_and_exit(int argc, char* argv[],
799*bf2c3715SXin Li const vector<unique_ptr<action_t>>& available_actions)
800*bf2c3715SXin Li {
801*bf2c3715SXin Li cerr << "usage: " << argv[0] << " <action> [options...] <input files...>" << endl;
802*bf2c3715SXin Li cerr << "available actions:" << endl;
803*bf2c3715SXin Li for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
804*bf2c3715SXin Li cerr << " " << (*it)->invokation_name() << endl;
805*bf2c3715SXin Li }
806*bf2c3715SXin Li cerr << "the input files should each contain an output of benchmark-blocking-sizes" << endl;
807*bf2c3715SXin Li exit(1);
808*bf2c3715SXin Li }
809*bf2c3715SXin Li
main(int argc,char * argv[])810*bf2c3715SXin Li int main(int argc, char* argv[])
811*bf2c3715SXin Li {
812*bf2c3715SXin Li cout.precision(default_precision);
813*bf2c3715SXin Li cerr.precision(default_precision);
814*bf2c3715SXin Li
815*bf2c3715SXin Li vector<unique_ptr<action_t>> available_actions;
816*bf2c3715SXin Li available_actions.emplace_back(new partition_action_t);
817*bf2c3715SXin Li available_actions.emplace_back(new evaluate_defaults_action_t);
818*bf2c3715SXin Li
819*bf2c3715SXin Li vector<string> input_filenames;
820*bf2c3715SXin Li
821*bf2c3715SXin Li action_t* action = nullptr;
822*bf2c3715SXin Li
823*bf2c3715SXin Li if (argc < 2) {
824*bf2c3715SXin Li show_usage_and_exit(argc, argv, available_actions);
825*bf2c3715SXin Li }
826*bf2c3715SXin Li for (int i = 1; i < argc; i++) {
827*bf2c3715SXin Li bool arg_handled = false;
828*bf2c3715SXin Li // Step 1. Try to match action invocation names.
829*bf2c3715SXin Li for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
830*bf2c3715SXin Li if (!strcmp(argv[i], (*it)->invokation_name())) {
831*bf2c3715SXin Li if (!action) {
832*bf2c3715SXin Li action = it->get();
833*bf2c3715SXin Li arg_handled = true;
834*bf2c3715SXin Li break;
835*bf2c3715SXin Li } else {
836*bf2c3715SXin Li cerr << "can't specify more than one action!" << endl;
837*bf2c3715SXin Li show_usage_and_exit(argc, argv, available_actions);
838*bf2c3715SXin Li }
839*bf2c3715SXin Li }
840*bf2c3715SXin Li }
841*bf2c3715SXin Li if (arg_handled) {
842*bf2c3715SXin Li continue;
843*bf2c3715SXin Li }
844*bf2c3715SXin Li // Step 2. Try to match option names.
845*bf2c3715SXin Li if (argv[i][0] == '-') {
846*bf2c3715SXin Li if (!strcmp(argv[i], "--only-cubic-sizes")) {
847*bf2c3715SXin Li only_cubic_sizes = true;
848*bf2c3715SXin Li arg_handled = true;
849*bf2c3715SXin Li }
850*bf2c3715SXin Li if (!strcmp(argv[i], "--dump-tables")) {
851*bf2c3715SXin Li dump_tables = true;
852*bf2c3715SXin Li arg_handled = true;
853*bf2c3715SXin Li }
854*bf2c3715SXin Li if (!arg_handled) {
855*bf2c3715SXin Li cerr << "Unrecognized option: " << argv[i] << endl;
856*bf2c3715SXin Li show_usage_and_exit(argc, argv, available_actions);
857*bf2c3715SXin Li }
858*bf2c3715SXin Li }
859*bf2c3715SXin Li if (arg_handled) {
860*bf2c3715SXin Li continue;
861*bf2c3715SXin Li }
862*bf2c3715SXin Li // Step 3. Default to interpreting args as input filenames.
863*bf2c3715SXin Li input_filenames.emplace_back(argv[i]);
864*bf2c3715SXin Li }
865*bf2c3715SXin Li
866*bf2c3715SXin Li if (dump_tables && only_cubic_sizes) {
867*bf2c3715SXin Li cerr << "Incompatible options: --only-cubic-sizes and --dump-tables." << endl;
868*bf2c3715SXin Li show_usage_and_exit(argc, argv, available_actions);
869*bf2c3715SXin Li }
870*bf2c3715SXin Li
871*bf2c3715SXin Li if (!action) {
872*bf2c3715SXin Li show_usage_and_exit(argc, argv, available_actions);
873*bf2c3715SXin Li }
874*bf2c3715SXin Li
875*bf2c3715SXin Li action->run(input_filenames);
876*bf2c3715SXin Li }
877