1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 // A C++ binary to benchmark a compute graph and its individual operators,
17 // both on desktop machines and on Android.
18 //
19 // See README.md for usage instructions.
20
21 #include "tensorflow/tools/benchmark/benchmark_model.h"
22
23 #include <cstdlib>
24 #include <memory>
25 #include <string>
26 #include <unordered_set>
27 #include <vector>
28
29 #include "tensorflow/core/common_runtime/graph_constructor.h"
30 #include "tensorflow/core/framework/graph.pb.h"
31 #include "tensorflow/core/framework/node_def.pb.h"
32 #include "tensorflow/core/framework/step_stats.pb.h"
33 #include "tensorflow/core/framework/tensor.h"
34 #include "tensorflow/core/graph/algorithm.h"
35 #include "tensorflow/core/graph/graph.h"
36 #include "tensorflow/core/lib/strings/numbers.h"
37 #include "tensorflow/core/lib/strings/str_util.h"
38 #include "tensorflow/core/lib/strings/strcat.h"
39 #include "tensorflow/core/platform/env.h"
40 #include "tensorflow/core/platform/init_main.h"
41 #include "tensorflow/core/platform/logging.h"
42 #include "tensorflow/core/platform/platform.h"
43 #include "tensorflow/core/platform/types.h"
44 #include "tensorflow/core/public/session.h"
45 #include "tensorflow/core/util/command_line_flags.h"
46 #include "tensorflow/core/util/reporter.h"
47 #include "tensorflow/core/util/stat_summarizer.h"
48
49 namespace tensorflow {
50 namespace benchmark_model {
51
52 namespace {
53
InitializeVariables(Session * session,const std::vector<string> & init_ops)54 Status InitializeVariables(Session* session,
55 const std::vector<string>& init_ops) {
56 LOG(INFO) << "Initializing graph variables";
57 for (const string& init_op : init_ops) {
58 TF_RETURN_IF_ERROR(session->Run({}, {}, {init_op}, nullptr));
59 }
60 return OkStatus();
61 }
62
63 template <class T>
InitializeTensor(const std::vector<float> & initialization_values,Tensor * input_tensor)64 void InitializeTensor(const std::vector<float>& initialization_values,
65 Tensor* input_tensor) {
66 auto type_tensor = input_tensor->flat<T>();
67 type_tensor = type_tensor.constant(0);
68 if (!initialization_values.empty()) {
69 for (int i = 0; i < initialization_values.size(); ++i) {
70 type_tensor(i) = static_cast<T>(initialization_values[i]);
71 }
72 }
73 }
74
CreateTensorsFromInputInfo(const std::vector<InputLayerInfo> & inputs,std::vector<std::pair<string,tensorflow::Tensor>> * input_tensors)75 void CreateTensorsFromInputInfo(
76 const std::vector<InputLayerInfo>& inputs,
77 std::vector<std::pair<string, tensorflow::Tensor> >* input_tensors) {
78 for (const InputLayerInfo& input : inputs) {
79 Tensor input_tensor(input.data_type, input.shape);
80 switch (input.data_type) {
81 case DT_INT32: {
82 InitializeTensor<int32>(input.initialization_values, &input_tensor);
83 break;
84 }
85 case DT_INT64: {
86 InitializeTensor<int64>(input.initialization_values, &input_tensor);
87 break;
88 }
89 case DT_FLOAT: {
90 InitializeTensor<float>(input.initialization_values, &input_tensor);
91 break;
92 }
93 case DT_QUINT8: {
94 InitializeTensor<quint8>(input.initialization_values, &input_tensor);
95 break;
96 }
97 case DT_UINT8: {
98 InitializeTensor<uint8>(input.initialization_values, &input_tensor);
99 break;
100 }
101 case DT_BOOL: {
102 InitializeTensor<bool>(input.initialization_values, &input_tensor);
103 break;
104 }
105 case DT_STRING: {
106 if (!input.initialization_values.empty()) {
107 LOG(FATAL) << "Initialization values are not supported for strings";
108 }
109 auto type_tensor = input_tensor.flat<tstring>();
110 type_tensor = type_tensor.constant("");
111 break;
112 }
113 default:
114 LOG(FATAL) << "Unsupported input type: "
115 << DataTypeString(input.data_type);
116 }
117 input_tensors->push_back({input.name, input_tensor});
118 }
119 }
120
GetOutputShapes(const std::vector<InputLayerInfo> & inputs,const std::set<string> & wanted_shapes,Session * session,std::unordered_map<string,TensorShape> * node_shapes)121 Status GetOutputShapes(const std::vector<InputLayerInfo>& inputs,
122 const std::set<string>& wanted_shapes, Session* session,
123 std::unordered_map<string, TensorShape>* node_shapes) {
124 std::vector<std::pair<string, tensorflow::Tensor> > input_tensors;
125 CreateTensorsFromInputInfo(inputs, &input_tensors);
126 std::vector<tensorflow::Tensor> output_tensors;
127 std::vector<string> output_tensor_names;
128 for (const string& wanted_shape : wanted_shapes) {
129 bool is_input = false;
130 for (const std::pair<string, tensorflow::Tensor>& input_tensor :
131 input_tensors) {
132 if (input_tensor.first == wanted_shape) {
133 (*node_shapes)[wanted_shape] = input_tensor.second.shape();
134 is_input = true;
135 break;
136 }
137 }
138 if (!is_input) {
139 output_tensor_names.push_back(wanted_shape);
140 }
141 }
142 TF_RETURN_IF_ERROR(
143 session->Run(input_tensors, output_tensor_names, {}, &output_tensors));
144 CHECK_EQ(output_tensors.size(), output_tensor_names.size());
145 for (int i = 0; i < output_tensor_names.size(); ++i) {
146 const string& wanted_shape_name = output_tensor_names[i];
147 const TensorShape& found_shape = output_tensors[i].shape();
148 (*node_shapes)[wanted_shape_name] = found_shape;
149 }
150 return OkStatus();
151 }
152
CalculateFlops(const GraphDef & graph,const std::vector<InputLayerInfo> & inputs,Session * session,int64_t * total_flops,std::unordered_map<string,int64_t> * flops_by_op)153 Status CalculateFlops(const GraphDef& graph,
154 const std::vector<InputLayerInfo>& inputs,
155 Session* session, int64_t* total_flops,
156 std::unordered_map<string, int64_t>* flops_by_op) {
157 std::unordered_set<string> floppable_ops = {
158 "Conv2D", "MatMul", "QuantizedConv2D", "QuantizedMatMul",
159 "DepthwiseConv2dNative"};
160
161 std::set<string> wanted_shapes;
162 for (const NodeDef& node : graph.node()) {
163 if (floppable_ops.count(node.op())) {
164 for (const string& input : node.input()) {
165 wanted_shapes.insert(input);
166 }
167 wanted_shapes.insert(node.name());
168 }
169 }
170 std::unordered_map<string, TensorShape> found_shapes;
171 TF_RETURN_IF_ERROR(
172 GetOutputShapes(inputs, wanted_shapes, session, &found_shapes));
173
174 *total_flops = 0;
175 for (const NodeDef& node : graph.node()) {
176 if (floppable_ops.count(node.op())) {
177 int64_t current_flops = 0;
178 // This is a very crude approximation to FLOPs that only looks at a few
179 // op types that commonly form the bulk of the computation for many
180 // models. It's included here because getting even an approximate value
181 // for FLOPs is still very useful for estimating utilization, versus a
182 // device's theoretical maximum FLOPs/second.
183 if ((node.op() == "Conv2D") || (node.op() == "QuantizedConv2D")) {
184 const TensorShape& filter_shape = found_shapes[node.input(1)];
185 const TensorShape& output_shape = found_shapes[node.name()];
186 int64_t filter_height = filter_shape.dim_size(0);
187 int64_t filter_width = filter_shape.dim_size(1);
188 int64_t filter_in_depth = filter_shape.dim_size(2);
189 int64_t output_count = output_shape.num_elements();
190 current_flops =
191 output_count * filter_in_depth * filter_height * filter_width * 2;
192 } else if ((node.op() == "MatMul") || (node.op() == "QuantizedMatMul")) {
193 const bool transpose_a = node.attr().at("transpose_a").b();
194 const TensorShape& a_shape = found_shapes[node.input(0)];
195 const TensorShape& output_shape = found_shapes[node.name()];
196 int64_t k;
197 if (transpose_a) {
198 k = a_shape.dim_size(0);
199 } else {
200 k = a_shape.dim_size(1);
201 }
202 int64_t output_count = output_shape.num_elements();
203 current_flops = k * output_count * 2;
204 } else if (node.op() == "DepthwiseConv2dNative") {
205 const TensorShape& filter_shape = found_shapes[node.input(1)];
206 const TensorShape& output_shape = found_shapes[node.name()];
207 int64_t filter_height = filter_shape.dim_size(0);
208 int64_t filter_width = filter_shape.dim_size(1);
209 int64_t output_count = output_shape.num_elements();
210 current_flops = output_count * filter_height * filter_width * 2;
211 }
212 (*flops_by_op)[node.op()] += current_flops;
213 *total_flops += current_flops;
214 }
215 }
216 return OkStatus();
217 }
218
RecordBenchmarkEntry(const string & output_prefix,const string & benchmark_name,const string & postfix,int num_runs,double total_time_s,double throughput=-1.0)219 void RecordBenchmarkEntry(const string& output_prefix,
220 const string& benchmark_name, const string& postfix,
221 int num_runs, double total_time_s,
222 double throughput = -1.0) {
223 std::stringstream stream;
224 stream << benchmark_name;
225 if (!postfix.empty()) {
226 stream << "_" << postfix;
227 }
228
229 TestReporter node_reporter(output_prefix, stream.str());
230 TF_QCHECK_OK(node_reporter.Initialize());
231 TF_QCHECK_OK(
232 node_reporter.Benchmark(num_runs, -1.0, total_time_s, throughput));
233 TF_QCHECK_OK(node_reporter.Close());
234 }
235
SleepSeconds(double sleep_seconds)236 void SleepSeconds(double sleep_seconds) {
237 if (sleep_seconds <= 0.0) {
238 return;
239 }
240 #ifdef PLATFORM_WINDOWS
241 Env::Default()->SleepForMicroseconds(sleep_seconds * 1000 * 1000);
242 #else
243 // Convert the inference_delay string into a timespec.
244 timespec req;
245 req.tv_sec = static_cast<time_t>(sleep_seconds);
246 req.tv_nsec = (sleep_seconds - req.tv_sec) * 1000000000;
247 nanosleep(&req, nullptr);
248 #endif
249 }
250
251 } // namespace
252
InitializeSession(int num_threads,const string & graph,std::unique_ptr<Session> * session,std::unique_ptr<GraphDef> * graph_def)253 Status InitializeSession(int num_threads, const string& graph,
254 std::unique_ptr<Session>* session,
255 std::unique_ptr<GraphDef>* graph_def) {
256 LOG(INFO) << "Loading TensorFlow.";
257
258 tensorflow::SessionOptions options;
259 tensorflow::ConfigProto& config = options.config;
260 if (num_threads > 0) {
261 config.set_intra_op_parallelism_threads(num_threads);
262 config.set_inter_op_parallelism_threads(num_threads);
263 }
264 LOG(INFO) << "Got config, " << config.device_count_size() << " devices";
265
266 session->reset(tensorflow::NewSession(options));
267 graph_def->reset(new GraphDef());
268 tensorflow::GraphDef tensorflow_graph;
269 Status s = ReadBinaryProto(Env::Default(), graph, graph_def->get());
270 if (!s.ok()) {
271 s = ReadTextProto(Env::Default(), graph, graph_def->get());
272 }
273
274 if (!s.ok()) {
275 LOG(ERROR) << "Could not create TensorFlow Graph: " << s;
276 return s;
277 }
278
279 s = (*session)->Create(*(graph_def->get()));
280 if (!s.ok()) {
281 LOG(ERROR) << "Could not create TensorFlow Session: " << s;
282 return s;
283 }
284
285 return OkStatus();
286 }
287
RunBenchmark(const std::vector<InputLayerInfo> & inputs,const std::vector<string> & outputs,const std::vector<string> & targets,Session * session,StatSummarizer * stats,int64_t * inference_time_us)288 Status RunBenchmark(const std::vector<InputLayerInfo>& inputs,
289 const std::vector<string>& outputs,
290 const std::vector<string>& targets, Session* session,
291 StatSummarizer* stats, int64_t* inference_time_us) {
292 std::vector<std::pair<string, tensorflow::Tensor> > input_tensors;
293 CreateTensorsFromInputInfo(inputs, &input_tensors);
294
295 std::vector<tensorflow::Tensor> output_tensors;
296
297 tensorflow::Status s;
298
299 RunOptions run_options;
300 if (stats != nullptr) {
301 run_options.set_trace_level(RunOptions::FULL_TRACE);
302 }
303
304 RunMetadata run_metadata;
305 const int64_t start_time = Env::Default()->NowMicros();
306 s = session->Run(run_options, input_tensors, outputs, targets,
307 &output_tensors, &run_metadata);
308 const int64_t end_time = Env::Default()->NowMicros();
309 *inference_time_us = end_time - start_time;
310
311 if (!s.ok()) {
312 LOG(ERROR) << "Error during inference: " << s;
313 return s;
314 }
315
316 if (stats != nullptr) {
317 assert(run_metadata.has_step_stats());
318 const StepStats& step_stats = run_metadata.step_stats();
319 stats->ProcessStepStats(step_stats);
320 }
321
322 return s;
323 }
324
TimeMultipleRuns(double sleep_seconds,int num_runs,double max_time_s,const std::vector<InputLayerInfo> & inputs,const std::vector<string> & outputs,const std::vector<string> & targets,Session * session,StatSummarizer * stats,int64_t * total_time_us,int64_t * actual_num_runs)325 Status TimeMultipleRuns(double sleep_seconds, int num_runs, double max_time_s,
326 const std::vector<InputLayerInfo>& inputs,
327 const std::vector<string>& outputs,
328 const std::vector<string>& targets, Session* session,
329 StatSummarizer* stats, int64_t* total_time_us,
330 int64_t* actual_num_runs) {
331 *total_time_us = 0;
332
333 LOG(INFO) << "Running benchmark for max " << num_runs << " iterations, max "
334 << max_time_s << " seconds "
335 << (stats != nullptr ? "with" : "without")
336 << " detailed stat logging, with " << sleep_seconds
337 << "s sleep between inferences";
338
339 Stat<int64_t> stat;
340 const bool until_max_time = num_runs <= 0;
341 for (int i = 0; until_max_time || i < num_runs; ++i) {
342 int64_t time;
343 Status run_status =
344 RunBenchmark(inputs, outputs, targets, session, stats, &time);
345 stat.UpdateStat(time);
346 (*total_time_us) += time;
347 ++(*actual_num_runs);
348
349 if (max_time_s > 0.0 && (*total_time_us / 1000000.0) > max_time_s) {
350 break;
351 }
352
353 if (!run_status.ok()) {
354 LOG(INFO) << "Failed on run " << i;
355 return run_status;
356 }
357
358 // If requested, sleep between runs for an arbitrary amount of time.
359 // This can be helpful to determine the effect of mobile processor
360 // scaling and thermal throttling.
361 if (sleep_seconds > 0.0) {
362 SleepSeconds(sleep_seconds);
363 }
364 }
365 std::stringstream stream;
366 stat.OutputToStream(&stream);
367 LOG(INFO) << stream.str() << std::endl;
368
369 return OkStatus();
370 }
371
Main(int argc,char ** argv)372 int Main(int argc, char** argv) {
373 string graph = "/data/local/tmp/tensorflow_inception_graph.pb";
374 string init_ops_string = "";
375 string input_layer_string = "input:0";
376 string input_layer_shape_string = "1,224,224,3";
377 string input_layer_type_string = "float";
378 string input_layer_values_string = "";
379 string output_layer_string = "output:0";
380 string target_layer_string = "";
381 int max_num_runs = 1000;
382 string max_time = "10.0";
383 string inference_delay = "-1.0";
384 string inter_benchmark_delay = "-1.0";
385 int num_threads = -1;
386 string benchmark_name = "";
387 string output_prefix = "";
388 bool show_sizes = false;
389 bool show_run_order = true;
390 int run_order_limit = 0;
391 bool show_time = true;
392 int time_limit = 10;
393 bool show_memory = true;
394 int memory_limit = 10;
395 bool show_type = true;
396 bool show_summary = true;
397 bool show_flops = false;
398 int warmup_runs = 1;
399
400 std::vector<Flag> flag_list = {
401 Flag("graph", &graph, "graph file name"),
402 Flag("init_ops", &init_ops_string, "init ops"),
403 Flag("input_layer", &input_layer_string, "input layer names"),
404 Flag("input_layer_shape", &input_layer_shape_string, "input layer shape"),
405 Flag("input_layer_type", &input_layer_type_string, "input layer type"),
406 Flag("input_layer_values", &input_layer_values_string,
407 "values to initialize the inputs with"),
408 Flag("output_layer", &output_layer_string, "output layer name"),
409 Flag("target_layer", &target_layer_string, "target layer name"),
410 Flag("max_num_runs", &max_num_runs, "number of runs max"),
411 Flag("max_time", &max_time, "length to run max"),
412 Flag("inference_delay", &inference_delay,
413 "delay between runs in seconds"),
414 Flag("inter_benchmark_delay", &inter_benchmark_delay,
415 "delay between benchmarks in seconds"),
416 Flag("num_threads", &num_threads, "number of threads"),
417 Flag("benchmark_name", &benchmark_name, "benchmark name"),
418 Flag("output_prefix", &output_prefix, "benchmark output prefix"),
419 Flag("show_sizes", &show_sizes, "whether to show sizes"),
420 Flag("show_run_order", &show_run_order,
421 "whether to list stats by run order"),
422 Flag("run_order_limit", &run_order_limit,
423 "how many items to show by run order"),
424 Flag("show_time", &show_time, "whether to list stats by time taken"),
425 Flag("time_limit", &time_limit, "how many items to show by time taken"),
426 Flag("show_memory", &show_memory, "whether to list stats by memory used"),
427 Flag("memory_limit", &memory_limit,
428 "how many items to show by memory used"),
429 Flag("show_type", &show_type, "whether to list stats by op type"),
430 Flag("show_summary", &show_summary,
431 "whether to show a summary of the stats"),
432 Flag("show_flops", &show_flops, "whether to estimate the model's FLOPs"),
433 Flag("warmup_runs", &warmup_runs, "how many runs to initialize model"),
434 };
435 string usage = Flags::Usage(argv[0], flag_list);
436 const bool parse_result = Flags::Parse(&argc, argv, flag_list);
437
438 if (!parse_result) {
439 LOG(ERROR) << usage;
440 return -1;
441 }
442
443 std::vector<string> init_ops = str_util::Split(init_ops_string, ',');
444 std::vector<string> input_layers = str_util::Split(input_layer_string, ',');
445 std::vector<string> input_layer_shapes =
446 str_util::Split(input_layer_shape_string, ':');
447 std::vector<string> input_layer_types =
448 str_util::Split(input_layer_type_string, ',');
449 std::vector<string> input_layer_values =
450 str_util::Split(input_layer_values_string, ':');
451 std::vector<string> output_layers = str_util::Split(output_layer_string, ',');
452 std::vector<string> target_layers = str_util::Split(target_layer_string, ',');
453 if ((input_layers.size() != input_layer_shapes.size()) ||
454 (input_layers.size() != input_layer_types.size())) {
455 LOG(ERROR) << "There must be the same number of items in --input_layer,"
456 << " --input_layer_shape, and --input_layer_type, for example"
457 << " --input_layer=input1,input2 --input_layer_type=float,float "
458 << " --input_layer_shape=1,224,224,4:1,20";
459 LOG(ERROR) << "--input_layer=" << input_layer_string << " ("
460 << input_layers.size() << " items)";
461 LOG(ERROR) << "--input_layer_type=" << input_layer_type_string << " ("
462 << input_layer_types.size() << " items)";
463 LOG(ERROR) << "--input_layer_shape=" << input_layer_shape_string << " ("
464 << input_layer_shapes.size() << " items)";
465 return -1;
466 }
467 const size_t inputs_count = input_layers.size();
468
469 ::tensorflow::port::InitMain(argv[0], &argc, &argv);
470 if (argc > 1) {
471 LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
472 return -1;
473 }
474
475 LOG(INFO) << "Graph: [" << graph << "]";
476 LOG(INFO) << "Init ops:" << init_ops_string;
477 LOG(INFO) << "Input layers: [" << input_layer_string << "]";
478 LOG(INFO) << "Input shapes: [" << input_layer_shape_string << "]";
479 LOG(INFO) << "Input types: [" << input_layer_type_string << "]";
480 LOG(INFO) << "Output layers: [" << output_layer_string << "]";
481 LOG(INFO) << "Target layers: [" << target_layer_string << "]";
482 LOG(INFO) << "Num runs: [" << max_num_runs << "]";
483 LOG(INFO) << "Inter-inference delay (seconds): [" << inference_delay << "]";
484 LOG(INFO) << "Inter-benchmark delay (seconds): [" << inter_benchmark_delay
485 << "]";
486 LOG(INFO) << "Num threads: [" << num_threads << "]";
487 LOG(INFO) << "Benchmark name: [" << benchmark_name << "]";
488 LOG(INFO) << "Output prefix: [" << output_prefix << "]";
489 LOG(INFO) << "Show sizes: [" << show_sizes << "]";
490 LOG(INFO) << "Warmup runs: [" << warmup_runs << "]";
491
492 std::unique_ptr<Session> session;
493 std::unique_ptr<StatSummarizer> stats;
494 std::unique_ptr<GraphDef> graph_def;
495
496 int64_t initialization_start_us = Env::Default()->NowMicros();
497 Status initialize_status =
498 InitializeSession(num_threads, graph, &session, &graph_def);
499 int64_t initialization_end_us = Env::Default()->NowMicros();
500 double initialization_time_s =
501 (initialization_end_us - initialization_start_us) / 1000000.0;
502 LOG(INFO) << "Initialized session in " << initialization_time_s << "s";
503 if (!initialize_status.ok()) {
504 return -1;
505 }
506
507 if (!init_ops.empty()) {
508 Status initialize_variables_status =
509 InitializeVariables(session.get(), init_ops);
510 if (!initialize_variables_status.ok()) {
511 LOG(ERROR) << "Graph variables initialization failed with "
512 << initialize_variables_status;
513 return -1;
514 }
515 }
516
517 StatSummarizerOptions stats_options;
518 stats_options.show_run_order = show_run_order;
519 stats_options.run_order_limit = run_order_limit;
520 stats_options.show_time = show_time;
521 stats_options.time_limit = time_limit;
522 stats_options.show_memory = show_memory;
523 stats_options.memory_limit = memory_limit;
524 stats_options.show_type = show_type;
525 stats_options.show_summary = show_summary;
526 stats.reset(new tensorflow::StatSummarizer(stats_options));
527
528 const double inter_inference_sleep_seconds =
529 std::strtod(inference_delay.c_str(), nullptr);
530 const double inter_benchmark_sleep_seconds =
531 std::strtod(inter_benchmark_delay.c_str(), nullptr);
532 const double max_benchmark_time_seconds =
533 std::strtod(max_time.c_str(), nullptr);
534
535 std::vector<InputLayerInfo> inputs;
536 for (int n = 0; n < inputs_count; ++n) {
537 InputLayerInfo input;
538 CHECK(DataTypeFromString(input_layer_types[n], &input.data_type))
539 << input_layer_types[n] << " was an invalid type";
540
541 std::vector<string> split_layer_shapes =
542 str_util::Split(input_layer_shapes[n], ',');
543 for (const string& layer_shape : split_layer_shapes) {
544 int32_t tmp;
545 CHECK(strings::safe_strto32(layer_shape, &tmp))
546 << "Incorrect size string specified: " << input_layer_shapes[n];
547 if (tmp == -1) {
548 LOG(ERROR) << "Any unknown sizes in the shapes (-1's) must be replaced"
549 << " with the size you want to benchmark with.";
550 return -1;
551 } else {
552 input.shape.AddDim(tmp);
553 }
554 }
555 input.name = input_layers[n];
556 if (n < input_layer_values.size()) {
557 std::vector<string> string_tokens =
558 str_util::Split(input_layer_values[n], ',');
559 input.initialization_values.clear();
560 input.initialization_values.reserve(string_tokens.size());
561 for (const string& str_val : string_tokens) {
562 float val;
563 CHECK(strings::safe_strtof(str_val, &val))
564 << "Incorrect initialization values string specified: "
565 << input_layer_values[n];
566 input.initialization_values.push_back(val);
567 }
568 }
569 inputs.push_back(input);
570 }
571
572 // If requested, run through the graph first to preinitialize everything
573 // before the benchmarking runs.
574 int64_t warmup_time_us = 0;
575 int64_t num_warmup_runs = 0;
576 if (warmup_runs > 0) {
577 Status warmup_time_status =
578 TimeMultipleRuns(inter_inference_sleep_seconds, warmup_runs, -1.0,
579 inputs, output_layers, target_layers, session.get(),
580 nullptr, &warmup_time_us, &num_warmup_runs);
581 if (!warmup_time_status.ok()) {
582 LOG(ERROR) << "Timing failed with " << warmup_time_status;
583 return -1;
584 }
585 }
586
587 // Capture overall inference time without stat logging overhead. This is the
588 // timing data that can be compared to other libraries.
589 SleepSeconds(inter_benchmark_sleep_seconds);
590 int64_t no_stat_time_us = 0;
591 int64_t no_stat_num_runs = 0;
592 Status no_stat_time_status = TimeMultipleRuns(
593 inter_inference_sleep_seconds, max_num_runs, max_benchmark_time_seconds,
594 inputs, output_layers, target_layers, session.get(), nullptr,
595 &no_stat_time_us, &no_stat_num_runs);
596 const double no_stat_wall_time = no_stat_time_us / 1000000.0;
597 if (!no_stat_time_status.ok()) {
598 LOG(ERROR) << "Timing failed with " << no_stat_time_status;
599 return -1;
600 }
601
602 // Run again to gather detailed log stats to get a better idea of where
603 // relative time is going within the graph.
604 SleepSeconds(inter_benchmark_sleep_seconds);
605 int64_t stat_time_us = 0;
606 int64_t stat_num_runs = 0;
607 Status stat_time_status = TimeMultipleRuns(
608 inter_inference_sleep_seconds, max_num_runs, max_benchmark_time_seconds,
609 inputs, output_layers, target_layers, session.get(), stats.get(),
610 &stat_time_us, &stat_num_runs);
611 if (!stat_time_status.ok()) {
612 LOG(ERROR) << "Timing failed with " << stat_time_status;
613 return -1;
614 }
615
616 LOG(INFO) << "Average inference timings in us: "
617 << "Warmup: "
618 << (warmup_runs > 0 ? warmup_time_us / warmup_runs : 0) << ", "
619 << "no stats: " << no_stat_time_us / no_stat_num_runs << ", "
620 << "with stats: " << stat_time_us / stat_num_runs;
621
622 stats->PrintStepStats();
623
624 if (show_sizes) {
625 stats->PrintOutputs();
626 }
627
628 if (show_flops) {
629 int64_t total_flops;
630 std::unordered_map<string, int64_t> flops_by_op;
631 Status flop_status = CalculateFlops(*graph_def, inputs, session.get(),
632 &total_flops, &flops_by_op);
633 if (!flop_status.ok()) {
634 LOG(ERROR) << "FLOPs calculation failed with " << flop_status;
635 return -1;
636 }
637 string pretty_flops;
638 if (total_flops < 1000) {
639 pretty_flops = strings::StrCat(total_flops, " FLOPs");
640 } else if (total_flops < (1000 * 1000)) {
641 const float rounded_flops = (total_flops / 1000.0f);
642 pretty_flops = strings::StrCat(rounded_flops, "k FLOPs");
643 } else if (total_flops < (1000 * 1000 * 1000)) {
644 const float rounded_flops = round(total_flops / 1000.0f) / 1000.0f;
645 pretty_flops = strings::StrCat(rounded_flops, " million FLOPs");
646 } else {
647 const float rounded_flops =
648 round(total_flops / (1000.0f * 1000.0f)) / 1000.0f;
649 pretty_flops = strings::StrCat(rounded_flops, " billion FLOPs");
650 }
651 LOG(INFO) << "FLOPs estimate: " << strings::HumanReadableNum(total_flops);
652 const double mean_run_time = no_stat_wall_time / no_stat_num_runs;
653 LOG(INFO) << "FLOPs/second: "
654 << strings::HumanReadableNum(
655 static_cast<int64_t>(total_flops / mean_run_time));
656 }
657
658 if (!benchmark_name.empty() && !output_prefix.empty()) {
659 // Compute the total number of values per input.
660 int64_t total_size = inputs[0].shape.num_elements();
661
662 // Throughput in MB/s
663 const double throughput =
664 DataTypeSize(inputs[0].data_type) * total_size * no_stat_num_runs /
665 static_cast<double>(no_stat_wall_time) / (1024 * 1024);
666
667 // Report the stats.
668 RecordBenchmarkEntry(output_prefix, benchmark_name, "", no_stat_num_runs,
669 no_stat_wall_time, throughput);
670
671 // Session initialization time.
672 RecordBenchmarkEntry(output_prefix, benchmark_name, "meta-init", 1,
673 initialization_time_s);
674
675 // First inference time. Note: if warmup_runs is > 1 this will actually be
676 // an average of all the warmup runs.
677 RecordBenchmarkEntry(output_prefix, benchmark_name, "meta-first-inference",
678 warmup_runs, warmup_time_us / 1000000.0);
679
680 // Time from starting to initialize TF to getting the first result back.
681 // This also assumes that only one warmup run is performed.
682 RecordBenchmarkEntry(
683 output_prefix, benchmark_name, "meta-init-plus-first-inference", 1,
684 initialization_time_s + (warmup_time_us / 1000000.0) / warmup_runs);
685
686 std::map<std::string, int64_t> node_type_map_count;
687 std::map<std::string, int64_t> node_type_map_time;
688 std::map<std::string, int64_t> node_type_map_memory;
689 std::map<std::string, int64_t> node_type_map_times_called;
690
691 int64_t accumulated_us;
692 stats->ComputeStatsByType(&node_type_map_count, &node_type_map_time,
693 &node_type_map_memory,
694 &node_type_map_times_called, &accumulated_us);
695 for (const auto& time : node_type_map_time) {
696 LOG(INFO) << "Outputting: [" << time.first << "]";
697 RecordBenchmarkEntry(output_prefix, benchmark_name, time.first,
698 stat_num_runs,
699 (time.second * stat_num_runs) / 1000000.0f);
700 }
701 }
702
703 return 0;
704 }
705
706 } // namespace benchmark_model
707 } // namespace tensorflow
708