xref: /aosp_15_r20/external/tensorflow/tensorflow/tools/benchmark/benchmark_model.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // A C++ binary to benchmark a compute graph and its individual operators,
17 // both on desktop machines and on Android.
18 //
19 // See README.md for usage instructions.
20 
21 #include "tensorflow/tools/benchmark/benchmark_model.h"
22 
23 #include <cstdlib>
24 #include <memory>
25 #include <string>
26 #include <unordered_set>
27 #include <vector>
28 
29 #include "tensorflow/core/common_runtime/graph_constructor.h"
30 #include "tensorflow/core/framework/graph.pb.h"
31 #include "tensorflow/core/framework/node_def.pb.h"
32 #include "tensorflow/core/framework/step_stats.pb.h"
33 #include "tensorflow/core/framework/tensor.h"
34 #include "tensorflow/core/graph/algorithm.h"
35 #include "tensorflow/core/graph/graph.h"
36 #include "tensorflow/core/lib/strings/numbers.h"
37 #include "tensorflow/core/lib/strings/str_util.h"
38 #include "tensorflow/core/lib/strings/strcat.h"
39 #include "tensorflow/core/platform/env.h"
40 #include "tensorflow/core/platform/init_main.h"
41 #include "tensorflow/core/platform/logging.h"
42 #include "tensorflow/core/platform/platform.h"
43 #include "tensorflow/core/platform/types.h"
44 #include "tensorflow/core/public/session.h"
45 #include "tensorflow/core/util/command_line_flags.h"
46 #include "tensorflow/core/util/reporter.h"
47 #include "tensorflow/core/util/stat_summarizer.h"
48 
49 namespace tensorflow {
50 namespace benchmark_model {
51 
52 namespace {
53 
InitializeVariables(Session * session,const std::vector<string> & init_ops)54 Status InitializeVariables(Session* session,
55                            const std::vector<string>& init_ops) {
56   LOG(INFO) << "Initializing graph variables";
57   for (const string& init_op : init_ops) {
58     TF_RETURN_IF_ERROR(session->Run({}, {}, {init_op}, nullptr));
59   }
60   return OkStatus();
61 }
62 
63 template <class T>
InitializeTensor(const std::vector<float> & initialization_values,Tensor * input_tensor)64 void InitializeTensor(const std::vector<float>& initialization_values,
65                       Tensor* input_tensor) {
66   auto type_tensor = input_tensor->flat<T>();
67   type_tensor = type_tensor.constant(0);
68   if (!initialization_values.empty()) {
69     for (int i = 0; i < initialization_values.size(); ++i) {
70       type_tensor(i) = static_cast<T>(initialization_values[i]);
71     }
72   }
73 }
74 
CreateTensorsFromInputInfo(const std::vector<InputLayerInfo> & inputs,std::vector<std::pair<string,tensorflow::Tensor>> * input_tensors)75 void CreateTensorsFromInputInfo(
76     const std::vector<InputLayerInfo>& inputs,
77     std::vector<std::pair<string, tensorflow::Tensor> >* input_tensors) {
78   for (const InputLayerInfo& input : inputs) {
79     Tensor input_tensor(input.data_type, input.shape);
80     switch (input.data_type) {
81       case DT_INT32: {
82         InitializeTensor<int32>(input.initialization_values, &input_tensor);
83         break;
84       }
85       case DT_INT64: {
86         InitializeTensor<int64>(input.initialization_values, &input_tensor);
87         break;
88       }
89       case DT_FLOAT: {
90         InitializeTensor<float>(input.initialization_values, &input_tensor);
91         break;
92       }
93       case DT_QUINT8: {
94         InitializeTensor<quint8>(input.initialization_values, &input_tensor);
95         break;
96       }
97       case DT_UINT8: {
98         InitializeTensor<uint8>(input.initialization_values, &input_tensor);
99         break;
100       }
101       case DT_BOOL: {
102         InitializeTensor<bool>(input.initialization_values, &input_tensor);
103         break;
104       }
105       case DT_STRING: {
106         if (!input.initialization_values.empty()) {
107           LOG(FATAL) << "Initialization values are not supported for strings";
108         }
109         auto type_tensor = input_tensor.flat<tstring>();
110         type_tensor = type_tensor.constant("");
111         break;
112       }
113       default:
114         LOG(FATAL) << "Unsupported input type: "
115                    << DataTypeString(input.data_type);
116     }
117     input_tensors->push_back({input.name, input_tensor});
118   }
119 }
120 
GetOutputShapes(const std::vector<InputLayerInfo> & inputs,const std::set<string> & wanted_shapes,Session * session,std::unordered_map<string,TensorShape> * node_shapes)121 Status GetOutputShapes(const std::vector<InputLayerInfo>& inputs,
122                        const std::set<string>& wanted_shapes, Session* session,
123                        std::unordered_map<string, TensorShape>* node_shapes) {
124   std::vector<std::pair<string, tensorflow::Tensor> > input_tensors;
125   CreateTensorsFromInputInfo(inputs, &input_tensors);
126   std::vector<tensorflow::Tensor> output_tensors;
127   std::vector<string> output_tensor_names;
128   for (const string& wanted_shape : wanted_shapes) {
129     bool is_input = false;
130     for (const std::pair<string, tensorflow::Tensor>& input_tensor :
131          input_tensors) {
132       if (input_tensor.first == wanted_shape) {
133         (*node_shapes)[wanted_shape] = input_tensor.second.shape();
134         is_input = true;
135         break;
136       }
137     }
138     if (!is_input) {
139       output_tensor_names.push_back(wanted_shape);
140     }
141   }
142   TF_RETURN_IF_ERROR(
143       session->Run(input_tensors, output_tensor_names, {}, &output_tensors));
144   CHECK_EQ(output_tensors.size(), output_tensor_names.size());
145   for (int i = 0; i < output_tensor_names.size(); ++i) {
146     const string& wanted_shape_name = output_tensor_names[i];
147     const TensorShape& found_shape = output_tensors[i].shape();
148     (*node_shapes)[wanted_shape_name] = found_shape;
149   }
150   return OkStatus();
151 }
152 
CalculateFlops(const GraphDef & graph,const std::vector<InputLayerInfo> & inputs,Session * session,int64_t * total_flops,std::unordered_map<string,int64_t> * flops_by_op)153 Status CalculateFlops(const GraphDef& graph,
154                       const std::vector<InputLayerInfo>& inputs,
155                       Session* session, int64_t* total_flops,
156                       std::unordered_map<string, int64_t>* flops_by_op) {
157   std::unordered_set<string> floppable_ops = {
158       "Conv2D", "MatMul", "QuantizedConv2D", "QuantizedMatMul",
159       "DepthwiseConv2dNative"};
160 
161   std::set<string> wanted_shapes;
162   for (const NodeDef& node : graph.node()) {
163     if (floppable_ops.count(node.op())) {
164       for (const string& input : node.input()) {
165         wanted_shapes.insert(input);
166       }
167       wanted_shapes.insert(node.name());
168     }
169   }
170   std::unordered_map<string, TensorShape> found_shapes;
171   TF_RETURN_IF_ERROR(
172       GetOutputShapes(inputs, wanted_shapes, session, &found_shapes));
173 
174   *total_flops = 0;
175   for (const NodeDef& node : graph.node()) {
176     if (floppable_ops.count(node.op())) {
177       int64_t current_flops = 0;
178       // This is a very crude approximation to FLOPs that only looks at a few
179       // op types that commonly form the bulk of the computation for many
180       // models. It's included here because getting even an approximate value
181       // for FLOPs is still very useful for estimating utilization, versus a
182       // device's theoretical maximum FLOPs/second.
183       if ((node.op() == "Conv2D") || (node.op() == "QuantizedConv2D")) {
184         const TensorShape& filter_shape = found_shapes[node.input(1)];
185         const TensorShape& output_shape = found_shapes[node.name()];
186         int64_t filter_height = filter_shape.dim_size(0);
187         int64_t filter_width = filter_shape.dim_size(1);
188         int64_t filter_in_depth = filter_shape.dim_size(2);
189         int64_t output_count = output_shape.num_elements();
190         current_flops =
191             output_count * filter_in_depth * filter_height * filter_width * 2;
192       } else if ((node.op() == "MatMul") || (node.op() == "QuantizedMatMul")) {
193         const bool transpose_a = node.attr().at("transpose_a").b();
194         const TensorShape& a_shape = found_shapes[node.input(0)];
195         const TensorShape& output_shape = found_shapes[node.name()];
196         int64_t k;
197         if (transpose_a) {
198           k = a_shape.dim_size(0);
199         } else {
200           k = a_shape.dim_size(1);
201         }
202         int64_t output_count = output_shape.num_elements();
203         current_flops = k * output_count * 2;
204       } else if (node.op() == "DepthwiseConv2dNative") {
205         const TensorShape& filter_shape = found_shapes[node.input(1)];
206         const TensorShape& output_shape = found_shapes[node.name()];
207         int64_t filter_height = filter_shape.dim_size(0);
208         int64_t filter_width = filter_shape.dim_size(1);
209         int64_t output_count = output_shape.num_elements();
210         current_flops = output_count * filter_height * filter_width * 2;
211       }
212       (*flops_by_op)[node.op()] += current_flops;
213       *total_flops += current_flops;
214     }
215   }
216   return OkStatus();
217 }
218 
RecordBenchmarkEntry(const string & output_prefix,const string & benchmark_name,const string & postfix,int num_runs,double total_time_s,double throughput=-1.0)219 void RecordBenchmarkEntry(const string& output_prefix,
220                           const string& benchmark_name, const string& postfix,
221                           int num_runs, double total_time_s,
222                           double throughput = -1.0) {
223   std::stringstream stream;
224   stream << benchmark_name;
225   if (!postfix.empty()) {
226     stream << "_" << postfix;
227   }
228 
229   TestReporter node_reporter(output_prefix, stream.str());
230   TF_QCHECK_OK(node_reporter.Initialize());
231   TF_QCHECK_OK(
232       node_reporter.Benchmark(num_runs, -1.0, total_time_s, throughput));
233   TF_QCHECK_OK(node_reporter.Close());
234 }
235 
SleepSeconds(double sleep_seconds)236 void SleepSeconds(double sleep_seconds) {
237   if (sleep_seconds <= 0.0) {
238     return;
239   }
240 #ifdef PLATFORM_WINDOWS
241   Env::Default()->SleepForMicroseconds(sleep_seconds * 1000 * 1000);
242 #else
243   // Convert the inference_delay string into a timespec.
244   timespec req;
245   req.tv_sec = static_cast<time_t>(sleep_seconds);
246   req.tv_nsec = (sleep_seconds - req.tv_sec) * 1000000000;
247   nanosleep(&req, nullptr);
248 #endif
249 }
250 
251 }  // namespace
252 
InitializeSession(int num_threads,const string & graph,std::unique_ptr<Session> * session,std::unique_ptr<GraphDef> * graph_def)253 Status InitializeSession(int num_threads, const string& graph,
254                          std::unique_ptr<Session>* session,
255                          std::unique_ptr<GraphDef>* graph_def) {
256   LOG(INFO) << "Loading TensorFlow.";
257 
258   tensorflow::SessionOptions options;
259   tensorflow::ConfigProto& config = options.config;
260   if (num_threads > 0) {
261     config.set_intra_op_parallelism_threads(num_threads);
262     config.set_inter_op_parallelism_threads(num_threads);
263   }
264   LOG(INFO) << "Got config, " << config.device_count_size() << " devices";
265 
266   session->reset(tensorflow::NewSession(options));
267   graph_def->reset(new GraphDef());
268   tensorflow::GraphDef tensorflow_graph;
269   Status s = ReadBinaryProto(Env::Default(), graph, graph_def->get());
270   if (!s.ok()) {
271     s = ReadTextProto(Env::Default(), graph, graph_def->get());
272   }
273 
274   if (!s.ok()) {
275     LOG(ERROR) << "Could not create TensorFlow Graph: " << s;
276     return s;
277   }
278 
279   s = (*session)->Create(*(graph_def->get()));
280   if (!s.ok()) {
281     LOG(ERROR) << "Could not create TensorFlow Session: " << s;
282     return s;
283   }
284 
285   return OkStatus();
286 }
287 
RunBenchmark(const std::vector<InputLayerInfo> & inputs,const std::vector<string> & outputs,const std::vector<string> & targets,Session * session,StatSummarizer * stats,int64_t * inference_time_us)288 Status RunBenchmark(const std::vector<InputLayerInfo>& inputs,
289                     const std::vector<string>& outputs,
290                     const std::vector<string>& targets, Session* session,
291                     StatSummarizer* stats, int64_t* inference_time_us) {
292   std::vector<std::pair<string, tensorflow::Tensor> > input_tensors;
293   CreateTensorsFromInputInfo(inputs, &input_tensors);
294 
295   std::vector<tensorflow::Tensor> output_tensors;
296 
297   tensorflow::Status s;
298 
299   RunOptions run_options;
300   if (stats != nullptr) {
301     run_options.set_trace_level(RunOptions::FULL_TRACE);
302   }
303 
304   RunMetadata run_metadata;
305   const int64_t start_time = Env::Default()->NowMicros();
306   s = session->Run(run_options, input_tensors, outputs, targets,
307                    &output_tensors, &run_metadata);
308   const int64_t end_time = Env::Default()->NowMicros();
309   *inference_time_us = end_time - start_time;
310 
311   if (!s.ok()) {
312     LOG(ERROR) << "Error during inference: " << s;
313     return s;
314   }
315 
316   if (stats != nullptr) {
317     assert(run_metadata.has_step_stats());
318     const StepStats& step_stats = run_metadata.step_stats();
319     stats->ProcessStepStats(step_stats);
320   }
321 
322   return s;
323 }
324 
TimeMultipleRuns(double sleep_seconds,int num_runs,double max_time_s,const std::vector<InputLayerInfo> & inputs,const std::vector<string> & outputs,const std::vector<string> & targets,Session * session,StatSummarizer * stats,int64_t * total_time_us,int64_t * actual_num_runs)325 Status TimeMultipleRuns(double sleep_seconds, int num_runs, double max_time_s,
326                         const std::vector<InputLayerInfo>& inputs,
327                         const std::vector<string>& outputs,
328                         const std::vector<string>& targets, Session* session,
329                         StatSummarizer* stats, int64_t* total_time_us,
330                         int64_t* actual_num_runs) {
331   *total_time_us = 0;
332 
333   LOG(INFO) << "Running benchmark for max " << num_runs << " iterations, max "
334             << max_time_s << " seconds "
335             << (stats != nullptr ? "with" : "without")
336             << " detailed stat logging, with " << sleep_seconds
337             << "s sleep between inferences";
338 
339   Stat<int64_t> stat;
340   const bool until_max_time = num_runs <= 0;
341   for (int i = 0; until_max_time || i < num_runs; ++i) {
342     int64_t time;
343     Status run_status =
344         RunBenchmark(inputs, outputs, targets, session, stats, &time);
345     stat.UpdateStat(time);
346     (*total_time_us) += time;
347     ++(*actual_num_runs);
348 
349     if (max_time_s > 0.0 && (*total_time_us / 1000000.0) > max_time_s) {
350       break;
351     }
352 
353     if (!run_status.ok()) {
354       LOG(INFO) << "Failed on run " << i;
355       return run_status;
356     }
357 
358     // If requested, sleep between runs for an arbitrary amount of time.
359     // This can be helpful to determine the effect of mobile processor
360     // scaling and thermal throttling.
361     if (sleep_seconds > 0.0) {
362       SleepSeconds(sleep_seconds);
363     }
364   }
365   std::stringstream stream;
366   stat.OutputToStream(&stream);
367   LOG(INFO) << stream.str() << std::endl;
368 
369   return OkStatus();
370 }
371 
Main(int argc,char ** argv)372 int Main(int argc, char** argv) {
373   string graph = "/data/local/tmp/tensorflow_inception_graph.pb";
374   string init_ops_string = "";
375   string input_layer_string = "input:0";
376   string input_layer_shape_string = "1,224,224,3";
377   string input_layer_type_string = "float";
378   string input_layer_values_string = "";
379   string output_layer_string = "output:0";
380   string target_layer_string = "";
381   int max_num_runs = 1000;
382   string max_time = "10.0";
383   string inference_delay = "-1.0";
384   string inter_benchmark_delay = "-1.0";
385   int num_threads = -1;
386   string benchmark_name = "";
387   string output_prefix = "";
388   bool show_sizes = false;
389   bool show_run_order = true;
390   int run_order_limit = 0;
391   bool show_time = true;
392   int time_limit = 10;
393   bool show_memory = true;
394   int memory_limit = 10;
395   bool show_type = true;
396   bool show_summary = true;
397   bool show_flops = false;
398   int warmup_runs = 1;
399 
400   std::vector<Flag> flag_list = {
401       Flag("graph", &graph, "graph file name"),
402       Flag("init_ops", &init_ops_string, "init ops"),
403       Flag("input_layer", &input_layer_string, "input layer names"),
404       Flag("input_layer_shape", &input_layer_shape_string, "input layer shape"),
405       Flag("input_layer_type", &input_layer_type_string, "input layer type"),
406       Flag("input_layer_values", &input_layer_values_string,
407            "values to initialize the inputs with"),
408       Flag("output_layer", &output_layer_string, "output layer name"),
409       Flag("target_layer", &target_layer_string, "target layer name"),
410       Flag("max_num_runs", &max_num_runs, "number of runs max"),
411       Flag("max_time", &max_time, "length to run max"),
412       Flag("inference_delay", &inference_delay,
413            "delay between runs in seconds"),
414       Flag("inter_benchmark_delay", &inter_benchmark_delay,
415            "delay between benchmarks in seconds"),
416       Flag("num_threads", &num_threads, "number of threads"),
417       Flag("benchmark_name", &benchmark_name, "benchmark name"),
418       Flag("output_prefix", &output_prefix, "benchmark output prefix"),
419       Flag("show_sizes", &show_sizes, "whether to show sizes"),
420       Flag("show_run_order", &show_run_order,
421            "whether to list stats by run order"),
422       Flag("run_order_limit", &run_order_limit,
423            "how many items to show by run order"),
424       Flag("show_time", &show_time, "whether to list stats by time taken"),
425       Flag("time_limit", &time_limit, "how many items to show by time taken"),
426       Flag("show_memory", &show_memory, "whether to list stats by memory used"),
427       Flag("memory_limit", &memory_limit,
428            "how many items to show by memory used"),
429       Flag("show_type", &show_type, "whether to list stats by op type"),
430       Flag("show_summary", &show_summary,
431            "whether to show a summary of the stats"),
432       Flag("show_flops", &show_flops, "whether to estimate the model's FLOPs"),
433       Flag("warmup_runs", &warmup_runs, "how many runs to initialize model"),
434   };
435   string usage = Flags::Usage(argv[0], flag_list);
436   const bool parse_result = Flags::Parse(&argc, argv, flag_list);
437 
438   if (!parse_result) {
439     LOG(ERROR) << usage;
440     return -1;
441   }
442 
443   std::vector<string> init_ops = str_util::Split(init_ops_string, ',');
444   std::vector<string> input_layers = str_util::Split(input_layer_string, ',');
445   std::vector<string> input_layer_shapes =
446       str_util::Split(input_layer_shape_string, ':');
447   std::vector<string> input_layer_types =
448       str_util::Split(input_layer_type_string, ',');
449   std::vector<string> input_layer_values =
450       str_util::Split(input_layer_values_string, ':');
451   std::vector<string> output_layers = str_util::Split(output_layer_string, ',');
452   std::vector<string> target_layers = str_util::Split(target_layer_string, ',');
453   if ((input_layers.size() != input_layer_shapes.size()) ||
454       (input_layers.size() != input_layer_types.size())) {
455     LOG(ERROR) << "There must be the same number of items in --input_layer,"
456                << " --input_layer_shape, and --input_layer_type, for example"
457                << " --input_layer=input1,input2 --input_layer_type=float,float "
458                << " --input_layer_shape=1,224,224,4:1,20";
459     LOG(ERROR) << "--input_layer=" << input_layer_string << " ("
460                << input_layers.size() << " items)";
461     LOG(ERROR) << "--input_layer_type=" << input_layer_type_string << " ("
462                << input_layer_types.size() << " items)";
463     LOG(ERROR) << "--input_layer_shape=" << input_layer_shape_string << " ("
464                << input_layer_shapes.size() << " items)";
465     return -1;
466   }
467   const size_t inputs_count = input_layers.size();
468 
469   ::tensorflow::port::InitMain(argv[0], &argc, &argv);
470   if (argc > 1) {
471     LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
472     return -1;
473   }
474 
475   LOG(INFO) << "Graph: [" << graph << "]";
476   LOG(INFO) << "Init ops:" << init_ops_string;
477   LOG(INFO) << "Input layers: [" << input_layer_string << "]";
478   LOG(INFO) << "Input shapes: [" << input_layer_shape_string << "]";
479   LOG(INFO) << "Input types: [" << input_layer_type_string << "]";
480   LOG(INFO) << "Output layers: [" << output_layer_string << "]";
481   LOG(INFO) << "Target layers: [" << target_layer_string << "]";
482   LOG(INFO) << "Num runs: [" << max_num_runs << "]";
483   LOG(INFO) << "Inter-inference delay (seconds): [" << inference_delay << "]";
484   LOG(INFO) << "Inter-benchmark delay (seconds): [" << inter_benchmark_delay
485             << "]";
486   LOG(INFO) << "Num threads: [" << num_threads << "]";
487   LOG(INFO) << "Benchmark name: [" << benchmark_name << "]";
488   LOG(INFO) << "Output prefix: [" << output_prefix << "]";
489   LOG(INFO) << "Show sizes: [" << show_sizes << "]";
490   LOG(INFO) << "Warmup runs: [" << warmup_runs << "]";
491 
492   std::unique_ptr<Session> session;
493   std::unique_ptr<StatSummarizer> stats;
494   std::unique_ptr<GraphDef> graph_def;
495 
496   int64_t initialization_start_us = Env::Default()->NowMicros();
497   Status initialize_status =
498       InitializeSession(num_threads, graph, &session, &graph_def);
499   int64_t initialization_end_us = Env::Default()->NowMicros();
500   double initialization_time_s =
501       (initialization_end_us - initialization_start_us) / 1000000.0;
502   LOG(INFO) << "Initialized session in " << initialization_time_s << "s";
503   if (!initialize_status.ok()) {
504     return -1;
505   }
506 
507   if (!init_ops.empty()) {
508     Status initialize_variables_status =
509         InitializeVariables(session.get(), init_ops);
510     if (!initialize_variables_status.ok()) {
511       LOG(ERROR) << "Graph variables initialization failed with "
512                  << initialize_variables_status;
513       return -1;
514     }
515   }
516 
517   StatSummarizerOptions stats_options;
518   stats_options.show_run_order = show_run_order;
519   stats_options.run_order_limit = run_order_limit;
520   stats_options.show_time = show_time;
521   stats_options.time_limit = time_limit;
522   stats_options.show_memory = show_memory;
523   stats_options.memory_limit = memory_limit;
524   stats_options.show_type = show_type;
525   stats_options.show_summary = show_summary;
526   stats.reset(new tensorflow::StatSummarizer(stats_options));
527 
528   const double inter_inference_sleep_seconds =
529       std::strtod(inference_delay.c_str(), nullptr);
530   const double inter_benchmark_sleep_seconds =
531       std::strtod(inter_benchmark_delay.c_str(), nullptr);
532   const double max_benchmark_time_seconds =
533       std::strtod(max_time.c_str(), nullptr);
534 
535   std::vector<InputLayerInfo> inputs;
536   for (int n = 0; n < inputs_count; ++n) {
537     InputLayerInfo input;
538     CHECK(DataTypeFromString(input_layer_types[n], &input.data_type))
539         << input_layer_types[n] << " was an invalid type";
540 
541     std::vector<string> split_layer_shapes =
542         str_util::Split(input_layer_shapes[n], ',');
543     for (const string& layer_shape : split_layer_shapes) {
544       int32_t tmp;
545       CHECK(strings::safe_strto32(layer_shape, &tmp))
546           << "Incorrect size string specified: " << input_layer_shapes[n];
547       if (tmp == -1) {
548         LOG(ERROR) << "Any unknown sizes in the shapes (-1's) must be replaced"
549                    << " with the size you want to benchmark with.";
550         return -1;
551       } else {
552         input.shape.AddDim(tmp);
553       }
554     }
555     input.name = input_layers[n];
556     if (n < input_layer_values.size()) {
557       std::vector<string> string_tokens =
558           str_util::Split(input_layer_values[n], ',');
559       input.initialization_values.clear();
560       input.initialization_values.reserve(string_tokens.size());
561       for (const string& str_val : string_tokens) {
562         float val;
563         CHECK(strings::safe_strtof(str_val, &val))
564             << "Incorrect initialization values string specified: "
565             << input_layer_values[n];
566         input.initialization_values.push_back(val);
567       }
568     }
569     inputs.push_back(input);
570   }
571 
572   // If requested, run through the graph first to preinitialize everything
573   // before the benchmarking runs.
574   int64_t warmup_time_us = 0;
575   int64_t num_warmup_runs = 0;
576   if (warmup_runs > 0) {
577     Status warmup_time_status =
578         TimeMultipleRuns(inter_inference_sleep_seconds, warmup_runs, -1.0,
579                          inputs, output_layers, target_layers, session.get(),
580                          nullptr, &warmup_time_us, &num_warmup_runs);
581     if (!warmup_time_status.ok()) {
582       LOG(ERROR) << "Timing failed with " << warmup_time_status;
583       return -1;
584     }
585   }
586 
587   // Capture overall inference time without stat logging overhead. This is the
588   // timing data that can be compared to other libraries.
589   SleepSeconds(inter_benchmark_sleep_seconds);
590   int64_t no_stat_time_us = 0;
591   int64_t no_stat_num_runs = 0;
592   Status no_stat_time_status = TimeMultipleRuns(
593       inter_inference_sleep_seconds, max_num_runs, max_benchmark_time_seconds,
594       inputs, output_layers, target_layers, session.get(), nullptr,
595       &no_stat_time_us, &no_stat_num_runs);
596   const double no_stat_wall_time = no_stat_time_us / 1000000.0;
597   if (!no_stat_time_status.ok()) {
598     LOG(ERROR) << "Timing failed with " << no_stat_time_status;
599     return -1;
600   }
601 
602   // Run again to gather detailed log stats to get a better idea of where
603   // relative time is going within the graph.
604   SleepSeconds(inter_benchmark_sleep_seconds);
605   int64_t stat_time_us = 0;
606   int64_t stat_num_runs = 0;
607   Status stat_time_status = TimeMultipleRuns(
608       inter_inference_sleep_seconds, max_num_runs, max_benchmark_time_seconds,
609       inputs, output_layers, target_layers, session.get(), stats.get(),
610       &stat_time_us, &stat_num_runs);
611   if (!stat_time_status.ok()) {
612     LOG(ERROR) << "Timing failed with " << stat_time_status;
613     return -1;
614   }
615 
616   LOG(INFO) << "Average inference timings in us: "
617             << "Warmup: "
618             << (warmup_runs > 0 ? warmup_time_us / warmup_runs : 0) << ", "
619             << "no stats: " << no_stat_time_us / no_stat_num_runs << ", "
620             << "with stats: " << stat_time_us / stat_num_runs;
621 
622   stats->PrintStepStats();
623 
624   if (show_sizes) {
625     stats->PrintOutputs();
626   }
627 
628   if (show_flops) {
629     int64_t total_flops;
630     std::unordered_map<string, int64_t> flops_by_op;
631     Status flop_status = CalculateFlops(*graph_def, inputs, session.get(),
632                                         &total_flops, &flops_by_op);
633     if (!flop_status.ok()) {
634       LOG(ERROR) << "FLOPs calculation failed with " << flop_status;
635       return -1;
636     }
637     string pretty_flops;
638     if (total_flops < 1000) {
639       pretty_flops = strings::StrCat(total_flops, " FLOPs");
640     } else if (total_flops < (1000 * 1000)) {
641       const float rounded_flops = (total_flops / 1000.0f);
642       pretty_flops = strings::StrCat(rounded_flops, "k FLOPs");
643     } else if (total_flops < (1000 * 1000 * 1000)) {
644       const float rounded_flops = round(total_flops / 1000.0f) / 1000.0f;
645       pretty_flops = strings::StrCat(rounded_flops, " million FLOPs");
646     } else {
647       const float rounded_flops =
648           round(total_flops / (1000.0f * 1000.0f)) / 1000.0f;
649       pretty_flops = strings::StrCat(rounded_flops, " billion FLOPs");
650     }
651     LOG(INFO) << "FLOPs estimate: " << strings::HumanReadableNum(total_flops);
652     const double mean_run_time = no_stat_wall_time / no_stat_num_runs;
653     LOG(INFO) << "FLOPs/second: "
654               << strings::HumanReadableNum(
655                      static_cast<int64_t>(total_flops / mean_run_time));
656   }
657 
658   if (!benchmark_name.empty() && !output_prefix.empty()) {
659     // Compute the total number of values per input.
660     int64_t total_size = inputs[0].shape.num_elements();
661 
662     // Throughput in MB/s
663     const double throughput =
664         DataTypeSize(inputs[0].data_type) * total_size * no_stat_num_runs /
665         static_cast<double>(no_stat_wall_time) / (1024 * 1024);
666 
667     // Report the stats.
668     RecordBenchmarkEntry(output_prefix, benchmark_name, "", no_stat_num_runs,
669                          no_stat_wall_time, throughput);
670 
671     // Session initialization time.
672     RecordBenchmarkEntry(output_prefix, benchmark_name, "meta-init", 1,
673                          initialization_time_s);
674 
675     // First inference time. Note: if warmup_runs is > 1 this will actually be
676     // an average of all the warmup runs.
677     RecordBenchmarkEntry(output_prefix, benchmark_name, "meta-first-inference",
678                          warmup_runs, warmup_time_us / 1000000.0);
679 
680     // Time from starting to initialize TF to getting the first result back.
681     // This also assumes that only one warmup run is performed.
682     RecordBenchmarkEntry(
683         output_prefix, benchmark_name, "meta-init-plus-first-inference", 1,
684         initialization_time_s + (warmup_time_us / 1000000.0) / warmup_runs);
685 
686     std::map<std::string, int64_t> node_type_map_count;
687     std::map<std::string, int64_t> node_type_map_time;
688     std::map<std::string, int64_t> node_type_map_memory;
689     std::map<std::string, int64_t> node_type_map_times_called;
690 
691     int64_t accumulated_us;
692     stats->ComputeStatsByType(&node_type_map_count, &node_type_map_time,
693                               &node_type_map_memory,
694                               &node_type_map_times_called, &accumulated_us);
695     for (const auto& time : node_type_map_time) {
696       LOG(INFO) << "Outputting: [" << time.first << "]";
697       RecordBenchmarkEntry(output_prefix, benchmark_name, time.first,
698                            stat_num_runs,
699                            (time.second * stat_num_runs) / 1000000.0f);
700     }
701   }
702 
703   return 0;
704 }
705 
706 }  // namespace benchmark_model
707 }  // namespace tensorflow
708