xref: /aosp_15_r20/external/tensorflow/tensorflow/core/util/stat_summarizer.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/util/stat_summarizer.h"
17 
18 #include <iomanip>
19 #include <map>
20 #include <queue>
21 #include <sstream>
22 #include <string>
23 
24 #include "tensorflow/core/framework/step_stats.pb.h"
25 #include "tensorflow/core/framework/tensor_description.pb.h"
26 #include "tensorflow/core/framework/tensor_shape.pb.h"
27 #include "tensorflow/core/lib/strings/str_util.h"
28 #include "tensorflow/core/platform/env.h"
29 #include "tensorflow/core/platform/logging.h"
30 #include "tensorflow/core/platform/types.h"
31 
32 namespace tensorflow {
33 
34 using Detail = StatsCalculator::Detail;
35 
StatSummarizer(const StatSummarizerOptions & options)36 StatSummarizer::StatSummarizer(const StatSummarizerOptions& options)
37     : stats_calculator_(new StatsCalculator(options)) {}
38 
StatSummarizer(const tensorflow::GraphDef & tensorflow_graph)39 StatSummarizer::StatSummarizer(const tensorflow::GraphDef& tensorflow_graph)
40     : stats_calculator_(new StatsCalculator(StatSummarizerOptions())) {}
41 
~StatSummarizer()42 StatSummarizer::~StatSummarizer() {}
43 
Validate(const std::vector<TensorDescription> * outputs,const NodeExecStats & ns) const44 void StatSummarizer::Validate(const std::vector<TensorDescription>* outputs,
45                               const NodeExecStats& ns) const {
46   if (outputs->size() != ns.output_size()) {
47     LOG(WARNING) << "Number of outputs changed between runs for '"
48                  << ns.node_name() << "' - was " << outputs->size() << ", now "
49                  << ns.output_size();
50   } else {
51     for (const auto& output : ns.output()) {
52       const int32_t slot = output.slot();
53       if ((slot < 0) || (slot >= ns.output_size())) {
54         // This is not a hard error for Switch ops, so just pass.
55         continue;
56       }
57       const auto& stored = (*outputs)[slot];
58       const auto& current = output.tensor_description();
59 
60       bool do_tensors_match =
61           (stored.dtype() == current.dtype()) &&
62           (stored.shape().dim_size() == current.shape().dim_size());
63 
64       if (do_tensors_match) {
65         for (int i = 0; i < stored.shape().dim_size(); ++i) {
66           if (stored.shape().dim(i).size() != current.shape().dim(i).size()) {
67             do_tensors_match = false;
68             break;
69           }
70         }
71       }
72 
73       if (!do_tensors_match) {
74         LOG(WARNING) << "Output tensor changed between runs for '"
75                      << ns.node_name();
76       }
77     }
78   }
79 }
80 
PrintStepStats() const81 void StatSummarizer::PrintStepStats() const {
82   string output = GetOutputString();
83   std::istringstream iss(output);
84   for (std::string line; std::getline(iss, line);) {
85     LOG(INFO) << line;
86   }
87 }
88 
89 namespace {
OpType(const DeviceStepStats & ds,const NodeExecStats & ns)90 std::string OpType(const DeviceStepStats& ds, const NodeExecStats& ns) {
91   // There is no published specification of how DeviceStats and NodeStats
92   // are filled in. Thus, we live with the fragility of this implementation.
93   //
94   // Note that NodeStats.node_name may NOT refer to a node in the Graph.
95   // This can happen if, either:
96   // (1) The DeviceStats corresponds to statistics from the GPUTracer
97   //     logging (which adds devices whose name contains either "/stream"
98   //     or "/memcpy" to the StepStats), OR
99   // (2) The graph was partitioned, and thus the NodeStats refers to
100   //     the SendTensor or RecvTensor operations added.
101   // For these cases, return "<>" as the "type" of the operation.
102   //
103   // The StatSummarizer was initially aimed at CPU execution on mobile, where
104   // there was no GPUTracing and no graph partitioning, so the conditions above
105   // do not occur.
106   //
107   // It would be nice to have a clearer spec for StepStats so utilities such as
108   // this class can handle nodes that do not appear in the original graph
109   // gracefully. Till then, duplicate what is done by:
110   // https://www.tensorflow.org/code/tensorflow/python/client/timeline.py
111   // and rely on the unittest.
112   if (ds.device().find("/stream") != std::string::npos ||
113       ds.device().find("/memcpy") != std::string::npos) {
114     // Stats from the GPUTracer, does not correspond to TensorFlow ops.
115     return "<>";
116   }
117   // timeline_label should be of the format: <node_name> = <op_type>(<args>)
118   // Extract <op_type>.
119   const std::string sep(" = ");
120   const std::string& label = ns.timeline_label();
121   std::string::size_type start = label.find(sep);
122   if (start == std::string::npos) return "<>";
123   start += sep.size();
124   std::string::size_type end = label.find('(', start);
125   if (end == std::string::npos) return "<>";
126   return label.substr(start, end - start);
127 }
128 }  // namespace
129 
ProcessStepStats(const StepStats & step_stats)130 void StatSummarizer::ProcessStepStats(const StepStats& step_stats) {
131   int64_t curr_total_us = 0;
132   int64_t mem_total = 0;
133 
134   int node_num = 0;
135   for (const auto& ds : step_stats.dev_stats()) {
136     for (const auto& ns : ds.node_stats()) {
137       // NOTE(blackhc): To better support GPUs:
138       // GPU kernels are duplicated both in /stream:all and their
139       // /stream:$index. GPU memcpys are duplicated both in /memcpy and their
140       // /stream:$index. So only keep /stream:all and /memcpy and ignore all
141       // /stream:$index to only count GPU executions once.
142       if (ds.device().find("/stream") != std::string::npos &&
143           ds.device().find("/stream:all") == std::string::npos) {
144         continue;
145       }
146       // NOTE(fishx): We will record ops execution time twice: one as CPU
147       // activity with device name "/host:CPU" and the other as TF runtime
148       // activity with device name started with "/job:*". It is safe to ignore
149       // CPU activities here.
150       // TODO(b/138729463): Read ops execution time from CPU activities instead
151       // of runtime activities.
152       if (ds.device().find("/host:CPU") != std::string::npos) {
153         continue;
154       }
155 
156       std::string name = ns.node_name();
157       std::string op_type = "<>";
158       // NOTE(blackhc): we have to ensure that all keys into the detail map
159       // are unique, so we add [Kernel] or [MemCpy] as a suffix to the name.
160       // To make the node type summary work better, we prefix "gpu:" to
161       // the op type when the info is from a /gpu/stream or /memcpy channel.
162       if (ds.device().find("/stream") != std::string::npos) {
163         // node_name: name ":" opType
164         auto parts = str_util::Split(ns.node_name(), ':');
165         if (parts.size() == 2) {
166           name = parts[0] + " [Kernel]";
167           op_type = "gpu:" + parts[1];
168         }
169       } else if (ds.device().find("/memcpy") != std::string::npos) {
170         // node_name: name (":" opType)? ":" memCpyType
171         auto parts = str_util::Split(ns.node_name(), ':');
172         if (parts.size() == 2 || parts.size() == 3) {
173           name = parts.front() + " [MemCpy]";
174           // We don't care about the actual op type (it might not be available
175           // for edge_ memcpys). We only care that it's a memcpy for now.
176           op_type = "gpu:" + parts.back();
177         }
178       } else {
179         op_type = OpType(ds, ns);
180       }
181 
182       ++node_num;
183       const int64_t curr_time = ns.all_end_rel_micros();
184       curr_total_us += curr_time;
185       auto output_result =
186           outputs_.emplace(name, std::vector<TensorDescription>());
187       std::vector<TensorDescription>* outputs = &(output_result.first->second);
188 
189       int64_t rel_end_us = curr_time;
190 
191       // If this is the first pass, initialize some values.
192       if (output_result.second) {
193         outputs->resize(ns.output_size());
194         for (const auto& output : ns.output()) {
195           const int32_t slot = output.slot();
196           if ((slot < 0) || (slot >= ns.output_size())) {
197             // This is not a hard error for Switch ops, so just pass.
198             continue;
199           }
200           (*outputs)[slot] = output.tensor_description();
201         }
202       }
203 
204       int64_t curr_node_mem = 0;
205       for (const auto& mem : ns.memory()) {
206         const int64_t mem_usage = mem.total_bytes();
207         curr_node_mem += mem_usage;
208       }
209       stats_calculator_->AddNodeStats(name, op_type, node_num, rel_end_us,
210                                       curr_node_mem);
211 
212       mem_total += curr_node_mem;
213 
214       Validate(outputs, ns);
215     }
216   }
217 
218   stats_calculator_->UpdateRunTotalUs(curr_total_us);
219   stats_calculator_->UpdateMemoryUsed(mem_total);
220 }
221 
222 
PrintOutputs() const223 void StatSummarizer::PrintOutputs() const {
224   std::priority_queue<
225       std::pair<int64_t, const std::pair<const std::string, Detail>*>>
226       timings;
227   for (const auto& entry : stats_calculator_->GetDetails()) {
228     timings.emplace(-entry.second.run_order, &entry);
229   }
230 
231   LOG(INFO) << "============ Node output tensor sizes in run order ========";
232   while (!timings.empty()) {
233     auto entry = timings.top();
234     timings.pop();
235     std::stringstream stream;
236     const auto detail_outputs = outputs_.at(entry.second->first);
237     stream << entry.second->first << "\t" << detail_outputs.size();
238     for (const auto& tensor : detail_outputs) {
239       stream << "\t" << DataTypeString(tensor.dtype());
240       stream << "\t" << tensor.shape().dim_size();
241       for (const auto& d : tensor.shape().dim()) {
242         stream << "\t" << d.size();
243       }
244     }
245     LOG(INFO) << stream.str();
246   }
247 }
248 
249 }  // namespace tensorflow
250