xref: /aosp_15_r20/external/tensorflow/tensorflow/core/grappler/costs/utils.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/grappler/costs/utils.h"
17 
18 #include <stddef.h>
19 
20 #include <utility>
21 
22 #include "absl/strings/str_cat.h"
23 #include "absl/strings/str_format.h"
24 #include "third_party/eigen3/Eigen/Core"
25 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
26 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
27 #include "tensorflow/core/framework/allocation_description.pb.h"
28 #include "tensorflow/core/framework/attr_value.pb.h"
29 #include "tensorflow/core/framework/op.h"
30 #include "tensorflow/core/framework/op_def.pb.h"
31 #include "tensorflow/core/framework/step_stats.pb.h"
32 #include "tensorflow/core/framework/tensor.pb.h"
33 #include "tensorflow/core/framework/tensor_description.pb.h"
34 #include "tensorflow/core/framework/tensor_shape.pb.h"
35 #include "tensorflow/core/framework/types.pb.h"
36 #include "tensorflow/core/graph/graph.h"
37 #include "tensorflow/core/graph/tensor_id.h"
38 #include "tensorflow/core/grappler/clusters/utils.h"
39 #include "tensorflow/core/grappler/utils.h"
40 #include "tensorflow/core/lib/core/bits.h"
41 #include "tensorflow/core/lib/strings/numbers.h"
42 #include "tensorflow/core/platform/byte_order.h"
43 #include "tensorflow/core/platform/env.h"
44 #include "tensorflow/core/platform/logging.h"
45 #include "tensorflow/core/platform/protobuf.h"
46 #include "tensorflow/core/protobuf/config.pb.h"
47 #include "tensorflow/core/util/device_name_utils.h"
48 #include "tensorflow/core/util/overflow.h"
49 
50 namespace tensorflow {
51 namespace grappler {
52 
UnknownInput()53 static OpInfo::TensorProperties UnknownInput() {
54   OpInfo::TensorProperties input;
55   input.set_dtype(DataType::DT_INVALID);
56   input.mutable_shape()->set_unknown_rank(true);
57   return input;
58 }
59 
ExtractTensors(const AttrValue & attr_value)60 static std::vector<TensorProto> ExtractTensors(const AttrValue& attr_value) {
61   std::vector<TensorProto> tensors;
62   switch (attr_value.value_case()) {
63     case AttrValue::kTensor: {
64       tensors.push_back(attr_value.tensor());
65       break;
66     }
67     case AttrValue::kList: {
68       for (const auto& tensor_proto : attr_value.list().tensor()) {
69         tensors.push_back(tensor_proto);
70       }
71       break;
72     }
73     default: {
74     }
75   }
76   return tensors;
77 }
78 
79 // Annotate the op_info inputs with extra information when possible (e.g. the
80 // input value if it's known statically).
ExtractExtraProperties(const NodeDef & node,const std::unordered_map<string,const NodeDef * > & name_to_node,OpInfo * op_info)81 static void ExtractExtraProperties(
82     const NodeDef& node,
83     const std::unordered_map<string, const NodeDef*>& name_to_node,
84     OpInfo* op_info) {
85   OpRegistry* op_registry = OpRegistry::Global();
86   const OpDef* op_def = nullptr;
87   auto s = op_registry->LookUpOpDef(node.op(), &op_def);
88   if (!s.ok()) {
89     op_def = nullptr;
90   }
91 
92   for (int i = 0; i < node.input_size(); ++i) {
93     const string input_name = node.input(i);
94     CHECK(!input_name.empty());
95     if (IsControlInput(input_name)) {
96       continue;
97     }
98     TensorId input_tensor_id = ParseTensorName(input_name);
99     const string input_node_name(input_tensor_id.first);
100 
101     auto iter = name_to_node.find(input_node_name);
102     if (iter == name_to_node.end()) continue;
103     const NodeDef* input_node = iter->second;
104 
105     if (i >= op_info->inputs_size()) {
106       LOG(ERROR) << "OpInfo's inputs doesn't match the graph! OpInfo: "
107                  << op_info->DebugString()
108                  << "\nCurrent node: " << node.DebugString()
109                  << "\nInput node: " << input_node->DebugString();
110     }
111 
112     // The value attribute in Const input is useful for cost prediction.
113     if (input_node->op() == "Const" && i < op_info->inputs_size()) {
114       auto it = input_node->attr().find("value");
115       if (it == input_node->attr().end()) continue;
116 
117       const AttrValue& attr_value = it->second;
118       std::vector<TensorProto> tensors = ExtractTensors(attr_value);
119       if (tensors.empty()) continue;
120 
121       const TensorProto& t = tensors[0];
122       OpInfo::TensorProperties* input = op_info->mutable_inputs(i);
123       *(input->mutable_value()) = t;
124 
125       // For filename input, the file size can also be useful.
126       if (op_def && i < op_def->input_arg_size() &&
127           op_def->input_arg(i).name().find("filename") != string::npos) {
128         Tensor tensor;
129         if (!tensor.FromProto(t)) {
130           continue;
131         }
132         if (tensor.NumElements() != 1) {
133           continue;
134         }
135         const string& filename = tensor.scalar<tstring>()();
136 
137         Env* env = Env::Default();
138         FileStatistics stat;
139         Status s = env->Stat(filename, &stat);
140         if (!s.ok()) {
141           continue;
142         }
143         AttrValue attr;
144         attr.set_i(stat.length);
145         string attr_key = absl::StrCat("input_", i, "_filesize");
146         (*op_info->mutable_attr())[attr_key] = attr;
147       }
148     }
149 
150     // When the input is a handle (e.g. look up table handle), the information
151     // in the op itself is not sufficient to predict the op memory.
152     if (op_def && i < op_def->input_arg_size() &&
153         op_def->input_arg(i).name().find("handle") != string::npos) {
154       string new_key = absl::StrCat("parent_", i, "_op");
155       AttrValue attr;
156       attr.set_s(input_node->op());
157       (*op_info->mutable_attr())[new_key] = attr;
158       // TODO(yuefengz): Only parent node's op name is copied. Copy inputs
159       // and attributes when necessary.
160     }
161   }
162 }
163 
FindInputFeatures(const NodeDef & node,const std::unordered_map<string,const CostGraphDef::Node * > & name_to_cost,const std::unordered_map<string,const NodeDef * > & name_to_node)164 std::vector<OpInfo::TensorProperties> FindInputFeatures(
165     const NodeDef& node,
166     const std::unordered_map<string, const CostGraphDef::Node*>& name_to_cost,
167     const std::unordered_map<string, const NodeDef*>& name_to_node) {
168   std::vector<OpInfo::TensorProperties> inputs;
169   for (const auto& input_name : node.input()) {
170     CHECK(!input_name.empty());
171     TensorId input_tensor_id = ParseTensorName(input_name);
172     const string input_node_name(input_tensor_id.first);
173     const int output_index = input_tensor_id.second;
174 
175     // Skip control inputs.
176     if (output_index == Graph::kControlSlot) {
177       continue;
178     }
179 
180     auto it = name_to_cost.find(input_node_name);
181     if (it == name_to_cost.end() || output_index < 0) {
182       inputs.push_back(UnknownInput());
183     } else {
184       const CostGraphDef::Node* input_cost = it->second;
185       if (input_cost->output_info_size() == 0) {
186         inputs.push_back(UnknownInput());
187       } else {
188         const CostGraphDef::Node::OutputInfo& output =
189             input_cost->output_info(output_index);
190         OpInfo::TensorProperties input;
191         input.set_dtype(output.dtype());
192         *input.mutable_shape() = output.shape();
193         inputs.push_back(input);
194       }
195     }
196   }
197 
198   return inputs;
199 }
200 
CalculateTensorSize(const OpInfo::TensorProperties & prop)201 int64_t CalculateTensorSize(const OpInfo::TensorProperties& prop) {
202   int64_t size = DataTypeSize(BaseType(prop.dtype()));
203   TensorShapeProto shape = prop.shape();
204 
205   // Can't infer the size if the rank is unknown. It has to be at least a
206   // scalar though.
207   if (shape.unknown_rank()) {
208     VLOG(2) << "CalculateTensorSize() -- unknown rank";
209     return size;
210   }
211 
212   // If one of the dimensions is unknown statically, assume it's at least one.
213   for (int i = 0; i < shape.dim_size(); ++i) {
214     if (shape.dim(i).size() < 0) {
215       shape.mutable_dim(i)->set_size(1);
216       VLOG(2) << "CalculateTensorSize() -- unknown dim: " << i;
217     }
218   }
219 
220   int64_t num_elems = TensorShape(shape).num_elements();
221   int64_t tensor_size = MultiplyWithoutOverflow(num_elems, size);
222   if (tensor_size < 0) {
223     VLOG(1) << "Overflow encountered when computing tensor size, multiplying "
224             << num_elems << " with " << size;
225     return -1;
226   }
227   return tensor_size;
228 }
229 
CalculateOutputSize(const std::vector<OpInfo::TensorProperties> & output_properties,const int port_num)230 int64_t CalculateOutputSize(
231     const std::vector<OpInfo::TensorProperties>& output_properties,
232     const int port_num) {
233   if (port_num < 0) return 4;  // 4B for control dependency.
234 
235   if (port_num >= output_properties.size()) {
236     LOG(ERROR) << "CalculateOutputSize() -- port_num: " << port_num
237                << " >= output_properties.size(): " << output_properties.size();
238     return 0;
239   }
240 
241   return CalculateTensorSize(output_properties[port_num]);
242 }
243 
GetDeviceInfo(const string & device_str)244 DeviceProperties GetDeviceInfo(const string& device_str) {
245   DeviceProperties unknown;
246   unknown.set_type("UNKNOWN");
247 
248   DeviceNameUtils::ParsedName parsed;
249   if (DeviceNameUtils::ParseFullName(device_str, &parsed)) {
250     if (parsed.type == "GPU") {
251       TfDeviceId tf_device_id(parsed.id);
252       PlatformDeviceId platform_device_id;
253       Status s =
254           GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id);
255       if (!s.ok()) {
256         // We are probably running simulation without linking cuda libraries.
257         platform_device_id = PlatformDeviceId(parsed.id);
258       }
259       return GetLocalGPUInfo(platform_device_id);
260     } else if (parsed.type == "CPU") {
261       return GetLocalCPUInfo();
262     }
263   }
264   return unknown;
265 }
266 
GetDeviceInfo(const CostGraphDef::Node & node)267 DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node) {
268   return GetDeviceInfo(node.device());
269 }
270 
BuildOpInfoWithoutDevice(const NodeDef & node,const std::unordered_map<string,const NodeDef * > & name_to_node,const std::vector<OpInfo::TensorProperties> & inputs)271 OpInfo BuildOpInfoWithoutDevice(
272     const NodeDef& node,
273     const std::unordered_map<string, const NodeDef*>& name_to_node,
274     const std::vector<OpInfo::TensorProperties>& inputs) {
275   OpInfo op_info;
276   op_info.set_op(node.op());
277   *op_info.mutable_attr() = node.attr();
278   for (auto& input : inputs) {
279     *op_info.add_inputs() = input;
280   }
281   ExtractExtraProperties(node, name_to_node, &op_info);
282   return op_info;
283 }
284 
GetOpDescription(const OpInfo & op_info)285 string GetOpDescription(const OpInfo& op_info) {
286   string description = "[";
287   description += "Op=" + op_info.op() + ", ";
288   description += "input_shapes=[";
289   for (auto const& input : op_info.inputs()) {
290     description += PartialTensorShape::DebugString(input.shape());
291   }
292   description += "]";
293   return description;
294 }
295 
CostGraphToOpPerformanceData(const CostGraphDef & cost_graph,const GraphDef & graph)296 OpPerformanceList CostGraphToOpPerformanceData(const CostGraphDef& cost_graph,
297                                                const GraphDef& graph) {
298   OpPerformanceList ret;
299   std::unordered_map<string, const CostGraphDef::Node*> name_to_cost;
300   std::unordered_map<string, const NodeDef*> name_to_node;
301   for (auto& node : cost_graph.node()) {
302     name_to_cost[node.name()] = &node;
303   }
304   for (auto& node : graph.node()) {
305     name_to_node[node.name()] = &node;
306   }
307 
308   for (const auto& node : graph.node()) {
309     // Skip the nodes that are not in the cost graph: these are nodes that
310     // aren't run, because they aren't in the intersection of transitive
311     // fan-in of a fetch node and the transitive fan-out of an input, or nodes
312     // that were optimized away by the optimizer. Since they don't contribute
313     // to the execution time we simply discard them.
314     auto it = name_to_cost.find(node.name());
315     if (it == name_to_cost.end()) {
316       continue;
317     }
318     const CostGraphDef::Node* cost_node = it->second;
319 
320     OpPerformance* perf = ret.add_op_performance();
321     perf->set_node(node.name());
322 
323     std::vector<OpInfo::TensorProperties> inputs =
324         FindInputFeatures(node, name_to_cost, name_to_node);
325     *perf->mutable_op() = BuildOpInfoWithoutDevice(node, name_to_node, inputs);
326     *perf->mutable_op()->mutable_device() = GetDeviceInfo(cost_node->device());
327 
328     perf->set_temporary_memory_size(cost_node->temporary_memory_size());
329     // Note that CostGraphDef::Node::compute_cost is microseconds, while
330     // OpPerformance.compute_cost is nanoseconds.
331     perf->set_compute_cost(cost_node->compute_cost() * 1000);
332     perf->set_compute_time(cost_node->compute_time() * 1000);
333     perf->set_memory_time(cost_node->memory_time() * 1000);
334 
335     for (const auto& output_info : cost_node->output_info()) {
336       perf->mutable_op_memory()->add_output_memory(output_info.size());
337     }
338 
339     perf->mutable_op_memory()->set_temp_memory(
340         cost_node->temporary_memory_size());
341     perf->mutable_op_memory()->set_persistent_memory(
342         cost_node->persistent_memory_size());
343   }
344   return ret;
345 }
346 
Add(const uint64 value)347 void TensorSizeHistogram::Add(const uint64 value) {
348   num_elem_++;
349   sum_elem_ += value;
350   min_ = std::min(min_, value);
351   max_ = std::max(max_, value);
352   buckets_[Index(value)]++;
353 }
354 
Merge(const TensorSizeHistogram & src)355 void TensorSizeHistogram::Merge(const TensorSizeHistogram& src) {
356   num_elem_ += src.num_elem_;
357   sum_elem_ += src.sum_elem_;
358   min_ = std::min(min_, src.min_);
359   max_ = std::max(max_, src.max_);
360   std::transform(buckets_.begin(), buckets_.end(), src.buckets_.begin(),
361                  buckets_.begin(), std::plus<uint64>());
362 }
363 
ToString() const364 string TensorSizeHistogram::ToString() const {
365   string r = absl::StrFormat(
366       "Count: %lld, Average: %s, Min: %s, Max: %s"
367       "\n------------------------------------------------------\n",
368       num_elem_, strings::HumanReadableNumBytes(Average()),
369       strings::HumanReadableNumBytes(min_),
370       strings::HumanReadableNumBytes(max_));
371   const double mult = num_elem_ > 0 ? 100.0 / num_elem_ : 0.0;
372   uint64 cumul_sum = 0;
373 
374   for (int i = 0; i < buckets_.size(); i++) {
375     if (buckets_[i] == 0) continue;
376     cumul_sum += buckets_[i];
377     uint64 left = i == 0 ? 0ULL : 1ULL << (i - 1);
378     uint64 right = 1ULL << i;
379     absl::StrAppendFormat(&r, "[ %12s, %12s) %7d %7.3f%% %7.3f%% ",
380                           strings::HumanReadableNumBytes(left),
381                           strings::HumanReadableNumBytes(right),
382                           buckets_[i],         // count
383                           mult * buckets_[i],  // percentage
384                           mult * cumul_sum);   // cumulative percentage
385 
386     // Add hash marks based on percentage; 40 marks for 100%.
387     auto marks = static_cast<int>(
388         (static_cast<double>(40 * buckets_[i] + (num_elem_ >> 1)) / num_elem_));
389     absl::StrAppendFormat(&r, "%s\n", std::string(marks, '#'));
390   }
391   return r;
392 }
393 
Index(const uint64 value) const394 const int TensorSizeHistogram::Index(const uint64 value) const {
395   // Log2Floor64 returns -1 for 0, 0 for 1, 1 for 2-3, 2 for 4-7, ...
396   const auto index = Log2Floor64(value) + 1;
397   return std::min(index, kMaxBuckets - 1);
398 }
399 
GetDeviceClassForNonChannelDevice(const string & device_name)400 string GetDeviceClassForNonChannelDevice(const string& device_name) {
401   DeviceNameUtils::ParsedName parsed_name;
402   bool parsed = DeviceNameUtils::ParseFullName(device_name, &parsed_name);
403   if (!parsed) {
404     string name = str_util::StringReplace(device_name, "/job_", "/job:", true);
405     name = str_util::StringReplace(name, "/replica_", "/replica:", true);
406     name = str_util::StringReplace(name, "/task_", "/task:", true);
407     name = str_util::StringReplace(name, "/device_", "/device:", true);
408     name = str_util::StringReplace(name, "GPU_", "GPU:", true);
409     name = str_util::StringReplace(name, "CPU_", "CPU:", true);
410     name = str_util::StringReplace(name, "gpu_", "gpu:", true);
411     name = str_util::StringReplace(name, "cpu_", "cpu:", true);
412     parsed = DeviceNameUtils::ParseFullName(name, &parsed_name);
413   }
414   if (parsed) {
415     const string jobname = parsed_name.has_job ? parsed_name.job : "";
416     return absl::StrCat("/", jobname, "/", parsed_name.type);
417   } else {
418     return "Unclassified";
419   }
420 }
421 
GetDeviceClass(const string & device_name)422 string GetDeviceClass(const string& device_name) {
423   // TODO(dyoon): channel device name follows the convention we currently have
424   // in VirtualScheduler. This should be revised with VirtualScheduler as well
425   // as VirtualPlacer in the future.
426   if (device_name.find("Channel") != string::npos) {
427     const string from = "_from_";
428     const string to = "_to_";
429     const auto from_loc = device_name.find(from);
430     const auto to_loc = device_name.find(to);
431     const auto src_device_full = device_name.substr(
432         from_loc + from.size(), to_loc - (from_loc + from.size()));
433     const auto dst_device_full = device_name.substr(to_loc + to.size());
434     return absl::StrCat(
435         "Channel", ": ", GetDeviceClassForNonChannelDevice(src_device_full),
436         " -> ", GetDeviceClassForNonChannelDevice(dst_device_full));
437   } else {
438     return GetDeviceClassForNonChannelDevice(device_name);
439   }
440 }
441 
GetStatsStringFromRunMetadata(const RunMetadata & run_metadata,bool verbosity)442 string GetStatsStringFromRunMetadata(const RunMetadata& run_metadata,
443                                      bool verbosity) {
444   // TODO(dyoon): print out other stats as needed.
445   std::ostringstream output;
446 
447   // Tensor size histogram:
448   // if verbosity, it outputs per-device histogram,
449   // otherwise, only per-class histogram.
450   std::unordered_map<string, TensorSizeHistogram> device_to_hist_map;
451   const auto& step_stats = run_metadata.step_stats();
452   for (const auto& dev_stat : step_stats.dev_stats()) {
453     const auto& device_name = dev_stat.device();
454     auto& hist = device_to_hist_map[device_name];
455     for (const auto& node_stat : dev_stat.node_stats()) {
456       for (const auto& node_output : node_stat.output()) {
457         // TODO(dyoon): Calculate tensor size from tensor_description's dtype
458         // and shape, instead of using optional allocation_description.
459         const auto size = node_output.tensor_description()
460                               .allocation_description()
461                               .allocated_bytes();
462         hist.Add(size);
463       }
464     }
465   }
466   if (verbosity) {
467     output << "\n";
468     output << "Per device tensor size histogram.\n";
469   }
470 
471   std::unordered_map<string, TensorSizeHistogram> device_class_to_hist_map;
472   for (const auto& device_hist : device_to_hist_map) {
473     const auto& device_name = device_hist.first;
474     const auto& hist = device_hist.second;
475     if (verbosity) {
476       output << "Device: " << device_name << "\n" << hist.ToString() << "\n";
477     }
478     const auto device_class = GetDeviceClass(device_name);
479     auto it = device_class_to_hist_map.find(device_class);
480     if (it == device_class_to_hist_map.end()) {
481       device_class_to_hist_map.emplace(device_class, TensorSizeHistogram(hist));
482     } else {
483       it->second.Merge(hist);
484     }
485   }
486   output << "\n";
487   output << "Aggregated per device / channel type tensor size histogram:\n";
488   for (const auto& device_hist : device_class_to_hist_map) {
489     const auto& device_name = device_hist.first;
490     const auto& hist = device_hist.second;
491     output << "Device: " << device_name << "\n" << hist.ToString() << "\n";
492   }
493   output << "\n";
494 
495   return output.str();
496 }
497 
CombineCostsAndUpdateExecutionTime(bool compute_memory_overlap,Costs * costs)498 void CombineCostsAndUpdateExecutionTime(bool compute_memory_overlap,
499                                         Costs* costs) {
500   if (compute_memory_overlap) {
501     costs->execution_time =
502         std::max(costs->intermediate_memory_time,
503                  std::max(costs->compute_time, costs->memory_time));
504   } else {
505     costs->execution_time = costs->compute_time + costs->memory_time +
506                             costs->intermediate_memory_time;
507   }
508 }
509 }  // end namespace grappler
510 }  // end namespace tensorflow
511