1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/grappler/costs/utils.h"
17
18 #include <stddef.h>
19
20 #include <utility>
21
22 #include "absl/strings/str_cat.h"
23 #include "absl/strings/str_format.h"
24 #include "third_party/eigen3/Eigen/Core"
25 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
26 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
27 #include "tensorflow/core/framework/allocation_description.pb.h"
28 #include "tensorflow/core/framework/attr_value.pb.h"
29 #include "tensorflow/core/framework/op.h"
30 #include "tensorflow/core/framework/op_def.pb.h"
31 #include "tensorflow/core/framework/step_stats.pb.h"
32 #include "tensorflow/core/framework/tensor.pb.h"
33 #include "tensorflow/core/framework/tensor_description.pb.h"
34 #include "tensorflow/core/framework/tensor_shape.pb.h"
35 #include "tensorflow/core/framework/types.pb.h"
36 #include "tensorflow/core/graph/graph.h"
37 #include "tensorflow/core/graph/tensor_id.h"
38 #include "tensorflow/core/grappler/clusters/utils.h"
39 #include "tensorflow/core/grappler/utils.h"
40 #include "tensorflow/core/lib/core/bits.h"
41 #include "tensorflow/core/lib/strings/numbers.h"
42 #include "tensorflow/core/platform/byte_order.h"
43 #include "tensorflow/core/platform/env.h"
44 #include "tensorflow/core/platform/logging.h"
45 #include "tensorflow/core/platform/protobuf.h"
46 #include "tensorflow/core/protobuf/config.pb.h"
47 #include "tensorflow/core/util/device_name_utils.h"
48 #include "tensorflow/core/util/overflow.h"
49
50 namespace tensorflow {
51 namespace grappler {
52
UnknownInput()53 static OpInfo::TensorProperties UnknownInput() {
54 OpInfo::TensorProperties input;
55 input.set_dtype(DataType::DT_INVALID);
56 input.mutable_shape()->set_unknown_rank(true);
57 return input;
58 }
59
ExtractTensors(const AttrValue & attr_value)60 static std::vector<TensorProto> ExtractTensors(const AttrValue& attr_value) {
61 std::vector<TensorProto> tensors;
62 switch (attr_value.value_case()) {
63 case AttrValue::kTensor: {
64 tensors.push_back(attr_value.tensor());
65 break;
66 }
67 case AttrValue::kList: {
68 for (const auto& tensor_proto : attr_value.list().tensor()) {
69 tensors.push_back(tensor_proto);
70 }
71 break;
72 }
73 default: {
74 }
75 }
76 return tensors;
77 }
78
79 // Annotate the op_info inputs with extra information when possible (e.g. the
80 // input value if it's known statically).
ExtractExtraProperties(const NodeDef & node,const std::unordered_map<string,const NodeDef * > & name_to_node,OpInfo * op_info)81 static void ExtractExtraProperties(
82 const NodeDef& node,
83 const std::unordered_map<string, const NodeDef*>& name_to_node,
84 OpInfo* op_info) {
85 OpRegistry* op_registry = OpRegistry::Global();
86 const OpDef* op_def = nullptr;
87 auto s = op_registry->LookUpOpDef(node.op(), &op_def);
88 if (!s.ok()) {
89 op_def = nullptr;
90 }
91
92 for (int i = 0; i < node.input_size(); ++i) {
93 const string input_name = node.input(i);
94 CHECK(!input_name.empty());
95 if (IsControlInput(input_name)) {
96 continue;
97 }
98 TensorId input_tensor_id = ParseTensorName(input_name);
99 const string input_node_name(input_tensor_id.first);
100
101 auto iter = name_to_node.find(input_node_name);
102 if (iter == name_to_node.end()) continue;
103 const NodeDef* input_node = iter->second;
104
105 if (i >= op_info->inputs_size()) {
106 LOG(ERROR) << "OpInfo's inputs doesn't match the graph! OpInfo: "
107 << op_info->DebugString()
108 << "\nCurrent node: " << node.DebugString()
109 << "\nInput node: " << input_node->DebugString();
110 }
111
112 // The value attribute in Const input is useful for cost prediction.
113 if (input_node->op() == "Const" && i < op_info->inputs_size()) {
114 auto it = input_node->attr().find("value");
115 if (it == input_node->attr().end()) continue;
116
117 const AttrValue& attr_value = it->second;
118 std::vector<TensorProto> tensors = ExtractTensors(attr_value);
119 if (tensors.empty()) continue;
120
121 const TensorProto& t = tensors[0];
122 OpInfo::TensorProperties* input = op_info->mutable_inputs(i);
123 *(input->mutable_value()) = t;
124
125 // For filename input, the file size can also be useful.
126 if (op_def && i < op_def->input_arg_size() &&
127 op_def->input_arg(i).name().find("filename") != string::npos) {
128 Tensor tensor;
129 if (!tensor.FromProto(t)) {
130 continue;
131 }
132 if (tensor.NumElements() != 1) {
133 continue;
134 }
135 const string& filename = tensor.scalar<tstring>()();
136
137 Env* env = Env::Default();
138 FileStatistics stat;
139 Status s = env->Stat(filename, &stat);
140 if (!s.ok()) {
141 continue;
142 }
143 AttrValue attr;
144 attr.set_i(stat.length);
145 string attr_key = absl::StrCat("input_", i, "_filesize");
146 (*op_info->mutable_attr())[attr_key] = attr;
147 }
148 }
149
150 // When the input is a handle (e.g. look up table handle), the information
151 // in the op itself is not sufficient to predict the op memory.
152 if (op_def && i < op_def->input_arg_size() &&
153 op_def->input_arg(i).name().find("handle") != string::npos) {
154 string new_key = absl::StrCat("parent_", i, "_op");
155 AttrValue attr;
156 attr.set_s(input_node->op());
157 (*op_info->mutable_attr())[new_key] = attr;
158 // TODO(yuefengz): Only parent node's op name is copied. Copy inputs
159 // and attributes when necessary.
160 }
161 }
162 }
163
FindInputFeatures(const NodeDef & node,const std::unordered_map<string,const CostGraphDef::Node * > & name_to_cost,const std::unordered_map<string,const NodeDef * > & name_to_node)164 std::vector<OpInfo::TensorProperties> FindInputFeatures(
165 const NodeDef& node,
166 const std::unordered_map<string, const CostGraphDef::Node*>& name_to_cost,
167 const std::unordered_map<string, const NodeDef*>& name_to_node) {
168 std::vector<OpInfo::TensorProperties> inputs;
169 for (const auto& input_name : node.input()) {
170 CHECK(!input_name.empty());
171 TensorId input_tensor_id = ParseTensorName(input_name);
172 const string input_node_name(input_tensor_id.first);
173 const int output_index = input_tensor_id.second;
174
175 // Skip control inputs.
176 if (output_index == Graph::kControlSlot) {
177 continue;
178 }
179
180 auto it = name_to_cost.find(input_node_name);
181 if (it == name_to_cost.end() || output_index < 0) {
182 inputs.push_back(UnknownInput());
183 } else {
184 const CostGraphDef::Node* input_cost = it->second;
185 if (input_cost->output_info_size() == 0) {
186 inputs.push_back(UnknownInput());
187 } else {
188 const CostGraphDef::Node::OutputInfo& output =
189 input_cost->output_info(output_index);
190 OpInfo::TensorProperties input;
191 input.set_dtype(output.dtype());
192 *input.mutable_shape() = output.shape();
193 inputs.push_back(input);
194 }
195 }
196 }
197
198 return inputs;
199 }
200
CalculateTensorSize(const OpInfo::TensorProperties & prop)201 int64_t CalculateTensorSize(const OpInfo::TensorProperties& prop) {
202 int64_t size = DataTypeSize(BaseType(prop.dtype()));
203 TensorShapeProto shape = prop.shape();
204
205 // Can't infer the size if the rank is unknown. It has to be at least a
206 // scalar though.
207 if (shape.unknown_rank()) {
208 VLOG(2) << "CalculateTensorSize() -- unknown rank";
209 return size;
210 }
211
212 // If one of the dimensions is unknown statically, assume it's at least one.
213 for (int i = 0; i < shape.dim_size(); ++i) {
214 if (shape.dim(i).size() < 0) {
215 shape.mutable_dim(i)->set_size(1);
216 VLOG(2) << "CalculateTensorSize() -- unknown dim: " << i;
217 }
218 }
219
220 int64_t num_elems = TensorShape(shape).num_elements();
221 int64_t tensor_size = MultiplyWithoutOverflow(num_elems, size);
222 if (tensor_size < 0) {
223 VLOG(1) << "Overflow encountered when computing tensor size, multiplying "
224 << num_elems << " with " << size;
225 return -1;
226 }
227 return tensor_size;
228 }
229
CalculateOutputSize(const std::vector<OpInfo::TensorProperties> & output_properties,const int port_num)230 int64_t CalculateOutputSize(
231 const std::vector<OpInfo::TensorProperties>& output_properties,
232 const int port_num) {
233 if (port_num < 0) return 4; // 4B for control dependency.
234
235 if (port_num >= output_properties.size()) {
236 LOG(ERROR) << "CalculateOutputSize() -- port_num: " << port_num
237 << " >= output_properties.size(): " << output_properties.size();
238 return 0;
239 }
240
241 return CalculateTensorSize(output_properties[port_num]);
242 }
243
GetDeviceInfo(const string & device_str)244 DeviceProperties GetDeviceInfo(const string& device_str) {
245 DeviceProperties unknown;
246 unknown.set_type("UNKNOWN");
247
248 DeviceNameUtils::ParsedName parsed;
249 if (DeviceNameUtils::ParseFullName(device_str, &parsed)) {
250 if (parsed.type == "GPU") {
251 TfDeviceId tf_device_id(parsed.id);
252 PlatformDeviceId platform_device_id;
253 Status s =
254 GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id);
255 if (!s.ok()) {
256 // We are probably running simulation without linking cuda libraries.
257 platform_device_id = PlatformDeviceId(parsed.id);
258 }
259 return GetLocalGPUInfo(platform_device_id);
260 } else if (parsed.type == "CPU") {
261 return GetLocalCPUInfo();
262 }
263 }
264 return unknown;
265 }
266
GetDeviceInfo(const CostGraphDef::Node & node)267 DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node) {
268 return GetDeviceInfo(node.device());
269 }
270
BuildOpInfoWithoutDevice(const NodeDef & node,const std::unordered_map<string,const NodeDef * > & name_to_node,const std::vector<OpInfo::TensorProperties> & inputs)271 OpInfo BuildOpInfoWithoutDevice(
272 const NodeDef& node,
273 const std::unordered_map<string, const NodeDef*>& name_to_node,
274 const std::vector<OpInfo::TensorProperties>& inputs) {
275 OpInfo op_info;
276 op_info.set_op(node.op());
277 *op_info.mutable_attr() = node.attr();
278 for (auto& input : inputs) {
279 *op_info.add_inputs() = input;
280 }
281 ExtractExtraProperties(node, name_to_node, &op_info);
282 return op_info;
283 }
284
GetOpDescription(const OpInfo & op_info)285 string GetOpDescription(const OpInfo& op_info) {
286 string description = "[";
287 description += "Op=" + op_info.op() + ", ";
288 description += "input_shapes=[";
289 for (auto const& input : op_info.inputs()) {
290 description += PartialTensorShape::DebugString(input.shape());
291 }
292 description += "]";
293 return description;
294 }
295
CostGraphToOpPerformanceData(const CostGraphDef & cost_graph,const GraphDef & graph)296 OpPerformanceList CostGraphToOpPerformanceData(const CostGraphDef& cost_graph,
297 const GraphDef& graph) {
298 OpPerformanceList ret;
299 std::unordered_map<string, const CostGraphDef::Node*> name_to_cost;
300 std::unordered_map<string, const NodeDef*> name_to_node;
301 for (auto& node : cost_graph.node()) {
302 name_to_cost[node.name()] = &node;
303 }
304 for (auto& node : graph.node()) {
305 name_to_node[node.name()] = &node;
306 }
307
308 for (const auto& node : graph.node()) {
309 // Skip the nodes that are not in the cost graph: these are nodes that
310 // aren't run, because they aren't in the intersection of transitive
311 // fan-in of a fetch node and the transitive fan-out of an input, or nodes
312 // that were optimized away by the optimizer. Since they don't contribute
313 // to the execution time we simply discard them.
314 auto it = name_to_cost.find(node.name());
315 if (it == name_to_cost.end()) {
316 continue;
317 }
318 const CostGraphDef::Node* cost_node = it->second;
319
320 OpPerformance* perf = ret.add_op_performance();
321 perf->set_node(node.name());
322
323 std::vector<OpInfo::TensorProperties> inputs =
324 FindInputFeatures(node, name_to_cost, name_to_node);
325 *perf->mutable_op() = BuildOpInfoWithoutDevice(node, name_to_node, inputs);
326 *perf->mutable_op()->mutable_device() = GetDeviceInfo(cost_node->device());
327
328 perf->set_temporary_memory_size(cost_node->temporary_memory_size());
329 // Note that CostGraphDef::Node::compute_cost is microseconds, while
330 // OpPerformance.compute_cost is nanoseconds.
331 perf->set_compute_cost(cost_node->compute_cost() * 1000);
332 perf->set_compute_time(cost_node->compute_time() * 1000);
333 perf->set_memory_time(cost_node->memory_time() * 1000);
334
335 for (const auto& output_info : cost_node->output_info()) {
336 perf->mutable_op_memory()->add_output_memory(output_info.size());
337 }
338
339 perf->mutable_op_memory()->set_temp_memory(
340 cost_node->temporary_memory_size());
341 perf->mutable_op_memory()->set_persistent_memory(
342 cost_node->persistent_memory_size());
343 }
344 return ret;
345 }
346
Add(const uint64 value)347 void TensorSizeHistogram::Add(const uint64 value) {
348 num_elem_++;
349 sum_elem_ += value;
350 min_ = std::min(min_, value);
351 max_ = std::max(max_, value);
352 buckets_[Index(value)]++;
353 }
354
Merge(const TensorSizeHistogram & src)355 void TensorSizeHistogram::Merge(const TensorSizeHistogram& src) {
356 num_elem_ += src.num_elem_;
357 sum_elem_ += src.sum_elem_;
358 min_ = std::min(min_, src.min_);
359 max_ = std::max(max_, src.max_);
360 std::transform(buckets_.begin(), buckets_.end(), src.buckets_.begin(),
361 buckets_.begin(), std::plus<uint64>());
362 }
363
ToString() const364 string TensorSizeHistogram::ToString() const {
365 string r = absl::StrFormat(
366 "Count: %lld, Average: %s, Min: %s, Max: %s"
367 "\n------------------------------------------------------\n",
368 num_elem_, strings::HumanReadableNumBytes(Average()),
369 strings::HumanReadableNumBytes(min_),
370 strings::HumanReadableNumBytes(max_));
371 const double mult = num_elem_ > 0 ? 100.0 / num_elem_ : 0.0;
372 uint64 cumul_sum = 0;
373
374 for (int i = 0; i < buckets_.size(); i++) {
375 if (buckets_[i] == 0) continue;
376 cumul_sum += buckets_[i];
377 uint64 left = i == 0 ? 0ULL : 1ULL << (i - 1);
378 uint64 right = 1ULL << i;
379 absl::StrAppendFormat(&r, "[ %12s, %12s) %7d %7.3f%% %7.3f%% ",
380 strings::HumanReadableNumBytes(left),
381 strings::HumanReadableNumBytes(right),
382 buckets_[i], // count
383 mult * buckets_[i], // percentage
384 mult * cumul_sum); // cumulative percentage
385
386 // Add hash marks based on percentage; 40 marks for 100%.
387 auto marks = static_cast<int>(
388 (static_cast<double>(40 * buckets_[i] + (num_elem_ >> 1)) / num_elem_));
389 absl::StrAppendFormat(&r, "%s\n", std::string(marks, '#'));
390 }
391 return r;
392 }
393
Index(const uint64 value) const394 const int TensorSizeHistogram::Index(const uint64 value) const {
395 // Log2Floor64 returns -1 for 0, 0 for 1, 1 for 2-3, 2 for 4-7, ...
396 const auto index = Log2Floor64(value) + 1;
397 return std::min(index, kMaxBuckets - 1);
398 }
399
GetDeviceClassForNonChannelDevice(const string & device_name)400 string GetDeviceClassForNonChannelDevice(const string& device_name) {
401 DeviceNameUtils::ParsedName parsed_name;
402 bool parsed = DeviceNameUtils::ParseFullName(device_name, &parsed_name);
403 if (!parsed) {
404 string name = str_util::StringReplace(device_name, "/job_", "/job:", true);
405 name = str_util::StringReplace(name, "/replica_", "/replica:", true);
406 name = str_util::StringReplace(name, "/task_", "/task:", true);
407 name = str_util::StringReplace(name, "/device_", "/device:", true);
408 name = str_util::StringReplace(name, "GPU_", "GPU:", true);
409 name = str_util::StringReplace(name, "CPU_", "CPU:", true);
410 name = str_util::StringReplace(name, "gpu_", "gpu:", true);
411 name = str_util::StringReplace(name, "cpu_", "cpu:", true);
412 parsed = DeviceNameUtils::ParseFullName(name, &parsed_name);
413 }
414 if (parsed) {
415 const string jobname = parsed_name.has_job ? parsed_name.job : "";
416 return absl::StrCat("/", jobname, "/", parsed_name.type);
417 } else {
418 return "Unclassified";
419 }
420 }
421
GetDeviceClass(const string & device_name)422 string GetDeviceClass(const string& device_name) {
423 // TODO(dyoon): channel device name follows the convention we currently have
424 // in VirtualScheduler. This should be revised with VirtualScheduler as well
425 // as VirtualPlacer in the future.
426 if (device_name.find("Channel") != string::npos) {
427 const string from = "_from_";
428 const string to = "_to_";
429 const auto from_loc = device_name.find(from);
430 const auto to_loc = device_name.find(to);
431 const auto src_device_full = device_name.substr(
432 from_loc + from.size(), to_loc - (from_loc + from.size()));
433 const auto dst_device_full = device_name.substr(to_loc + to.size());
434 return absl::StrCat(
435 "Channel", ": ", GetDeviceClassForNonChannelDevice(src_device_full),
436 " -> ", GetDeviceClassForNonChannelDevice(dst_device_full));
437 } else {
438 return GetDeviceClassForNonChannelDevice(device_name);
439 }
440 }
441
GetStatsStringFromRunMetadata(const RunMetadata & run_metadata,bool verbosity)442 string GetStatsStringFromRunMetadata(const RunMetadata& run_metadata,
443 bool verbosity) {
444 // TODO(dyoon): print out other stats as needed.
445 std::ostringstream output;
446
447 // Tensor size histogram:
448 // if verbosity, it outputs per-device histogram,
449 // otherwise, only per-class histogram.
450 std::unordered_map<string, TensorSizeHistogram> device_to_hist_map;
451 const auto& step_stats = run_metadata.step_stats();
452 for (const auto& dev_stat : step_stats.dev_stats()) {
453 const auto& device_name = dev_stat.device();
454 auto& hist = device_to_hist_map[device_name];
455 for (const auto& node_stat : dev_stat.node_stats()) {
456 for (const auto& node_output : node_stat.output()) {
457 // TODO(dyoon): Calculate tensor size from tensor_description's dtype
458 // and shape, instead of using optional allocation_description.
459 const auto size = node_output.tensor_description()
460 .allocation_description()
461 .allocated_bytes();
462 hist.Add(size);
463 }
464 }
465 }
466 if (verbosity) {
467 output << "\n";
468 output << "Per device tensor size histogram.\n";
469 }
470
471 std::unordered_map<string, TensorSizeHistogram> device_class_to_hist_map;
472 for (const auto& device_hist : device_to_hist_map) {
473 const auto& device_name = device_hist.first;
474 const auto& hist = device_hist.second;
475 if (verbosity) {
476 output << "Device: " << device_name << "\n" << hist.ToString() << "\n";
477 }
478 const auto device_class = GetDeviceClass(device_name);
479 auto it = device_class_to_hist_map.find(device_class);
480 if (it == device_class_to_hist_map.end()) {
481 device_class_to_hist_map.emplace(device_class, TensorSizeHistogram(hist));
482 } else {
483 it->second.Merge(hist);
484 }
485 }
486 output << "\n";
487 output << "Aggregated per device / channel type tensor size histogram:\n";
488 for (const auto& device_hist : device_class_to_hist_map) {
489 const auto& device_name = device_hist.first;
490 const auto& hist = device_hist.second;
491 output << "Device: " << device_name << "\n" << hist.ToString() << "\n";
492 }
493 output << "\n";
494
495 return output.str();
496 }
497
CombineCostsAndUpdateExecutionTime(bool compute_memory_overlap,Costs * costs)498 void CombineCostsAndUpdateExecutionTime(bool compute_memory_overlap,
499 Costs* costs) {
500 if (compute_memory_overlap) {
501 costs->execution_time =
502 std::max(costs->intermediate_memory_time,
503 std::max(costs->compute_time, costs->memory_time));
504 } else {
505 costs->execution_time = costs->compute_time + costs->memory_time +
506 costs->intermediate_memory_time;
507 }
508 }
509 } // end namespace grappler
510 } // end namespace tensorflow
511