1 /* Copyright 2016 The TensorFlow Authors All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/profiler/internal/tfprof_timeline.h"
17
18 #include <utility>
19
20 #include "absl/strings/str_cat.h"
21 #include "absl/strings/str_format.h"
22 #include "tensorflow/core/lib/core/status.h"
23 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
24
25 namespace tensorflow {
26 namespace tfprof {
27 namespace {
28 int kMaxDisplayedMemNode = 10;
29
GetTimeDevName(const std::string & dev)30 std::string GetTimeDevName(const std::string& dev) {
31 if (dev.find("stream") != dev.npos) {
32 return absl::StrCat("Op execution threads: ", dev);
33 } else {
34 return absl::StrCat("Op scheduling threads: ", dev);
35 }
36 }
GetMemoryLaneName(const std::string & dev)37 std::string GetMemoryLaneName(const std::string& dev) {
38 return absl::StrCat("mem usage on:", dev);
39 }
40 } // namespace
41
CreateEvent(const string & ph,const string & category,const string & name,int64_t pid,int64_t tid,int64_t ts)42 Json::Value ChromeTraceFormatter::CreateEvent(const string& ph,
43 const string& category,
44 const string& name, int64_t pid,
45 int64_t tid, int64_t ts) {
46 Json::Value event(Json::objectValue);
47 event["ph"] = Json::Value(ph);
48 event["cat"] = Json::Value(category);
49 event["name"] = Json::Value(name);
50 event["pid"] = Json::Int64(pid);
51 event["tid"] = Json::Int64(tid);
52 event["ts"] = Json::Int64(ts);
53 return event;
54 }
55
EmitPID(const string & name,int64_t pid)56 void ChromeTraceFormatter::EmitPID(const string& name, int64_t pid) {
57 Json::Value event(Json::objectValue);
58 event["name"] = Json::Value("process_name");
59 event["ph"] = Json::Value("M");
60 event["pid"] = Json::Int64(pid);
61 Json::Value args(Json::objectValue);
62 args["name"] = Json::Value(name);
63 event["args"] = args;
64 metadata_.push_back(event);
65 }
66
EmitRegion(int64_t ts,int64_t duration,int64_t pid,int64_t tid,const string & category,const string & name,Json::Value args)67 void ChromeTraceFormatter::EmitRegion(int64_t ts, int64_t duration, int64_t pid,
68 int64_t tid, const string& category,
69 const string& name, Json::Value args) {
70 Json::Value event = CreateEvent("X", category, name, pid, tid, ts);
71 event["dur"] = Json::Int64(duration);
72 event["args"] = std::move(args);
73 metadata_.push_back(event);
74 }
75
EmitFlowStart(const string & name,int64_t ts,int64_t pid,int64_t tid,int64_t flow_id)76 void ChromeTraceFormatter::EmitFlowStart(const string& name, int64_t ts,
77 int64_t pid, int64_t tid,
78 int64_t flow_id) {
79 Json::Value event = CreateEvent("s", "DataFlow", name, pid, tid, ts);
80 event["id"] = Json::Int64(flow_id);
81 events_.push_back(event);
82 }
83
EmitFlowEnd(const string & name,int64_t ts,int64_t pid,int64_t tid,int64_t flow_id)84 void ChromeTraceFormatter::EmitFlowEnd(const string& name, int64_t ts,
85 int64_t pid, int64_t tid,
86 int64_t flow_id) {
87 Json::Value event = CreateEvent("t", "DataFlow", name, pid, tid, ts);
88 event["id"] = Json::Int64(flow_id);
89 events_.push_back(event);
90 }
91
EmitCounter(const string & category,const string & name,int64_t pid,int64_t ts,const string & device,int64_t bytes,const std::map<int64_t,std::vector<string>> & tensor_mem)92 void ChromeTraceFormatter::EmitCounter(
93 const string& category, const string& name, int64_t pid, int64_t ts,
94 const string& device, int64_t bytes,
95 const std::map<int64_t, std::vector<string>>& tensor_mem) {
96 Json::Value event = CreateEvent("C", category, "Allocated Bytes", pid, 0, ts);
97 Json::Value args(Json::objectValue);
98 args["Allocator Bytes in Use"] = Json::Int64(bytes);
99 event["args"] = args;
100 events_.push_back(event);
101
102 // TODO(xpan): chrome://tracing is not ideal visualization for memory.
103 // It would be great to have a customized UI for it.
104 Json::Value event2 =
105 CreateEvent("C", category, "Top Allocations", pid + 1, 0, ts);
106 Json::Value args2(Json::objectValue);
107 // Need to reserve the same args for all locations.
108 for (int i = 1; i < kMaxDisplayedMemNode; ++i) {
109 args2[absl::StrFormat("Top Allocation %02d", i)] = Json::Value("N/A");
110 }
111 int count = 0;
112 for (auto it = tensor_mem.rbegin(); it != tensor_mem.rend(); ++it) {
113 for (const string& t : it->second) {
114 if (bytes < it->first || count >= kMaxDisplayedMemNode) {
115 break;
116 }
117 args2[absl::StrFormat("Top Allocation %02d", count)] =
118 Json::Value(absl::StrCat(it->first / 1000000.0, " MB from ", t));
119 ++count;
120 bytes -= it->first;
121 }
122 }
123 args2[std::string("Not Displayed")] =
124 Json::Value(absl::StrFormat("%.2f MB", bytes / 1000000.0));
125 event2["args"] = args2;
126 events_.push_back(event2);
127 }
128
Format()129 string ChromeTraceFormatter::Format() {
130 Json::Value trace;
131 trace["traceEvents"] = Json::Value(Json::arrayValue);
132 for (const Json::Value& v : metadata_) {
133 trace["traceEvents"].append(v);
134 }
135 for (const Json::Value& v : events_) {
136 trace["traceEvents"].append(v);
137 }
138 Json::FastWriter writer;
139 string trace_str = writer.write(trace);
140 if (trace_str.length() > 200 * 1024 * 1024) {
141 absl::FPrintF(stderr,
142 "Trace file is over 200MB. Chrome might not be able to "
143 "display it. Consider to use filters (e.g. -min_micros "
144 "> 1000 or -op_type .*gpu:0.* to reduce the size.\n");
145 }
146 return trace_str;
147 }
148
TrackNode(int64_t step,const GraphNode * node)149 void MemoryTracker::TrackNode(int64_t step, const GraphNode* node) {
150 if (!node->Trackable(step)) {
151 return;
152 }
153
154 Device& dev = devices_[node->node->canonical_device()];
155
156 std::map<int64_t, int64_t> allocs;
157 for (const auto& alloc : node->node->allocations(step)) {
158 allocs[alloc.alloc_micros()] += alloc.alloc_bytes();
159 dev.tracked_allocations[alloc.alloc_micros()] += alloc.alloc_bytes();
160 }
161 dev.tracked_allocations[0] += node->node->accelerator_persistent_bytes();
162 allocs[0] += node->node->accelerator_persistent_bytes();
163
164 int64_t last = 0;
165 std::map<int64_t, int64_t>& aggregate_allocs =
166 dev.tensor_allocs[node->name()];
167 for (auto it = allocs.begin(); it != allocs.end(); ++it) {
168 last += it->second;
169 aggregate_allocs[it->first] = last;
170 }
171 for (const auto& bytes_in_use : node->node->allocator_bytes_in_use(step)) {
172 if (bytes_in_use.first <= 0) continue;
173 dev.allocations[bytes_in_use.first] = bytes_in_use.second;
174 }
175 }
176
AllocateTimeNodes(GraphNode * gnode)177 void Timeline::AllocateTimeNodes(GraphNode* gnode) {
178 if (gnode->Trackable(step_)) {
179 TrackNode(gnode);
180 const TFGraphNode* node = gnode->node;
181 for (const auto& kernel_execs : node->op_execs(step_)) {
182 const string& device = kernel_execs.first;
183
184 if (process_.find(device) == process_.end()) {
185 int64_t pid = AllocatePID();
186 process_[device].reset(new Process(device, pid));
187 chrome_formatter_.EmitPID(GetTimeDevName(device), pid);
188 }
189 Process* p = process_[device].get();
190
191 for (const auto& exec : kernel_execs.second) {
192 int64_t start_micros = exec.first;
193 int64_t exec_micros = exec.second;
194 // TODO(xpan): There might be start time duplication here.
195 if (tnodes_[device].find(start_micros) == tnodes_[device].end()) {
196 // TODO(xpan): Give each kernel call a unique_name.
197 tnodes_[device][start_micros].reset(
198 new TimeNode(p, gnode, start_micros, exec_micros));
199 }
200 }
201 }
202 }
203 for (GraphNode* n : gnode->show_children) {
204 AllocateTimeNodes(n);
205 }
206 }
207
GenerateGraphTimeline(const std::vector<GraphNode * > & gnodes)208 void Timeline::GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes) {
209 for (GraphNode* gnode : gnodes) {
210 AllocateTimeNodes(gnode);
211 }
212 // To save memory, we only track cross-device (canonical device) flows.
213 for (auto& process : tnodes_) {
214 if (!IsCanonicalDevice(process.first)) continue;
215 for (auto& tn : process.second) {
216 TimeNode* tnode = tn.second.get();
217 for (GraphNode* inp : tnode->node->children) {
218 if (!inp->account || !inp->Trackable(step_)) {
219 continue;
220 }
221 for (const auto& execs : inp->node->cpu_execs(step_)) {
222 if (!IsCanonicalDevice(execs.first)) continue;
223 if (process.first == execs.first) {
224 // Not interested in flow within the same device.
225 continue;
226 }
227 for (const auto& exec : execs.second) {
228 int64_t start_micros = exec.first;
229 auto cprocess = tnodes_.find(execs.first);
230 if (cprocess == tnodes_.end()) continue;
231 auto ctn = cprocess->second.find(start_micros);
232 if (ctn == cprocess->second.end()) continue;
233 ctn->second->next_tnodes.push_back(tnode);
234 }
235 }
236 }
237 }
238 }
239
240 AllocateLanes();
241 absl::FPrintF(stdout, "generating trace file.\n");
242 int64_t flow_id = 1;
243 for (const auto& process : alloc_nodes_) {
244 for (const auto& lane : process.second) {
245 for (const auto& node : lane.second) {
246 TimeNode* tnode = node.second;
247
248 Json::Value args(Json::objectValue);
249 args["name"] = Json::Value(tnode->name());
250 chrome_formatter_.EmitRegion(node.first, tnode->exec_micros,
251 process.first, lane.first, "Op",
252 tnode->name(), args);
253 // Flow is a directed arrow pointing from src to dst.
254 // TODO(xpan): Disable flow to reduce json file size for now. Need
255 // to think of a better way to make flow interpretable.
256 for (TimeNode* next_tnode : node.second->next_tnodes) {
257 chrome_formatter_.EmitFlowStart(
258 tnode->name() + "_flow", tnode->start_micros + tnode->exec_micros,
259 process.first, lane.first, flow_id);
260 chrome_formatter_.EmitFlowEnd(
261 tnode->name() + "_flow", next_tnode->start_micros,
262 next_tnode->process->pid, next_tnode->tid, flow_id);
263 flow_id += 1;
264 }
265 }
266 }
267 }
268 for (const auto& dev : mem_tracker_.devices()) {
269 if (IsPlacedOnCPU(dev.first)) {
270 // TODO(xpan): Maybe also support CPU allocator memory tracking.
271 continue;
272 }
273 int64_t pid = AllocatePID();
274 chrome_formatter_.EmitPID(GetMemoryLaneName(dev.first), pid);
275 int64_t pid2 = AllocatePID();
276 chrome_formatter_.EmitPID(GetMemoryLaneName(dev.first) + " allocations",
277 pid2);
278
279 const MemoryTracker::Device& device = dev.second;
280
281 int64_t max_bytes_in_use = 0;
282 int64_t cur_bytes_in_use = 0;
283 int64_t last_point = 0;
284 for (const auto& alloc : device.allocations) {
285 cur_bytes_in_use = alloc.second;
286 max_bytes_in_use = std::max(max_bytes_in_use, cur_bytes_in_use);
287 // Do not plot too dense to reduce file size.
288 int64_t ts = alloc.first;
289 if (ts - last_point < 100) continue;
290 last_point = ts;
291
292 std::map<int64_t, std::vector<string>> tensor_mem;
293 for (const auto& tensor_alloc_it : dev.second.tensor_allocs) {
294 const auto& tensor_alloc = tensor_alloc_it.second;
295 auto it = tensor_alloc.lower_bound(ts);
296 if (it != tensor_alloc.begin()) {
297 --it;
298 }
299 if (it->second > 0) {
300 tensor_mem[it->second].push_back(tensor_alloc_it.first);
301 }
302 }
303 chrome_formatter_.EmitCounter("Memory", "Memory Series", pid, ts,
304 dev.first, cur_bytes_in_use, tensor_mem);
305 }
306 if (IsPlacedOnAccelerator(dev.first)) {
307 absl::FPrintF(stdout, "%s peak memory: %.2f MB\n", dev.first,
308 max_bytes_in_use / 1000000.0);
309 }
310 }
311 OutputTimeline();
312 }
313
GenerateScopeTimeline(const ScopeNode * node)314 void Timeline::GenerateScopeTimeline(const ScopeNode* node) {
315 std::set<int64_t> visited_depth;
316 EmitTreeNode(node, 0, node->proto().total_exec_micros(), 0, &visited_depth);
317 OutputTimeline();
318 }
319
GenerateCodeTimeline(const CodeNode * node)320 void Timeline::GenerateCodeTimeline(const CodeNode* node) {
321 std::set<int64_t> visited_depth;
322 EmitTreeNode(node, 0, node->proto().total_exec_micros(), 0, &visited_depth);
323 OutputTimeline();
324 }
325
OutputTimeline()326 void Timeline::OutputTimeline() {
327 std::string outfile = absl::StrFormat("%s_%d", outfile_, step());
328 Status s =
329 WriteStringToFile(Env::Default(), outfile, chrome_formatter_.Format());
330 if (!s.ok()) {
331 absl::FPrintF(stderr, "Failed to write timeline file: %s\nError: %s\n",
332 outfile, s.ToString());
333 return;
334 }
335 absl::FPrintF(stdout,
336 "\n******************************************************\n");
337 absl::FPrintF(stdout,
338 "Timeline file is written to %s.\n"
339 "Open a Chrome browser, enter URL chrome://tracing and "
340 "load the timeline file.",
341 outfile);
342 absl::FPrintF(stdout,
343 "\n******************************************************\n");
344 fflush(stdout);
345 }
346
AllocateLanes()347 void Timeline::AllocateLanes() {
348 for (auto& process : tnodes_) {
349 Process* p = process_[process.first].get();
350 for (auto& tnode : process.second) {
351 int64_t start_time = tnode.second->start_micros;
352 int64_t end_time = tnode.second->start_micros + tnode.second->exec_micros;
353 int64_t l = -1;
354 for (int64_t i = 0, end = p->lanes.size(); i < end; ++i) {
355 const auto& lane = p->lanes[i];
356 l = i;
357 for (auto cur_it = lane.rbegin(); cur_it != lane.rend(); ++cur_it) {
358 if (cur_it->second > start_time) {
359 l = -1;
360 break;
361 }
362 if (start_time > cur_it->second) {
363 break;
364 }
365 }
366 if (l >= 0) {
367 break;
368 }
369 }
370 if (l < 0) {
371 l = p->lanes.size();
372 std::map<int64_t, int64_t> nlane;
373 nlane[start_time] = end_time;
374 p->lanes.push_back(nlane);
375 } else {
376 p->lanes[l][start_time] = end_time;
377 }
378 tnode.second->tid = l;
379 alloc_nodes_[p->pid][l][start_time] = tnode.second.get();
380 }
381 }
382 }
383
AllocatePID()384 int64_t Timeline::AllocatePID() {
385 int64_t cur_pid = next_pid_;
386 next_pid_ += 1;
387 return cur_pid;
388 }
389
390 } // namespace tfprof
391 } // namespace tensorflow
392