xref: /aosp_15_r20/external/tensorflow/tensorflow/core/profiler/internal/tfprof_timeline.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2016 The TensorFlow Authors All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/profiler/internal/tfprof_timeline.h"
17 
18 #include <utility>
19 
20 #include "absl/strings/str_cat.h"
21 #include "absl/strings/str_format.h"
22 #include "tensorflow/core/lib/core/status.h"
23 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
24 
25 namespace tensorflow {
26 namespace tfprof {
27 namespace {
28 int kMaxDisplayedMemNode = 10;
29 
GetTimeDevName(const std::string & dev)30 std::string GetTimeDevName(const std::string& dev) {
31   if (dev.find("stream") != dev.npos) {
32     return absl::StrCat("Op execution threads: ", dev);
33   } else {
34     return absl::StrCat("Op scheduling threads: ", dev);
35   }
36 }
GetMemoryLaneName(const std::string & dev)37 std::string GetMemoryLaneName(const std::string& dev) {
38   return absl::StrCat("mem usage on:", dev);
39 }
40 }  // namespace
41 
CreateEvent(const string & ph,const string & category,const string & name,int64_t pid,int64_t tid,int64_t ts)42 Json::Value ChromeTraceFormatter::CreateEvent(const string& ph,
43                                               const string& category,
44                                               const string& name, int64_t pid,
45                                               int64_t tid, int64_t ts) {
46   Json::Value event(Json::objectValue);
47   event["ph"] = Json::Value(ph);
48   event["cat"] = Json::Value(category);
49   event["name"] = Json::Value(name);
50   event["pid"] = Json::Int64(pid);
51   event["tid"] = Json::Int64(tid);
52   event["ts"] = Json::Int64(ts);
53   return event;
54 }
55 
EmitPID(const string & name,int64_t pid)56 void ChromeTraceFormatter::EmitPID(const string& name, int64_t pid) {
57   Json::Value event(Json::objectValue);
58   event["name"] = Json::Value("process_name");
59   event["ph"] = Json::Value("M");
60   event["pid"] = Json::Int64(pid);
61   Json::Value args(Json::objectValue);
62   args["name"] = Json::Value(name);
63   event["args"] = args;
64   metadata_.push_back(event);
65 }
66 
EmitRegion(int64_t ts,int64_t duration,int64_t pid,int64_t tid,const string & category,const string & name,Json::Value args)67 void ChromeTraceFormatter::EmitRegion(int64_t ts, int64_t duration, int64_t pid,
68                                       int64_t tid, const string& category,
69                                       const string& name, Json::Value args) {
70   Json::Value event = CreateEvent("X", category, name, pid, tid, ts);
71   event["dur"] = Json::Int64(duration);
72   event["args"] = std::move(args);
73   metadata_.push_back(event);
74 }
75 
EmitFlowStart(const string & name,int64_t ts,int64_t pid,int64_t tid,int64_t flow_id)76 void ChromeTraceFormatter::EmitFlowStart(const string& name, int64_t ts,
77                                          int64_t pid, int64_t tid,
78                                          int64_t flow_id) {
79   Json::Value event = CreateEvent("s", "DataFlow", name, pid, tid, ts);
80   event["id"] = Json::Int64(flow_id);
81   events_.push_back(event);
82 }
83 
EmitFlowEnd(const string & name,int64_t ts,int64_t pid,int64_t tid,int64_t flow_id)84 void ChromeTraceFormatter::EmitFlowEnd(const string& name, int64_t ts,
85                                        int64_t pid, int64_t tid,
86                                        int64_t flow_id) {
87   Json::Value event = CreateEvent("t", "DataFlow", name, pid, tid, ts);
88   event["id"] = Json::Int64(flow_id);
89   events_.push_back(event);
90 }
91 
EmitCounter(const string & category,const string & name,int64_t pid,int64_t ts,const string & device,int64_t bytes,const std::map<int64_t,std::vector<string>> & tensor_mem)92 void ChromeTraceFormatter::EmitCounter(
93     const string& category, const string& name, int64_t pid, int64_t ts,
94     const string& device, int64_t bytes,
95     const std::map<int64_t, std::vector<string>>& tensor_mem) {
96   Json::Value event = CreateEvent("C", category, "Allocated Bytes", pid, 0, ts);
97   Json::Value args(Json::objectValue);
98   args["Allocator Bytes in Use"] = Json::Int64(bytes);
99   event["args"] = args;
100   events_.push_back(event);
101 
102   // TODO(xpan): chrome://tracing is not ideal visualization for memory.
103   // It would be great to have a customized UI for it.
104   Json::Value event2 =
105       CreateEvent("C", category, "Top Allocations", pid + 1, 0, ts);
106   Json::Value args2(Json::objectValue);
107   // Need to reserve the same args for all locations.
108   for (int i = 1; i < kMaxDisplayedMemNode; ++i) {
109     args2[absl::StrFormat("Top Allocation %02d", i)] = Json::Value("N/A");
110   }
111   int count = 0;
112   for (auto it = tensor_mem.rbegin(); it != tensor_mem.rend(); ++it) {
113     for (const string& t : it->second) {
114       if (bytes < it->first || count >= kMaxDisplayedMemNode) {
115         break;
116       }
117       args2[absl::StrFormat("Top Allocation %02d", count)] =
118           Json::Value(absl::StrCat(it->first / 1000000.0, " MB from ", t));
119       ++count;
120       bytes -= it->first;
121     }
122   }
123   args2[std::string("Not Displayed")] =
124       Json::Value(absl::StrFormat("%.2f MB", bytes / 1000000.0));
125   event2["args"] = args2;
126   events_.push_back(event2);
127 }
128 
Format()129 string ChromeTraceFormatter::Format() {
130   Json::Value trace;
131   trace["traceEvents"] = Json::Value(Json::arrayValue);
132   for (const Json::Value& v : metadata_) {
133     trace["traceEvents"].append(v);
134   }
135   for (const Json::Value& v : events_) {
136     trace["traceEvents"].append(v);
137   }
138   Json::FastWriter writer;
139   string trace_str = writer.write(trace);
140   if (trace_str.length() > 200 * 1024 * 1024) {
141     absl::FPrintF(stderr,
142                   "Trace file is over 200MB. Chrome might not be able to "
143                   "display it. Consider to use filters (e.g. -min_micros "
144                   "> 1000 or -op_type .*gpu:0.* to reduce the size.\n");
145   }
146   return trace_str;
147 }
148 
TrackNode(int64_t step,const GraphNode * node)149 void MemoryTracker::TrackNode(int64_t step, const GraphNode* node) {
150   if (!node->Trackable(step)) {
151     return;
152   }
153 
154   Device& dev = devices_[node->node->canonical_device()];
155 
156   std::map<int64_t, int64_t> allocs;
157   for (const auto& alloc : node->node->allocations(step)) {
158     allocs[alloc.alloc_micros()] += alloc.alloc_bytes();
159     dev.tracked_allocations[alloc.alloc_micros()] += alloc.alloc_bytes();
160   }
161   dev.tracked_allocations[0] += node->node->accelerator_persistent_bytes();
162   allocs[0] += node->node->accelerator_persistent_bytes();
163 
164   int64_t last = 0;
165   std::map<int64_t, int64_t>& aggregate_allocs =
166       dev.tensor_allocs[node->name()];
167   for (auto it = allocs.begin(); it != allocs.end(); ++it) {
168     last += it->second;
169     aggregate_allocs[it->first] = last;
170   }
171   for (const auto& bytes_in_use : node->node->allocator_bytes_in_use(step)) {
172     if (bytes_in_use.first <= 0) continue;
173     dev.allocations[bytes_in_use.first] = bytes_in_use.second;
174   }
175 }
176 
AllocateTimeNodes(GraphNode * gnode)177 void Timeline::AllocateTimeNodes(GraphNode* gnode) {
178   if (gnode->Trackable(step_)) {
179     TrackNode(gnode);
180     const TFGraphNode* node = gnode->node;
181     for (const auto& kernel_execs : node->op_execs(step_)) {
182       const string& device = kernel_execs.first;
183 
184       if (process_.find(device) == process_.end()) {
185         int64_t pid = AllocatePID();
186         process_[device].reset(new Process(device, pid));
187         chrome_formatter_.EmitPID(GetTimeDevName(device), pid);
188       }
189       Process* p = process_[device].get();
190 
191       for (const auto& exec : kernel_execs.second) {
192         int64_t start_micros = exec.first;
193         int64_t exec_micros = exec.second;
194         // TODO(xpan): There might be start time duplication here.
195         if (tnodes_[device].find(start_micros) == tnodes_[device].end()) {
196           // TODO(xpan): Give each kernel call a unique_name.
197           tnodes_[device][start_micros].reset(
198               new TimeNode(p, gnode, start_micros, exec_micros));
199         }
200       }
201     }
202   }
203   for (GraphNode* n : gnode->show_children) {
204     AllocateTimeNodes(n);
205   }
206 }
207 
GenerateGraphTimeline(const std::vector<GraphNode * > & gnodes)208 void Timeline::GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes) {
209   for (GraphNode* gnode : gnodes) {
210     AllocateTimeNodes(gnode);
211   }
212   // To save memory, we only track cross-device (canonical device) flows.
213   for (auto& process : tnodes_) {
214     if (!IsCanonicalDevice(process.first)) continue;
215     for (auto& tn : process.second) {
216       TimeNode* tnode = tn.second.get();
217       for (GraphNode* inp : tnode->node->children) {
218         if (!inp->account || !inp->Trackable(step_)) {
219           continue;
220         }
221         for (const auto& execs : inp->node->cpu_execs(step_)) {
222           if (!IsCanonicalDevice(execs.first)) continue;
223           if (process.first == execs.first) {
224             // Not interested in flow within the same device.
225             continue;
226           }
227           for (const auto& exec : execs.second) {
228             int64_t start_micros = exec.first;
229             auto cprocess = tnodes_.find(execs.first);
230             if (cprocess == tnodes_.end()) continue;
231             auto ctn = cprocess->second.find(start_micros);
232             if (ctn == cprocess->second.end()) continue;
233             ctn->second->next_tnodes.push_back(tnode);
234           }
235         }
236       }
237     }
238   }
239 
240   AllocateLanes();
241   absl::FPrintF(stdout, "generating trace file.\n");
242   int64_t flow_id = 1;
243   for (const auto& process : alloc_nodes_) {
244     for (const auto& lane : process.second) {
245       for (const auto& node : lane.second) {
246         TimeNode* tnode = node.second;
247 
248         Json::Value args(Json::objectValue);
249         args["name"] = Json::Value(tnode->name());
250         chrome_formatter_.EmitRegion(node.first, tnode->exec_micros,
251                                      process.first, lane.first, "Op",
252                                      tnode->name(), args);
253         // Flow is a directed arrow pointing from src to dst.
254         // TODO(xpan): Disable flow to reduce json file size for now. Need
255         // to think of a better way to make flow interpretable.
256         for (TimeNode* next_tnode : node.second->next_tnodes) {
257           chrome_formatter_.EmitFlowStart(
258               tnode->name() + "_flow", tnode->start_micros + tnode->exec_micros,
259               process.first, lane.first, flow_id);
260           chrome_formatter_.EmitFlowEnd(
261               tnode->name() + "_flow", next_tnode->start_micros,
262               next_tnode->process->pid, next_tnode->tid, flow_id);
263           flow_id += 1;
264         }
265       }
266     }
267   }
268   for (const auto& dev : mem_tracker_.devices()) {
269     if (IsPlacedOnCPU(dev.first)) {
270       // TODO(xpan): Maybe also support CPU allocator memory tracking.
271       continue;
272     }
273     int64_t pid = AllocatePID();
274     chrome_formatter_.EmitPID(GetMemoryLaneName(dev.first), pid);
275     int64_t pid2 = AllocatePID();
276     chrome_formatter_.EmitPID(GetMemoryLaneName(dev.first) + " allocations",
277                               pid2);
278 
279     const MemoryTracker::Device& device = dev.second;
280 
281     int64_t max_bytes_in_use = 0;
282     int64_t cur_bytes_in_use = 0;
283     int64_t last_point = 0;
284     for (const auto& alloc : device.allocations) {
285       cur_bytes_in_use = alloc.second;
286       max_bytes_in_use = std::max(max_bytes_in_use, cur_bytes_in_use);
287       // Do not plot too dense to reduce file size.
288       int64_t ts = alloc.first;
289       if (ts - last_point < 100) continue;
290       last_point = ts;
291 
292       std::map<int64_t, std::vector<string>> tensor_mem;
293       for (const auto& tensor_alloc_it : dev.second.tensor_allocs) {
294         const auto& tensor_alloc = tensor_alloc_it.second;
295         auto it = tensor_alloc.lower_bound(ts);
296         if (it != tensor_alloc.begin()) {
297           --it;
298         }
299         if (it->second > 0) {
300           tensor_mem[it->second].push_back(tensor_alloc_it.first);
301         }
302       }
303       chrome_formatter_.EmitCounter("Memory", "Memory Series", pid, ts,
304                                     dev.first, cur_bytes_in_use, tensor_mem);
305     }
306     if (IsPlacedOnAccelerator(dev.first)) {
307       absl::FPrintF(stdout, "%s peak memory: %.2f MB\n", dev.first,
308                     max_bytes_in_use / 1000000.0);
309     }
310   }
311   OutputTimeline();
312 }
313 
GenerateScopeTimeline(const ScopeNode * node)314 void Timeline::GenerateScopeTimeline(const ScopeNode* node) {
315   std::set<int64_t> visited_depth;
316   EmitTreeNode(node, 0, node->proto().total_exec_micros(), 0, &visited_depth);
317   OutputTimeline();
318 }
319 
GenerateCodeTimeline(const CodeNode * node)320 void Timeline::GenerateCodeTimeline(const CodeNode* node) {
321   std::set<int64_t> visited_depth;
322   EmitTreeNode(node, 0, node->proto().total_exec_micros(), 0, &visited_depth);
323   OutputTimeline();
324 }
325 
OutputTimeline()326 void Timeline::OutputTimeline() {
327   std::string outfile = absl::StrFormat("%s_%d", outfile_, step());
328   Status s =
329       WriteStringToFile(Env::Default(), outfile, chrome_formatter_.Format());
330   if (!s.ok()) {
331     absl::FPrintF(stderr, "Failed to write timeline file: %s\nError: %s\n",
332                   outfile, s.ToString());
333     return;
334   }
335   absl::FPrintF(stdout,
336                 "\n******************************************************\n");
337   absl::FPrintF(stdout,
338                 "Timeline file is written to %s.\n"
339                 "Open a Chrome browser, enter URL chrome://tracing and "
340                 "load the timeline file.",
341                 outfile);
342   absl::FPrintF(stdout,
343                 "\n******************************************************\n");
344   fflush(stdout);
345 }
346 
AllocateLanes()347 void Timeline::AllocateLanes() {
348   for (auto& process : tnodes_) {
349     Process* p = process_[process.first].get();
350     for (auto& tnode : process.second) {
351       int64_t start_time = tnode.second->start_micros;
352       int64_t end_time = tnode.second->start_micros + tnode.second->exec_micros;
353       int64_t l = -1;
354       for (int64_t i = 0, end = p->lanes.size(); i < end; ++i) {
355         const auto& lane = p->lanes[i];
356         l = i;
357         for (auto cur_it = lane.rbegin(); cur_it != lane.rend(); ++cur_it) {
358           if (cur_it->second > start_time) {
359             l = -1;
360             break;
361           }
362           if (start_time > cur_it->second) {
363             break;
364           }
365         }
366         if (l >= 0) {
367           break;
368         }
369       }
370       if (l < 0) {
371         l = p->lanes.size();
372         std::map<int64_t, int64_t> nlane;
373         nlane[start_time] = end_time;
374         p->lanes.push_back(nlane);
375       } else {
376         p->lanes[l][start_time] = end_time;
377       }
378       tnode.second->tid = l;
379       alloc_nodes_[p->pid][l][start_time] = tnode.second.get();
380     }
381   }
382 }
383 
AllocatePID()384 int64_t Timeline::AllocatePID() {
385   int64_t cur_pid = next_pid_;
386   next_pid_ += 1;
387   return cur_pid;
388 }
389 
390 }  // namespace tfprof
391 }  // namespace tensorflow
392