1 #include <unordered_map>
2 #include <unordered_set>
3
4 #include <torch/csrc/profiler/perf-inl.h>
5 #include <torch/csrc/profiler/perf.h>
6
7 namespace torch::profiler::impl::linux_perf {
8
9 #if defined(__ANDROID__) || defined(__linux__)
10
11 /*
12 * PerfEvent
13 * ---------
14 */
15
16 /*
17 * Syscall wrapper for perf_event_open(2)
18 */
perf_event_open(struct perf_event_attr * hw_event,pid_t pid,int cpu,int group_fd,unsigned long flags)19 inline long perf_event_open(
20 struct perf_event_attr* hw_event,
21 pid_t pid,
22 int cpu,
23 int group_fd,
24 unsigned long flags) {
25 return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
26 }
27
28 // TODO sync with Kineto level abstract events in profiler/events.h
29 static const std::unordered_map<
30 std::string,
31 std::pair<perf_type_id, /* perf event type */ uint32_t>>
32 EventTable{
33 {"cycles",
34 std::make_pair(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES)},
35 {"instructions",
36 std::make_pair(PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS)},
37
38 // Non Standard events for testing
39 {"pagefaults",
40 std::make_pair(PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS)},
41 {"backend-stall-cycles",
42 std::make_pair(
43 PERF_TYPE_HARDWARE,
44 PERF_COUNT_HW_STALLED_CYCLES_BACKEND)},
45 {"frontend-stall-cycles",
46 std::make_pair(
47 PERF_TYPE_HARDWARE,
48 PERF_COUNT_HW_STALLED_CYCLES_FRONTEND)}};
49
~PerfEvent()50 PerfEvent::~PerfEvent() {
51 if (fd_ > -1) {
52 close(fd_);
53 }
54 fd_ = -1; // poison
55 }
56
Init()57 void PerfEvent::Init() {
58 TORCH_CHECK(!name_.empty(), "Invalid profiler event name");
59
60 auto const it = EventTable.find(name_);
61 if (it == EventTable.end()) {
62 TORCH_CHECK(false, "Unsupported profiler event name: ", name_);
63 }
64
65 struct perf_event_attr attr {};
66 memset(&attr, 0, sizeof(attr));
67
68 attr.size = sizeof(perf_event_attr);
69 attr.type = it->second.first;
70 attr.config = it->second.second;
71 attr.disabled = 1;
72 attr.inherit = 1;
73 attr.exclude_kernel = 1; // TBD
74 attr.exclude_hv = 1;
75 /*
76 * These can be used to calculate estimated totals if the PMU is overcommitted
77 * and multiplexing is happening
78 */
79 attr.read_format =
80 PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
81
82 pid_t pid = getpid(); // this pid
83 int cpu = -1; // all cpus
84 int group_fd = -1;
85 unsigned long flags = 0;
86
87 fd_ = static_cast<int>(perf_event_open(&attr, pid, cpu, group_fd, flags));
88 if (fd_ == -1) {
89 TORCH_CHECK(
90 false, "perf_event_open() failed, error: ", std::strerror(errno));
91 }
92 Reset();
93 }
94
ReadCounter() const95 uint64_t PerfEvent::ReadCounter() const {
96 PerfCounter counter{};
97 long n = read(fd_, &counter, sizeof(PerfCounter));
98 TORCH_CHECK(
99 n == sizeof(counter),
100 "Read failed for Perf event fd, event : ",
101 name_,
102 ", error: ",
103 std::strerror(errno));
104 TORCH_CHECK(
105 counter.time_enabled == counter.time_running,
106 "Hardware performance counter time multiplexing is not handled yet",
107 ", name: ",
108 name_,
109 ", enabled: ",
110 counter.time_enabled,
111 ", running: ",
112 counter.time_running);
113 return counter.value;
114 }
115
116 #else /* __ANDROID__ || __linux__ */
117 /*
118 * Shim class for unsupported platforms - this will always return 0 counter
119 * value
120 */
121
122 PerfEvent::~PerfEvent(){};
123
124 void PerfEvent::Init(){};
125
126 uint64_t PerfEvent::ReadCounter() const {
127 return 0;
128 };
129
130 #endif /* __ANDROID__ || __linux__ */
131
132 /*
133 * PerfProfiler
134 * ------------
135 */
136
Configure(std::vector<std::string> & event_names)137 void PerfProfiler::Configure(std::vector<std::string>& event_names) {
138 TORCH_CHECK(
139 event_names.size() <= MAX_EVENTS,
140 "Too many events to configure, configured: ",
141 event_names.size(),
142 ", max allowed:",
143 MAX_EVENTS);
144 std::unordered_set<std::string> s(event_names.begin(), event_names.end());
145 TORCH_CHECK(
146 s.size() == event_names.size(), "Duplicate event names are not allowed!")
147 for (auto name : event_names) {
148 events_.emplace_back(name);
149 events_.back().Init();
150 }
151
152 // TODO
153 // Reset pthreadpool here to make sure we can attach to new children
154 // threads
155 }
156
Enable()157 void PerfProfiler::Enable() {
158 if (!start_values_.empty()) {
159 StopCounting();
160 }
161
162 start_values_.emplace(events_.size(), 0);
163
164 auto& sv = start_values_.top();
165 for (unsigned i = 0; i < events_.size(); ++i) {
166 sv[i] = events_[i].ReadCounter();
167 }
168 StartCounting();
169 }
170
Disable(perf_counters_t & vals)171 void PerfProfiler::Disable(perf_counters_t& vals) {
172 StopCounting();
173 TORCH_CHECK(
174 vals.size() == events_.size(),
175 "Can not fit all perf counters in the supplied container");
176 TORCH_CHECK(
177 !start_values_.empty(), "PerfProfiler must be enabled before disabling");
178
179 /* Always connecting this disable event to the last enable event i.e. using
180 * whatever is on the top of the start counter value stack. */
181 perf_counters_t& sv = start_values_.top();
182 for (unsigned i = 0; i < events_.size(); ++i) {
183 vals[i] = CalcDelta(sv[i], events_[i].ReadCounter());
184 }
185 start_values_.pop();
186
187 // Restore it for a parent
188 if (!start_values_.empty()) {
189 StartCounting();
190 }
191 }
192 } // namespace torch::profiler::impl::linux_perf
193