xref: /aosp_15_r20/external/pytorch/torch/csrc/profiler/perf.cpp (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #include <unordered_map>
2 #include <unordered_set>
3 
4 #include <torch/csrc/profiler/perf-inl.h>
5 #include <torch/csrc/profiler/perf.h>
6 
7 namespace torch::profiler::impl::linux_perf {
8 
9 #if defined(__ANDROID__) || defined(__linux__)
10 
11 /*
12  * PerfEvent
13  * ---------
14  */
15 
16 /*
17  * Syscall wrapper for perf_event_open(2)
18  */
perf_event_open(struct perf_event_attr * hw_event,pid_t pid,int cpu,int group_fd,unsigned long flags)19 inline long perf_event_open(
20     struct perf_event_attr* hw_event,
21     pid_t pid,
22     int cpu,
23     int group_fd,
24     unsigned long flags) {
25   return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
26 }
27 
28 // TODO sync with Kineto level abstract events in profiler/events.h
29 static const std::unordered_map<
30     std::string,
31     std::pair<perf_type_id, /* perf event type */ uint32_t>>
32     EventTable{
33         {"cycles",
34          std::make_pair(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES)},
35         {"instructions",
36          std::make_pair(PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS)},
37 
38         // Non Standard events for testing
39         {"pagefaults",
40          std::make_pair(PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS)},
41         {"backend-stall-cycles",
42          std::make_pair(
43              PERF_TYPE_HARDWARE,
44              PERF_COUNT_HW_STALLED_CYCLES_BACKEND)},
45         {"frontend-stall-cycles",
46          std::make_pair(
47              PERF_TYPE_HARDWARE,
48              PERF_COUNT_HW_STALLED_CYCLES_FRONTEND)}};
49 
~PerfEvent()50 PerfEvent::~PerfEvent() {
51   if (fd_ > -1) {
52     close(fd_);
53   }
54   fd_ = -1; // poison
55 }
56 
Init()57 void PerfEvent::Init() {
58   TORCH_CHECK(!name_.empty(), "Invalid profiler event name");
59 
60   auto const it = EventTable.find(name_);
61   if (it == EventTable.end()) {
62     TORCH_CHECK(false, "Unsupported profiler event name: ", name_);
63   }
64 
65   struct perf_event_attr attr {};
66   memset(&attr, 0, sizeof(attr));
67 
68   attr.size = sizeof(perf_event_attr);
69   attr.type = it->second.first;
70   attr.config = it->second.second;
71   attr.disabled = 1;
72   attr.inherit = 1;
73   attr.exclude_kernel = 1; // TBD
74   attr.exclude_hv = 1;
75   /*
76    * These can be used to calculate estimated totals if the PMU is overcommitted
77    * and multiplexing is happening
78    */
79   attr.read_format =
80       PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
81 
82   pid_t pid = getpid(); // this pid
83   int cpu = -1; // all cpus
84   int group_fd = -1;
85   unsigned long flags = 0;
86 
87   fd_ = static_cast<int>(perf_event_open(&attr, pid, cpu, group_fd, flags));
88   if (fd_ == -1) {
89     TORCH_CHECK(
90         false, "perf_event_open() failed, error: ", std::strerror(errno));
91   }
92   Reset();
93 }
94 
ReadCounter() const95 uint64_t PerfEvent::ReadCounter() const {
96   PerfCounter counter{};
97   long n = read(fd_, &counter, sizeof(PerfCounter));
98   TORCH_CHECK(
99       n == sizeof(counter),
100       "Read failed for Perf event fd, event : ",
101       name_,
102       ", error: ",
103       std::strerror(errno));
104   TORCH_CHECK(
105       counter.time_enabled == counter.time_running,
106       "Hardware performance counter time multiplexing is not handled yet",
107       ", name: ",
108       name_,
109       ", enabled: ",
110       counter.time_enabled,
111       ", running: ",
112       counter.time_running);
113   return counter.value;
114 }
115 
116 #else /* __ANDROID__ || __linux__ */
117 /*
118  * Shim class for unsupported platforms - this will always return 0 counter
119  * value
120  */
121 
122 PerfEvent::~PerfEvent(){};
123 
124 void PerfEvent::Init(){};
125 
126 uint64_t PerfEvent::ReadCounter() const {
127   return 0;
128 };
129 
130 #endif /* __ANDROID__ || __linux__ */
131 
132 /*
133  * PerfProfiler
134  * ------------
135  */
136 
Configure(std::vector<std::string> & event_names)137 void PerfProfiler::Configure(std::vector<std::string>& event_names) {
138   TORCH_CHECK(
139       event_names.size() <= MAX_EVENTS,
140       "Too many events to configure, configured: ",
141       event_names.size(),
142       ", max allowed:",
143       MAX_EVENTS);
144   std::unordered_set<std::string> s(event_names.begin(), event_names.end());
145   TORCH_CHECK(
146       s.size() == event_names.size(), "Duplicate event names are not allowed!")
147   for (auto name : event_names) {
148     events_.emplace_back(name);
149     events_.back().Init();
150   }
151 
152   // TODO
153   // Reset pthreadpool here to make sure we can attach to new children
154   // threads
155 }
156 
Enable()157 void PerfProfiler::Enable() {
158   if (!start_values_.empty()) {
159     StopCounting();
160   }
161 
162   start_values_.emplace(events_.size(), 0);
163 
164   auto& sv = start_values_.top();
165   for (unsigned i = 0; i < events_.size(); ++i) {
166     sv[i] = events_[i].ReadCounter();
167   }
168   StartCounting();
169 }
170 
Disable(perf_counters_t & vals)171 void PerfProfiler::Disable(perf_counters_t& vals) {
172   StopCounting();
173   TORCH_CHECK(
174       vals.size() == events_.size(),
175       "Can not fit all perf counters in the supplied container");
176   TORCH_CHECK(
177       !start_values_.empty(), "PerfProfiler must be enabled before disabling");
178 
179   /* Always connecting this disable event to the last enable event i.e. using
180    * whatever is on the top of the start counter value stack. */
181   perf_counters_t& sv = start_values_.top();
182   for (unsigned i = 0; i < events_.size(); ++i) {
183     vals[i] = CalcDelta(sv[i], events_[i].ReadCounter());
184   }
185   start_values_.pop();
186 
187   // Restore it for a parent
188   if (!start_values_.empty()) {
189     StartCounting();
190   }
191 }
192 } // namespace torch::profiler::impl::linux_perf
193