1 #include <asm/unistd.h>
2 #include <linux/perf_event.h>
3 #include <sys/ioctl.h>
4 #include <unistd.h>
5 #include <algorithm>
6 #include <cstdint>
7 #include <cstdio>
8 #include <cstdlib>
9 #include <cstring>
10 #include <random>
11
12 #ifndef __aarch64__
13 #error This program is for 64-bit ARM only.
14 #endif
15
16 struct PerfEvent {
17 perf_event_attr pe;
18 int fd = -1;
19
PerfEventPerfEvent20 PerfEvent(std::uint32_t type, std::uint64_t config) {
21 memset(&pe, 0, sizeof(pe));
22 pe.size = sizeof(pe);
23 pe.type = type;
24 pe.config = config;
25 pe.disabled = 1;
26 pe.exclude_kernel = 1;
27 pe.exclude_hv = 1;
28 fd = syscall(__NR_perf_event_open, &pe, 0, -1, -1, 0);
29 if (fd == -1) {
30 fprintf(stderr, "perf_event_open failed for config 0x%lx\n", config);
31 abort();
32 }
33 }
34
StartPerfEvent35 void Start() {
36 ioctl(fd, PERF_EVENT_IOC_RESET, 0);
37 ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
38 }
39
StopPerfEvent40 std::int64_t Stop() {
41 ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
42 std::int64_t count = 0;
43 read(fd, &count, sizeof(count));
44 return count;
45 }
46
~PerfEventPerfEvent47 ~PerfEvent() { close(fd); }
48 };
49
50 struct ArmPmuEvent : PerfEvent {
51 static constexpr std::uint16_t L1I_CACHE_REFILL = 0x01;
52 static constexpr std::uint16_t L1I_TLB_REFILL = 0x02;
53 static constexpr std::uint16_t L1D_CACHE_REFILL = 0x03;
54 static constexpr std::uint16_t L1D_CACHE = 0x04;
55 static constexpr std::uint16_t L1D_TLB_REFILL = 0x05;
56 static constexpr std::uint16_t LD_RETIRED = 0x06;
57 static constexpr std::uint16_t ST_RETIRED = 0x07;
58 static constexpr std::uint16_t INST_RETIRED = 0x08;
59 static constexpr std::uint16_t EXC_TAKEN = 0x09;
60 static constexpr std::uint16_t EXC_RETURN = 0x0A;
61 static constexpr std::uint16_t CID_WRITE_RETIRED = 0x0B;
62 static constexpr std::uint16_t PC_WRITE_RETIRED = 0x0C;
63 static constexpr std::uint16_t BR_IMMED_RETIRED = 0x0D;
64 static constexpr std::uint16_t BR_RETURN_RETIRED = 0x0E;
65 static constexpr std::uint16_t UNALIGNED_LDST_RETIRED = 0x0F;
66 static constexpr std::uint16_t BR_MIS_PRED = 0x10;
67 static constexpr std::uint16_t CPU_CYCLES = 0x11;
68 static constexpr std::uint16_t BR_PRED = 0x12;
69 static constexpr std::uint16_t MEM_ACCESS = 0x13;
70 static constexpr std::uint16_t L1I_CACHE = 0x14;
71 static constexpr std::uint16_t L1D_CACHE_WB = 0x15;
72 static constexpr std::uint16_t L2D_CACHE = 0x16;
73 static constexpr std::uint16_t L2D_CACHE_REFILL = 0x17;
74 static constexpr std::uint16_t L2D_CACHE_WB = 0x18;
75 static constexpr std::uint16_t BUS_ACCESS = 0x19;
76 static constexpr std::uint16_t MEMORY_ERROR = 0x1A;
77 static constexpr std::uint16_t INST_SPEC = 0x1B;
78 static constexpr std::uint16_t TTBR_WRITE_RETIRED = 0x1C;
79 static constexpr std::uint16_t BUS_CYCLES = 0x1D;
80 static constexpr std::uint16_t CHAIN = 0x1E;
81 static constexpr std::uint16_t L1D_CACHE_ALLOCATE = 0x1F;
82 static constexpr std::uint16_t L2D_CACHE_ALLOCATE = 0x20;
83 static constexpr std::uint16_t BR_RETIRED = 0x21;
84 static constexpr std::uint16_t BR_MIS_PRED_RETIRED = 0x22;
85 static constexpr std::uint16_t STALL_FRONTEND = 0x23;
86 static constexpr std::uint16_t STALL_BACKEND = 0x24;
87 static constexpr std::uint16_t L1D_TLB = 0x25;
88 static constexpr std::uint16_t L1I_TLB = 0x26;
89 static constexpr std::uint16_t L2I_CACHE = 0x27;
90 static constexpr std::uint16_t L2I_CACHE_REFILL = 0x28;
91 static constexpr std::uint16_t L3D_CACHE_ALLOCATE = 0x29;
92 static constexpr std::uint16_t L3D_CACHE_REFILL = 0x2A;
93 static constexpr std::uint16_t L3D_CACHE = 0x2B;
94 static constexpr std::uint16_t L3D_CACHE_WB = 0x2C;
95 static constexpr std::uint16_t L2D_TLB_REFILL = 0x2D;
96 static constexpr std::uint16_t L2I_TLB_REFILL = 0x2E;
97 static constexpr std::uint16_t L2D_TLB = 0x2F;
98 static constexpr std::uint16_t L2I_TLB = 0x30;
99 static constexpr std::uint16_t LL_CACHE = 0x32;
100 static constexpr std::uint16_t LL_CACHE_MISS = 0x33;
101 static constexpr std::uint16_t DTLB_WALK = 0x34;
102 static constexpr std::uint16_t LL_CACHE_RD = 0x36;
103 static constexpr std::uint16_t LL_CACHE_MISS_RD = 0x37;
104 static constexpr std::uint16_t L1D_CACHE_RD = 0x40;
105 static constexpr std::uint16_t L1D_CACHE_REFILL_RD = 0x42;
106 static constexpr std::uint16_t L1D_TLB_REFILL_RD = 0x4C;
107 static constexpr std::uint16_t L1D_TLB_RD = 0x4E;
108 static constexpr std::uint16_t L2D_CACHE_RD = 0x50;
109 static constexpr std::uint16_t L2D_CACHE_REFILL_RD = 0x52;
110 static constexpr std::uint16_t BUS_ACCESS_RD = 0x60;
111 static constexpr std::uint16_t MEM_ACCESS_RD = 0x66;
112 static constexpr std::uint16_t L3D_CACHE_RD = 0xA0;
113 static constexpr std::uint16_t L3D_CACHE_REFILL_RD = 0xA2;
ArmPmuEventArmPmuEvent114 ArmPmuEvent(std::uint16_t number) : PerfEvent(PERF_TYPE_RAW, number) {}
115 };
116
117 struct CacheCounts {
118 int ld_retired = 0;
119 int mem_access = 0;
120 int ll_cache = 0;
121 int ll_cache_miss = 0;
122 int l1d_cache = 0;
123 int l1d_cache_refill = 0;
124 int l2d_cache = 0;
125 int l2d_cache_refill = 0;
126 int l3d_cache = 0;
127 int l3d_cache_refill = 0;
128 };
129
PrintCacheCounts(const CacheCounts & cache_counts)130 void PrintCacheCounts(const CacheCounts& cache_counts) {
131 printf("ld_retired = %d\n", cache_counts.ld_retired);
132 printf("mem_access = %d\n", cache_counts.mem_access);
133 printf("ll_cache = %d\n", cache_counts.ll_cache);
134 printf("ll_cache_miss = %d\n", cache_counts.ll_cache_miss);
135 printf("l1d_cache = %d\n", cache_counts.l1d_cache);
136 printf("l1d_cache_refill = %d\n", cache_counts.l1d_cache_refill);
137 printf("l2d_cache = %d\n", cache_counts.l2d_cache);
138 printf("l2d_cache_refill = %d\n", cache_counts.l2d_cache_refill);
139 printf("l3d_cache = %d\n", cache_counts.l3d_cache);
140 printf("l3d_cache_refill = %d\n", cache_counts.l3d_cache_refill);
141 }
142
Workload(int accesses,int size,std::uint8_t * buf)143 void Workload(int accesses, int size, std::uint8_t* buf) {
144 // The main reason to do this in assembly is an attempt to make sense
145 // of instruction count counters, such as LD_RETIRED.
146 // Also, if we did this in C++, we would need to be watchful of the compiler
147 // optimizing away operations whose result isn't consumed.
148 //
149 // Note that TWO separate tricks are needed here to prevent Cortex-A76
150 // speculative execution om prefetching data from future loop iterations:
151 // 1. A data-dependency whereby the pointers being dereferenced at the
152 // next loop iteration depend on values loaded at the current iteration.
153 // That is the role of 'dummy'.
154 // 2. A pseudo-random sequence. This is the role of register w0,
155 // where we implement a simple xorshift pseudorandom generator.
156 // BOTH of these tricks are needed: if we disable just one of them,
157 // Cortex-A76 successfully speculates some addresses, resulting in different
158 // L3 / DRAM hit percentages on large sizes.
159 std::uint64_t dummy = 123456789;
160 asm volatile(
161 // w0 := xorshift RNG state. Must be nonzero.
162 "mov w0, #1\n"
163 "1:\n"
164 // xorshift RNG iteration: update w0 with the next pseudorandom value
165 // in [1 .. 2^32-1].
166 // This pseudorandomness is crucial to preventing speculative execution
167 // on Cortex-A76 from prefetching data from future loop iterations.
168 "eor w0, w0, w0, lsl #13\n"
169 "eor w0, w0, w0, lsr #17\n"
170 "eor w0, w0, w0, lsl #5\n"
171 // w1 := size - 1 = size mask (size is required to be power-of-two).
172 "sub w1, %w[size], #1\n"
173 // w2 := (pseudorandom value w0) xor (data-dependent sum).
174 "eor w2, w0, %w[dummy]\n"
175 // w1 := w2 modulo size
176 "and w1, w2, w1\n"
177 // align w1
178 "and w1, w1, #-64\n"
179 // load at offset w1, again using x1 as destination.
180 "ldr x1, [%[buf], w1, uxtw]\n"
181 // Update our dummy so it depends on the value we have just loaded.
182 // This data-dependency is key to preventing speculative execution on
183 // Cortex-A76 from prefetching data from future loop iterations.
184 "add %[dummy], %[dummy], w1, uxtw\n"
185 // loop back.
186 "subs %w[accesses], %w[accesses], #1\n"
187 "bne 1b\n"
188 : [ accesses ] "+r"(accesses), [ dummy ] "+r"(dummy)
189 : [ size ] "r"(size), [ buf ] "r"(buf)
190 : "memory", "cc", "x0", "x1", "x2");
191 }
192
MeasureCacheCounts(int accesses,int size,std::uint8_t * buf,CacheCounts * cache_counts)193 void MeasureCacheCounts(int accesses, int size, std::uint8_t* buf,
194 CacheCounts* cache_counts) {
195 const bool only_reads = getenv("ONLY_READS");
196 ArmPmuEvent ld_retired(ArmPmuEvent::LD_RETIRED);
197 ArmPmuEvent mem_access(only_reads ? ArmPmuEvent::MEM_ACCESS_RD
198 : ArmPmuEvent::MEM_ACCESS);
199 ArmPmuEvent ll_cache(only_reads ? ArmPmuEvent::LL_CACHE_RD
200 : ArmPmuEvent::LL_CACHE);
201 ArmPmuEvent ll_cache_miss(only_reads ? ArmPmuEvent::LL_CACHE_MISS_RD
202 : ArmPmuEvent::LL_CACHE_MISS);
203 ArmPmuEvent l1d_cache(only_reads ? ArmPmuEvent::L1D_CACHE_RD
204 : ArmPmuEvent::L1D_CACHE);
205 ArmPmuEvent l1d_cache_refill(only_reads ? ArmPmuEvent::L1D_CACHE_REFILL_RD
206 : ArmPmuEvent::L1D_CACHE_REFILL);
207 ArmPmuEvent l2d_cache(only_reads ? ArmPmuEvent::L2D_CACHE_RD
208 : ArmPmuEvent::L2D_CACHE);
209 ArmPmuEvent l2d_cache_refill(only_reads ? ArmPmuEvent::L2D_CACHE_REFILL_RD
210 : ArmPmuEvent::L2D_CACHE_REFILL);
211 ArmPmuEvent l3d_cache(only_reads ? ArmPmuEvent::L3D_CACHE_RD
212 : ArmPmuEvent::L3D_CACHE);
213 ArmPmuEvent l3d_cache_refill(only_reads ? ArmPmuEvent::L3D_CACHE_REFILL_RD
214 : ArmPmuEvent::L3D_CACHE_REFILL);
215
216 ld_retired.Start();
217 mem_access.Start();
218 ll_cache.Start();
219 ll_cache_miss.Start();
220 l1d_cache.Start();
221 l1d_cache_refill.Start();
222 l2d_cache.Start();
223 l2d_cache_refill.Start();
224 l3d_cache.Start();
225 l3d_cache_refill.Start();
226
227 Workload(accesses, size, buf);
228
229 cache_counts->ld_retired = ld_retired.Stop();
230 cache_counts->mem_access = mem_access.Stop();
231 cache_counts->ll_cache = ll_cache.Stop();
232 cache_counts->ll_cache_miss = ll_cache_miss.Stop();
233 cache_counts->l1d_cache = l1d_cache.Stop();
234 cache_counts->l1d_cache_refill = l1d_cache_refill.Stop();
235 cache_counts->l2d_cache = l2d_cache.Stop();
236 cache_counts->l2d_cache_refill = l2d_cache_refill.Stop();
237 cache_counts->l3d_cache = l3d_cache.Stop();
238 cache_counts->l3d_cache_refill = l3d_cache_refill.Stop();
239 }
240
241 struct PieChart {
242 // How many accesses were recorded, total? The other fields must sum to that.
243 int total;
244 // How many accesses were serviced with the typical cost of a L1 cache hit?
245 int l1_hits;
246 // How many accesses were serviced with the typical cost of a L2 cache hit?
247 int l2_hits;
248 // How many accesses were serviced with the typical cost of a L3 cache hit?
249 int l3_hits;
250 // How many accesses were serviced with the typical cost of a DRAM access?
251 int dram_hits;
252
~PieChartPieChart253 ~PieChart() {
254 // Consistency check
255 if (total != l1_hits + l2_hits + l3_hits + dram_hits) {
256 fprintf(stderr, "inconsistent pie-chart\n");
257 abort();
258 }
259 }
260 };
261
262 struct Hypothesis {
~HypothesisHypothesis263 virtual ~Hypothesis() {}
264 virtual const char* Name() const = 0;
265 virtual void Analyze(const CacheCounts& cache_counts,
266 PieChart* pie) const = 0;
267 };
268
269 struct Hypothesis1 : Hypothesis {
NameHypothesis1270 const char* Name() const override { return "Hypothesis1"; }
AnalyzeHypothesis1271 void Analyze(const CacheCounts& cache_counts, PieChart* pie) const override {
272 pie->total = cache_counts.l1d_cache + cache_counts.l1d_cache_refill;
273 pie->l1_hits = cache_counts.l1d_cache - cache_counts.l2d_cache_refill -
274 cache_counts.l3d_cache_refill;
275 pie->l2_hits = cache_counts.l1d_cache_refill;
276 pie->l3_hits = cache_counts.l2d_cache_refill;
277 pie->dram_hits = cache_counts.l3d_cache_refill;
278 }
279 };
280
281 struct Hypothesis2 : Hypothesis {
NameHypothesis2282 const char* Name() const override { return "Hypothesis2"; }
AnalyzeHypothesis2283 void Analyze(const CacheCounts& cache_counts, PieChart* pie) const override {
284 pie->total = cache_counts.l1d_cache;
285 pie->l1_hits = cache_counts.l1d_cache - cache_counts.l2d_cache;
286 pie->l2_hits = cache_counts.l2d_cache - cache_counts.l3d_cache;
287 pie->l3_hits = cache_counts.l3d_cache - cache_counts.l3d_cache_refill;
288 pie->dram_hits = cache_counts.l3d_cache_refill;
289 }
290 };
291
292 struct Hypothesis3 : Hypothesis {
NameHypothesis3293 const char* Name() const override { return "Hypothesis3"; }
AnalyzeHypothesis3294 void Analyze(const CacheCounts& cache_counts, PieChart* pie) const override {
295 pie->total = cache_counts.l1d_cache;
296 int corrected_l2 = std::min(cache_counts.l2d_cache, cache_counts.l1d_cache);
297 int corrected_l3 = std::min(cache_counts.l3d_cache, corrected_l2);
298 pie->l1_hits = cache_counts.l1d_cache - corrected_l2;
299 pie->l2_hits = corrected_l2 - corrected_l3;
300 pie->l3_hits = corrected_l3 - cache_counts.l3d_cache_refill;
301 pie->dram_hits = cache_counts.l3d_cache_refill;
302 }
303 };
304
305 struct Hypothesis4 : Hypothesis {
NameHypothesis4306 const char* Name() const override { return "Hypothesis4"; }
AnalyzeHypothesis4307 void Analyze(const CacheCounts& cache_counts, PieChart* pie) const override {
308 pie->total = cache_counts.l1d_cache;
309 pie->l1_hits = cache_counts.l1d_cache - cache_counts.l1d_cache_refill;
310 pie->l2_hits =
311 cache_counts.l1d_cache_refill - cache_counts.l2d_cache_refill;
312 pie->l3_hits =
313 cache_counts.l2d_cache_refill - cache_counts.l3d_cache_refill;
314 pie->dram_hits = cache_counts.l3d_cache_refill;
315 }
316 };
317
318 struct Hypothesis5 : Hypothesis {
NameHypothesis5319 const char* Name() const override { return "Hypothesis5"; }
AnalyzeHypothesis5320 void Analyze(const CacheCounts& cache_counts, PieChart* pie) const override {
321 pie->l1_hits =
322 std::max(0, cache_counts.l1d_cache - cache_counts.l1d_cache_refill);
323 pie->l2_hits = std::max(
324 0, cache_counts.l1d_cache_refill - cache_counts.l2d_cache_refill);
325 const int l3_misses =
326 std::max(cache_counts.ll_cache_miss, cache_counts.l3d_cache_refill);
327 pie->l3_hits = std::max(0, cache_counts.l2d_cache_refill - l3_misses);
328 pie->dram_hits = l3_misses;
329 pie->total = pie->l1_hits + pie->l2_hits + pie->l3_hits + pie->dram_hits;
330 }
331 };
332
PrintPieChart(const PieChart & pie)333 void PrintPieChart(const PieChart& pie) {
334 printf("total accesses: %d\n", pie.total);
335 double l1_hits_pct = 100. * pie.l1_hits / pie.total;
336 double l2_hits_pct = 100. * pie.l2_hits / pie.total;
337 double l3_hits_pct = 100. * pie.l3_hits / pie.total;
338 double dram_hits_pct = 100. * pie.dram_hits / pie.total;
339 printf("L1 hits: %.2f%%\n", l1_hits_pct);
340 printf("L2 hits: %.2f%%\n", l2_hits_pct);
341 printf("L1/2 hits: %.2f%%\n", l1_hits_pct + l2_hits_pct);
342 printf("L3 hits: %.2f%%\n", l3_hits_pct);
343 printf("L1/2/3 hits: %.2f%%\n", l1_hits_pct + l2_hits_pct + l3_hits_pct);
344 printf("DRAM hits: %.2f%%\n", dram_hits_pct);
345 }
346
PrintPieChartCsvNoNewline(const PieChart & pie)347 void PrintPieChartCsvNoNewline(const PieChart& pie) {
348 double l1_hits_pct = 100. * pie.l1_hits / pie.total;
349 double l2_hits_pct = 100. * pie.l2_hits / pie.total;
350 double l3_hits_pct = 100. * pie.l3_hits / pie.total;
351 double dram_hits_pct = 100. * pie.dram_hits / pie.total;
352 printf("%.2f,%.2f,%.2f,%.2f", l1_hits_pct, l2_hits_pct, l3_hits_pct,
353 dram_hits_pct);
354 }
355
Study(int accesses,int size,std::uint8_t * buf)356 void Study(int accesses, int size, std::uint8_t* buf) {
357 CacheCounts cache_counts;
358 MeasureCacheCounts(accesses, size, buf, &cache_counts);
359 const Hypothesis* hypotheses[] = {
360 new Hypothesis5, new Hypothesis4, new Hypothesis3,
361 new Hypothesis2, new Hypothesis1,
362 };
363 if (getenv("DUMP_CSV")) {
364 printf("%d", size);
365 for (const Hypothesis* hypothesis : hypotheses) {
366 printf(",");
367 PieChart pie;
368 hypothesis->Analyze(cache_counts, &pie);
369 PrintPieChartCsvNoNewline(pie);
370 }
371 printf("\n");
372 } else {
373 printf("\n\n\naccesses=%d, size=%d:\n", accesses, size);
374 printf("\nCache counts:\n");
375 PrintCacheCounts(cache_counts);
376 for (const Hypothesis* hypothesis : hypotheses) {
377 printf("\n%s:\n", hypothesis->Name());
378 PieChart pie;
379 hypothesis->Analyze(cache_counts, &pie);
380 PrintPieChart(pie);
381 }
382 }
383 fflush(stdout);
384 for (const Hypothesis* hypothesis : hypotheses) {
385 delete hypothesis;
386 }
387 }
388
main()389 int main() {
390 const int kMinSize = 1 << 12;
391 const int kMaxSize = 1 << 24;
392 const int kAccesses = 1e8;
393 void* buf_void = nullptr;
394 posix_memalign(&buf_void, 64, kMaxSize);
395 std::uint8_t* buf = static_cast<std::uint8_t*>(buf_void);
396 std::default_random_engine random_engine;
397 for (int i = 0; i < kMaxSize; i++) {
398 buf[i] = random_engine();
399 }
400 for (int size = kMinSize; size <= kMaxSize; size *= 2) {
401 Study(kAccesses, size, buf);
402 }
403 delete[] buf;
404 }
405