1 /*
2 * Copyright 2023 Alyssa Rosenzweig
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "agx_compile.h"
7 #include "agx_compiler.h"
8 #include "agx_opcodes.h"
9
10 /* Table describing the relationship between registers pressure and thread
11 * count. Each entry describes a maximum number of registers and the associated
12 * best-case thread count.
13 *
14 * Sorted in ascending order of maximum registers for easy lookup.
15 */
16 static const struct agx_occupancy occupancies[] = {
17 {104, 1024}, {112, 896}, {128, 832}, {136, 768}, {144, 704},
18 {160, 640}, {184, 576}, {208, 512}, {232, 448}, {256, 384},
19 };
20
21 struct agx_occupancy
agx_occupancy_for_register_count(unsigned halfregs)22 agx_occupancy_for_register_count(unsigned halfregs)
23 {
24 for (unsigned i = 0; i < ARRAY_SIZE(occupancies); ++i) {
25 unsigned max = occupancies[i].max_registers;
26 assert((i == 0 || max > occupancies[i - 1].max_registers) && "ascending");
27
28 if (halfregs <= max)
29 return occupancies[i];
30 }
31
32 unreachable("Register count must be less than the maximum");
33 }
34
35 unsigned
agx_max_registers_for_occupancy(unsigned occupancy)36 agx_max_registers_for_occupancy(unsigned occupancy)
37 {
38 unsigned max_regs = 0;
39
40 for (unsigned i = 0; i < ARRAY_SIZE(occupancies); ++i) {
41 if (occupancy <= occupancies[i].max_threads)
42 max_regs = occupancies[i].max_registers;
43 else
44 break;
45 }
46
47 assert(max_regs > 0 && "Thread count must be less than the maximum");
48 return max_regs;
49 }
50
51 /* Crude cycle model for G13G */
52 enum alu_unit {
53 NONE,
54 SCIB,
55 IC,
56 F32,
57 F16,
58 };
59
60 struct alu_timing {
61 enum alu_unit unit;
62 unsigned latency;
63 unsigned tp;
64 };
65
66 /* clang-format off */
67 struct alu_timing op_timings[] = {
68 [AGX_OPCODE_FMA] = { F32, 2, 1 },
69 [AGX_OPCODE_FADD] = { F32, 2, 1 },
70 [AGX_OPCODE_FMUL] = { F32, 2, 1 },
71
72 [AGX_OPCODE_MOV_IMM] = { SCIB, 1, 1 },
73 [AGX_OPCODE_BITOP] = { SCIB, 2, 1 }, /* tp might be 2 for 32-bit / no $? */
74 [AGX_OPCODE_ICMPSEL] = { SCIB, 2, 1 },
75 [AGX_OPCODE_FCMPSEL] = { SCIB, 2, 1 },
76 [AGX_OPCODE_IADD] = { SCIB, 2, 1 },
77
78 [AGX_OPCODE_GET_SR] = { SCIB, 2, 2 },
79 [AGX_OPCODE_GET_SR_BARRIER] = { SCIB, 2, 2 },
80 [AGX_OPCODE_GET_SR_COVERAGE] = { SCIB, 2, 2 },
81
82 [AGX_OPCODE_IMAD] = { IC, 3, 2 },
83 [AGX_OPCODE_BFI] = { IC, 3, 2 },
84 [AGX_OPCODE_EXTR] = { IC, 3, 2 },
85 [AGX_OPCODE_ASR] = { IC, 3, 2 },
86 [AGX_OPCODE_FLOOR] = { IC, 3, 2 },
87 [AGX_OPCODE_SIN_PT_1] = { IC, 3, 2 },
88 [AGX_OPCODE_SIN_PT_2] = { IC, 5, 2 },
89 [AGX_OPCODE_LOG2] = { IC, 5, 2 },
90 [AGX_OPCODE_EXP2] = { IC, 5, 2 },
91 [AGX_OPCODE_RCP] = { IC, 5, 3 },
92 [AGX_OPCODE_RSQRT] = { IC, 6, 4 },
93 [AGX_OPCODE_SRSQRT] = { IC, 6, 4 },
94
95 // XXX: check this
96 [AGX_OPCODE_SIMD_PREFIX] = { SCIB, 18, 18 },
97 [AGX_OPCODE_SIMD_REDUCE] = { SCIB, 24, 24 },
98 [AGX_OPCODE_QUAD_PREFIX] = { SCIB, 18, 18 },
99 [AGX_OPCODE_QUAD_REDUCE] = { SCIB, 24, 24 },
100 [AGX_OPCODE_SHUFFLE] = { SCIB, 5, 2 },
101 [AGX_OPCODE_QUAD_SHUFFLE] = { SCIB, 5, 2 },
102
103 [AGX_OPCODE_ICMP_BALLOT] = { SCIB, 5, 2 },
104 [AGX_OPCODE_FCMP_BALLOT] = { SCIB, 5, 2 },
105 [AGX_OPCODE_ICMP_QUAD_BALLOT] = { SCIB, 4, 2 },
106 [AGX_OPCODE_FCMP_QUAD_BALLOT] = { SCIB, 4, 2 },
107 };
108 /* clang-format on */
109
110 /*
111 * TODO: Model non-ALU instructions, latency, register cache, 64-bit, etc.
112 */
113 struct agx_cycle_estimate
agx_estimate_cycles(agx_context * ctx)114 agx_estimate_cycles(agx_context *ctx)
115 {
116 struct agx_cycle_estimate est = {0};
117
118 agx_foreach_instr_global(ctx, I) {
119 struct alu_timing alu = I->op < ARRAY_SIZE(op_timings)
120 ? op_timings[I->op]
121 : (struct alu_timing){0};
122
123 if (alu.unit == IC) {
124 est.ic += alu.tp * 2;
125 } else if (alu.unit) {
126 est.f_scib += alu.tp;
127 } else {
128 /* TODO */
129 }
130 }
131
132 /* IC and F/SCIB run in parallel across warps */
133 est.alu = MAX2(est.ic, est.f_scib);
134 return est;
135 }
136