1 /*
2 * Copyright 2022 Alyssa Rosenzweig
3 * Copyright 2021 Valve Corporation
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "compiler/nir/nir_builder.h"
8 #include "util/macros.h"
9 #include "agx_compiler.h"
10 #include "nir.h"
11 #include "nir_opcodes.h"
12
13 static void
def_size(nir_def * def,unsigned * size,unsigned * align)14 def_size(nir_def *def, unsigned *size, unsigned *align)
15 {
16 unsigned bit_size = MAX2(def->bit_size, 16);
17
18 *size = (bit_size * def->num_components) / 16;
19 *align = bit_size / 16;
20 }
21
22 static bool
all_uses_float(nir_def * def)23 all_uses_float(nir_def *def)
24 {
25 nir_foreach_use_including_if(use, def) {
26 if (nir_src_is_if(use))
27 return false;
28
29 nir_instr *use_instr = nir_src_parent_instr(use);
30 if (use_instr->type != nir_instr_type_alu)
31 return false;
32
33 nir_alu_instr *use_alu = nir_instr_as_alu(use_instr);
34 unsigned src_index = ~0;
35 for (unsigned i = 0; i < nir_op_infos[use_alu->op].num_inputs; i++) {
36 if (&use_alu->src[i].src == use) {
37 src_index = i;
38 break;
39 }
40 }
41
42 assert(src_index != ~0);
43 nir_alu_type src_type = nir_alu_type_get_base_type(
44 nir_op_infos[use_alu->op].input_types[src_index]);
45
46 if (src_type != nir_type_float)
47 return false;
48
49 /* No float modifiers on G13 */
50 if (use_alu->op == nir_op_fmax || use_alu->op == nir_op_fmin)
51 return false;
52 }
53
54 return true;
55 }
56
57 static float
alu_cost(nir_alu_instr * alu)58 alu_cost(nir_alu_instr *alu)
59 {
60 /* TODO: Model 64-bit better */
61 if (alu->def.bit_size == 64)
62 return 10.0f;
63
64 switch (alu->op) {
65 case nir_op_fsat:
66 case nir_op_f2fmp:
67 case nir_op_f2f16:
68 case nir_op_f2f16_rtne:
69 case nir_op_fadd:
70 case nir_op_fmul:
71 case nir_op_ffma:
72 case nir_op_iadd:
73 case nir_op_inot:
74 case nir_op_iand:
75 case nir_op_ior:
76 case nir_op_ixor:
77 case nir_op_feq:
78 case nir_op_flt:
79 case nir_op_fge:
80 case nir_op_fneu:
81 case nir_op_ieq:
82 case nir_op_ine:
83 case nir_op_ilt:
84 case nir_op_ige:
85 case nir_op_ult:
86 case nir_op_uge:
87 case nir_op_fmin:
88 case nir_op_fmax:
89 case nir_op_imin:
90 case nir_op_imax:
91 case nir_op_umin:
92 case nir_op_umax:
93 case nir_op_isub:
94 case nir_op_ineg:
95 case nir_op_bcsel:
96 case nir_op_b2b1:
97 case nir_op_b2b8:
98 case nir_op_b2b16:
99 case nir_op_b2b32:
100 case nir_op_b2i8:
101 case nir_op_b2i16:
102 case nir_op_b2i32:
103 case nir_op_b2f16:
104 case nir_op_b2f32:
105 case nir_op_i2i32:
106 case nir_op_i2i16:
107 case nir_op_u2u32:
108 case nir_op_u2u16:
109 case nir_op_u2u8:
110 case nir_op_i2i8:
111 case nir_op_iadd_sat:
112 case nir_op_isub_sat:
113 case nir_op_uadd_sat:
114 case nir_op_usub_sat:
115 case nir_op_iabs:
116 /* SCIB */
117 return 1.0;
118
119 case nir_op_ffloor:
120 case nir_op_fceil:
121 case nir_op_ftrunc:
122 case nir_op_fround_even:
123 case nir_op_bit_count:
124 case nir_op_bitfield_reverse:
125 case nir_op_ufind_msb:
126 case nir_op_imul:
127 case nir_op_imadshl_agx:
128 case nir_op_imsubshl_agx:
129 case nir_op_ishl:
130 case nir_op_ishr:
131 case nir_op_ushr:
132 case nir_op_flog2:
133 case nir_op_fexp2:
134 case nir_op_extr_agx:
135 case nir_op_ubitfield_extract:
136 case nir_op_f2i8:
137 case nir_op_f2i16:
138 case nir_op_f2i32:
139 case nir_op_f2u8:
140 case nir_op_f2u16:
141 case nir_op_f2u32:
142 case nir_op_i2fmp:
143 case nir_op_i2f16:
144 case nir_op_i2f32:
145 case nir_op_u2fmp:
146 case nir_op_u2f16:
147 case nir_op_u2f32:
148 case nir_op_interleave_agx:
149 /* IC */
150 return 4.0;
151
152 case nir_op_frcp:
153 /* IC */
154 return 6.0;
155
156 case nir_op_frsq:
157 /* IC */
158 return 8.0;
159
160 case nir_op_fsqrt:
161 /* IC + F32 */
162 return 8.5;
163
164 case nir_op_imul_high:
165 case nir_op_umul_high:
166 case nir_op_imul_2x32_64:
167 case nir_op_umul_2x32_64:
168 /* IC */
169 return 8.0;
170
171 case nir_op_fsin_agx:
172 /* 2 IC + 1 F32 in parallel */
173 return 8.5;
174
175 case nir_op_fneg:
176 case nir_op_fabs:
177 case nir_op_f2f32:
178 case nir_op_unpack_half_2x16_split_x:
179 case nir_op_unpack_half_2x16_split_y:
180 /* Float source modifiers will be propagated */
181 return all_uses_float(&alu->def) ? 0.0 : 1.0;
182
183 case nir_op_mov:
184 case nir_op_vec2:
185 case nir_op_vec3:
186 case nir_op_vec4:
187 case nir_op_pack_32_2x16_split:
188 case nir_op_pack_64_2x32_split:
189 case nir_op_unpack_64_2x32_split_x:
190 case nir_op_unpack_64_2x32_split_y:
191 case nir_op_unpack_32_2x16_split_x:
192 case nir_op_unpack_32_2x16_split_y:
193 case nir_op_extract_i8:
194 case nir_op_extract_u8:
195 case nir_op_extract_i16:
196 case nir_op_extract_u16:
197 /* We optimistically assume that moves get coalesced */
198 return 0.0;
199
200 default:
201 /* Shrug */
202 return 2.0;
203 }
204 }
205
206 static float
instr_cost(nir_instr * instr,const void * data)207 instr_cost(nir_instr *instr, const void *data)
208 {
209 switch (instr->type) {
210 case nir_instr_type_intrinsic:
211 switch (nir_instr_as_intrinsic(instr)->intrinsic) {
212 case nir_intrinsic_load_global:
213 case nir_intrinsic_load_agx:
214 case nir_intrinsic_load_global_constant:
215 case nir_intrinsic_load_constant_agx:
216 case nir_intrinsic_load_ubo:
217 return 10.0;
218 case nir_intrinsic_ddx:
219 case nir_intrinsic_ddx_fine:
220 case nir_intrinsic_ddx_coarse:
221 case nir_intrinsic_ddy:
222 case nir_intrinsic_ddy_fine:
223 case nir_intrinsic_ddy_coarse:
224 return 1.0;
225 default:
226 /* Assume it's a sysval or something */
227 return 0.0;
228 }
229
230 case nir_instr_type_tex:
231 /* Texturing involes lots of memory bandwidth */
232 return 20.0;
233
234 case nir_instr_type_alu:
235 return alu_cost(nir_instr_as_alu(instr));
236
237 default:
238 return 1.0;
239 }
240 }
241
242 static float
rewrite_cost(nir_def * def,const void * data)243 rewrite_cost(nir_def *def, const void *data)
244 {
245 bool mov_needed = false, vectorizable = true;
246 nir_foreach_use(use, def) {
247 nir_instr *parent_instr = nir_src_parent_instr(use);
248 if (parent_instr->type == nir_instr_type_tex) {
249 /* TODO: Maybe check the source index, but biases can be uniform */
250 break;
251 } else if (parent_instr->type == nir_instr_type_phi) {
252 /* Assume we'd eat a move anyway */
253 } else if (parent_instr->type != nir_instr_type_alu) {
254 mov_needed = true;
255 vectorizable = false;
256 break;
257 } else {
258 nir_alu_instr *alu = nir_instr_as_alu(parent_instr);
259 if (alu->op == nir_op_vec2 || alu->op == nir_op_vec3 ||
260 alu->op == nir_op_vec4) {
261 mov_needed = true;
262 break;
263 } else if (alu->op == nir_op_mov) {
264 mov_needed = true;
265 vectorizable = false;
266 } else {
267 /* Assume for non-moves that the const is folded into the src */
268 }
269 }
270 }
271
272 return mov_needed ? ((float)(def->num_components * def->bit_size) /
273 (vectorizable ? 32.0 : 16.0))
274 : 0;
275 }
276
277 static bool
avoid_instr(const nir_instr * instr,const void * data)278 avoid_instr(const nir_instr *instr, const void *data)
279 {
280 const nir_def *def = nir_instr_def((nir_instr *)instr);
281
282 /* Do not move bindless handles, since we need those to retain their
283 * constant base index.
284 */
285 if (def) {
286 nir_foreach_use(use, def) {
287 if (nir_src_parent_instr(use)->type == nir_instr_type_tex) {
288 /* Check if used as a bindless texture handle */
289 nir_tex_instr *tex = nir_instr_as_tex(nir_src_parent_instr(use));
290 int handle_idx =
291 nir_tex_instr_src_index(tex, nir_tex_src_texture_handle);
292
293 if (handle_idx >= 0 && tex->src[handle_idx].src.ssa == def)
294 return true;
295 } else if (nir_src_parent_instr(use)->type ==
296 nir_instr_type_intrinsic) {
297 /* Check if used as a bindless image handle */
298 nir_intrinsic_instr *intr =
299 nir_instr_as_intrinsic(nir_src_parent_instr(use));
300
301 switch (intr->intrinsic) {
302 case nir_intrinsic_bindless_image_load:
303 case nir_intrinsic_bindless_image_store:
304 case nir_intrinsic_bindless_image_store_block_agx:
305 if (intr->src[0].ssa == def)
306 return true;
307 break;
308 default:
309 break;
310 }
311 }
312 }
313 }
314
315 return false;
316 }
317
318 static const nir_opt_preamble_options preamble_options = {
319 .drawid_uniform = true,
320 .subgroup_size_uniform = true,
321 /* not supported in hardware */
322 .load_workgroup_size_allowed = false,
323 .def_size = def_size,
324 .instr_cost_cb = instr_cost,
325 .rewrite_cost_cb = rewrite_cost,
326 .avoid_instr_cb = avoid_instr,
327
328 /* hardware size is 512, but it's polite to leave some wiggle room to push
329 * hot constants so we don't end up rematerializing all over the place.
330 * 480 seems to be a sweetspot, based on a few minutes of shader-db.
331 */
332 .preamble_storage_size = 480,
333 };
334
335 bool
agx_nir_opt_preamble(nir_shader * nir,unsigned * preamble_size)336 agx_nir_opt_preamble(nir_shader *nir, unsigned *preamble_size)
337 {
338 return nir_opt_preamble(nir, &preamble_options, preamble_size);
339 }
340