xref: /aosp_15_r20/external/mesa3d/src/asahi/compiler/agx_nir_opt_preamble.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2022 Alyssa Rosenzweig
3  * Copyright 2021 Valve Corporation
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "compiler/nir/nir_builder.h"
8 #include "util/macros.h"
9 #include "agx_compiler.h"
10 #include "nir.h"
11 #include "nir_opcodes.h"
12 
13 static void
def_size(nir_def * def,unsigned * size,unsigned * align)14 def_size(nir_def *def, unsigned *size, unsigned *align)
15 {
16    unsigned bit_size = MAX2(def->bit_size, 16);
17 
18    *size = (bit_size * def->num_components) / 16;
19    *align = bit_size / 16;
20 }
21 
22 static bool
all_uses_float(nir_def * def)23 all_uses_float(nir_def *def)
24 {
25    nir_foreach_use_including_if(use, def) {
26       if (nir_src_is_if(use))
27          return false;
28 
29       nir_instr *use_instr = nir_src_parent_instr(use);
30       if (use_instr->type != nir_instr_type_alu)
31          return false;
32 
33       nir_alu_instr *use_alu = nir_instr_as_alu(use_instr);
34       unsigned src_index = ~0;
35       for (unsigned i = 0; i < nir_op_infos[use_alu->op].num_inputs; i++) {
36          if (&use_alu->src[i].src == use) {
37             src_index = i;
38             break;
39          }
40       }
41 
42       assert(src_index != ~0);
43       nir_alu_type src_type = nir_alu_type_get_base_type(
44          nir_op_infos[use_alu->op].input_types[src_index]);
45 
46       if (src_type != nir_type_float)
47          return false;
48 
49       /* No float modifiers on G13 */
50       if (use_alu->op == nir_op_fmax || use_alu->op == nir_op_fmin)
51          return false;
52    }
53 
54    return true;
55 }
56 
57 static float
alu_cost(nir_alu_instr * alu)58 alu_cost(nir_alu_instr *alu)
59 {
60    /* TODO: Model 64-bit better */
61    if (alu->def.bit_size == 64)
62       return 10.0f;
63 
64    switch (alu->op) {
65    case nir_op_fsat:
66    case nir_op_f2fmp:
67    case nir_op_f2f16:
68    case nir_op_f2f16_rtne:
69    case nir_op_fadd:
70    case nir_op_fmul:
71    case nir_op_ffma:
72    case nir_op_iadd:
73    case nir_op_inot:
74    case nir_op_iand:
75    case nir_op_ior:
76    case nir_op_ixor:
77    case nir_op_feq:
78    case nir_op_flt:
79    case nir_op_fge:
80    case nir_op_fneu:
81    case nir_op_ieq:
82    case nir_op_ine:
83    case nir_op_ilt:
84    case nir_op_ige:
85    case nir_op_ult:
86    case nir_op_uge:
87    case nir_op_fmin:
88    case nir_op_fmax:
89    case nir_op_imin:
90    case nir_op_imax:
91    case nir_op_umin:
92    case nir_op_umax:
93    case nir_op_isub:
94    case nir_op_ineg:
95    case nir_op_bcsel:
96    case nir_op_b2b1:
97    case nir_op_b2b8:
98    case nir_op_b2b16:
99    case nir_op_b2b32:
100    case nir_op_b2i8:
101    case nir_op_b2i16:
102    case nir_op_b2i32:
103    case nir_op_b2f16:
104    case nir_op_b2f32:
105    case nir_op_i2i32:
106    case nir_op_i2i16:
107    case nir_op_u2u32:
108    case nir_op_u2u16:
109    case nir_op_u2u8:
110    case nir_op_i2i8:
111    case nir_op_iadd_sat:
112    case nir_op_isub_sat:
113    case nir_op_uadd_sat:
114    case nir_op_usub_sat:
115    case nir_op_iabs:
116       /* SCIB */
117       return 1.0;
118 
119    case nir_op_ffloor:
120    case nir_op_fceil:
121    case nir_op_ftrunc:
122    case nir_op_fround_even:
123    case nir_op_bit_count:
124    case nir_op_bitfield_reverse:
125    case nir_op_ufind_msb:
126    case nir_op_imul:
127    case nir_op_imadshl_agx:
128    case nir_op_imsubshl_agx:
129    case nir_op_ishl:
130    case nir_op_ishr:
131    case nir_op_ushr:
132    case nir_op_flog2:
133    case nir_op_fexp2:
134    case nir_op_extr_agx:
135    case nir_op_ubitfield_extract:
136    case nir_op_f2i8:
137    case nir_op_f2i16:
138    case nir_op_f2i32:
139    case nir_op_f2u8:
140    case nir_op_f2u16:
141    case nir_op_f2u32:
142    case nir_op_i2fmp:
143    case nir_op_i2f16:
144    case nir_op_i2f32:
145    case nir_op_u2fmp:
146    case nir_op_u2f16:
147    case nir_op_u2f32:
148    case nir_op_interleave_agx:
149       /* IC */
150       return 4.0;
151 
152    case nir_op_frcp:
153       /* IC */
154       return 6.0;
155 
156    case nir_op_frsq:
157       /* IC */
158       return 8.0;
159 
160    case nir_op_fsqrt:
161       /* IC + F32 */
162       return 8.5;
163 
164    case nir_op_imul_high:
165    case nir_op_umul_high:
166    case nir_op_imul_2x32_64:
167    case nir_op_umul_2x32_64:
168       /* IC */
169       return 8.0;
170 
171    case nir_op_fsin_agx:
172       /* 2 IC + 1 F32 in parallel */
173       return 8.5;
174 
175    case nir_op_fneg:
176    case nir_op_fabs:
177    case nir_op_f2f32:
178    case nir_op_unpack_half_2x16_split_x:
179    case nir_op_unpack_half_2x16_split_y:
180       /* Float source modifiers will be propagated */
181       return all_uses_float(&alu->def) ? 0.0 : 1.0;
182 
183    case nir_op_mov:
184    case nir_op_vec2:
185    case nir_op_vec3:
186    case nir_op_vec4:
187    case nir_op_pack_32_2x16_split:
188    case nir_op_pack_64_2x32_split:
189    case nir_op_unpack_64_2x32_split_x:
190    case nir_op_unpack_64_2x32_split_y:
191    case nir_op_unpack_32_2x16_split_x:
192    case nir_op_unpack_32_2x16_split_y:
193    case nir_op_extract_i8:
194    case nir_op_extract_u8:
195    case nir_op_extract_i16:
196    case nir_op_extract_u16:
197       /* We optimistically assume that moves get coalesced */
198       return 0.0;
199 
200    default:
201       /* Shrug */
202       return 2.0;
203    }
204 }
205 
206 static float
instr_cost(nir_instr * instr,const void * data)207 instr_cost(nir_instr *instr, const void *data)
208 {
209    switch (instr->type) {
210    case nir_instr_type_intrinsic:
211       switch (nir_instr_as_intrinsic(instr)->intrinsic) {
212       case nir_intrinsic_load_global:
213       case nir_intrinsic_load_agx:
214       case nir_intrinsic_load_global_constant:
215       case nir_intrinsic_load_constant_agx:
216       case nir_intrinsic_load_ubo:
217          return 10.0;
218       case nir_intrinsic_ddx:
219       case nir_intrinsic_ddx_fine:
220       case nir_intrinsic_ddx_coarse:
221       case nir_intrinsic_ddy:
222       case nir_intrinsic_ddy_fine:
223       case nir_intrinsic_ddy_coarse:
224          return 1.0;
225       default:
226          /* Assume it's a sysval or something */
227          return 0.0;
228       }
229 
230    case nir_instr_type_tex:
231       /* Texturing involes lots of memory bandwidth */
232       return 20.0;
233 
234    case nir_instr_type_alu:
235       return alu_cost(nir_instr_as_alu(instr));
236 
237    default:
238       return 1.0;
239    }
240 }
241 
242 static float
rewrite_cost(nir_def * def,const void * data)243 rewrite_cost(nir_def *def, const void *data)
244 {
245    bool mov_needed = false, vectorizable = true;
246    nir_foreach_use(use, def) {
247       nir_instr *parent_instr = nir_src_parent_instr(use);
248       if (parent_instr->type == nir_instr_type_tex) {
249          /* TODO: Maybe check the source index, but biases can be uniform */
250          break;
251       } else if (parent_instr->type == nir_instr_type_phi) {
252          /* Assume we'd eat a move anyway */
253       } else if (parent_instr->type != nir_instr_type_alu) {
254          mov_needed = true;
255          vectorizable = false;
256          break;
257       } else {
258          nir_alu_instr *alu = nir_instr_as_alu(parent_instr);
259          if (alu->op == nir_op_vec2 || alu->op == nir_op_vec3 ||
260              alu->op == nir_op_vec4) {
261             mov_needed = true;
262             break;
263          } else if (alu->op == nir_op_mov) {
264             mov_needed = true;
265             vectorizable = false;
266          } else {
267             /* Assume for non-moves that the const is folded into the src */
268          }
269       }
270    }
271 
272    return mov_needed ? ((float)(def->num_components * def->bit_size) /
273                         (vectorizable ? 32.0 : 16.0))
274                      : 0;
275 }
276 
277 static bool
avoid_instr(const nir_instr * instr,const void * data)278 avoid_instr(const nir_instr *instr, const void *data)
279 {
280    const nir_def *def = nir_instr_def((nir_instr *)instr);
281 
282    /* Do not move bindless handles, since we need those to retain their
283     * constant base index.
284     */
285    if (def) {
286       nir_foreach_use(use, def) {
287          if (nir_src_parent_instr(use)->type == nir_instr_type_tex) {
288             /* Check if used as a bindless texture handle */
289             nir_tex_instr *tex = nir_instr_as_tex(nir_src_parent_instr(use));
290             int handle_idx =
291                nir_tex_instr_src_index(tex, nir_tex_src_texture_handle);
292 
293             if (handle_idx >= 0 && tex->src[handle_idx].src.ssa == def)
294                return true;
295          } else if (nir_src_parent_instr(use)->type ==
296                     nir_instr_type_intrinsic) {
297             /* Check if used as a bindless image handle */
298             nir_intrinsic_instr *intr =
299                nir_instr_as_intrinsic(nir_src_parent_instr(use));
300 
301             switch (intr->intrinsic) {
302             case nir_intrinsic_bindless_image_load:
303             case nir_intrinsic_bindless_image_store:
304             case nir_intrinsic_bindless_image_store_block_agx:
305                if (intr->src[0].ssa == def)
306                   return true;
307                break;
308             default:
309                break;
310             }
311          }
312       }
313    }
314 
315    return false;
316 }
317 
318 static const nir_opt_preamble_options preamble_options = {
319    .drawid_uniform = true,
320    .subgroup_size_uniform = true,
321    /* not supported in hardware */
322    .load_workgroup_size_allowed = false,
323    .def_size = def_size,
324    .instr_cost_cb = instr_cost,
325    .rewrite_cost_cb = rewrite_cost,
326    .avoid_instr_cb = avoid_instr,
327 
328    /* hardware size is 512, but it's polite to leave some wiggle room to push
329     * hot constants so we don't end up rematerializing all over the place.
330     * 480 seems to be a sweetspot, based on a few minutes of shader-db.
331     */
332    .preamble_storage_size = 480,
333 };
334 
335 bool
agx_nir_opt_preamble(nir_shader * nir,unsigned * preamble_size)336 agx_nir_opt_preamble(nir_shader *nir, unsigned *preamble_size)
337 {
338    return nir_opt_preamble(nir, &preamble_options, preamble_size);
339 }
340