1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "ac_shader_util.h"
8 #include "ac_gpu_info.h"
9
10 #include "sid.h"
11 #include "util/u_math.h"
12
13 #include <assert.h>
14 #include <stdlib.h>
15 #include <string.h>
16
17 /* Set NIR options shared by ACO, LLVM, RADV, and radeonsi. */
ac_set_nir_options(struct radeon_info * info,bool use_llvm,nir_shader_compiler_options * options)18 void ac_set_nir_options(struct radeon_info *info, bool use_llvm,
19 nir_shader_compiler_options *options)
20 {
21 /* |---------------------------------- Performance & Availability --------------------------------|
22 * |MAD/MAC/MADAK/MADMK|MAD_LEGACY|MAC_LEGACY| FMA |FMAC/FMAAK/FMAMK|FMA_LEGACY|PK_FMA_F16,|Best choice
23 * Arch | F32,F16,F64 | F32,F16 | F32,F16 |F32,F16,F64 | F32,F16 | F32 |PK_FMAC_F16|F16,F32,F64
24 * ------------------------------------------------------------------------------------------------------------------
25 * gfx6,7 | 1 , - , - | 1 , - | 1 , - |1/4, - ,1/16| - , - | - | - , - | - ,MAD,FMA
26 * gfx8 | 1 , 1 , - | 1 , - | - , - |1/4, 1 ,1/16| - , - | - | - , - |MAD,MAD,FMA
27 * gfx9 | 1 ,1|0, - | 1 , - | - , - | 1 , 1 ,1/16| 0|1, - | - | 2 , - |FMA,MAD,FMA
28 * gfx10 | 1 , - , - | 1 , - | 1 , - | 1 , 1 ,1/16| 1 , 1 | - | 2 , 2 |FMA,MAD,FMA
29 * gfx10.3| - , - , - | - , - | - , - | 1 , 1 ,1/16| 1 , 1 | 1 | 2 , 2 | all FMA
30 * gfx11 | - , - , - | - , - | - , - | 2 , 2 ,1/16| 2 , 2 | 2 | 2 , 2 | all FMA
31 *
32 * Tahiti, Hawaii, Carrizo, Vega20: FMA_F32 is full rate, FMA_F64 is 1/4
33 * gfx9 supports MAD_F16 only on Vega10, Raven, Raven2, Renoir.
34 * gfx9 supports FMAC_F32 only on Vega20, but doesn't support FMAAK and FMAMK.
35 *
36 * gfx8 prefers MAD for F16 because of MAC/MADAK/MADMK.
37 * gfx9 and newer prefer FMA for F16 because of the packed instruction.
38 * gfx10 and older prefer MAD for F32 because of the legacy instruction.
39 */
40
41 memset(options, 0, sizeof(*options));
42 options->vertex_id_zero_based = true;
43 options->lower_scmp = true;
44 options->lower_flrp16 = true;
45 options->lower_flrp32 = true;
46 options->lower_flrp64 = true;
47 options->lower_device_index_to_zero = true;
48 options->lower_fdiv = true;
49 options->lower_fmod = true;
50 options->lower_ineg = true;
51 options->lower_bitfield_insert = true;
52 options->lower_bitfield_extract = true;
53 options->lower_pack_snorm_4x8 = true;
54 options->lower_pack_unorm_4x8 = true;
55 options->lower_pack_half_2x16 = true;
56 options->lower_pack_64_2x32 = true;
57 options->lower_pack_64_4x16 = true;
58 options->lower_pack_32_2x16 = true;
59 options->lower_unpack_snorm_2x16 = true;
60 options->lower_unpack_snorm_4x8 = true;
61 options->lower_unpack_unorm_2x16 = true;
62 options->lower_unpack_unorm_4x8 = true;
63 options->lower_unpack_half_2x16 = true;
64 options->lower_fpow = true;
65 options->lower_mul_2x32_64 = true;
66 options->lower_iadd_sat = info->gfx_level <= GFX8;
67 options->lower_hadd = true;
68 options->lower_mul_32x16 = true;
69 options->has_bfe = true;
70 options->has_bfm = true;
71 options->has_bitfield_select = true;
72 options->has_fneo_fcmpu = true;
73 options->has_ford_funord = true;
74 options->has_fsub = true;
75 options->has_isub = true;
76 options->has_sdot_4x8 = info->has_accelerated_dot_product;
77 options->has_sudot_4x8 = info->has_accelerated_dot_product && info->gfx_level >= GFX11;
78 options->has_udot_4x8 = info->has_accelerated_dot_product;
79 options->has_sdot_4x8_sat = info->has_accelerated_dot_product;
80 options->has_sudot_4x8_sat = info->has_accelerated_dot_product && info->gfx_level >= GFX11;
81 options->has_udot_4x8_sat = info->has_accelerated_dot_product;
82 options->has_dot_2x16 = info->has_accelerated_dot_product && info->gfx_level < GFX11;
83 options->has_find_msb_rev = true;
84 options->has_pack_32_4x8 = true;
85 options->has_pack_half_2x16_rtz = true;
86 options->has_bit_test = !use_llvm;
87 options->has_fmulz = true;
88 options->has_msad = true;
89 options->has_shfr32 = true;
90 options->use_interpolated_input_intrinsics = true;
91 options->lower_int64_options = nir_lower_imul64 | nir_lower_imul_high64 | nir_lower_imul_2x32_64 | nir_lower_divmod64 |
92 nir_lower_minmax64 | nir_lower_iabs64 | nir_lower_iadd_sat64 | nir_lower_conv64;
93 options->divergence_analysis_options = nir_divergence_view_index_uniform;
94 options->optimize_quad_vote_to_reduce = !use_llvm;
95 options->lower_fisnormal = true;
96 options->support_16bit_alu = info->gfx_level >= GFX8;
97 options->vectorize_vec2_16bit = info->has_packed_math_16bit;
98 options->discard_is_demote = true;
99 options->io_options = nir_io_has_flexible_input_interpolation_except_flat |
100 (info->gfx_level >= GFX8 ? nir_io_16bit_input_output_support : 0) |
101 nir_io_prefer_scalar_fs_inputs |
102 nir_io_mix_convergent_flat_with_interpolated |
103 nir_io_vectorizer_ignores_types;
104 options->has_ddx_intrinsics = true;
105 options->scalarize_ddx = true;
106 options->skip_lower_packing_ops =
107 BITFIELD_BIT(nir_lower_packing_op_unpack_64_2x32) |
108 BITFIELD_BIT(nir_lower_packing_op_unpack_64_4x16) |
109 BITFIELD_BIT(nir_lower_packing_op_unpack_32_2x16) |
110 BITFIELD_BIT(nir_lower_packing_op_pack_32_4x8) |
111 BITFIELD_BIT(nir_lower_packing_op_unpack_32_4x8);
112 }
113
114 bool
ac_nir_mem_vectorize_callback(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)115 ac_nir_mem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned bit_size,
116 unsigned num_components, nir_intrinsic_instr *low,
117 nir_intrinsic_instr *high, void *data)
118 {
119 if (num_components > 4)
120 return false;
121
122 bool is_scratch = false;
123 switch (low->intrinsic) {
124 case nir_intrinsic_load_stack:
125 case nir_intrinsic_load_scratch:
126 case nir_intrinsic_store_stack:
127 case nir_intrinsic_store_scratch:
128 is_scratch = true;
129 break;
130 default:
131 break;
132 }
133
134 /* >128 bit loads are split except with SMEM. On GFX6-8, >32 bit scratch loads are split. */
135 enum amd_gfx_level gfx_level = *(enum amd_gfx_level *)data;
136 if (bit_size * num_components > (is_scratch && gfx_level <= GFX8 ? 32 : 128))
137 return false;
138
139 uint32_t align;
140 if (align_offset)
141 align = 1 << (ffs(align_offset) - 1);
142 else
143 align = align_mul;
144
145 switch (low->intrinsic) {
146 case nir_intrinsic_load_global:
147 case nir_intrinsic_load_global_constant:
148 case nir_intrinsic_store_global:
149 case nir_intrinsic_store_ssbo:
150 case nir_intrinsic_load_ssbo:
151 case nir_intrinsic_load_ubo:
152 case nir_intrinsic_load_push_constant:
153 case nir_intrinsic_load_stack:
154 case nir_intrinsic_load_scratch:
155 case nir_intrinsic_store_stack:
156 case nir_intrinsic_store_scratch: {
157 unsigned max_components;
158 if (align % 4 == 0)
159 max_components = NIR_MAX_VEC_COMPONENTS;
160 else if (align % 2 == 0)
161 max_components = 16u / bit_size;
162 else
163 max_components = 8u / bit_size;
164 return (align % (bit_size / 8u)) == 0 && num_components <= max_components;
165 }
166 case nir_intrinsic_load_deref:
167 case nir_intrinsic_store_deref:
168 assert(nir_deref_mode_is(nir_src_as_deref(low->src[0]), nir_var_mem_shared));
169 FALLTHROUGH;
170 case nir_intrinsic_load_shared:
171 case nir_intrinsic_store_shared:
172 if (bit_size * num_components == 96) { /* 96 bit loads require 128 bit alignment and are split otherwise */
173 return align % 16 == 0;
174 } else if (bit_size == 16 && (align % 4)) {
175 /* AMD hardware can't do 2-byte aligned f16vec2 loads, but they are useful for ALU
176 * vectorization, because our vectorizer requires the scalar IR to already contain vectors.
177 */
178 return (align % 2 == 0) && num_components <= 2;
179 } else {
180 if (num_components == 3) {
181 /* AMD hardware can't do 3-component loads except for 96-bit loads, handled above. */
182 return false;
183 }
184 unsigned req = bit_size * num_components;
185 if (req == 64 || req == 128) /* 64-bit and 128-bit loads can use ds_read2_b{32,64} */
186 req /= 2u;
187 return align % (req / 8u) == 0;
188 }
189 default:
190 return false;
191 }
192 return false;
193 }
194
ac_get_spi_shader_z_format(bool writes_z,bool writes_stencil,bool writes_samplemask,bool writes_mrt0_alpha)195 unsigned ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil, bool writes_samplemask,
196 bool writes_mrt0_alpha)
197 {
198 /* If writes_mrt0_alpha is true, one other flag must be true too. */
199 assert(!writes_mrt0_alpha || writes_z || writes_stencil || writes_samplemask);
200
201 if (writes_z || writes_mrt0_alpha) {
202 /* Z needs 32 bits. */
203 if (writes_samplemask || writes_mrt0_alpha)
204 return V_028710_SPI_SHADER_32_ABGR;
205 else if (writes_stencil)
206 return V_028710_SPI_SHADER_32_GR;
207 else
208 return V_028710_SPI_SHADER_32_R;
209 } else if (writes_stencil || writes_samplemask) {
210 /* Both stencil and sample mask need only 16 bits. */
211 return V_028710_SPI_SHADER_UINT16_ABGR;
212 } else {
213 return V_028710_SPI_SHADER_ZERO;
214 }
215 }
216
ac_get_cb_shader_mask(unsigned spi_shader_col_format)217 unsigned ac_get_cb_shader_mask(unsigned spi_shader_col_format)
218 {
219 unsigned i, cb_shader_mask = 0;
220
221 for (i = 0; i < 8; i++) {
222 switch ((spi_shader_col_format >> (i * 4)) & 0xf) {
223 case V_028714_SPI_SHADER_ZERO:
224 break;
225 case V_028714_SPI_SHADER_32_R:
226 cb_shader_mask |= 0x1 << (i * 4);
227 break;
228 case V_028714_SPI_SHADER_32_GR:
229 cb_shader_mask |= 0x3 << (i * 4);
230 break;
231 case V_028714_SPI_SHADER_32_AR:
232 cb_shader_mask |= 0x9u << (i * 4);
233 break;
234 case V_028714_SPI_SHADER_FP16_ABGR:
235 case V_028714_SPI_SHADER_UNORM16_ABGR:
236 case V_028714_SPI_SHADER_SNORM16_ABGR:
237 case V_028714_SPI_SHADER_UINT16_ABGR:
238 case V_028714_SPI_SHADER_SINT16_ABGR:
239 case V_028714_SPI_SHADER_32_ABGR:
240 cb_shader_mask |= 0xfu << (i * 4);
241 break;
242 default:
243 assert(0);
244 }
245 }
246 return cb_shader_mask;
247 }
248
249 /**
250 * Calculate the appropriate setting of VGT_GS_MODE when \p shader is a
251 * geometry shader.
252 */
ac_vgt_gs_mode(unsigned gs_max_vert_out,enum amd_gfx_level gfx_level)253 uint32_t ac_vgt_gs_mode(unsigned gs_max_vert_out, enum amd_gfx_level gfx_level)
254 {
255 unsigned cut_mode;
256
257 assert (gfx_level < GFX11);
258
259 if (gs_max_vert_out <= 128) {
260 cut_mode = V_028A40_GS_CUT_128;
261 } else if (gs_max_vert_out <= 256) {
262 cut_mode = V_028A40_GS_CUT_256;
263 } else if (gs_max_vert_out <= 512) {
264 cut_mode = V_028A40_GS_CUT_512;
265 } else {
266 assert(gs_max_vert_out <= 1024);
267 cut_mode = V_028A40_GS_CUT_1024;
268 }
269
270 return S_028A40_MODE(V_028A40_GS_SCENARIO_G) | S_028A40_CUT_MODE(cut_mode) |
271 S_028A40_ES_WRITE_OPTIMIZE(gfx_level <= GFX8) | S_028A40_GS_WRITE_OPTIMIZE(1) |
272 S_028A40_ONCHIP(gfx_level >= GFX9 ? 1 : 0);
273 }
274
275 /// Translate a (dfmt, nfmt) pair into a chip-appropriate combined format
276 /// value for LLVM8+ tbuffer intrinsics.
ac_get_tbuffer_format(enum amd_gfx_level gfx_level,unsigned dfmt,unsigned nfmt)277 unsigned ac_get_tbuffer_format(enum amd_gfx_level gfx_level, unsigned dfmt, unsigned nfmt)
278 {
279 // Some games try to access vertex buffers without a valid format.
280 // This is a game bug, but we should still handle it gracefully.
281 if (dfmt == V_008F0C_GFX10_FORMAT_INVALID)
282 return V_008F0C_GFX10_FORMAT_INVALID;
283
284 if (gfx_level >= GFX11) {
285 switch (dfmt) {
286 default:
287 unreachable("bad dfmt");
288 case V_008F0C_BUF_DATA_FORMAT_INVALID:
289 return V_008F0C_GFX11_FORMAT_INVALID;
290
291 case V_008F0C_BUF_DATA_FORMAT_8:
292 switch (nfmt) {
293 case V_008F0C_BUF_NUM_FORMAT_UNORM:
294 return V_008F0C_GFX11_FORMAT_8_UNORM;
295 case V_008F0C_BUF_NUM_FORMAT_SNORM:
296 return V_008F0C_GFX11_FORMAT_8_SNORM;
297 case V_008F0C_BUF_NUM_FORMAT_USCALED:
298 return V_008F0C_GFX11_FORMAT_8_USCALED;
299 case V_008F0C_BUF_NUM_FORMAT_SSCALED:
300 return V_008F0C_GFX11_FORMAT_8_SSCALED;
301 default:
302 unreachable("bad nfmt");
303 case V_008F0C_BUF_NUM_FORMAT_UINT:
304 return V_008F0C_GFX11_FORMAT_8_UINT;
305 case V_008F0C_BUF_NUM_FORMAT_SINT:
306 return V_008F0C_GFX11_FORMAT_8_SINT;
307 }
308
309 case V_008F0C_BUF_DATA_FORMAT_8_8:
310 switch (nfmt) {
311 case V_008F0C_BUF_NUM_FORMAT_UNORM:
312 return V_008F0C_GFX11_FORMAT_8_8_UNORM;
313 case V_008F0C_BUF_NUM_FORMAT_SNORM:
314 return V_008F0C_GFX11_FORMAT_8_8_SNORM;
315 case V_008F0C_BUF_NUM_FORMAT_USCALED:
316 return V_008F0C_GFX11_FORMAT_8_8_USCALED;
317 case V_008F0C_BUF_NUM_FORMAT_SSCALED:
318 return V_008F0C_GFX11_FORMAT_8_8_SSCALED;
319 default:
320 unreachable("bad nfmt");
321 case V_008F0C_BUF_NUM_FORMAT_UINT:
322 return V_008F0C_GFX11_FORMAT_8_8_UINT;
323 case V_008F0C_BUF_NUM_FORMAT_SINT:
324 return V_008F0C_GFX11_FORMAT_8_8_SINT;
325 }
326
327 case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
328 switch (nfmt) {
329 case V_008F0C_BUF_NUM_FORMAT_UNORM:
330 return V_008F0C_GFX11_FORMAT_8_8_8_8_UNORM;
331 case V_008F0C_BUF_NUM_FORMAT_SNORM:
332 return V_008F0C_GFX11_FORMAT_8_8_8_8_SNORM;
333 case V_008F0C_BUF_NUM_FORMAT_USCALED:
334 return V_008F0C_GFX11_FORMAT_8_8_8_8_USCALED;
335 case V_008F0C_BUF_NUM_FORMAT_SSCALED:
336 return V_008F0C_GFX11_FORMAT_8_8_8_8_SSCALED;
337 default:
338 unreachable("bad nfmt");
339 case V_008F0C_BUF_NUM_FORMAT_UINT:
340 return V_008F0C_GFX11_FORMAT_8_8_8_8_UINT;
341 case V_008F0C_BUF_NUM_FORMAT_SINT:
342 return V_008F0C_GFX11_FORMAT_8_8_8_8_SINT;
343 }
344
345 case V_008F0C_BUF_DATA_FORMAT_16:
346 switch (nfmt) {
347 case V_008F0C_BUF_NUM_FORMAT_UNORM:
348 return V_008F0C_GFX11_FORMAT_16_UNORM;
349 case V_008F0C_BUF_NUM_FORMAT_SNORM:
350 return V_008F0C_GFX11_FORMAT_16_SNORM;
351 case V_008F0C_BUF_NUM_FORMAT_USCALED:
352 return V_008F0C_GFX11_FORMAT_16_USCALED;
353 case V_008F0C_BUF_NUM_FORMAT_SSCALED:
354 return V_008F0C_GFX11_FORMAT_16_SSCALED;
355 default:
356 unreachable("bad nfmt");
357 case V_008F0C_BUF_NUM_FORMAT_UINT:
358 return V_008F0C_GFX11_FORMAT_16_UINT;
359 case V_008F0C_BUF_NUM_FORMAT_SINT:
360 return V_008F0C_GFX11_FORMAT_16_SINT;
361 case V_008F0C_BUF_NUM_FORMAT_FLOAT:
362 return V_008F0C_GFX11_FORMAT_16_FLOAT;
363 }
364
365 case V_008F0C_BUF_DATA_FORMAT_16_16:
366 switch (nfmt) {
367 case V_008F0C_BUF_NUM_FORMAT_UNORM:
368 return V_008F0C_GFX11_FORMAT_16_16_UNORM;
369 case V_008F0C_BUF_NUM_FORMAT_SNORM:
370 return V_008F0C_GFX11_FORMAT_16_16_SNORM;
371 case V_008F0C_BUF_NUM_FORMAT_USCALED:
372 return V_008F0C_GFX11_FORMAT_16_16_USCALED;
373 case V_008F0C_BUF_NUM_FORMAT_SSCALED:
374 return V_008F0C_GFX11_FORMAT_16_16_SSCALED;
375 default:
376 unreachable("bad nfmt");
377 case V_008F0C_BUF_NUM_FORMAT_UINT:
378 return V_008F0C_GFX11_FORMAT_16_16_UINT;
379 case V_008F0C_BUF_NUM_FORMAT_SINT:
380 return V_008F0C_GFX11_FORMAT_16_16_SINT;
381 case V_008F0C_BUF_NUM_FORMAT_FLOAT:
382 return V_008F0C_GFX11_FORMAT_16_16_FLOAT;
383 }
384
385 case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
386 switch (nfmt) {
387 case V_008F0C_BUF_NUM_FORMAT_UNORM:
388 return V_008F0C_GFX11_FORMAT_16_16_16_16_UNORM;
389 case V_008F0C_BUF_NUM_FORMAT_SNORM:
390 return V_008F0C_GFX11_FORMAT_16_16_16_16_SNORM;
391 case V_008F0C_BUF_NUM_FORMAT_USCALED:
392 return V_008F0C_GFX11_FORMAT_16_16_16_16_USCALED;
393 case V_008F0C_BUF_NUM_FORMAT_SSCALED:
394 return V_008F0C_GFX11_FORMAT_16_16_16_16_SSCALED;
395 default:
396 unreachable("bad nfmt");
397 case V_008F0C_BUF_NUM_FORMAT_UINT:
398 return V_008F0C_GFX11_FORMAT_16_16_16_16_UINT;
399 case V_008F0C_BUF_NUM_FORMAT_SINT:
400 return V_008F0C_GFX11_FORMAT_16_16_16_16_SINT;
401 case V_008F0C_BUF_NUM_FORMAT_FLOAT:
402 return V_008F0C_GFX11_FORMAT_16_16_16_16_FLOAT;
403 }
404
405 case V_008F0C_BUF_DATA_FORMAT_32:
406 switch (nfmt) {
407 default:
408 unreachable("bad nfmt");
409 case V_008F0C_BUF_NUM_FORMAT_UINT:
410 return V_008F0C_GFX11_FORMAT_32_UINT;
411 case V_008F0C_BUF_NUM_FORMAT_SINT:
412 return V_008F0C_GFX11_FORMAT_32_SINT;
413 case V_008F0C_BUF_NUM_FORMAT_FLOAT:
414 return V_008F0C_GFX11_FORMAT_32_FLOAT;
415 }
416
417 case V_008F0C_BUF_DATA_FORMAT_32_32:
418 switch (nfmt) {
419 default:
420 unreachable("bad nfmt");
421 case V_008F0C_BUF_NUM_FORMAT_UINT:
422 return V_008F0C_GFX11_FORMAT_32_32_UINT;
423 case V_008F0C_BUF_NUM_FORMAT_SINT:
424 return V_008F0C_GFX11_FORMAT_32_32_SINT;
425 case V_008F0C_BUF_NUM_FORMAT_FLOAT:
426 return V_008F0C_GFX11_FORMAT_32_32_FLOAT;
427 }
428
429 case V_008F0C_BUF_DATA_FORMAT_32_32_32:
430 switch (nfmt) {
431 default:
432 unreachable("bad nfmt");
433 case V_008F0C_BUF_NUM_FORMAT_UINT:
434 return V_008F0C_GFX11_FORMAT_32_32_32_UINT;
435 case V_008F0C_BUF_NUM_FORMAT_SINT:
436 return V_008F0C_GFX11_FORMAT_32_32_32_SINT;
437 case V_008F0C_BUF_NUM_FORMAT_FLOAT:
438 return V_008F0C_GFX11_FORMAT_32_32_32_FLOAT;
439 }
440
441 case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
442 switch (nfmt) {
443 default:
444 unreachable("bad nfmt");
445 case V_008F0C_BUF_NUM_FORMAT_UINT:
446 return V_008F0C_GFX11_FORMAT_32_32_32_32_UINT;
447 case V_008F0C_BUF_NUM_FORMAT_SINT:
448 return V_008F0C_GFX11_FORMAT_32_32_32_32_SINT;
449 case V_008F0C_BUF_NUM_FORMAT_FLOAT:
450 return V_008F0C_GFX11_FORMAT_32_32_32_32_FLOAT;
451 }
452
453 case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
454 switch (nfmt) {
455 case V_008F0C_BUF_NUM_FORMAT_UNORM:
456 return V_008F0C_GFX11_FORMAT_2_10_10_10_UNORM;
457 case V_008F0C_BUF_NUM_FORMAT_SNORM:
458 return V_008F0C_GFX11_FORMAT_2_10_10_10_SNORM;
459 case V_008F0C_BUF_NUM_FORMAT_USCALED:
460 return V_008F0C_GFX11_FORMAT_2_10_10_10_USCALED;
461 case V_008F0C_BUF_NUM_FORMAT_SSCALED:
462 return V_008F0C_GFX11_FORMAT_2_10_10_10_SSCALED;
463 default:
464 unreachable("bad nfmt");
465 case V_008F0C_BUF_NUM_FORMAT_UINT:
466 return V_008F0C_GFX11_FORMAT_2_10_10_10_UINT;
467 case V_008F0C_BUF_NUM_FORMAT_SINT:
468 return V_008F0C_GFX11_FORMAT_2_10_10_10_SINT;
469 }
470
471 case V_008F0C_BUF_DATA_FORMAT_10_11_11:
472 switch (nfmt) {
473 default:
474 unreachable("bad nfmt");
475 case V_008F0C_BUF_NUM_FORMAT_FLOAT:
476 return V_008F0C_GFX11_FORMAT_10_11_11_FLOAT;
477 }
478 }
479 } else if (gfx_level >= GFX10) {
480 unsigned format;
481 switch (dfmt) {
482 default:
483 unreachable("bad dfmt");
484 case V_008F0C_BUF_DATA_FORMAT_INVALID:
485 format = V_008F0C_GFX10_FORMAT_INVALID;
486 break;
487 case V_008F0C_BUF_DATA_FORMAT_8:
488 format = V_008F0C_GFX10_FORMAT_8_UINT;
489 break;
490 case V_008F0C_BUF_DATA_FORMAT_8_8:
491 format = V_008F0C_GFX10_FORMAT_8_8_UINT;
492 break;
493 case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
494 format = V_008F0C_GFX10_FORMAT_8_8_8_8_UINT;
495 break;
496 case V_008F0C_BUF_DATA_FORMAT_16:
497 format = V_008F0C_GFX10_FORMAT_16_UINT;
498 break;
499 case V_008F0C_BUF_DATA_FORMAT_16_16:
500 format = V_008F0C_GFX10_FORMAT_16_16_UINT;
501 break;
502 case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
503 format = V_008F0C_GFX10_FORMAT_16_16_16_16_UINT;
504 break;
505 case V_008F0C_BUF_DATA_FORMAT_32:
506 format = V_008F0C_GFX10_FORMAT_32_UINT;
507 break;
508 case V_008F0C_BUF_DATA_FORMAT_32_32:
509 format = V_008F0C_GFX10_FORMAT_32_32_UINT;
510 break;
511 case V_008F0C_BUF_DATA_FORMAT_32_32_32:
512 format = V_008F0C_GFX10_FORMAT_32_32_32_UINT;
513 break;
514 case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
515 format = V_008F0C_GFX10_FORMAT_32_32_32_32_UINT;
516 break;
517 case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
518 format = V_008F0C_GFX10_FORMAT_2_10_10_10_UINT;
519 break;
520 case V_008F0C_BUF_DATA_FORMAT_10_11_11:
521 format = V_008F0C_GFX10_FORMAT_10_11_11_UINT;
522 break;
523 }
524
525 // Use the regularity properties of the combined format enum.
526 //
527 // Note: float is incompatible with 8-bit data formats,
528 // [us]{norm,scaled} are incompatible with 32-bit data formats.
529 // [us]scaled are not writable.
530 switch (nfmt) {
531 case V_008F0C_BUF_NUM_FORMAT_UNORM:
532 format -= 4;
533 break;
534 case V_008F0C_BUF_NUM_FORMAT_SNORM:
535 format -= 3;
536 break;
537 case V_008F0C_BUF_NUM_FORMAT_USCALED:
538 format -= 2;
539 break;
540 case V_008F0C_BUF_NUM_FORMAT_SSCALED:
541 format -= 1;
542 break;
543 default:
544 unreachable("bad nfmt");
545 case V_008F0C_BUF_NUM_FORMAT_UINT:
546 break;
547 case V_008F0C_BUF_NUM_FORMAT_SINT:
548 format += 1;
549 break;
550 case V_008F0C_BUF_NUM_FORMAT_FLOAT:
551 format += 2;
552 break;
553 }
554
555 return format;
556 } else {
557 return dfmt | (nfmt << 4);
558 }
559 }
560
561 static const struct ac_data_format_info data_format_table[] = {
562 [V_008F0C_BUF_DATA_FORMAT_INVALID] = {0, 4, 0, V_008F0C_BUF_DATA_FORMAT_INVALID},
563 [V_008F0C_BUF_DATA_FORMAT_8] = {1, 1, 1, V_008F0C_BUF_DATA_FORMAT_8},
564 [V_008F0C_BUF_DATA_FORMAT_16] = {2, 1, 2, V_008F0C_BUF_DATA_FORMAT_16},
565 [V_008F0C_BUF_DATA_FORMAT_8_8] = {2, 2, 1, V_008F0C_BUF_DATA_FORMAT_8},
566 [V_008F0C_BUF_DATA_FORMAT_32] = {4, 1, 4, V_008F0C_BUF_DATA_FORMAT_32},
567 [V_008F0C_BUF_DATA_FORMAT_16_16] = {4, 2, 2, V_008F0C_BUF_DATA_FORMAT_16},
568 [V_008F0C_BUF_DATA_FORMAT_10_11_11] = {4, 3, 0, V_008F0C_BUF_DATA_FORMAT_10_11_11},
569 [V_008F0C_BUF_DATA_FORMAT_11_11_10] = {4, 3, 0, V_008F0C_BUF_DATA_FORMAT_11_11_10},
570 [V_008F0C_BUF_DATA_FORMAT_10_10_10_2] = {4, 4, 0, V_008F0C_BUF_DATA_FORMAT_10_10_10_2},
571 [V_008F0C_BUF_DATA_FORMAT_2_10_10_10] = {4, 4, 0, V_008F0C_BUF_DATA_FORMAT_2_10_10_10},
572 [V_008F0C_BUF_DATA_FORMAT_8_8_8_8] = {4, 4, 1, V_008F0C_BUF_DATA_FORMAT_8},
573 [V_008F0C_BUF_DATA_FORMAT_32_32] = {8, 2, 4, V_008F0C_BUF_DATA_FORMAT_32},
574 [V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = {8, 4, 2, V_008F0C_BUF_DATA_FORMAT_16},
575 [V_008F0C_BUF_DATA_FORMAT_32_32_32] = {12, 3, 4, V_008F0C_BUF_DATA_FORMAT_32},
576 [V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = {16, 4, 4, V_008F0C_BUF_DATA_FORMAT_32},
577 };
578
ac_get_data_format_info(unsigned dfmt)579 const struct ac_data_format_info *ac_get_data_format_info(unsigned dfmt)
580 {
581 assert(dfmt < ARRAY_SIZE(data_format_table));
582 return &data_format_table[dfmt];
583 }
584
585 #define DUP2(v) v, v
586 #define DUP3(v) v, v, v
587 #define DUP4(v) v, v, v, v
588
589 #define FMT(dfmt, nfmt) 0xb, {HW_FMT(dfmt, nfmt), HW_FMT(dfmt##_##dfmt, nfmt), HW_FMT_INVALID, HW_FMT(dfmt##_##dfmt##_##dfmt##_##dfmt, nfmt)}
590 #define FMT_32(nfmt) 0xf, {HW_FMT(32, nfmt), HW_FMT(32_32, nfmt), HW_FMT(32_32_32, nfmt), HW_FMT(32_32_32_32, nfmt)}
591 #define FMT_64(nfmt) 0x3, {HW_FMT(32_32, nfmt), HW_FMT(32_32_32_32, nfmt), DUP2(HW_FMT_INVALID)}
592 #define FMTP(dfmt, nfmt) 0xf, {DUP4(HW_FMT(dfmt, nfmt))}
593
594 #define DST_SEL(x, y, z, w) \
595 (S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_##x) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_##y) | \
596 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_##z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_##w))
597
598 #define LIST_NFMT_8_16(nfmt) \
599 [(int)PIPE_FORMAT_R8_##nfmt] = {DST_SEL(X,0,0,1), 1, 1, 1, FMT(8, nfmt)}, \
600 [(int)PIPE_FORMAT_R8G8_##nfmt] = {DST_SEL(X,Y,0,1), 2, 2, 1, FMT(8, nfmt)}, \
601 [(int)PIPE_FORMAT_R8G8B8_##nfmt] = {DST_SEL(X,Y,Z,1), 3, 3, 1, FMT(8, nfmt)}, \
602 [(int)PIPE_FORMAT_B8G8R8_##nfmt] = {DST_SEL(Z,Y,X,1), 3, 3, 1, FMT(8, nfmt)}, \
603 [(int)PIPE_FORMAT_R8G8B8A8_##nfmt] = {DST_SEL(X,Y,Z,W), 4, 4, 1, FMT(8, nfmt)}, \
604 [(int)PIPE_FORMAT_B8G8R8A8_##nfmt] = {DST_SEL(Z,Y,X,W), 4, 4, 1, FMT(8, nfmt)}, \
605 [(int)PIPE_FORMAT_R16_##nfmt] = {DST_SEL(X,0,0,1), 2, 1, 2, FMT(16, nfmt)}, \
606 [(int)PIPE_FORMAT_R16G16_##nfmt] = {DST_SEL(X,Y,0,1), 4, 2, 2, FMT(16, nfmt)}, \
607 [(int)PIPE_FORMAT_R16G16B16_##nfmt] = {DST_SEL(X,Y,Z,1), 6, 3, 2, FMT(16, nfmt)}, \
608 [(int)PIPE_FORMAT_R16G16B16A16_##nfmt] = {DST_SEL(X,Y,Z,W), 8, 4, 2, FMT(16, nfmt)},
609
610 #define LIST_NFMT_32_64(nfmt) \
611 [(int)PIPE_FORMAT_R32_##nfmt] = {DST_SEL(X,0,0,1), 4, 1, 4, FMT_32(nfmt)}, \
612 [(int)PIPE_FORMAT_R32G32_##nfmt] = {DST_SEL(X,Y,0,1), 8, 2, 4, FMT_32(nfmt)}, \
613 [(int)PIPE_FORMAT_R32G32B32_##nfmt] = {DST_SEL(X,Y,Z,1), 12, 3, 4, FMT_32(nfmt)}, \
614 [(int)PIPE_FORMAT_R32G32B32A32_##nfmt] = {DST_SEL(X,Y,Z,W), 16, 4, 4, FMT_32(nfmt)}, \
615 [(int)PIPE_FORMAT_R64_##nfmt] = {DST_SEL(X,Y,0,0), 8, 1, 8, FMT_64(nfmt)}, \
616 [(int)PIPE_FORMAT_R64G64_##nfmt] = {DST_SEL(X,Y,Z,W), 16, 2, 8, FMT_64(nfmt)}, \
617 [(int)PIPE_FORMAT_R64G64B64_##nfmt] = {DST_SEL(X,Y,Z,W), 24, 3, 8, FMT_64(nfmt)}, \
618 [(int)PIPE_FORMAT_R64G64B64A64_##nfmt] = {DST_SEL(X,Y,Z,W), 32, 4, 8, FMT_64(nfmt)}, \
619
620 #define VB_FORMATS \
621 [(int)PIPE_FORMAT_NONE] = {DST_SEL(0,0,0,1), 0, 4, 0, 0xf, {DUP4(HW_FMT_INVALID)}}, \
622 LIST_NFMT_8_16(UNORM) \
623 LIST_NFMT_8_16(SNORM) \
624 LIST_NFMT_8_16(USCALED) \
625 LIST_NFMT_8_16(SSCALED) \
626 LIST_NFMT_8_16(UINT) \
627 LIST_NFMT_8_16(SINT) \
628 LIST_NFMT_32_64(UINT) \
629 LIST_NFMT_32_64(SINT) \
630 LIST_NFMT_32_64(FLOAT) \
631 [(int)PIPE_FORMAT_R16_FLOAT] = {DST_SEL(X,0,0,1), 2, 1, 2, FMT(16, FLOAT)}, \
632 [(int)PIPE_FORMAT_R16G16_FLOAT] = {DST_SEL(X,Y,0,1), 4, 2, 2, FMT(16, FLOAT)}, \
633 [(int)PIPE_FORMAT_R16G16B16_FLOAT] = {DST_SEL(X,Y,Z,1), 6, 3, 2, FMT(16, FLOAT)}, \
634 [(int)PIPE_FORMAT_R16G16B16A16_FLOAT] = {DST_SEL(X,Y,Z,W), 8, 4, 2, FMT(16, FLOAT)}, \
635 [(int)PIPE_FORMAT_B10G10R10A2_UNORM] = {DST_SEL(Z,Y,X,W), 4, 4, 0, FMTP(2_10_10_10, UNORM)}, \
636 [(int)PIPE_FORMAT_B10G10R10A2_SNORM] = {DST_SEL(Z,Y,X,W), 4, 4, 0, FMTP(2_10_10_10, SNORM), \
637 AA(AC_ALPHA_ADJUST_SNORM)}, \
638 [(int)PIPE_FORMAT_B10G10R10A2_USCALED] = {DST_SEL(Z,Y,X,W), 4, 4, 0, FMTP(2_10_10_10, USCALED)}, \
639 [(int)PIPE_FORMAT_B10G10R10A2_SSCALED] = {DST_SEL(Z,Y,X,W), 4, 4, 0, FMTP(2_10_10_10, SSCALED), \
640 AA(AC_ALPHA_ADJUST_SSCALED)}, \
641 [(int)PIPE_FORMAT_B10G10R10A2_UINT] = {DST_SEL(Z,Y,X,W), 4, 4, 0, FMTP(2_10_10_10, UINT)}, \
642 [(int)PIPE_FORMAT_B10G10R10A2_SINT] = {DST_SEL(Z,Y,X,W), 4, 4, 0, FMTP(2_10_10_10, SINT), \
643 AA(AC_ALPHA_ADJUST_SINT)}, \
644 [(int)PIPE_FORMAT_R10G10B10A2_UNORM] = {DST_SEL(X,Y,Z,W), 4, 4, 0, FMTP(2_10_10_10, UNORM)}, \
645 [(int)PIPE_FORMAT_R10G10B10A2_SNORM] = {DST_SEL(X,Y,Z,W), 4, 4, 0, FMTP(2_10_10_10, SNORM), \
646 AA(AC_ALPHA_ADJUST_SNORM)}, \
647 [(int)PIPE_FORMAT_R10G10B10A2_USCALED] = {DST_SEL(X,Y,Z,W), 4, 4, 0, FMTP(2_10_10_10, USCALED)}, \
648 [(int)PIPE_FORMAT_R10G10B10A2_SSCALED] = {DST_SEL(X,Y,Z,W), 4, 4, 0, FMTP(2_10_10_10, SSCALED), \
649 AA(AC_ALPHA_ADJUST_SSCALED)}, \
650 [(int)PIPE_FORMAT_R10G10B10A2_UINT] = {DST_SEL(X,Y,Z,W), 4, 4, 0, FMTP(2_10_10_10, UINT)}, \
651 [(int)PIPE_FORMAT_R10G10B10A2_SINT] = {DST_SEL(X,Y,Z,W), 4, 4, 0, FMTP(2_10_10_10, SINT), \
652 AA(AC_ALPHA_ADJUST_SINT)}, \
653 [(int)PIPE_FORMAT_R11G11B10_FLOAT] = {DST_SEL(X,Y,Z,1), 4, 3, 0, FMTP(10_11_11, FLOAT)}, \
654
655 #define HW_FMT(dfmt, nfmt) (V_008F0C_BUF_DATA_FORMAT_##dfmt | (V_008F0C_BUF_NUM_FORMAT_##nfmt << 4))
656 #define HW_FMT_INVALID (V_008F0C_BUF_DATA_FORMAT_INVALID | (V_008F0C_BUF_NUM_FORMAT_UNORM << 4))
657 #define AA(v) v
658 static const struct ac_vtx_format_info vb_formats_gfx6_alpha_adjust[] = {VB_FORMATS};
659 #undef AA
660
661 #define AA(v) AC_ALPHA_ADJUST_NONE
662 static const struct ac_vtx_format_info vb_formats_gfx6[] = {VB_FORMATS};
663 #undef HW_FMT_INVALID
664 #undef HW_FMT
665
666 #define HW_FMT(dfmt, nfmt) V_008F0C_GFX10_FORMAT_##dfmt##_##nfmt
667 #define HW_FMT_INVALID V_008F0C_GFX10_FORMAT_INVALID
668 static const struct ac_vtx_format_info vb_formats_gfx10[] = {VB_FORMATS};
669 #undef HW_FMT_INVALID
670 #undef HW_FMT
671
672 #define HW_FMT(dfmt, nfmt) V_008F0C_GFX11_FORMAT_##dfmt##_##nfmt
673 #define HW_FMT_INVALID V_008F0C_GFX11_FORMAT_INVALID
674 static const struct ac_vtx_format_info vb_formats_gfx11[] = {VB_FORMATS};
675
676 const struct ac_vtx_format_info *
ac_get_vtx_format_info_table(enum amd_gfx_level level,enum radeon_family family)677 ac_get_vtx_format_info_table(enum amd_gfx_level level, enum radeon_family family)
678 {
679 if (level >= GFX11)
680 return vb_formats_gfx11;
681 else if (level >= GFX10)
682 return vb_formats_gfx10;
683 bool alpha_adjust = level <= GFX8 && family != CHIP_STONEY;
684 return alpha_adjust ? vb_formats_gfx6_alpha_adjust : vb_formats_gfx6;
685 }
686
687 const struct ac_vtx_format_info *
ac_get_vtx_format_info(enum amd_gfx_level level,enum radeon_family family,enum pipe_format fmt)688 ac_get_vtx_format_info(enum amd_gfx_level level, enum radeon_family family, enum pipe_format fmt)
689 {
690 return &ac_get_vtx_format_info_table(level, family)[fmt];
691 }
692
693 /**
694 * Check whether the specified fetch size is safe to use with MTBUF.
695 *
696 * Split typed vertex buffer loads when necessary to avoid any
697 * alignment issues that trigger memory violations and eventually a GPU
698 * hang. This can happen if the stride (static or dynamic) is unaligned and
699 * also if the VBO offset is aligned to a scalar (eg. stride is 8 and VBO
700 * offset is 2 for R16G16B16A16_SNORM).
701 */
702 static bool
is_fetch_size_safe(const enum amd_gfx_level gfx_level,const struct ac_vtx_format_info * vtx_info,const unsigned offset,const unsigned alignment,const unsigned channels)703 is_fetch_size_safe(const enum amd_gfx_level gfx_level, const struct ac_vtx_format_info* vtx_info,
704 const unsigned offset, const unsigned alignment, const unsigned channels)
705 {
706 if (!(vtx_info->has_hw_format & BITFIELD_BIT(channels - 1)))
707 return false;
708
709 unsigned vertex_byte_size = vtx_info->chan_byte_size * channels;
710 return (gfx_level >= GFX7 && gfx_level <= GFX9) ||
711 (offset % vertex_byte_size == 0 && MAX2(alignment, 1) % vertex_byte_size == 0);
712 }
713
714 /**
715 * Gets the number of channels that can be safely fetched by MTBUF (typed buffer load)
716 * instructions without triggering alignment-related issues.
717 */
718 unsigned
ac_get_safe_fetch_size(const enum amd_gfx_level gfx_level,const struct ac_vtx_format_info * vtx_info,const unsigned offset,const unsigned max_channels,const unsigned alignment,const unsigned num_channels)719 ac_get_safe_fetch_size(const enum amd_gfx_level gfx_level, const struct ac_vtx_format_info* vtx_info,
720 const unsigned offset, const unsigned max_channels, const unsigned alignment,
721 const unsigned num_channels)
722 {
723 /* Packed formats can't be split. */
724 if (!vtx_info->chan_byte_size)
725 return vtx_info->num_channels;
726
727 /* Early exit if the specified number of channels is fine. */
728 if (is_fetch_size_safe(gfx_level, vtx_info, offset, alignment, num_channels))
729 return num_channels;
730
731 /* First, assume that more load instructions are worse and try using a larger data format. */
732 unsigned new_channels = num_channels + 1;
733 while (new_channels <= max_channels &&
734 !is_fetch_size_safe(gfx_level, vtx_info, offset, alignment, new_channels)) {
735 new_channels++;
736 }
737
738 /* Found a feasible load size. */
739 if (new_channels <= max_channels)
740 return new_channels;
741
742 /* Try decreasing load size (at the cost of more load instructions). */
743 new_channels = num_channels;
744 while (new_channels > 1 &&
745 !is_fetch_size_safe(gfx_level, vtx_info, offset, alignment, new_channels)) {
746 new_channels--;
747 }
748
749 return new_channels;
750 }
751
ac_get_sampler_dim(enum amd_gfx_level gfx_level,enum glsl_sampler_dim dim,bool is_array)752 enum ac_image_dim ac_get_sampler_dim(enum amd_gfx_level gfx_level, enum glsl_sampler_dim dim,
753 bool is_array)
754 {
755 switch (dim) {
756 case GLSL_SAMPLER_DIM_1D:
757 if (gfx_level == GFX9)
758 return is_array ? ac_image_2darray : ac_image_2d;
759 return is_array ? ac_image_1darray : ac_image_1d;
760 case GLSL_SAMPLER_DIM_2D:
761 case GLSL_SAMPLER_DIM_RECT:
762 case GLSL_SAMPLER_DIM_EXTERNAL:
763 return is_array ? ac_image_2darray : ac_image_2d;
764 case GLSL_SAMPLER_DIM_3D:
765 return ac_image_3d;
766 case GLSL_SAMPLER_DIM_CUBE:
767 return ac_image_cube;
768 case GLSL_SAMPLER_DIM_MS:
769 return is_array ? ac_image_2darraymsaa : ac_image_2dmsaa;
770 case GLSL_SAMPLER_DIM_SUBPASS:
771 return ac_image_2darray;
772 case GLSL_SAMPLER_DIM_SUBPASS_MS:
773 return ac_image_2darraymsaa;
774 default:
775 unreachable("bad sampler dim");
776 }
777 }
778
ac_get_image_dim(enum amd_gfx_level gfx_level,enum glsl_sampler_dim sdim,bool is_array)779 enum ac_image_dim ac_get_image_dim(enum amd_gfx_level gfx_level, enum glsl_sampler_dim sdim,
780 bool is_array)
781 {
782 enum ac_image_dim dim = ac_get_sampler_dim(gfx_level, sdim, is_array);
783
784 /* Match the resource type set in the descriptor. */
785 if (dim == ac_image_cube || (gfx_level <= GFX8 && dim == ac_image_3d))
786 dim = ac_image_2darray;
787 else if (sdim == GLSL_SAMPLER_DIM_2D && !is_array && gfx_level == GFX9) {
788 /* When a single layer of a 3D texture is bound, the shader
789 * will refer to a 2D target, but the descriptor has a 3D type.
790 * Since the HW ignores BASE_ARRAY in this case, we need to
791 * send 3 coordinates. This doesn't hurt when the underlying
792 * texture is non-3D.
793 */
794 dim = ac_image_3d;
795 }
796
797 return dim;
798 }
799
ac_get_fs_input_vgpr_cnt(const struct ac_shader_config * config,uint8_t * num_fragcoord_components)800 unsigned ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config,
801 uint8_t *num_fragcoord_components)
802 {
803 unsigned num_input_vgprs = 0;
804 unsigned fragcoord_components = 0;
805
806 if (G_0286CC_PERSP_SAMPLE_ENA(config->spi_ps_input_addr))
807 num_input_vgprs += 2;
808 if (G_0286CC_PERSP_CENTER_ENA(config->spi_ps_input_addr))
809 num_input_vgprs += 2;
810 if (G_0286CC_PERSP_CENTROID_ENA(config->spi_ps_input_addr))
811 num_input_vgprs += 2;
812 if (G_0286CC_PERSP_PULL_MODEL_ENA(config->spi_ps_input_addr))
813 num_input_vgprs += 3;
814 if (G_0286CC_LINEAR_SAMPLE_ENA(config->spi_ps_input_addr))
815 num_input_vgprs += 2;
816 if (G_0286CC_LINEAR_CENTER_ENA(config->spi_ps_input_addr))
817 num_input_vgprs += 2;
818 if (G_0286CC_LINEAR_CENTROID_ENA(config->spi_ps_input_addr))
819 num_input_vgprs += 2;
820 if (G_0286CC_LINE_STIPPLE_TEX_ENA(config->spi_ps_input_addr))
821 num_input_vgprs += 1;
822 if (G_0286CC_POS_X_FLOAT_ENA(config->spi_ps_input_addr)) {
823 num_input_vgprs += 1;
824 fragcoord_components++;
825 }
826 if (G_0286CC_POS_Y_FLOAT_ENA(config->spi_ps_input_addr)) {
827 num_input_vgprs += 1;
828 fragcoord_components++;
829 }
830 if (G_0286CC_POS_Z_FLOAT_ENA(config->spi_ps_input_addr)) {
831 num_input_vgprs += 1;
832 fragcoord_components++;
833 }
834 if (G_0286CC_POS_W_FLOAT_ENA(config->spi_ps_input_addr)) {
835 num_input_vgprs += 1;
836 fragcoord_components++;
837 }
838 if (G_0286CC_FRONT_FACE_ENA(config->spi_ps_input_addr))
839 num_input_vgprs += 1;
840 if (G_0286CC_ANCILLARY_ENA(config->spi_ps_input_addr))
841 num_input_vgprs += 1;
842 if (G_0286CC_SAMPLE_COVERAGE_ENA(config->spi_ps_input_addr))
843 num_input_vgprs += 1;
844 if (G_0286CC_POS_FIXED_PT_ENA(config->spi_ps_input_addr))
845 num_input_vgprs += 1;
846
847 if (num_fragcoord_components)
848 *num_fragcoord_components = fragcoord_components;
849
850 return num_input_vgprs;
851 }
852
ac_get_ps_iter_mask(unsigned ps_iter_samples)853 uint16_t ac_get_ps_iter_mask(unsigned ps_iter_samples)
854 {
855 /* The bit pattern matches that used by fixed function fragment
856 * processing.
857 */
858 switch (ps_iter_samples) {
859 case 1: return 0xffff;
860 case 2: return 0x5555;
861 case 4: return 0x1111;
862 case 8: return 0x0101;
863 case 16: return 0x0001;
864 default:
865 unreachable("invalid sample count");
866 }
867 }
868
ac_choose_spi_color_formats(unsigned format,unsigned swap,unsigned ntype,bool is_depth,bool use_rbplus,struct ac_spi_color_formats * formats)869 void ac_choose_spi_color_formats(unsigned format, unsigned swap, unsigned ntype,
870 bool is_depth, bool use_rbplus,
871 struct ac_spi_color_formats *formats)
872 {
873 /* Alpha is needed for alpha-to-coverage.
874 * Blending may be with or without alpha.
875 */
876 unsigned normal = 0; /* most optimal, may not support blending or export alpha */
877 unsigned alpha = 0; /* exports alpha, but may not support blending */
878 unsigned blend = 0; /* supports blending, but may not export alpha */
879 unsigned blend_alpha = 0; /* least optimal, supports blending and exports alpha */
880
881 /* Choose the SPI color formats. These are required values for RB+.
882 * Other chips have multiple choices, though they are not necessarily better.
883 */
884 switch (format) {
885 case V_028C70_COLOR_5_6_5:
886 case V_028C70_COLOR_1_5_5_5:
887 case V_028C70_COLOR_5_5_5_1:
888 case V_028C70_COLOR_4_4_4_4:
889 case V_028C70_COLOR_10_11_11:
890 case V_028C70_COLOR_11_11_10:
891 case V_028C70_COLOR_5_9_9_9:
892 case V_028C70_COLOR_8:
893 case V_028C70_COLOR_8_8:
894 case V_028C70_COLOR_8_8_8_8:
895 case V_028C70_COLOR_10_10_10_2:
896 case V_028C70_COLOR_2_10_10_10:
897 if (ntype == V_028C70_NUMBER_UINT)
898 alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR;
899 else if (ntype == V_028C70_NUMBER_SINT)
900 alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR;
901 else
902 alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR;
903
904 if (!use_rbplus && format == V_028C70_COLOR_8 &&
905 ntype != V_028C70_NUMBER_SRGB && swap == V_028C70_SWAP_STD) /* R */ {
906 /* When RB+ is enabled, R8_UNORM should use FP16_ABGR for 2x
907 * exporting performance. Otherwise, use 32_R to remove useless
908 * instructions needed for 16-bit compressed exports.
909 */
910 blend = normal = V_028714_SPI_SHADER_32_R;
911 }
912 break;
913
914 case V_028C70_COLOR_16:
915 case V_028C70_COLOR_16_16:
916 case V_028C70_COLOR_16_16_16_16:
917 if (ntype == V_028C70_NUMBER_UNORM || ntype == V_028C70_NUMBER_SNORM) {
918 /* UNORM16 and SNORM16 don't support blending */
919 if (ntype == V_028C70_NUMBER_UNORM)
920 normal = alpha = V_028714_SPI_SHADER_UNORM16_ABGR;
921 else
922 normal = alpha = V_028714_SPI_SHADER_SNORM16_ABGR;
923
924 /* Use 32 bits per channel for blending. */
925 if (format == V_028C70_COLOR_16) {
926 if (swap == V_028C70_SWAP_STD) { /* R */
927 blend = V_028714_SPI_SHADER_32_R;
928 blend_alpha = V_028714_SPI_SHADER_32_AR;
929 } else if (swap == V_028C70_SWAP_ALT_REV) /* A */
930 blend = blend_alpha = V_028714_SPI_SHADER_32_AR;
931 else
932 assert(0);
933 } else if (format == V_028C70_COLOR_16_16) {
934 if (swap == V_028C70_SWAP_STD || swap == V_028C70_SWAP_STD_REV) { /* RG or GR */
935 blend = V_028714_SPI_SHADER_32_GR;
936 blend_alpha = V_028714_SPI_SHADER_32_ABGR;
937 } else if (swap == V_028C70_SWAP_ALT) /* RA */
938 blend = blend_alpha = V_028714_SPI_SHADER_32_AR;
939 else
940 assert(0);
941 } else /* 16_16_16_16 */
942 blend = blend_alpha = V_028714_SPI_SHADER_32_ABGR;
943 } else if (ntype == V_028C70_NUMBER_UINT)
944 alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR;
945 else if (ntype == V_028C70_NUMBER_SINT)
946 alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR;
947 else if (ntype == V_028C70_NUMBER_FLOAT)
948 alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR;
949 else
950 assert(0);
951 break;
952
953 case V_028C70_COLOR_32:
954 if (swap == V_028C70_SWAP_STD) { /* R */
955 blend = normal = V_028714_SPI_SHADER_32_R;
956 alpha = blend_alpha = V_028714_SPI_SHADER_32_AR;
957 } else if (swap == V_028C70_SWAP_ALT_REV) /* A */
958 alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR;
959 else
960 assert(0);
961 break;
962
963 case V_028C70_COLOR_32_32:
964 if (swap == V_028C70_SWAP_STD || swap == V_028C70_SWAP_STD_REV) { /* RG or GR */
965 blend = normal = V_028714_SPI_SHADER_32_GR;
966 alpha = blend_alpha = V_028714_SPI_SHADER_32_ABGR;
967 } else if (swap == V_028C70_SWAP_ALT) /* RA */
968 alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR;
969 else
970 assert(0);
971 break;
972
973 case V_028C70_COLOR_32_32_32_32:
974 case V_028C70_COLOR_8_24:
975 case V_028C70_COLOR_24_8:
976 case V_028C70_COLOR_X24_8_32_FLOAT:
977 alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR;
978 break;
979
980 default:
981 assert(0);
982 return;
983 }
984
985 /* The DB->CB copy needs 32_ABGR. */
986 if (is_depth)
987 alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR;
988
989 formats->normal = normal;
990 formats->alpha = alpha;
991 formats->blend = blend;
992 formats->blend_alpha = blend_alpha;
993 }
994
ac_compute_late_alloc(const struct radeon_info * info,bool ngg,bool ngg_culling,bool uses_scratch,unsigned * late_alloc_wave64,unsigned * cu_mask)995 void ac_compute_late_alloc(const struct radeon_info *info, bool ngg, bool ngg_culling,
996 bool uses_scratch, unsigned *late_alloc_wave64, unsigned *cu_mask)
997 {
998 *late_alloc_wave64 = 0; /* The limit is per SA. */
999 *cu_mask = 0xffff;
1000
1001 /* This should never be called on gfx12. Gfx12 doesn't need to mask CUs for late alloc. */
1002 assert(info->gfx_level < GFX12);
1003
1004 /* CU masking can decrease performance and cause a hang with <= 2 CUs per SA. */
1005 if (info->min_good_cu_per_sa <= 2)
1006 return;
1007
1008 /* If scratch is used with late alloc, the GPU could deadlock if PS uses scratch too. A more
1009 * complicated computation is needed to enable late alloc with scratch (see PAL).
1010 */
1011 if (uses_scratch)
1012 return;
1013
1014 /* Late alloc is not used for NGG on Navi14 due to a hw bug. */
1015 if (ngg && info->family == CHIP_NAVI14)
1016 return;
1017
1018 if (info->gfx_level >= GFX10) {
1019 /* For Wave32, the hw will launch twice the number of late alloc waves, so 1 == 2x wave32.
1020 * These limits are estimated because they are all safe but they vary in performance.
1021 */
1022 if (ngg_culling)
1023 *late_alloc_wave64 = info->min_good_cu_per_sa * 10;
1024 else if (info->gfx_level >= GFX11)
1025 *late_alloc_wave64 = 63;
1026 else
1027 *late_alloc_wave64 = info->min_good_cu_per_sa * 4;
1028
1029 /* Limit LATE_ALLOC_GS to prevent a hang (hw bug) on gfx10. */
1030 if (info->gfx_level == GFX10 && ngg)
1031 *late_alloc_wave64 = MIN2(*late_alloc_wave64, 64);
1032
1033 /* Gfx10: CU2 & CU3 must be disabled to prevent a hw deadlock.
1034 * Others: CU1 must be disabled to prevent a hw deadlock.
1035 *
1036 * The deadlock is caused by late alloc, which usually increases performance.
1037 */
1038 *cu_mask &= info->gfx_level == GFX10 ? ~BITFIELD_RANGE(2, 2) :
1039 ~BITFIELD_RANGE(1, 1);
1040 } else {
1041 if (info->min_good_cu_per_sa <= 4) {
1042 /* Too few available compute units per SA. Disallowing VS to run on one CU could hurt us
1043 * more than late VS allocation would help.
1044 *
1045 * 2 is the highest safe number that allows us to keep all CUs enabled.
1046 */
1047 *late_alloc_wave64 = 2;
1048 } else {
1049 /* This is a good initial value, allowing 1 late_alloc wave per SIMD on num_cu - 2.
1050 */
1051 *late_alloc_wave64 = (info->min_good_cu_per_sa - 2) * 4;
1052 }
1053
1054 /* VS can't execute on one CU if the limit is > 2. */
1055 if (*late_alloc_wave64 > 2)
1056 *cu_mask = 0xfffe; /* 1 CU disabled */
1057 }
1058
1059 /* Max number that fits into the register field. */
1060 if (ngg) /* GS */
1061 *late_alloc_wave64 = MIN2(*late_alloc_wave64, G_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(~0u));
1062 else /* VS */
1063 *late_alloc_wave64 = MIN2(*late_alloc_wave64, G_00B11C_LIMIT(~0u));
1064 }
1065
ac_compute_cs_workgroup_size(const uint16_t sizes[3],bool variable,unsigned max)1066 unsigned ac_compute_cs_workgroup_size(const uint16_t sizes[3], bool variable, unsigned max)
1067 {
1068 if (variable)
1069 return max;
1070
1071 return sizes[0] * sizes[1] * sizes[2];
1072 }
1073
ac_compute_lshs_workgroup_size(enum amd_gfx_level gfx_level,gl_shader_stage stage,unsigned tess_num_patches,unsigned tess_patch_in_vtx,unsigned tess_patch_out_vtx)1074 unsigned ac_compute_lshs_workgroup_size(enum amd_gfx_level gfx_level, gl_shader_stage stage,
1075 unsigned tess_num_patches,
1076 unsigned tess_patch_in_vtx,
1077 unsigned tess_patch_out_vtx)
1078 {
1079 /* When tessellation is used, API VS runs on HW LS, API TCS runs on HW HS.
1080 * These two HW stages are merged on GFX9+.
1081 */
1082
1083 bool merged_shaders = gfx_level >= GFX9;
1084 unsigned ls_workgroup_size = tess_num_patches * tess_patch_in_vtx;
1085 unsigned hs_workgroup_size = tess_num_patches * tess_patch_out_vtx;
1086
1087 if (merged_shaders)
1088 return MAX2(ls_workgroup_size, hs_workgroup_size);
1089 else if (stage == MESA_SHADER_VERTEX)
1090 return ls_workgroup_size;
1091 else if (stage == MESA_SHADER_TESS_CTRL)
1092 return hs_workgroup_size;
1093 else
1094 unreachable("invalid LSHS shader stage");
1095 }
1096
ac_compute_esgs_workgroup_size(enum amd_gfx_level gfx_level,unsigned wave_size,unsigned es_verts,unsigned gs_inst_prims)1097 unsigned ac_compute_esgs_workgroup_size(enum amd_gfx_level gfx_level, unsigned wave_size,
1098 unsigned es_verts, unsigned gs_inst_prims)
1099 {
1100 /* ESGS may operate in workgroups if on-chip GS (LDS rings) are enabled.
1101 *
1102 * GFX6: Not possible in the HW.
1103 * GFX7-8 (unmerged): possible in the HW, but not implemented in Mesa.
1104 * GFX9+ (merged): implemented in Mesa.
1105 */
1106
1107 if (gfx_level <= GFX8)
1108 return wave_size;
1109
1110 unsigned workgroup_size = MAX2(es_verts, gs_inst_prims);
1111 return CLAMP(workgroup_size, 1, 256);
1112 }
1113
ac_compute_ngg_workgroup_size(unsigned es_verts,unsigned gs_inst_prims,unsigned max_vtx_out,unsigned prim_amp_factor)1114 unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims,
1115 unsigned max_vtx_out, unsigned prim_amp_factor)
1116 {
1117 /* NGG always operates in workgroups.
1118 *
1119 * For API VS/TES/GS:
1120 * - 1 invocation per input vertex
1121 * - 1 invocation per input primitive
1122 *
1123 * The same invocation can process both an input vertex and primitive,
1124 * however 1 invocation can only output up to 1 vertex and 1 primitive.
1125 */
1126
1127 unsigned max_vtx_in = es_verts < 256 ? es_verts : 3 * gs_inst_prims;
1128 unsigned max_prim_in = gs_inst_prims;
1129 unsigned max_prim_out = gs_inst_prims * prim_amp_factor;
1130 unsigned workgroup_size = MAX4(max_vtx_in, max_vtx_out, max_prim_in, max_prim_out);
1131
1132 return CLAMP(workgroup_size, 1, 256);
1133 }
1134
ac_compute_num_tess_patches(const struct radeon_info * info,uint32_t num_tcs_input_cp,uint32_t num_tcs_output_cp,uint32_t vram_per_patch,uint32_t lds_per_patch,uint32_t wave_size,bool tess_uses_primid)1135 uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t num_tcs_input_cp,
1136 uint32_t num_tcs_output_cp, uint32_t vram_per_patch,
1137 uint32_t lds_per_patch, uint32_t wave_size,
1138 bool tess_uses_primid)
1139 {
1140 /* The VGT HS block increments the patch ID unconditionally
1141 * within a single threadgroup. This results in incorrect
1142 * patch IDs when instanced draws are used.
1143 *
1144 * The intended solution is to restrict threadgroups to
1145 * a single instance by setting SWITCH_ON_EOI, which
1146 * should cause IA to split instances up. However, this
1147 * doesn't work correctly on GFX6 when there is no other
1148 * SE to switch to.
1149 */
1150 const bool has_primid_instancing_bug = info->gfx_level == GFX6 && info->max_se == 1;
1151 if (has_primid_instancing_bug && tess_uses_primid)
1152 return 1;
1153
1154 /* Ensure that we only need 4 waves per CU, so that we don't need to check
1155 * resource usage (such as whether we have enough VGPRs to fit the whole
1156 * threadgroup into the CU). It also ensures that the number of tcs in and out
1157 * vertices per threadgroup are at most 256, which is the hw limit.
1158 */
1159 const unsigned max_verts_per_patch = MAX2(num_tcs_input_cp, num_tcs_output_cp);
1160 unsigned num_patches = 256 / max_verts_per_patch;
1161
1162 /* Not necessary for correctness, but higher numbers are slower.
1163 * The hardware can do more, but we prefer fully occupied waves.
1164 * eg. 64 triangle patches means 3 fully occupied Wave64 waves.
1165 */
1166 num_patches = MIN2(num_patches, 64);
1167
1168 /* When distributed tessellation is unsupported, switch between SEs
1169 * at a higher frequency to manually balance the workload between SEs.
1170 */
1171 if (!info->has_distributed_tess && info->max_se > 1)
1172 num_patches = MIN2(num_patches, 16); /* recommended */
1173
1174 /* Make sure the output data fits in the offchip buffer */
1175 if (vram_per_patch) {
1176 const uint32_t tess_offchip_block_dw_size = info->family == CHIP_HAWAII ? 4096 : 8192;
1177 num_patches =
1178 MIN2(num_patches, (tess_offchip_block_dw_size * 4) / vram_per_patch);
1179 }
1180
1181 /* Make sure that the data fits in LDS. This assumes the shaders only
1182 * use LDS for the inputs and outputs.
1183 */
1184 if (lds_per_patch) {
1185 ASSERTED const unsigned max_lds_size = info->gfx_level >= GFX9 ? 64 * 1024 : 32 * 1024; /* hw limit */
1186 const unsigned target_lds_size = max_lds_size / 2; /* target at least 2 workgroups per CU */
1187 num_patches = MIN2(num_patches, target_lds_size / lds_per_patch);
1188 assert(num_patches * lds_per_patch <= max_lds_size);
1189 }
1190 num_patches = MAX2(num_patches, 1);
1191
1192 /* Make sure that vector lanes are fully occupied by cutting off the last wave
1193 * if it's only partially filled.
1194 */
1195 const unsigned temp_verts_per_tg = num_patches * max_verts_per_patch;
1196
1197 if (temp_verts_per_tg > wave_size &&
1198 (wave_size - temp_verts_per_tg % wave_size >= MAX2(max_verts_per_patch, 8)))
1199 num_patches = (temp_verts_per_tg & ~(wave_size - 1)) / max_verts_per_patch;
1200
1201 if (info->gfx_level == GFX6) {
1202 /* GFX6 bug workaround, related to power management. Limit LS-HS
1203 * threadgroups to only one wave.
1204 */
1205 const unsigned one_wave = wave_size / max_verts_per_patch;
1206 num_patches = MIN2(num_patches, one_wave);
1207 }
1208
1209 return num_patches;
1210 }
1211
1212 uint32_t
ac_compute_tess_lds_size(const struct radeon_info * info,uint32_t lds_per_patch,uint32_t num_patches)1213 ac_compute_tess_lds_size(const struct radeon_info *info, uint32_t lds_per_patch, uint32_t num_patches)
1214 {
1215 const unsigned lds_size = lds_per_patch * num_patches;
1216
1217 assert(lds_size <= (info->gfx_level >= GFX9 ? 65536 : 32768));
1218
1219 return align(lds_size, info->lds_encode_granularity) / info->lds_encode_granularity;
1220 }
1221
ac_apply_cu_en(uint32_t value,uint32_t clear_mask,unsigned value_shift,const struct radeon_info * info)1222 uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shift,
1223 const struct radeon_info *info)
1224 {
1225 /* Register field position and mask. */
1226 uint32_t cu_en_mask = ~clear_mask;
1227 unsigned cu_en_shift = ffs(cu_en_mask) - 1;
1228 /* The value being set. */
1229 uint32_t cu_en = (value & cu_en_mask) >> cu_en_shift;
1230
1231 uint32_t set_cu_en = info->spi_cu_en;
1232
1233 if (info->gfx_level >= GFX12 && clear_mask == 0) {
1234 /* The CU mask has 32 bits and is per SE, not per SA. This math doesn't work with
1235 * asymmetric WGP harvesting because SA0 doesn't always end on the same bit.
1236 */
1237 set_cu_en &= BITFIELD_MASK(info->max_good_cu_per_sa);
1238 set_cu_en |= set_cu_en << info->max_good_cu_per_sa;
1239 }
1240
1241 /* AND the field by spi_cu_en. */
1242 uint32_t spi_cu_en = info->spi_cu_en >> value_shift;
1243 return (value & ~cu_en_mask) |
1244 (((cu_en & spi_cu_en) << cu_en_shift) & cu_en_mask);
1245 }
1246
1247 /* Return the register value and tune bytes_per_wave to increase scratch performance. */
ac_get_scratch_tmpring_size(const struct radeon_info * info,unsigned bytes_per_wave,unsigned * max_seen_bytes_per_wave,uint32_t * tmpring_size)1248 void ac_get_scratch_tmpring_size(const struct radeon_info *info,
1249 unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave,
1250 uint32_t *tmpring_size)
1251 {
1252 /* SPI_TMPRING_SIZE and COMPUTE_TMPRING_SIZE are essentially scratch buffer descriptors.
1253 * WAVES means NUM_RECORDS. WAVESIZE is the size of each element, meaning STRIDE.
1254 * Thus, WAVESIZE must be constant while the scratch buffer is being used by the GPU.
1255 *
1256 * If you want to increase WAVESIZE without waiting for idle, you need to allocate a new
1257 * scratch buffer and use it instead. This will result in multiple scratch buffers being
1258 * used at the same time, each with a different WAVESIZE.
1259 *
1260 * If you want to decrease WAVESIZE, you don't have to. There is no advantage in decreasing
1261 * WAVESIZE after it's been increased.
1262 *
1263 * Shaders with SCRATCH_EN=0 don't allocate scratch space.
1264 */
1265 const unsigned size_shift = info->gfx_level >= GFX11 ? 8 : 10;
1266 const unsigned min_size_per_wave = BITFIELD_BIT(size_shift);
1267
1268 /* The LLVM shader backend should be reporting aligned scratch_sizes. */
1269 assert((bytes_per_wave & BITFIELD_MASK(size_shift)) == 0 &&
1270 "scratch size per wave should be aligned");
1271
1272 /* Add 1 scratch item to make the number of items odd. This should improve scratch
1273 * performance by more randomly distributing scratch waves among memory channels.
1274 */
1275 if (bytes_per_wave)
1276 bytes_per_wave |= min_size_per_wave;
1277
1278 *max_seen_bytes_per_wave = MAX2(*max_seen_bytes_per_wave, bytes_per_wave);
1279
1280 unsigned max_scratch_waves = info->max_scratch_waves;
1281 if (info->gfx_level >= GFX11)
1282 max_scratch_waves /= info->max_se; /* WAVES is per SE */
1283
1284 /* TODO: We could decrease WAVES to make the whole buffer fit into the infinity cache. */
1285 *tmpring_size = S_0286E8_WAVES(max_scratch_waves) |
1286 S_0286E8_WAVESIZE(*max_seen_bytes_per_wave >> size_shift);
1287 }
1288
1289 /* Get chip-agnostic memory instruction access flags (as opposed to chip-specific GLC/DLC/SLC)
1290 * from a NIR memory intrinsic.
1291 */
ac_get_mem_access_flags(const nir_intrinsic_instr * instr)1292 enum gl_access_qualifier ac_get_mem_access_flags(const nir_intrinsic_instr *instr)
1293 {
1294 enum gl_access_qualifier access =
1295 nir_intrinsic_has_access(instr) ? nir_intrinsic_access(instr) : 0;
1296
1297 /* Determine ACCESS_MAY_STORE_SUBDWORD. (for the GFX6 TC L1 bug workaround) */
1298 if (!nir_intrinsic_infos[instr->intrinsic].has_dest) {
1299 switch (instr->intrinsic) {
1300 case nir_intrinsic_bindless_image_store:
1301 access |= ACCESS_MAY_STORE_SUBDWORD;
1302 break;
1303
1304 case nir_intrinsic_store_ssbo:
1305 case nir_intrinsic_store_buffer_amd:
1306 case nir_intrinsic_store_global:
1307 case nir_intrinsic_store_global_amd:
1308 if (access & ACCESS_USES_FORMAT_AMD ||
1309 (nir_intrinsic_has_align_offset(instr) && nir_intrinsic_align(instr) % 4 != 0) ||
1310 ((instr->src[0].ssa->bit_size / 8) * instr->src[0].ssa->num_components) % 4 != 0)
1311 access |= ACCESS_MAY_STORE_SUBDWORD;
1312 break;
1313
1314 default:
1315 unreachable("unexpected store instruction");
1316 }
1317 }
1318
1319 return access;
1320 }
1321
1322 /* Convert chip-agnostic memory access flags into hw-specific cache flags.
1323 *
1324 * "access" must be a result of ac_get_mem_access_flags() with the appropriate ACCESS_TYPE_*
1325 * flags set.
1326 */
ac_get_hw_cache_flags(enum amd_gfx_level gfx_level,enum gl_access_qualifier access)1327 union ac_hw_cache_flags ac_get_hw_cache_flags(enum amd_gfx_level gfx_level,
1328 enum gl_access_qualifier access)
1329 {
1330 union ac_hw_cache_flags result;
1331 result.value = 0;
1332
1333 assert(util_bitcount(access & (ACCESS_TYPE_LOAD | ACCESS_TYPE_STORE |
1334 ACCESS_TYPE_ATOMIC)) == 1);
1335 assert(!(access & ACCESS_TYPE_SMEM) || access & ACCESS_TYPE_LOAD);
1336 assert(!(access & ACCESS_IS_SWIZZLED_AMD) || !(access & ACCESS_TYPE_SMEM));
1337 assert(!(access & ACCESS_MAY_STORE_SUBDWORD) || access & ACCESS_TYPE_STORE);
1338
1339 bool scope_is_device = access & (ACCESS_COHERENT | ACCESS_VOLATILE);
1340
1341 if (gfx_level >= GFX12) {
1342 if (access & ACCESS_CP_GE_COHERENT_AMD) {
1343 bool cp_sdma_ge_use_system_memory_scope = gfx_level == GFX12;
1344 result.gfx12.scope = cp_sdma_ge_use_system_memory_scope ?
1345 gfx12_scope_memory : gfx12_scope_device;
1346 } else if (scope_is_device) {
1347 result.gfx12.scope = gfx12_scope_device;
1348 } else {
1349 result.gfx12.scope = gfx12_scope_cu;
1350 }
1351
1352 if (access & ACCESS_NON_TEMPORAL) {
1353 if (access & ACCESS_TYPE_LOAD) {
1354 /* Don't use non_temporal for SMEM because it can't set regular_temporal for MALL. */
1355 if (!(access & ACCESS_TYPE_SMEM))
1356 result.gfx12.temporal_hint = gfx12_load_near_non_temporal_far_regular_temporal;
1357 } else if (access & ACCESS_TYPE_STORE) {
1358 result.gfx12.temporal_hint = gfx12_store_near_non_temporal_far_regular_temporal;
1359 } else {
1360 result.gfx12.temporal_hint = gfx12_atomic_non_temporal;
1361 }
1362 }
1363 } else if (gfx_level >= GFX11) {
1364 /* GFX11 simplified it and exposes what is actually useful.
1365 *
1366 * GLC means device scope for loads only. (stores and atomics are always device scope)
1367 * SLC means non-temporal for GL1 and GL2 caches. (GL1 = hit-evict, GL2 = stream, unavailable in SMEM)
1368 * DLC means non-temporal for MALL. (noalloc, i.e. coherent bypass)
1369 *
1370 * GL0 doesn't have a non-temporal flag, so you always get LRU caching in CU scope.
1371 */
1372 if (access & ACCESS_TYPE_LOAD && scope_is_device)
1373 result.value |= ac_glc;
1374
1375 if (access & ACCESS_NON_TEMPORAL && !(access & ACCESS_TYPE_SMEM))
1376 result.value |= ac_slc;
1377 } else if (gfx_level >= GFX10) {
1378 /* GFX10-10.3:
1379 *
1380 * VMEM and SMEM loads (SMEM only supports the first four):
1381 * !GLC && !DLC && !SLC means CU scope <== use for normal loads with CU scope
1382 * GLC && !DLC && !SLC means SA scope
1383 * !GLC && DLC && !SLC means CU scope, GL1 bypass
1384 * GLC && DLC && !SLC means device scope <== use for normal loads with device scope
1385 * !GLC && !DLC && SLC means CU scope, non-temporal (GL0 = GL1 = hit-evict, GL2 = stream) <== use for non-temporal loads with CU scope
1386 * GLC && !DLC && SLC means SA scope, non-temporal (GL1 = hit-evict, GL2 = stream)
1387 * !GLC && DLC && SLC means CU scope, GL0 non-temporal, GL1-GL2 coherent bypass (GL0 = hit-evict, GL1 = bypass, GL2 = noalloc)
1388 * GLC && DLC && SLC means device scope, GL2 coherent bypass (noalloc) <== use for non-temporal loads with device scope
1389 *
1390 * VMEM stores/atomics (stores are CU scope only if they overwrite the whole cache line,
1391 * atomics are always device scope, GL1 is always bypassed):
1392 * !GLC && !DLC && !SLC means CU scope <== use for normal stores with CU scope
1393 * GLC && !DLC && !SLC means device scope <== use for normal stores with device scope
1394 * !GLC && DLC && !SLC means CU scope, GL2 non-coherent bypass
1395 * GLC && DLC && !SLC means device scope, GL2 non-coherent bypass
1396 * !GLC && !DLC && SLC means CU scope, GL2 non-temporal (stream) <== use for non-temporal stores with CU scope
1397 * GLC && !DLC && SLC means device scope, GL2 non-temporal (stream) <== use for non-temporal stores with device scope
1398 * !GLC && DLC && SLC means CU scope, GL2 coherent bypass (noalloc)
1399 * GLC && DLC && SLC means device scope, GL2 coherent bypass (noalloc)
1400 *
1401 * "stream" allows write combining in GL2. "coherent bypass" doesn't.
1402 * "non-coherent bypass" doesn't guarantee ordering with any coherent stores.
1403 */
1404 if (scope_is_device && !(access & ACCESS_TYPE_ATOMIC))
1405 result.value |= ac_glc | (access & ACCESS_TYPE_LOAD ? ac_dlc : 0);
1406
1407 if (access & ACCESS_NON_TEMPORAL && !(access & ACCESS_TYPE_SMEM))
1408 result.value |= ac_slc;
1409 } else {
1410 /* GFX6-GFX9:
1411 *
1412 * VMEM loads:
1413 * !GLC && !SLC means CU scope
1414 * GLC && !SLC means (GFX6: device scope, GFX7-9: device scope [*])
1415 * !GLC && SLC means (GFX6: CU scope, GFX7: device scope, GFX8-9: CU scope), GL2 non-temporal (stream)
1416 * GLC && SLC means device scope, GL2 non-temporal (stream)
1417 *
1418 * VMEM stores (atomics don't have [*]):
1419 * !GLC && !SLC means (GFX6: CU scope, GFX7-9: device scope [*])
1420 * GLC && !SLC means (GFX6-7: device scope, GFX8-9: device scope [*])
1421 * !GLC && SLC means (GFX6: CU scope, GFX7-9: device scope [*]), GL2 non-temporal (stream)
1422 * GLC && SLC means device scope, GL2 non-temporal (stream)
1423 *
1424 * [*] data can be cached in GL1 for future CU scope
1425 *
1426 * SMEM loads:
1427 * GLC means device scope (available on GFX8+)
1428 */
1429 if (scope_is_device && !(access & ACCESS_TYPE_ATOMIC)) {
1430 /* SMEM doesn't support the device scope on GFX6-7. */
1431 assert(gfx_level >= GFX8 || !(access & ACCESS_TYPE_SMEM));
1432 result.value |= ac_glc;
1433 }
1434
1435 if (access & ACCESS_NON_TEMPORAL && !(access & ACCESS_TYPE_SMEM))
1436 result.value |= ac_slc;
1437
1438 /* GFX6 has a TC L1 bug causing corruption of 8bit/16bit stores. All store opcodes not
1439 * aligned to a dword are affected.
1440 */
1441 if (gfx_level == GFX6 && access & ACCESS_MAY_STORE_SUBDWORD)
1442 result.value |= ac_glc;
1443 }
1444
1445 if (access & ACCESS_IS_SWIZZLED_AMD) {
1446 if (gfx_level >= GFX12)
1447 result.gfx12.swizzled = true;
1448 else
1449 result.value |= ac_swizzled;
1450 }
1451
1452 return result;
1453 }
1454
ac_get_all_edge_flag_bits(enum amd_gfx_level gfx_level)1455 unsigned ac_get_all_edge_flag_bits(enum amd_gfx_level gfx_level)
1456 {
1457 return gfx_level >= GFX12 ?
1458 ((1u << 8) | (1u << 17) | (1u << 26)) :
1459 ((1u << 9) | (1u << 19) | (1u << 29));
1460 }
1461
1462 /**
1463 * Returns a unique index for a per-patch semantic name and index. The index
1464 * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
1465 * can be calculated.
1466 */
1467 unsigned
ac_shader_io_get_unique_index_patch(unsigned semantic)1468 ac_shader_io_get_unique_index_patch(unsigned semantic)
1469 {
1470 switch (semantic) {
1471 case VARYING_SLOT_TESS_LEVEL_OUTER:
1472 return 0;
1473 case VARYING_SLOT_TESS_LEVEL_INNER:
1474 return 1;
1475 default:
1476 if (semantic >= VARYING_SLOT_PATCH0 && semantic < VARYING_SLOT_PATCH0 + 30)
1477 return 2 + (semantic - VARYING_SLOT_PATCH0);
1478
1479 assert(!"invalid semantic");
1480 return 0;
1481 }
1482 }
1483