xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/radeonsi/si_shaderlib_nir.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2018 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "gallium/auxiliary/nir/pipe_nir.h"
8 #define AC_SURFACE_INCLUDE_NIR
9 #include "ac_surface.h"
10 #include "si_pipe.h"
11 #include "si_query.h"
12 #include "aco_interface.h"
13 #include "nir_format_convert.h"
14 #include "ac_nir_helpers.h"
15 
si_create_shader_state(struct si_context * sctx,nir_shader * nir)16 void *si_create_shader_state(struct si_context *sctx, nir_shader *nir)
17 {
18    sctx->b.screen->finalize_nir(sctx->b.screen, (void*)nir);
19    return pipe_shader_from_nir(&sctx->b, nir);
20 }
21 
22 /* unpack_2x16(src, x, y): x = src & 0xffff; y = src >> 16; */
unpack_2x16(nir_builder * b,nir_def * src,nir_def ** x,nir_def ** y)23 static void unpack_2x16(nir_builder *b, nir_def *src, nir_def **x, nir_def **y)
24 {
25    *x = nir_iand_imm(b, src, 0xffff);
26    *y = nir_ushr_imm(b, src, 16);
27 }
28 
si_create_dcc_retile_cs(struct si_context * sctx,struct radeon_surf * surf)29 void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf)
30 {
31    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,
32                                                   "dcc_retile");
33    b.shader->info.workgroup_size[0] = 8;
34    b.shader->info.workgroup_size[1] = 8;
35    b.shader->info.workgroup_size[2] = 1;
36    b.shader->info.cs.user_data_components_amd = 3;
37    b.shader->info.num_ssbos = 1;
38 
39    /* Get user data SGPRs. */
40    nir_def *user_sgprs = nir_load_user_data_amd(&b);
41 
42    /* Relative offset from the displayable DCC to the non-displayable DCC in the same buffer. */
43    nir_def *src_dcc_offset = nir_channel(&b, user_sgprs, 0);
44 
45    nir_def *src_dcc_pitch, *dst_dcc_pitch, *src_dcc_height, *dst_dcc_height;
46    unpack_2x16(&b, nir_channel(&b, user_sgprs, 1), &src_dcc_pitch, &src_dcc_height);
47    unpack_2x16(&b, nir_channel(&b, user_sgprs, 2), &dst_dcc_pitch, &dst_dcc_height);
48 
49    /* Get the 2D coordinates. */
50    nir_def *coord = ac_get_global_ids(&b, 2, 32);
51    nir_def *zero = nir_imm_int(&b, 0);
52 
53    /* Multiply the coordinates by the DCC block size (they are DCC block coordinates). */
54    coord = nir_imul(&b, coord, nir_imm_ivec2(&b, surf->u.gfx9.color.dcc_block_width,
55                                              surf->u.gfx9.color.dcc_block_height));
56 
57    nir_def *src_offset =
58       ac_nir_dcc_addr_from_coord(&b, &sctx->screen->info, surf->bpe, &surf->u.gfx9.color.dcc_equation,
59                                  src_dcc_pitch, src_dcc_height, zero, /* DCC slice size */
60                                  nir_channel(&b, coord, 0), nir_channel(&b, coord, 1), /* x, y */
61                                  zero, zero, zero); /* z, sample, pipe_xor */
62    src_offset = nir_iadd(&b, src_offset, src_dcc_offset);
63    nir_def *value = nir_load_ssbo(&b, 1, 8, zero, src_offset, .align_mul=1);
64 
65    nir_def *dst_offset =
66       ac_nir_dcc_addr_from_coord(&b, &sctx->screen->info, surf->bpe, &surf->u.gfx9.color.display_dcc_equation,
67                                  dst_dcc_pitch, dst_dcc_height, zero, /* DCC slice size */
68                                  nir_channel(&b, coord, 0), nir_channel(&b, coord, 1), /* x, y */
69                                  zero, zero, zero); /* z, sample, pipe_xor */
70    nir_store_ssbo(&b, value, zero, dst_offset, .write_mask=0x1, .align_mul=1);
71 
72    return si_create_shader_state(sctx, b.shader);
73 }
74 
gfx9_create_clear_dcc_msaa_cs(struct si_context * sctx,struct si_texture * tex)75 void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *tex)
76 {
77    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,
78                                                   "clear_dcc_msaa");
79    b.shader->info.workgroup_size[0] = 8;
80    b.shader->info.workgroup_size[1] = 8;
81    b.shader->info.workgroup_size[2] = 1;
82    b.shader->info.cs.user_data_components_amd = 2;
83    b.shader->info.num_ssbos = 1;
84 
85    /* Get user data SGPRs. */
86    nir_def *user_sgprs = nir_load_user_data_amd(&b);
87    nir_def *dcc_pitch, *dcc_height, *clear_value, *pipe_xor;
88    unpack_2x16(&b, nir_channel(&b, user_sgprs, 0), &dcc_pitch, &dcc_height);
89    unpack_2x16(&b, nir_channel(&b, user_sgprs, 1), &clear_value, &pipe_xor);
90    clear_value = nir_u2u16(&b, clear_value);
91 
92    /* Get the 2D coordinates. */
93    nir_def *coord = ac_get_global_ids(&b, 3, 32);
94    nir_def *zero = nir_imm_int(&b, 0);
95 
96    /* Multiply the coordinates by the DCC block size (they are DCC block coordinates). */
97    coord = nir_imul(&b, coord,
98                     nir_imm_ivec3(&b, tex->surface.u.gfx9.color.dcc_block_width,
99                                       tex->surface.u.gfx9.color.dcc_block_height,
100                                       tex->surface.u.gfx9.color.dcc_block_depth));
101 
102    nir_def *offset =
103       ac_nir_dcc_addr_from_coord(&b, &sctx->screen->info, tex->surface.bpe,
104                                  &tex->surface.u.gfx9.color.dcc_equation,
105                                  dcc_pitch, dcc_height, zero, /* DCC slice size */
106                                  nir_channel(&b, coord, 0), nir_channel(&b, coord, 1), /* x, y */
107                                  tex->buffer.b.b.array_size > 1 ? nir_channel(&b, coord, 2) : zero, /* z */
108                                  zero, pipe_xor); /* sample, pipe_xor */
109 
110    /* The trick here is that DCC elements for an even and the next odd sample are next to each other
111     * in memory, so we only need to compute the address for sample 0 and the next DCC byte is always
112     * sample 1. That's why the clear value has 2 bytes - we're clearing 2 samples at the same time.
113     */
114    nir_store_ssbo(&b, clear_value, zero, offset, .write_mask=0x1, .align_mul=2);
115 
116    return si_create_shader_state(sctx, b.shader);
117 }
118 
119 /* Create a compute shader implementing clear_buffer or copy_buffer. */
si_create_clear_buffer_rmw_cs(struct si_context * sctx)120 void *si_create_clear_buffer_rmw_cs(struct si_context *sctx)
121 {
122    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,
123                                                   "clear_buffer_rmw_cs");
124    b.shader->info.workgroup_size[0] = 64;
125    b.shader->info.workgroup_size[1] = 1;
126    b.shader->info.workgroup_size[2] = 1;
127    b.shader->info.cs.user_data_components_amd = 2;
128    b.shader->info.num_ssbos = 1;
129 
130    /* address = blockID * 64 + threadID; */
131    nir_def *address = ac_get_global_ids(&b, 1, 32);
132 
133    /* address = address * 16; (byte offset, loading one vec4 per thread) */
134    address = nir_ishl_imm(&b, address, 4);
135 
136    nir_def *zero = nir_imm_int(&b, 0);
137    nir_def *data = nir_load_ssbo(&b, 4, 32, zero, address, .align_mul = 4);
138 
139    /* Get user data SGPRs. */
140    nir_def *user_sgprs = nir_load_user_data_amd(&b);
141 
142    /* data &= inverted_writemask; */
143    data = nir_iand(&b, data, nir_channel(&b, user_sgprs, 1));
144    /* data |= clear_value_masked; */
145    data = nir_ior(&b, data, nir_channel(&b, user_sgprs, 0));
146 
147    nir_store_ssbo(&b, data, zero, address, .align_mul = 4);
148 
149    return si_create_shader_state(sctx, b.shader);
150 }
151 
152 /* This is used when TCS is NULL in the VS->TCS->TES chain. In this case,
153  * VS passes its outputs to TES directly, so the fixed-function shader only
154  * has to write TESSOUTER and TESSINNER.
155  */
si_create_passthrough_tcs(struct si_context * sctx)156 void *si_create_passthrough_tcs(struct si_context *sctx)
157 {
158    unsigned locations[PIPE_MAX_SHADER_OUTPUTS];
159 
160    struct si_shader_info *info = &sctx->shader.vs.cso->info;
161    for (unsigned i = 0; i < info->num_outputs; i++) {
162       locations[i] = info->output_semantic[i];
163    }
164 
165    nir_shader *tcs = nir_create_passthrough_tcs_impl(sctx->screen->nir_options, locations,
166                                                      info->num_outputs, sctx->patch_vertices);
167 
168    return si_create_shader_state(sctx, tcs);
169 }
170 
171 /* Store the clear color at the beginning of every 256B block. This is required when we clear DCC
172  * to GFX11_DCC_CLEAR_SINGLE.
173  */
si_clear_image_dcc_single_shader(struct si_context * sctx,bool is_msaa,unsigned wg_dim)174 void *si_clear_image_dcc_single_shader(struct si_context *sctx, bool is_msaa, unsigned wg_dim)
175 {
176    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,
177                                                   "write_clear_color_dcc_single");
178    b.shader->info.num_images = 1;
179    if (is_msaa)
180       BITSET_SET(b.shader->info.msaa_images, 0);
181    b.shader->info.workgroup_size[0] = 8;
182    b.shader->info.workgroup_size[1] = 8;
183    b.shader->info.cs.user_data_components_amd = 5;
184 
185    const struct glsl_type *img_type =
186       glsl_image_type(is_msaa ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D, true, GLSL_TYPE_FLOAT);
187    nir_variable *output_img = nir_variable_create(b.shader, nir_var_image, img_type, "out_img");
188    output_img->data.binding = 0;
189 
190    nir_def *global_id = nir_pad_vector_imm_int(&b, ac_get_global_ids(&b, wg_dim, 32), 0, 3);
191    nir_def *clear_color = nir_trim_vector(&b, nir_load_user_data_amd(&b), 4);
192 
193    nir_def *dcc_block_width, *dcc_block_height;
194    unpack_2x16(&b, nir_channel(&b, nir_load_user_data_amd(&b), 4), &dcc_block_width,
195                &dcc_block_height);
196 
197    /* Compute the coordinates. */
198    nir_def *coord = nir_trim_vector(&b, global_id, 2);
199    coord = nir_imul(&b, coord, nir_vec2(&b, dcc_block_width, dcc_block_height));
200    coord = nir_vec4(&b, nir_channel(&b, coord, 0), nir_channel(&b, coord, 1),
201                     nir_channel(&b, global_id, 2), nir_undef(&b, 1, 32));
202 
203    /* Store the clear color. */
204    nir_image_deref_store(&b, &nir_build_deref_var(&b, output_img)->def, coord, nir_imm_int(&b, 0),
205                          clear_color, nir_imm_int(&b, 0));
206 
207    return si_create_shader_state(sctx, b.shader);
208 }
209 
si_create_ubyte_to_ushort_compute_shader(struct si_context * sctx)210 void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx)
211 {
212    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,
213                                                   "ubyte_to_ushort");
214    b.shader->info.workgroup_size[0] = 64;
215    b.shader->info.workgroup_size[1] = 1;
216    b.shader->info.workgroup_size[2] = 1;
217    b.shader->info.num_ssbos = 2;
218 
219    nir_def *load_address = ac_get_global_ids(&b, 1, 32);
220    nir_def *store_address = nir_imul_imm(&b, load_address, 2);
221 
222    nir_def *ubyte_value = nir_load_ssbo(&b, 1, 8, nir_imm_int(&b, 1),
223                                         load_address, .access = ACCESS_RESTRICT);
224    nir_store_ssbo(&b, nir_u2u16(&b, ubyte_value), nir_imm_int(&b, 0),
225                   store_address, .access = ACCESS_RESTRICT);
226 
227    return si_create_shader_state(sctx, b.shader);
228 }
229 
230 /* Load samples from the image, and copy them to the same image. This looks like
231  * a no-op, but it's not. Loads use FMASK, while stores don't, so samples are
232  * reordered to match expanded FMASK.
233  *
234  * After the shader finishes, FMASK should be cleared to identity.
235  */
si_create_fmask_expand_cs(struct si_context * sctx,unsigned num_samples,bool is_array)236 void *si_create_fmask_expand_cs(struct si_context *sctx, unsigned num_samples, bool is_array)
237 {
238    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,
239                                                   "create_fmask_expand_cs");
240    b.shader->info.workgroup_size[0] = 8;
241    b.shader->info.workgroup_size[1] = 8;
242    b.shader->info.workgroup_size[2] = 1;
243 
244    /* Return an empty compute shader */
245    if (num_samples == 0)
246       return si_create_shader_state(sctx, b.shader);
247 
248    b.shader->info.num_images = 1;
249 
250    const struct glsl_type *img_type = glsl_image_type(GLSL_SAMPLER_DIM_MS, is_array, GLSL_TYPE_FLOAT);
251    nir_variable *img = nir_variable_create(b.shader, nir_var_image, img_type, "image");
252    img->data.access = ACCESS_RESTRICT;
253 
254    nir_def *z = nir_undef(&b, 1, 32);
255    if (is_array) {
256       z = nir_channel(&b, nir_load_workgroup_id(&b), 2);
257    }
258 
259    nir_def *zero = nir_imm_int(&b, 0);
260    nir_def *address = ac_get_global_ids(&b, 2, 32);
261 
262    nir_def *sample[8], *addresses[8];
263    assert(num_samples <= ARRAY_SIZE(sample));
264 
265    nir_def *img_def = &nir_build_deref_var(&b, img)->def;
266 
267    /* Load samples, resolving FMASK. */
268    for (unsigned i = 0; i < num_samples; i++) {
269       nir_def *it = nir_imm_int(&b, i);
270       sample[i] = nir_vec4(&b, nir_channel(&b, address, 0), nir_channel(&b, address, 1), z, it);
271       addresses[i] = nir_image_deref_load(&b, 4, 32, img_def, sample[i], it, zero,
272                                           .access = ACCESS_RESTRICT,
273                                           .image_dim = GLSL_SAMPLER_DIM_2D,
274                                           .image_array = is_array);
275    }
276 
277    /* Store samples, ignoring FMASK. */
278    for (unsigned i = 0; i < num_samples; i++) {
279       nir_image_deref_store(&b, img_def, sample[i], nir_imm_int(&b, i), addresses[i], zero,
280                             .access = ACCESS_RESTRICT,
281                             .image_dim = GLSL_SAMPLER_DIM_2D,
282                             .image_array = is_array);
283    }
284 
285    return si_create_shader_state(sctx, b.shader);
286 }
287 
288 /* This is just a pass-through shader with 1-3 MOV instructions. */
si_get_blitter_vs(struct si_context * sctx,enum blitter_attrib_type type,unsigned num_layers)289 void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, unsigned num_layers)
290 {
291    unsigned vs_blit_property;
292    void **vs;
293 
294    switch (type) {
295    case UTIL_BLITTER_ATTRIB_NONE:
296       vs = num_layers > 1 ? &sctx->vs_blit_pos_layered : &sctx->vs_blit_pos;
297       vs_blit_property = SI_VS_BLIT_SGPRS_POS;
298       break;
299    case UTIL_BLITTER_ATTRIB_COLOR:
300       vs = num_layers > 1 ? &sctx->vs_blit_color_layered : &sctx->vs_blit_color;
301       vs_blit_property = SI_VS_BLIT_SGPRS_POS_COLOR;
302       break;
303    case UTIL_BLITTER_ATTRIB_TEXCOORD_XY:
304    case UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW:
305       assert(num_layers == 1);
306       vs = &sctx->vs_blit_texcoord;
307       vs_blit_property = SI_VS_BLIT_SGPRS_POS_TEXCOORD;
308       break;
309    default:
310       assert(0);
311       return NULL;
312    }
313 
314    if (*vs)
315       return *vs;
316 
317    /* Add 1 for the attribute ring address. */
318    if (sctx->gfx_level >= GFX11 && type != UTIL_BLITTER_ATTRIB_NONE)
319       vs_blit_property++;
320 
321    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_VERTEX, sctx->screen->nir_options,
322                                                   "get_blitter_vs");
323 
324    /* Tell the shader to load VS inputs from SGPRs: */
325    b.shader->info.vs.blit_sgprs_amd = vs_blit_property;
326    b.shader->info.vs.window_space_position = true;
327 
328    const struct glsl_type *vec4 = glsl_vec4_type();
329 
330    nir_copy_var(&b,
331                 nir_create_variable_with_location(b.shader, nir_var_shader_out,
332                                                   VARYING_SLOT_POS, vec4),
333                 nir_create_variable_with_location(b.shader, nir_var_shader_in,
334                                                   VERT_ATTRIB_GENERIC0, vec4));
335 
336    if (type != UTIL_BLITTER_ATTRIB_NONE) {
337       nir_copy_var(&b,
338                    nir_create_variable_with_location(b.shader, nir_var_shader_out,
339                                                      VARYING_SLOT_VAR0, vec4),
340                    nir_create_variable_with_location(b.shader, nir_var_shader_in,
341                                                      VERT_ATTRIB_GENERIC1, vec4));
342    }
343 
344    if (num_layers > 1) {
345       nir_variable *out_layer =
346          nir_create_variable_with_location(b.shader, nir_var_shader_out,
347                                            VARYING_SLOT_LAYER, glsl_int_type());
348       out_layer->data.interpolation = INTERP_MODE_NONE;
349 
350       nir_copy_var(&b, out_layer,
351                    nir_create_variable_with_location(b.shader, nir_var_system_value,
352                                                      SYSTEM_VALUE_INSTANCE_ID, glsl_int_type()));
353    }
354 
355    *vs = si_create_shader_state(sctx, b.shader);
356    return *vs;
357 }
358 
359 /* Create the compute shader that is used to collect the results.
360  *
361  * One compute grid with a single thread is launched for every query result
362  * buffer. The thread (optionally) reads a previous summary buffer, then
363  * accumulates data from the query result buffer, and writes the result either
364  * to a summary buffer to be consumed by the next grid invocation or to the
365  * user-supplied buffer.
366  *
367  * Data layout:
368  *
369  * CONST
370  *  0.x = end_offset
371  *  0.y = result_stride
372  *  0.z = result_count
373  *  0.w = bit field:
374  *          1: read previously accumulated values
375  *          2: write accumulated values for chaining
376  *          4: write result available
377  *          8: convert result to boolean (0/1)
378  *         16: only read one dword and use that as result
379  *         32: apply timestamp conversion
380  *         64: store full 64 bits result
381  *        128: store signed 32 bits result
382  *        256: SO_OVERFLOW mode: take the difference of two successive half-pairs
383  *  1.x = fence_offset
384  *  1.y = pair_stride
385  *  1.z = pair_count
386  *
387  */
si_create_query_result_cs(struct si_context * sctx)388 void *si_create_query_result_cs(struct si_context *sctx)
389 {
390    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,
391                                                   "create_query_result_cs");
392    b.shader->info.workgroup_size[0] = 1;
393    b.shader->info.workgroup_size[1] = 1;
394    b.shader->info.workgroup_size[2] = 1;
395    b.shader->info.num_ubos = 1;
396    b.shader->info.num_ssbos = 3;
397    b.shader->num_uniforms = 2;
398 
399    nir_def *var_undef = nir_undef(&b, 1, 32);
400    nir_def *zero = nir_imm_int(&b, 0);
401    nir_def *one = nir_imm_int(&b, 1);
402    nir_def *two = nir_imm_int(&b, 2);
403    nir_def *four = nir_imm_int(&b, 4);
404    nir_def *eight = nir_imm_int(&b, 8);
405    nir_def *sixteen = nir_imm_int(&b, 16);
406    nir_def *thirty_one = nir_imm_int(&b, 31);
407    nir_def *sixty_four = nir_imm_int(&b, 64);
408 
409    /* uint32_t x, y, z = 0; */
410    nir_function_impl *e = nir_shader_get_entrypoint(b.shader);
411    nir_variable *x = nir_local_variable_create(e, glsl_uint_type(), "x");
412    nir_store_var(&b, x, var_undef, 0x1);
413    nir_variable *y = nir_local_variable_create(e, glsl_uint_type(), "y");
414    nir_store_var(&b, y, var_undef, 0x1);
415    nir_variable *z = nir_local_variable_create(e, glsl_uint_type(), "z");
416    nir_store_var(&b, z, zero, 0x1);
417 
418    /* uint32_t buff_0[4] = load_ubo(0, 0); */
419    nir_def *buff_0 = nir_load_ubo(&b, 4, 32, zero, zero, .range_base = 0, .range = 16);
420    /* uint32_t buff_1[4] = load_ubo(1, 16); */
421    nir_def *buff_1 = nir_load_ubo(&b, 4, 32, zero, sixteen, .range_base = 16, .range = 16);
422 
423    /* uint32_t b0_bitfield = buff_0.w; */
424    nir_def *b0_bitfield = nir_channel(&b, buff_0, 3);
425 
426    /* Check result availability.
427     *    if (b0_bitfield & (1u << 4)) {
428     *       ...
429     */
430    nir_def *is_one_dword_result = nir_i2b(&b, nir_iand(&b, b0_bitfield, sixteen));
431    nir_if *if_one_dword_result = nir_push_if(&b, is_one_dword_result); {
432 
433       /*   int32_t value = load_ssbo(0, fence_offset);
434        *   z = ~(value >> 31);
435        */
436       nir_def *value = nir_load_ssbo(&b, 1, 32, zero, nir_channel(&b, buff_1, 0));
437       nir_def *bitmask = nir_inot(&b, nir_ishr(&b, value, thirty_one));
438       nir_store_var(&b, z, bitmask, 0x1);
439 
440       /* Load result if available.
441        *    if (value < 0) {
442        *       uint32_t result[2] = load_ssbo(0, 0);
443        *       x = result[0];
444        *       y = result[1];
445        *    }
446        */
447       nir_if *if_negative = nir_push_if(&b, nir_ilt(&b, value, zero)); {
448          nir_def *result = nir_load_ssbo(&b, 2, 32, zero, zero);
449          nir_store_var(&b, x, nir_channel(&b, result, 0), 0x1);
450          nir_store_var(&b, y, nir_channel(&b, result, 1), 0x1);
451       }
452       nir_pop_if(&b, if_negative);
453    } nir_push_else(&b, if_one_dword_result); {
454 
455       /* } else {
456        *    x = 0; y = 0;
457        */
458       nir_store_var(&b, x, zero, 0x1);
459       nir_store_var(&b, y, zero, 0x1);
460 
461       /* Load previously accumulated result if requested.
462        *    if (b0_bitfield & (1u << 0)) {
463        *       uint32_t result[3] = load_ssbo(1, 0);
464        *       x = result[0];
465        *       y = result[1];
466        *       z = result[2];
467        *    }
468        */
469       nir_def *is_prev_acc_result = nir_i2b(&b, nir_iand(&b, b0_bitfield, one));
470       nir_if *if_prev_acc_result = nir_push_if(&b, is_prev_acc_result); {
471          nir_def *result = nir_load_ssbo(&b, 3, 32, one, zero);
472          nir_store_var(&b, x, nir_channel(&b, result, 0), 0x1);
473          nir_store_var(&b, y, nir_channel(&b, result, 1), 0x1);
474          nir_store_var(&b, z, nir_channel(&b, result, 2), 0x1);
475       }
476       nir_pop_if(&b, if_prev_acc_result);
477 
478       /* if (!z) {
479        *    uint32_t result_index = 0;
480        *    uint32_t pitch = 0;
481        *    ...
482        */
483       nir_def *z_value = nir_load_var(&b, z);
484       nir_if *if_not_z = nir_push_if(&b, nir_ieq(&b, z_value, zero)); {
485          nir_variable *outer_loop_iter =
486             nir_local_variable_create(e, glsl_uint_type(), "outer_loop_iter");
487          nir_store_var(&b, outer_loop_iter, zero, 0x1);
488          nir_variable *pitch = nir_local_variable_create(e, glsl_uint_type(), "pitch");
489          nir_store_var(&b, pitch, zero, 0x1);
490 
491          /* Outer loop.
492           *   while (result_index <= result_count) {
493           *      ...
494           */
495          nir_loop *loop_outer = nir_push_loop(&b); {
496             nir_def *result_index = nir_load_var(&b, outer_loop_iter);
497             nir_def *is_result_index_out_of_bound =
498                nir_uge(&b, result_index, nir_channel(&b, buff_0, 2));
499             nir_if *if_out_of_bound = nir_push_if(&b, is_result_index_out_of_bound); {
500                nir_jump(&b, nir_jump_break);
501             }
502             nir_pop_if(&b, if_out_of_bound);
503 
504             /* Load fence and check result availability.
505              *    pitch = i * result_stride;
506              *    uint32_t address = fence_offset + pitch;
507              *    int32_t value = load_ssbo(0, address);
508              *    z = ~(value >> 31);
509              */
510             nir_def *pitch_outer_loop = nir_imul(&b, result_index, nir_channel(&b, buff_0, 1));
511             nir_store_var(&b, pitch, pitch_outer_loop, 0x1);
512             nir_def *address = nir_iadd(&b, pitch_outer_loop, nir_channel(&b, buff_1, 0));
513             nir_def *value = nir_load_ssbo(&b, 1, 32, zero, address);
514             nir_def *bitmask = nir_inot(&b, nir_ishr(&b, value, thirty_one));
515             nir_store_var(&b, z, bitmask, 0x1);
516 
517             /*    if (z) {
518              *       break;
519              *    }
520              */
521             nir_if *if_result_available = nir_push_if(&b, nir_i2b(&b, bitmask)); {
522                nir_jump(&b, nir_jump_break);
523             }
524             nir_pop_if(&b, if_result_available);
525 
526             /* Inner loop iterator.
527              *    uint32_t i = 0;
528              */
529             nir_variable *inner_loop_iter =
530                nir_local_variable_create(e, glsl_uint_type(), "inner_loop_iter");
531             nir_store_var(&b, inner_loop_iter, zero, 0x1);
532 
533             /* Inner loop.
534              *    do {
535              *       ...
536              */
537             nir_loop *loop_inner = nir_push_loop(&b); {
538                nir_def *pitch_inner_loop = nir_load_var(&b, pitch);
539                nir_def *i = nir_load_var(&b, inner_loop_iter);
540 
541                /* Load start and end.
542                 *    uint64_t first = load_ssbo(0, pitch);
543                 *    uint64_t second = load_ssbo(0, pitch + end_offset);
544                 *    uint64_t start_half_pair = second - first;
545                 */
546                nir_def *first = nir_load_ssbo(&b, 1, 64, zero, pitch_inner_loop);
547                nir_def *new_pitch = nir_iadd(&b, pitch_inner_loop, nir_channel(&b, buff_0, 0));
548                nir_def *second = nir_load_ssbo(&b, 1, 64, zero, new_pitch);
549                nir_def *start_half_pair = nir_isub(&b, second, first);
550 
551                /* Load second start/end half-pair and take the difference.
552                 *    if (b0_bitfield & (1u << 8)) {
553                 *       uint64_t first = load_ssbo(0, pitch + 8);
554                 *       uint64_t second = load_ssbo(0, pitch + end_offset + 8);
555                 *       uint64_t end_half_pair = second - first;
556                 *       uint64_t difference = start_half_pair - end_half_pair;
557                 *    }
558                 */
559                nir_def *difference;
560                nir_def *is_so_overflow_mode = nir_i2b(&b, nir_iand_imm(&b, b0_bitfield, 256));
561                nir_if *if_so_overflow_mode = nir_push_if(&b, is_so_overflow_mode); {
562                   first = nir_load_ssbo(&b, 1, 64, zero, nir_iadd(&b, pitch_inner_loop, eight));
563                   second = nir_load_ssbo(&b, 1, 64, zero, nir_iadd(&b, new_pitch, eight));
564                   nir_def *end_half_pair = nir_isub(&b, second, first);
565                   difference = nir_isub(&b, start_half_pair, end_half_pair);
566                }
567                nir_pop_if(&b, if_so_overflow_mode);
568 
569                /* uint64_t sum = (x | (uint64_t) y << 32) + difference; */
570                nir_def *sum = nir_iadd(&b,
571                                        nir_pack_64_2x32_split(&b,
572                                                               nir_load_var(&b, x),
573                                                               nir_load_var(&b, y)),
574                                        nir_if_phi(&b, difference, start_half_pair));
575                sum = nir_unpack_64_2x32(&b, sum);
576 
577                /* Increment inner loop iterator.
578                 *    i++;
579                 */
580                i = nir_iadd(&b, i, one);
581                nir_store_var(&b, inner_loop_iter, i, 0x1);
582 
583                /* Update pitch value.
584                 *    pitch = i * pair_stride + pitch;
585                 */
586                nir_def *incremented_pitch = nir_iadd(&b,
587                                              nir_imul(&b, i, nir_channel(&b, buff_1, 1)),
588                                              pitch_outer_loop);
589                nir_store_var(&b, pitch, incremented_pitch, 0x1);
590 
591                /* Update x and y.
592                 *    x = sum.x;
593                 *    y = sum.x >> 32;
594                 */
595                nir_store_var(&b, x, nir_channel(&b, sum, 0), 0x1);
596                nir_store_var(&b, y, nir_channel(&b, sum, 1), 0x1);
597 
598                /* } while (i < pair_count);
599                */
600                nir_def *is_pair_count_exceeded = nir_uge(&b, i, nir_channel(&b, buff_1, 2));
601                nir_if *if_pair_count_exceeded = nir_push_if(&b, is_pair_count_exceeded); {
602                   nir_jump(&b, nir_jump_break);
603                }
604                nir_pop_if(&b, if_pair_count_exceeded);
605             }
606             nir_pop_loop(&b, loop_inner);
607 
608             /* Increment pair iterator.
609              *    result_index++;
610              */
611             nir_store_var(&b, outer_loop_iter, nir_iadd(&b, result_index, one), 0x1);
612          }
613          nir_pop_loop(&b, loop_outer);
614       }
615       nir_pop_if(&b, if_not_z);
616    }
617    nir_pop_if(&b, if_one_dword_result);
618 
619    nir_def *x_value = nir_load_var(&b, x);
620    nir_def *y_value = nir_load_var(&b, y);
621    nir_def *z_value = nir_load_var(&b, z);
622 
623    /* Store accumulated data for chaining.
624     *    if (b0_bitfield & (1u << 1)) {
625     *       store_ssbo(<x, y, z>, 2, 0);
626     */
627    nir_def *is_acc_chaining = nir_i2b(&b, nir_iand(&b, b0_bitfield, two));
628    nir_if *if_acc_chaining = nir_push_if(&b, is_acc_chaining); {
629       nir_store_ssbo(&b, nir_vec3(&b, x_value, y_value, z_value), two, zero);
630    } nir_push_else(&b, if_acc_chaining); {
631 
632       /* Store result availability.
633        *    } else {
634        *       if (b0_bitfield & (1u << 2)) {
635        *          store_ssbo((~z & 1), 2, 0);
636        *          ...
637        */
638       nir_def *is_result_available = nir_i2b(&b, nir_iand(&b, b0_bitfield, four));
639       nir_if *if_result_available = nir_push_if(&b, is_result_available); {
640          nir_store_ssbo(&b, nir_iand(&b, nir_inot(&b, z_value), one), two, zero);
641 
642          /* Store full 64 bits result.
643           *    if (b0_bitfield & (1u << 6)) {
644           *       store_ssbo(<0, 0>, 2, 0);
645           *    }
646           */
647          nir_def *is_result_64_bits = nir_i2b(&b, nir_iand(&b, b0_bitfield, sixty_four));
648          nir_if *if_result_64_bits = nir_push_if(&b, is_result_64_bits); {
649             nir_store_ssbo(&b, nir_imm_ivec2(&b, 0, 0), two, zero,
650                            .write_mask = (1u << 1));
651          }
652          nir_pop_if(&b, if_result_64_bits);
653       } nir_push_else(&b, if_result_available); {
654 
655          /* } else {
656           *    if (~z) {
657           *       ...
658           */
659          nir_def *is_bitwise_not_z = nir_i2b(&b, nir_inot(&b, z_value));
660          nir_if *if_bitwise_not_z = nir_push_if(&b, is_bitwise_not_z); {
661             nir_def *ts_x, *ts_y;
662 
663             /* Apply timestamp conversion.
664              *    if (b0_bitfield & (1u << 5)) {
665              *       uint64_t xy_million = (x | (uint64_t) y << 32) * (uint64_t) 1000000;
666              *       uint64_t ts_converted = xy_million / (uint64_t) clock_crystal_frequency;
667              *       x = ts_converted.x;
668              *       y = ts_converted.x >> 32;
669              *    }
670              */
671             nir_def *is_apply_timestamp = nir_i2b(&b, nir_iand_imm(&b, b0_bitfield, 32));
672             nir_if *if_apply_timestamp = nir_push_if(&b, is_apply_timestamp); {
673                /* Add the frequency into the shader for timestamp conversion
674                 * so that the backend can use the full range of optimizations
675                 * for divide-by-constant.
676                 */
677                nir_def *clock_crystal_frequency =
678                   nir_imm_int64(&b, sctx->screen->info.clock_crystal_freq);
679 
680                nir_def *xy_million = nir_imul(&b,
681                                            nir_pack_64_2x32_split(&b, x_value, y_value),
682                                            nir_imm_int64(&b, 1000000));
683                nir_def *ts_converted = nir_udiv(&b, xy_million, clock_crystal_frequency);
684                ts_converted = nir_unpack_64_2x32(&b, ts_converted);
685                ts_x = nir_channel(&b, ts_converted, 0);
686                ts_y = nir_channel(&b, ts_converted, 1);
687             }
688             nir_pop_if(&b, if_apply_timestamp);
689 
690             nir_def *nx = nir_if_phi(&b, ts_x, x_value);
691             nir_def *ny = nir_if_phi(&b, ts_y, y_value);
692 
693             /* x = b0_bitfield & (1u << 3) ? ((x | (uint64_t) y << 32) != 0) : x;
694              * y = b0_bitfield & (1u << 3) ? 0 : y;
695              */
696             nir_def *is_convert_to_bool = nir_i2b(&b, nir_iand(&b, b0_bitfield, eight));
697             nir_def *xy = nir_pack_64_2x32_split(&b, nx, ny);
698             nir_def *is_xy = nir_b2i32(&b, nir_ine(&b, xy, nir_imm_int64(&b, 0)));
699             nx = nir_bcsel(&b, is_convert_to_bool, is_xy, nx);
700             ny = nir_bcsel(&b, is_convert_to_bool, zero, ny);
701 
702             /* if (b0_bitfield & (1u << 6)) {
703              *    store_ssbo(<x, y>, 2, 0);
704              * }
705              */
706             nir_def *is_result_64_bits = nir_i2b(&b, nir_iand(&b, b0_bitfield, sixty_four));
707             nir_if *if_result_64_bits = nir_push_if(&b, is_result_64_bits); {
708                nir_store_ssbo(&b, nir_vec2(&b, nx, ny), two, zero);
709             } nir_push_else(&b, if_result_64_bits); {
710 
711                /* Clamping.
712                 *    } else {
713                 *       x = y ? UINT32_MAX : x;
714                 *       x = b0_bitfield & (1u << 7) ? min(x, INT_MAX) : x;
715                 *       store_ssbo(x, 2, 0);
716                 *    }
717                 */
718                nir_def *is_y = nir_ine(&b, ny, zero);
719                nx = nir_bcsel(&b, is_y, nir_imm_int(&b, UINT32_MAX), nx);
720                nir_def *is_signed_32bit_result = nir_i2b(&b, nir_iand_imm(&b, b0_bitfield, 128));
721                nir_def *min = nir_umin(&b, nx, nir_imm_int(&b, INT_MAX));
722                nx = nir_bcsel(&b, is_signed_32bit_result, min, nx);
723                nir_store_ssbo(&b, nx, two, zero);
724             }
725             nir_pop_if(&b, if_result_64_bits);
726          }
727          nir_pop_if(&b, if_bitwise_not_z);
728       }
729       nir_pop_if(&b, if_result_available);
730    }
731    nir_pop_if(&b, if_acc_chaining);
732 
733    return si_create_shader_state(sctx, b.shader);
734 }
735 
736 /* Create the compute shader that is used to collect the results of gfx10+
737  * shader queries.
738  *
739  * One compute grid with a single thread is launched for every query result
740  * buffer. The thread (optionally) reads a previous summary buffer, then
741  * accumulates data from the query result buffer, and writes the result either
742  * to a summary buffer to be consumed by the next grid invocation or to the
743  * user-supplied buffer.
744  *
745  * Data layout:
746  *
747  * CONST
748  *  0.x = config;
749  *          [0:2] the low 3 bits indicate the mode:
750  *             0: sum up counts
751  *             1: determine result availability and write it as a boolean
752  *             2: SO_OVERFLOW
753  *          3: SO_ANY_OVERFLOW
754  *        the remaining bits form a bitfield:
755  *          8: write result as a 64-bit value
756  *  0.y = offset in bytes to counts or stream for SO_OVERFLOW mode
757  *  0.z = chain bit field:
758  *          1: have previous summary buffer
759  *          2: write next summary buffer
760  *  0.w = result_count
761  */
gfx11_create_sh_query_result_cs(struct si_context * sctx)762 void *gfx11_create_sh_query_result_cs(struct si_context *sctx)
763 {
764    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,
765                                                   "gfx11_create_sh_query_result_cs");
766    b.shader->info.workgroup_size[0] = 1;
767    b.shader->info.workgroup_size[1] = 1;
768    b.shader->info.workgroup_size[2] = 1;
769    b.shader->info.num_ubos = 1;
770    b.shader->info.num_ssbos = 3;
771    b.shader->num_uniforms = 1;
772 
773    nir_def *zero = nir_imm_int(&b, 0);
774    nir_def *one = nir_imm_int(&b, 1);
775    nir_def *two = nir_imm_int(&b, 2);
776    nir_def *four = nir_imm_int(&b, 4);
777    nir_def *minus_one = nir_imm_int(&b, 0xffffffff);
778 
779    /* uint32_t acc_result = 0, acc_missing = 0; */
780    nir_function_impl *e = nir_shader_get_entrypoint(b.shader);
781    nir_variable *acc_result = nir_local_variable_create(e, glsl_uint_type(), "acc_result");
782    nir_store_var(&b, acc_result, zero, 0x1);
783    nir_variable *acc_missing = nir_local_variable_create(e, glsl_uint_type(), "acc_missing");
784    nir_store_var(&b, acc_missing, zero, 0x1);
785 
786    /* uint32_t buff_0[4] = load_ubo(0, 0); */
787    nir_def *buff_0 = nir_load_ubo(&b, 4, 32, zero, zero, .range_base = 0, .range = 16);
788 
789    /* if((chain & 1) {
790     *    uint32_t result[2] = load_ssbo(1, 0);
791     *    acc_result = result[0];
792     *    acc_missing = result[1];
793     * }
794     */
795    nir_def *is_prev_summary_buffer = nir_i2b(&b, nir_iand(&b, nir_channel(&b, buff_0, 2), one));
796    nir_if *if_prev_summary_buffer = nir_push_if(&b, is_prev_summary_buffer); {
797       nir_def *result = nir_load_ssbo(&b, 2, 32, one, zero);
798          nir_store_var(&b, acc_result, nir_channel(&b, result, 0), 0x1);
799          nir_store_var(&b, acc_missing, nir_channel(&b, result, 1), 0x1);
800    }
801    nir_pop_if(&b, if_prev_summary_buffer);
802 
803    /* uint32_t mode = config & 0b111;
804     * bool is_overflow = mode >= 2;
805     */
806    nir_def *mode = nir_iand_imm(&b, nir_channel(&b, buff_0, 0), 0b111);
807    nir_def *is_overflow = nir_uge(&b, mode, two);
808 
809    /* uint32_t result_remaining = (is_overflow && acc_result) ? 0 : result_count; */
810    nir_variable *result_remaining = nir_local_variable_create(e, glsl_uint_type(), "result_remaining");
811    nir_variable *base_offset = nir_local_variable_create(e, glsl_uint_type(), "base_offset");
812    nir_def *state = nir_iand(&b,
813                              nir_isub(&b, zero, nir_b2i32(&b, is_overflow)),
814                              nir_load_var(&b, acc_result));
815    nir_def *value = nir_bcsel(&b, nir_i2b(&b, state), zero, nir_channel(&b, buff_0, 3));
816    nir_store_var(&b, result_remaining, value, 0x1);
817 
818    /* uint32_t base_offset = 0; */
819    nir_store_var(&b, base_offset, zero, 0x1);
820 
821    /* Outer loop begin.
822     *   while (!result_remaining) {
823     *      ...
824     */
825    nir_loop *loop_outer = nir_push_loop(&b); {
826       nir_def *condition = nir_load_var(&b, result_remaining);
827       nir_if *if_not_condition = nir_push_if(&b, nir_ieq(&b, condition, zero)); {
828          nir_jump(&b, nir_jump_break);
829       }
830       nir_pop_if(&b, if_not_condition);
831 
832       /* result_remaining--; */
833       condition = nir_iadd(&b, condition, minus_one);
834       nir_store_var(&b, result_remaining, condition, 0x1);
835 
836       /* uint32_t fence = load_ssbo(0, base_offset + sizeof(gfx11_sh_query_buffer_mem.stream)); */
837       nir_def *b_offset = nir_load_var(&b, base_offset);
838       uint64_t buffer_mem_stream_size = sizeof(((struct gfx11_sh_query_buffer_mem*)0)->stream);
839       nir_def *fence = nir_load_ssbo(&b, 1, 32, zero,
840                                     nir_iadd_imm(&b, b_offset, buffer_mem_stream_size));
841 
842       /* if (!fence) {
843        *    acc_missing = ~0u;
844        *    break;
845        * }
846        */
847       nir_def *is_zero = nir_ieq(&b, fence, zero);
848       nir_def *y_value = nir_isub(&b, zero, nir_b2i32(&b, is_zero));
849       nir_store_var(&b, acc_missing, y_value, 0x1);
850       nir_if *if_ssbo_zero = nir_push_if(&b, is_zero); {
851          nir_jump(&b, nir_jump_break);
852       }
853       nir_pop_if(&b, if_ssbo_zero);
854 
855       /* stream_offset = base_offset + offset; */
856       nir_def *s_offset = nir_iadd(&b, b_offset, nir_channel(&b, buff_0, 1));
857 
858       /* if (!(config & 7)) {
859        *    acc_result += buffer[0]@stream_offset;
860        * }
861        */
862       nir_if *if_sum_up_counts = nir_push_if(&b, nir_ieq(&b, mode, zero)); {
863          nir_def *x_value = nir_load_ssbo(&b, 1, 32, zero, s_offset);
864          x_value = nir_iadd(&b, nir_load_var(&b, acc_result), x_value);
865          nir_store_var(&b, acc_result, x_value, 0x1);
866       }
867       nir_pop_if(&b, if_sum_up_counts);
868 
869       /* if (is_overflow) {
870        *    uint32_t count = (config & 1) ? 4 : 1;
871        *    ...
872        */
873       nir_if *if_overflow = nir_push_if(&b, is_overflow); {
874          nir_def *is_result_available = nir_i2b(&b, nir_iand(&b, mode, one));
875          nir_def *initial_count = nir_bcsel(&b, is_result_available, four, one);
876 
877          nir_variable *count =
878             nir_local_variable_create(e, glsl_uint_type(), "count");
879          nir_store_var(&b, count, initial_count, 0x1);
880 
881          nir_variable *stream_offset =
882             nir_local_variable_create(e, glsl_uint_type(), "stream_offset");
883          nir_store_var(&b, stream_offset, s_offset, 0x1);
884 
885          /* Inner loop begin.
886           *    do {
887           *       ...
888           */
889          nir_loop *loop_inner = nir_push_loop(&b); {
890             /* uint32_t buffer[4] = load_ssbo(0, stream_offset + 2 * sizeof(uint64_t)); */
891             nir_def *stream_offset_value = nir_load_var(&b, stream_offset);
892             nir_def *buffer =
893                nir_load_ssbo(&b, 4, 32, zero,
894                              nir_iadd_imm(&b, stream_offset_value, 2 * sizeof(uint64_t)));
895 
896             /* if (generated != emitted) {
897              *    acc_result = 1;
898              *    base_offset = 0;
899              *    break;
900              * }
901              */
902             nir_def *generated = nir_channel(&b, buffer, 0);
903             nir_def *emitted = nir_channel(&b, buffer, 2);
904             nir_if *if_not_equal = nir_push_if(&b, nir_ine(&b, generated, emitted)); {
905                nir_store_var(&b, acc_result, one, 0x1);
906                nir_store_var(&b, base_offset, zero, 0x1);
907                nir_jump(&b, nir_jump_break);
908             }
909             nir_pop_if(&b, if_not_equal);
910 
911             /* stream_offset += sizeof(gfx11_sh_query_buffer_mem.stream[0]); */
912             uint64_t buffer_mem_stream0_size =
913                sizeof(((struct gfx11_sh_query_buffer_mem*)0)->stream[0]);
914             stream_offset_value = nir_iadd_imm(&b, stream_offset_value, buffer_mem_stream0_size);
915             nir_store_var(&b, stream_offset, stream_offset_value, 0x1);
916 
917             /* } while(count--); */
918             nir_def *loop_count = nir_load_var(&b, count);
919             loop_count = nir_iadd(&b, loop_count, minus_one);
920             nir_store_var(&b, count, loop_count, 0x1);
921 
922             nir_if *if_zero = nir_push_if(&b, nir_ieq(&b, loop_count, zero)); {
923                nir_jump(&b, nir_jump_break);
924             }
925             nir_pop_if(&b, if_zero);
926          }
927          nir_pop_loop(&b, loop_inner); /* Inner loop end */
928       }
929       nir_pop_if(&b, if_overflow);
930 
931       /* base_offset += sizeof(gfx11_sh_query_buffer_mem); */
932       nir_def *buffer_mem_size = nir_imm_int(&b, sizeof(struct gfx11_sh_query_buffer_mem));
933       nir_store_var(&b, base_offset, nir_iadd(&b, nir_load_var(&b, base_offset), buffer_mem_size), 0x1);
934    }
935    nir_pop_loop(&b, loop_outer); /* Outer loop end */
936 
937    nir_def *acc_result_value = nir_load_var(&b, acc_result);
938    nir_def *y_value = nir_load_var(&b, acc_missing);
939 
940    /* if ((chain & 2)) {
941     *    store_ssbo(<acc_result, acc_missing>, 2, 0);
942     *    ...
943     */
944    nir_def *is_write_summary_buffer = nir_i2b(&b, nir_iand(&b, nir_channel(&b, buff_0, 2), two));
945    nir_if *if_write_summary_buffer = nir_push_if(&b, is_write_summary_buffer); {
946       nir_store_ssbo(&b, nir_vec2(&b, acc_result_value, y_value), two, zero);
947    } nir_push_else(&b, if_write_summary_buffer); {
948 
949       /* } else {
950        *    if ((config & 7) == 1) {
951        *       acc_result = acc_missing ? 0 : 1;
952        *       acc_missing = 0;
953        *    }
954        *    ...
955        */
956       nir_def *is_result_available = nir_ieq(&b, mode, one);
957       nir_def *is_zero = nir_ieq(&b, y_value, zero);
958       acc_result_value = nir_bcsel(&b, is_result_available, nir_b2i32(&b, is_zero), acc_result_value);
959       nir_def *ny = nir_bcsel(&b, is_result_available, zero, y_value);
960 
961       /* if (!acc_missing) {
962        *    store_ssbo(acc_result, 2, 0);
963        *    if (config & 8)) {
964        *       store_ssbo(0, 2, 4)
965        *    }
966        * }
967        */
968       nir_if *if_zero = nir_push_if(&b, nir_ieq(&b, ny, zero)); {
969          nir_store_ssbo(&b, acc_result_value, two, zero);
970 
971          nir_def *is_so_any_overflow = nir_i2b(&b, nir_iand_imm(&b, nir_channel(&b, buff_0, 0), 8));
972          nir_if *if_so_any_overflow = nir_push_if(&b, is_so_any_overflow); {
973             nir_store_ssbo(&b, zero, two, four);
974          }
975          nir_pop_if(&b, if_so_any_overflow);
976       }
977       nir_pop_if(&b, if_zero);
978    }
979    nir_pop_if(&b, if_write_summary_buffer);
980 
981    return si_create_shader_state(sctx, b.shader);
982 }
983